__init__.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. """Configurable environment variables for GPUStack."""
  2. import os
  3. # Database configuration
  4. DB_ECHO = os.getenv("GPUSTACK_DB_ECHO", "false").lower() == "true"
  5. DB_POOL_SIZE = int(os.getenv("GPUSTACK_DB_POOL_SIZE", 30))
  6. DB_MAX_OVERFLOW = int(os.getenv("GPUSTACK_DB_MAX_OVERFLOW", 20))
  7. DB_POOL_TIMEOUT = int(os.getenv("GPUSTACK_DB_POOL_TIMEOUT", 30))
  8. # Proxy configuration
  9. PROXY_TIMEOUT = int(os.getenv("GPUSTACK_PROXY_TIMEOUT_SECONDS", 1800))
  10. PROXY_UPSTREAM_IDLE_TIMEOUT = int(
  11. os.getenv("GPUSTACK_PROXY_UPSTREAM_IDLE_TIMEOUT_SECONDS", 3)
  12. )
  13. # HTTP client TCP connector configuration
  14. TCP_CONNECTOR_LIMIT = int(os.getenv("GPUSTACK_TCP_CONNECTOR_LIMIT", 1000))
  15. # JWT Expiration
  16. JWT_TOKEN_EXPIRE_MINUTES = int(os.getenv("GPUSTACK_JWT_TOKEN_EXPIRE_MINUTES", 120))
  17. # Higress plugin configuration
  18. HIGRESS_EXT_AUTH_TIMEOUT_MS = int(
  19. os.getenv("GPUSTACK_HIGRESS_EXT_AUTH_TIMEOUT_MS", 30000)
  20. )
  21. # Server Cache
  22. SERVER_CACHE_TTL_SECONDS = int(os.getenv("GPUSTACK_SERVER_CACHE_TTL_SECONDS", 600))
  23. SERVER_CACHE_LOCKS_MAX_SIZE = int(
  24. os.getenv("GPUSTACK_SERVER_CACHE_LOCKS_MAX_SIZE", 10000)
  25. )
  26. # Server event bus queue capacity. Configurable via env so large clusters can tune the buffer.
  27. EVENT_BUS_SUBSCRIBER_QUEUE_SIZE = int(
  28. os.getenv("GPUSTACK_EVENT_BUS_SUBSCRIBER_QUEUE_SIZE", 1024)
  29. )
  30. # Worker configuration
  31. WORKER_HEARTBEAT_INTERVAL = int(
  32. os.getenv("GPUSTACK_WORKER_HEARTBEAT_INTERVAL", 30)
  33. ) # in seconds
  34. WORKER_STATUS_SYNC_INTERVAL = int(
  35. os.getenv("GPUSTACK_WORKER_STATUS_SYNC_INTERVAL", 30)
  36. ) # in seconds
  37. WORKER_HEARTBEAT_GRACE_PERIOD = int(
  38. os.getenv("GPUSTACK_WORKER_HEARTBEAT_GRACE_PERIOD", 150)
  39. ) # 2.5 minutes in seconds
  40. WORKER_ORPHAN_WORKLOAD_CLEANUP_GRACE_PERIOD = int(
  41. os.getenv("GPUSTACK_WORKER_ORPHAN_WORKLOAD_CLEANUP_GRACE_PERIOD", 300)
  42. ) # 5 minutes in seconds
  43. WORKER_ORPHAN_BENCHMARK_WORKLOAD_CLEANUP_GRACE_PERIOD = int(
  44. os.getenv("GPUSTACK_WORKER_ORPHAN_BENCHMARK_WORKLOAD_CLEANUP_GRACE_PERIOD", 300)
  45. ) # 5 minutes in seconds
  46. # Worker unreachable check mode: auto, enabled, disabled
  47. # - auto: automatically disable check when worker count > 50 (default)
  48. # - enabled: always perform unreachable check
  49. # - disabled: never perform unreachable check
  50. WORKER_UNREACHABLE_CHECK_MODE = os.getenv(
  51. "GPUSTACK_WORKER_UNREACHABLE_CHECK_MODE", "auto"
  52. ).lower()
  53. # Model instance configuration
  54. MODEL_INSTANCE_RESCHEDULE_GRACE_PERIOD = int(
  55. os.getenv("GPUSTACK_MODEL_INSTANCE_RESCHEDULE_GRACE_PERIOD", 300)
  56. ) # 5 minutes in seconds
  57. MODEL_INSTANCE_HEALTH_CHECK_INTERVAL = int(
  58. os.getenv("GPUSTACK_MODEL_INSTANCE_HEALTH_CHECK_INTERVAL", 3)
  59. )
  60. DISABLE_OS_FILELOCK = os.getenv("GPUSTACK_DISABLE_OS_FILELOCK", "false").lower() in [
  61. "true",
  62. "1",
  63. ]
  64. # Opt out of automatically writing gpustack's configured port ranges to
  65. # /proc/sys/net/ipv4/ip_local_reserved_ports. Use when the environment already
  66. # manages the reservation, or when the configured ranges would starve the
  67. # ephemeral pool after reservation.
  68. SKIP_RESERVE_EPHEMERAL_PORTS = os.getenv(
  69. "GPUSTACK_SKIP_RESERVE_EPHEMERAL_PORTS", "false"
  70. ).lower() in ["true", "1"]
  71. # Add debug logs for slow worker status collection, default to 3 minutes
  72. WORKER_STATUS_COLLECTION_LOG_SLOW_SECONDS = float(
  73. os.getenv("GPUSTACK_WORKER_STATUS_COLLECTION_LOG_SLOW_SECONDS", 180)
  74. )
  75. # Model evaluation cache configuration
  76. MODEL_EVALUATION_CACHE_MAX_SIZE = int(
  77. os.getenv("GPUSTACK_MODEL_EVALUATION_CACHE_MAX_SIZE", 1000)
  78. )
  79. MODEL_EVALUATION_CACHE_TTL = int(os.getenv("GPUSTACK_MODEL_EVALUATION_CACHE_TTL", 3600))
  80. # Scheduler configuration (server-side)
  81. SCHEDULER_SCALE_UP_PLACEMENT_MAX_SCORE = float(
  82. os.getenv("GPUSTACK_SCHEDULER_SCALE_UP_PLACEMENT_MAX_SCORE", 100)
  83. )
  84. SCHEDULER_SCALE_UP_LOCALITY_MAX_SCORE = float(
  85. os.getenv("GPUSTACK_SCHEDULER_SCALE_UP_LOCALITY_MAX_SCORE", 5)
  86. )
  87. # Scale-down scoring weights (relative, normalized in score chain)
  88. SCHEDULER_SCALE_DOWN_STATUS_MAX_SCORE = float(
  89. os.getenv("GPUSTACK_SCHEDULER_SCALE_DOWN_STATUS_MAX_SCORE", 100)
  90. )
  91. SCHEDULER_SCALE_DOWN_OFFLOAD_MAX_SCORE = float(
  92. os.getenv("GPUSTACK_SCHEDULER_SCALE_DOWN_OFFLOAD_MAX_SCORE", 10)
  93. )
  94. SCHEDULER_SCALE_DOWN_PLACEMENT_MAX_SCORE = float(
  95. os.getenv("GPUSTACK_SCHEDULER_SCALE_DOWN_PLACEMENT_MAX_SCORE", 1)
  96. )
  97. MIGRATION_DATA_DIR = os.getenv("GPUSTACK_MIGRATION_DATA_DIR", None)
  98. DATA_MIGRATION = os.getenv("GPUSTACK_DATA_MIGRATION", "false").lower() == "true"
  99. GATEWAY_PORT_CHECK_INTERVAL = int(
  100. os.getenv("GPUSTACK_GATEWAY_PORT_CHECK_INTERVAL", 2)
  101. ) # in seconds
  102. GATEWAY_PORT_CHECK_RETRY_COUNT = int(
  103. os.getenv("GPUSTACK_GATEWAY_PORT_CHECK_RETRY_COUNT", 300)
  104. ) # number of retries
  105. GATEWAY_MIRROR_INGRESS_NAME = os.getenv(
  106. "GPUSTACK_GATEWAY_MIRROR_INGRESS_NAME", "gpustack"
  107. )
  108. GATEWAY_AI_STATISTICS_PLUGIN_CONTENT_TYPES = [
  109. ct.strip()
  110. for ct in os.getenv(
  111. "GPUSTACK_GATEWAY_AI_STATISTICS_PLUGIN_CONTENT_TYPES",
  112. "application/json,text/event-stream",
  113. ).split(",")
  114. if ct.strip()
  115. ]
  116. # Heuristics for partial-stream usage estimation.
  117. # Used by metrics_collector when a gateway report arrives with completed=false
  118. # (client disconnect, upstream cancel) and token fields are blank or partial.
  119. # Defaults target English-leaning GPT-style tokenizers; tune for CJK or other
  120. # tokenizer families as needed.
  121. # Clamped to >= 1 so an operator typo (e.g. ``=0``) can't make
  122. # ``_estimate_partial_usage`` divide by zero on every incomplete report.
  123. USAGE_ESTIMATED_BYTES_PER_INPUT_TOKEN = max(
  124. 1, int(os.getenv("GPUSTACK_USAGE_ESTIMATED_BYTES_PER_INPUT_TOKEN", 4))
  125. )
  126. USAGE_ESTIMATED_TOKENS_PER_OUTPUT_CHUNK = max(
  127. 1, int(os.getenv("GPUSTACK_USAGE_ESTIMATED_TOKENS_PER_OUTPUT_CHUNK", 1))
  128. )
  129. # Usage details archival.
  130. # Rows in ``model_usage_details`` older than the retention threshold (anchored
  131. # on COALESCE(completed_at, created_at)) are moved to
  132. # ``model_usage_details_archive`` by a leader-only background controller.
  133. # The controller runs once on server startup and then on the configured cron
  134. # schedule (UTC). Default ``0 3 * * *`` = daily at 03:00 UTC — picked to land
  135. # in a typical off-peak window for most regions.
  136. USAGE_DETAILS_RETENTION_MONTHS = int(
  137. os.getenv("GPUSTACK_USAGE_DETAILS_RETENTION_MONTHS", 13)
  138. )
  139. USAGE_DETAILS_ARCHIVE_CRON = os.getenv(
  140. "GPUSTACK_USAGE_DETAILS_ARCHIVE_CRON", "0 3 * * *"
  141. )
  142. # Per-batch row count for archival moves. Smaller batches keep transactions
  143. # short on environments with replication lag concerns; larger batches reduce
  144. # round-trip overhead.
  145. USAGE_DETAILS_ARCHIVE_BATCH_SIZE = int(
  146. os.getenv("GPUSTACK_USAGE_DETAILS_ARCHIVE_BATCH_SIZE", 1000)
  147. )
  148. # Hard cap on the in-memory ``gateway_details_buffer`` (per-request audit
  149. # rows held between flushes). Bounds memory growth when flushes fail
  150. # persistently (DB down, schema drift) and the failure-path re-buffer keeps
  151. # piling up alongside new ingest. Oldest entries are dropped on overflow
  152. # with a warning log; the rollup buffer is naturally bounded by key
  153. # cardinality so it does not need a separate cap.
  154. USAGE_DETAILS_BUFFER_MAX_SIZE = int(
  155. os.getenv("GPUSTACK_USAGE_DETAILS_BUFFER_MAX_SIZE", 100000)
  156. )
  157. DEFAULT_CLUSTER_KUBERNETES = (
  158. os.getenv("GPUSTACK_DEFAULT_CLUSTER_KUBERNETES", "false").lower() == "true"
  159. )
  160. # Benchmark configuration
  161. BENCHMARK_DATASET_SHAREGPT_PATH = os.getenv(
  162. "GPUSTACK_BENCHMARK_DATASET_SHAREGPT_PATH",
  163. "/workspace/benchmark-runner/sharegpt_data/ShareGPT_V3_unfiltered_cleaned_split.json",
  164. )
  165. BENCHMARK_REQUEST_TIMEOUT = int(
  166. os.getenv("GPUSTACK_BENCHMARK_REQUEST_TIMEOUT", 3600) # 1 hour
  167. ) # in seconds