| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191 |
- """Configurable environment variables for GPUStack."""
- import os
- # Database configuration
- DB_ECHO = os.getenv("GPUSTACK_DB_ECHO", "false").lower() == "true"
- DB_POOL_SIZE = int(os.getenv("GPUSTACK_DB_POOL_SIZE", 30))
- DB_MAX_OVERFLOW = int(os.getenv("GPUSTACK_DB_MAX_OVERFLOW", 20))
- DB_POOL_TIMEOUT = int(os.getenv("GPUSTACK_DB_POOL_TIMEOUT", 30))
- # Proxy configuration
- PROXY_TIMEOUT = int(os.getenv("GPUSTACK_PROXY_TIMEOUT_SECONDS", 1800))
- PROXY_UPSTREAM_IDLE_TIMEOUT = int(
- os.getenv("GPUSTACK_PROXY_UPSTREAM_IDLE_TIMEOUT_SECONDS", 3)
- )
- # HTTP client TCP connector configuration
- TCP_CONNECTOR_LIMIT = int(os.getenv("GPUSTACK_TCP_CONNECTOR_LIMIT", 1000))
- # JWT Expiration
- JWT_TOKEN_EXPIRE_MINUTES = int(os.getenv("GPUSTACK_JWT_TOKEN_EXPIRE_MINUTES", 120))
- # Higress plugin configuration
- HIGRESS_EXT_AUTH_TIMEOUT_MS = int(
- os.getenv("GPUSTACK_HIGRESS_EXT_AUTH_TIMEOUT_MS", 30000)
- )
- # Server Cache
- SERVER_CACHE_TTL_SECONDS = int(os.getenv("GPUSTACK_SERVER_CACHE_TTL_SECONDS", 600))
- SERVER_CACHE_LOCKS_MAX_SIZE = int(
- os.getenv("GPUSTACK_SERVER_CACHE_LOCKS_MAX_SIZE", 10000)
- )
- # Server event bus queue capacity. Configurable via env so large clusters can tune the buffer.
- EVENT_BUS_SUBSCRIBER_QUEUE_SIZE = int(
- os.getenv("GPUSTACK_EVENT_BUS_SUBSCRIBER_QUEUE_SIZE", 1024)
- )
- # Worker configuration
- WORKER_HEARTBEAT_INTERVAL = int(
- os.getenv("GPUSTACK_WORKER_HEARTBEAT_INTERVAL", 30)
- ) # in seconds
- WORKER_STATUS_SYNC_INTERVAL = int(
- os.getenv("GPUSTACK_WORKER_STATUS_SYNC_INTERVAL", 30)
- ) # in seconds
- WORKER_HEARTBEAT_GRACE_PERIOD = int(
- os.getenv("GPUSTACK_WORKER_HEARTBEAT_GRACE_PERIOD", 150)
- ) # 2.5 minutes in seconds
- WORKER_ORPHAN_WORKLOAD_CLEANUP_GRACE_PERIOD = int(
- os.getenv("GPUSTACK_WORKER_ORPHAN_WORKLOAD_CLEANUP_GRACE_PERIOD", 300)
- ) # 5 minutes in seconds
- WORKER_ORPHAN_BENCHMARK_WORKLOAD_CLEANUP_GRACE_PERIOD = int(
- os.getenv("GPUSTACK_WORKER_ORPHAN_BENCHMARK_WORKLOAD_CLEANUP_GRACE_PERIOD", 300)
- ) # 5 minutes in seconds
- # Worker unreachable check mode: auto, enabled, disabled
- # - auto: automatically disable check when worker count > 50 (default)
- # - enabled: always perform unreachable check
- # - disabled: never perform unreachable check
- WORKER_UNREACHABLE_CHECK_MODE = os.getenv(
- "GPUSTACK_WORKER_UNREACHABLE_CHECK_MODE", "auto"
- ).lower()
- # Model instance configuration
- MODEL_INSTANCE_RESCHEDULE_GRACE_PERIOD = int(
- os.getenv("GPUSTACK_MODEL_INSTANCE_RESCHEDULE_GRACE_PERIOD", 300)
- ) # 5 minutes in seconds
- MODEL_INSTANCE_HEALTH_CHECK_INTERVAL = int(
- os.getenv("GPUSTACK_MODEL_INSTANCE_HEALTH_CHECK_INTERVAL", 3)
- )
- DISABLE_OS_FILELOCK = os.getenv("GPUSTACK_DISABLE_OS_FILELOCK", "false").lower() in [
- "true",
- "1",
- ]
- # Opt out of automatically writing gpustack's configured port ranges to
- # /proc/sys/net/ipv4/ip_local_reserved_ports. Use when the environment already
- # manages the reservation, or when the configured ranges would starve the
- # ephemeral pool after reservation.
- SKIP_RESERVE_EPHEMERAL_PORTS = os.getenv(
- "GPUSTACK_SKIP_RESERVE_EPHEMERAL_PORTS", "false"
- ).lower() in ["true", "1"]
- # Add debug logs for slow worker status collection, default to 3 minutes
- WORKER_STATUS_COLLECTION_LOG_SLOW_SECONDS = float(
- os.getenv("GPUSTACK_WORKER_STATUS_COLLECTION_LOG_SLOW_SECONDS", 180)
- )
- # Model evaluation cache configuration
- MODEL_EVALUATION_CACHE_MAX_SIZE = int(
- os.getenv("GPUSTACK_MODEL_EVALUATION_CACHE_MAX_SIZE", 1000)
- )
- MODEL_EVALUATION_CACHE_TTL = int(os.getenv("GPUSTACK_MODEL_EVALUATION_CACHE_TTL", 3600))
- # Scheduler configuration (server-side)
- SCHEDULER_SCALE_UP_PLACEMENT_MAX_SCORE = float(
- os.getenv("GPUSTACK_SCHEDULER_SCALE_UP_PLACEMENT_MAX_SCORE", 100)
- )
- SCHEDULER_SCALE_UP_LOCALITY_MAX_SCORE = float(
- os.getenv("GPUSTACK_SCHEDULER_SCALE_UP_LOCALITY_MAX_SCORE", 5)
- )
- # Scale-down scoring weights (relative, normalized in score chain)
- SCHEDULER_SCALE_DOWN_STATUS_MAX_SCORE = float(
- os.getenv("GPUSTACK_SCHEDULER_SCALE_DOWN_STATUS_MAX_SCORE", 100)
- )
- SCHEDULER_SCALE_DOWN_OFFLOAD_MAX_SCORE = float(
- os.getenv("GPUSTACK_SCHEDULER_SCALE_DOWN_OFFLOAD_MAX_SCORE", 10)
- )
- SCHEDULER_SCALE_DOWN_PLACEMENT_MAX_SCORE = float(
- os.getenv("GPUSTACK_SCHEDULER_SCALE_DOWN_PLACEMENT_MAX_SCORE", 1)
- )
- MIGRATION_DATA_DIR = os.getenv("GPUSTACK_MIGRATION_DATA_DIR", None)
- DATA_MIGRATION = os.getenv("GPUSTACK_DATA_MIGRATION", "false").lower() == "true"
- GATEWAY_PORT_CHECK_INTERVAL = int(
- os.getenv("GPUSTACK_GATEWAY_PORT_CHECK_INTERVAL", 2)
- ) # in seconds
- GATEWAY_PORT_CHECK_RETRY_COUNT = int(
- os.getenv("GPUSTACK_GATEWAY_PORT_CHECK_RETRY_COUNT", 300)
- ) # number of retries
- GATEWAY_MIRROR_INGRESS_NAME = os.getenv(
- "GPUSTACK_GATEWAY_MIRROR_INGRESS_NAME", "gpustack"
- )
- GATEWAY_AI_STATISTICS_PLUGIN_CONTENT_TYPES = [
- ct.strip()
- for ct in os.getenv(
- "GPUSTACK_GATEWAY_AI_STATISTICS_PLUGIN_CONTENT_TYPES",
- "application/json,text/event-stream",
- ).split(",")
- if ct.strip()
- ]
- # Heuristics for partial-stream usage estimation.
- # Used by metrics_collector when a gateway report arrives with completed=false
- # (client disconnect, upstream cancel) and token fields are blank or partial.
- # Defaults target English-leaning GPT-style tokenizers; tune for CJK or other
- # tokenizer families as needed.
- # Clamped to >= 1 so an operator typo (e.g. ``=0``) can't make
- # ``_estimate_partial_usage`` divide by zero on every incomplete report.
- USAGE_ESTIMATED_BYTES_PER_INPUT_TOKEN = max(
- 1, int(os.getenv("GPUSTACK_USAGE_ESTIMATED_BYTES_PER_INPUT_TOKEN", 4))
- )
- USAGE_ESTIMATED_TOKENS_PER_OUTPUT_CHUNK = max(
- 1, int(os.getenv("GPUSTACK_USAGE_ESTIMATED_TOKENS_PER_OUTPUT_CHUNK", 1))
- )
- # Usage details archival.
- # Rows in ``model_usage_details`` older than the retention threshold (anchored
- # on COALESCE(completed_at, created_at)) are moved to
- # ``model_usage_details_archive`` by a leader-only background controller.
- # The controller runs once on server startup and then on the configured cron
- # schedule (UTC). Default ``0 3 * * *`` = daily at 03:00 UTC — picked to land
- # in a typical off-peak window for most regions.
- USAGE_DETAILS_RETENTION_MONTHS = int(
- os.getenv("GPUSTACK_USAGE_DETAILS_RETENTION_MONTHS", 13)
- )
- USAGE_DETAILS_ARCHIVE_CRON = os.getenv(
- "GPUSTACK_USAGE_DETAILS_ARCHIVE_CRON", "0 3 * * *"
- )
- # Per-batch row count for archival moves. Smaller batches keep transactions
- # short on environments with replication lag concerns; larger batches reduce
- # round-trip overhead.
- USAGE_DETAILS_ARCHIVE_BATCH_SIZE = int(
- os.getenv("GPUSTACK_USAGE_DETAILS_ARCHIVE_BATCH_SIZE", 1000)
- )
- # Hard cap on the in-memory ``gateway_details_buffer`` (per-request audit
- # rows held between flushes). Bounds memory growth when flushes fail
- # persistently (DB down, schema drift) and the failure-path re-buffer keeps
- # piling up alongside new ingest. Oldest entries are dropped on overflow
- # with a warning log; the rollup buffer is naturally bounded by key
- # cardinality so it does not need a separate cap.
- USAGE_DETAILS_BUFFER_MAX_SIZE = int(
- os.getenv("GPUSTACK_USAGE_DETAILS_BUFFER_MAX_SIZE", 100000)
- )
- DEFAULT_CLUSTER_KUBERNETES = (
- os.getenv("GPUSTACK_DEFAULT_CLUSTER_KUBERNETES", "false").lower() == "true"
- )
- # Benchmark configuration
- BENCHMARK_DATASET_SHAREGPT_PATH = os.getenv(
- "GPUSTACK_BENCHMARK_DATASET_SHAREGPT_PATH",
- "/workspace/benchmark-runner/sharegpt_data/ShareGPT_V3_unfiltered_cleaned_split.json",
- )
- BENCHMARK_REQUEST_TIMEOUT = int(
- os.getenv("GPUSTACK_BENCHMARK_REQUEST_TIMEOUT", 3600) # 1 hour
- ) # in seconds
|