sglang.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. import logging
  2. import os
  3. from typing import Dict, List, Optional, Tuple
  4. from gpustack_runtime.detector import ManufacturerEnum
  5. from packaging.version import Version
  6. from packaging.specifiers import SpecifierSet
  7. from gpustack_runtime.deployer import (
  8. Container,
  9. ContainerEnv,
  10. ContainerExecution,
  11. ContainerProfileEnum,
  12. WorkloadPlan,
  13. create_workload,
  14. ContainerRestartPolicyEnum,
  15. )
  16. from gpustack_runtime.deployer.__utils__ import compare_versions
  17. from gpustack.scheduler.model_registry import is_multimodal_model
  18. from gpustack.schemas.models import (
  19. ModelInstance,
  20. SpeculativeAlgorithmEnum,
  21. CategoryEnum,
  22. ModelInstanceDeploymentMetadata,
  23. )
  24. from gpustack.utils.command import (
  25. find_bool_parameter,
  26. find_parameter,
  27. extend_args_no_exist,
  28. format_backend_parameters,
  29. )
  30. from gpustack.utils.envs import sanitize_env
  31. from gpustack.worker.backends.base import (
  32. InferenceServer,
  33. cal_distributed_parallelism_arguments,
  34. is_ascend,
  35. is_ascend_310p,
  36. )
  37. logger = logging.getLogger(__name__)
  38. class SGLangServer(InferenceServer):
  39. """
  40. Containerized SGLang inference server backend using gpustack-runtime.
  41. This backend runs SGLang in a Docker container managed by gpustack-runtime,
  42. providing better isolation, resource management, and deployment consistency.
  43. """
  44. is_diffusion = False
  45. def start(self): # noqa: C901
  46. try:
  47. if CategoryEnum.IMAGE in self._model.categories:
  48. self.is_diffusion = True
  49. self._start_diffusion()
  50. else:
  51. self._start()
  52. except Exception as e:
  53. self._handle_error(e)
  54. def _start(self):
  55. logger.info(f"Starting SGLang model instance: {self._model_instance.name}")
  56. deployment_metadata = self._get_deployment_metadata()
  57. # Setup environment variables
  58. env = self._get_configured_env(
  59. is_distributed=deployment_metadata.distributed,
  60. )
  61. # Resolve image first so that backend_version is populated before
  62. # building command args (version-gated arguments depend on it).
  63. image = self._get_configured_image()
  64. if not image:
  65. raise ValueError("Can't find compatible SGLang image")
  66. command = None
  67. if self.inference_backend:
  68. command = self.inference_backend.get_container_entrypoint(
  69. self._model.backend_version
  70. )
  71. command_script = self._get_serving_command_script(env)
  72. # Build SGLang command arguments
  73. command_args, injected = self._build_command_args(
  74. port=self._get_serving_port(),
  75. is_distributed=deployment_metadata.distributed,
  76. is_distributed_leader=deployment_metadata.distributed_leader,
  77. entrypoint=command,
  78. )
  79. try:
  80. self._update_model_instance(
  81. self._model_instance.id,
  82. injected_backend_parameters=format_backend_parameters(injected) or None,
  83. )
  84. except Exception as e:
  85. logger.warning(
  86. f"Failed to persist injected backend parameters for {self._model_instance.name}: {e}"
  87. )
  88. self._create_workload(
  89. deployment_metadata=deployment_metadata,
  90. command=command,
  91. command_script=command_script,
  92. command_args=command_args,
  93. env=env,
  94. image=image,
  95. )
  96. def _start_diffusion(self):
  97. logger.info(
  98. f"Starting SGLang Diffusion model instance: {self._model_instance.name}"
  99. )
  100. deployment_metadata = self._get_deployment_metadata()
  101. # Setup environment variables
  102. env = self._get_configured_env(
  103. is_distributed=False,
  104. )
  105. # Resolve image first so that backend_version is populated before
  106. # building command args (version-gated arguments depend on it).
  107. image = self._get_configured_image()
  108. if not image:
  109. raise ValueError("Can't find compatible SGLang image")
  110. command = None
  111. if self.inference_backend:
  112. command = self.inference_backend.get_container_entrypoint(
  113. self._model.backend_version
  114. )
  115. command_script = self._get_serving_command_script(env)
  116. command_args, injected = self._build_command_args_for_diffusion(
  117. port=self._get_serving_port(),
  118. entrypoint=command,
  119. )
  120. try:
  121. self._update_model_instance(
  122. self._model_instance.id,
  123. injected_backend_parameters=format_backend_parameters(injected) or None,
  124. )
  125. except Exception as e:
  126. logger.warning(
  127. f"Failed to persist injected backend parameters for {self._model_instance.name}: {e}"
  128. )
  129. self._create_workload(
  130. deployment_metadata=deployment_metadata,
  131. command=command,
  132. command_script=command_script,
  133. command_args=command_args,
  134. env=env,
  135. image=image,
  136. )
  137. def _create_workload(
  138. self,
  139. deployment_metadata: ModelInstanceDeploymentMetadata,
  140. command: Optional[List[str]],
  141. command_script: Optional[str],
  142. command_args: List[str],
  143. env: Dict[str, str],
  144. image: str,
  145. ):
  146. if (
  147. self.is_diffusion
  148. and compare_versions(self._model.backend_version, "0.5.5") < 0
  149. ):
  150. raise ValueError(
  151. "SGLang versions <= 0.5.5 do not support Diffusion models."
  152. )
  153. # Command script will override the given command,
  154. # so we need to prepend command to command args.
  155. if command_script and command:
  156. command_args = command + command_args
  157. command = None
  158. resources = self._get_configured_resources()
  159. mounts = self._get_configured_mounts()
  160. ports = self._get_configured_ports()
  161. # Read container config from environment variables
  162. container_config = self._get_container_env_config(env)
  163. run_container = Container(
  164. image=image,
  165. name="default",
  166. profile=ContainerProfileEnum.RUN,
  167. restart_policy=ContainerRestartPolicyEnum.NEVER,
  168. execution=ContainerExecution(
  169. privileged=True,
  170. command=command,
  171. command_script=command_script,
  172. args=command_args,
  173. run_as_user=container_config.user,
  174. run_as_group=container_config.group,
  175. ),
  176. envs=[
  177. ContainerEnv(
  178. name=name,
  179. value=value,
  180. )
  181. for name, value in env.items()
  182. ],
  183. resources=resources,
  184. mounts=mounts,
  185. ports=ports,
  186. )
  187. logger.info(f"Creating SGLang container workload: {deployment_metadata.name}")
  188. logger.info(
  189. f"With image: {image}, "
  190. f"command: [{' '.join(command) if command else ''}], "
  191. f"arguments: [{' '.join(command_args)}], "
  192. f"ports: [{','.join([str(port.internal) for port in ports])}], "
  193. f"envs(inconsistent input items mean unchangeable):{os.linesep}"
  194. f"{os.linesep.join(f'{k}={v}' for k, v in sorted(sanitize_env(env).items()))}"
  195. )
  196. workload_plan = WorkloadPlan(
  197. name=deployment_metadata.name,
  198. host_network=True,
  199. shm_size=int(container_config.shm_size_gib * (1 << 30)),
  200. containers=[run_container],
  201. run_as_user=container_config.user,
  202. run_as_group=container_config.group,
  203. )
  204. create_workload(self._transform_workload_plan(workload_plan))
  205. logger.info(f"Created SGLang container workload: {deployment_metadata.name}")
  206. def _get_configured_env(self, is_distributed: bool) -> Dict[str, str]:
  207. """
  208. Get environment variables for SGLang service.
  209. """
  210. # Apply GPUStack's inference environment setup
  211. env = super()._get_configured_env()
  212. # Optimize environment variables
  213. # -- Disable OpenMP parallelism to avoid resource contention, increases model loading.
  214. env["OMP_NUM_THREADS"] = env.pop("OMP_NUM_THREADS", "1")
  215. # -- Enable safetensors GPU loading pass-through for faster model loading.
  216. env["SAFETENSORS_FAST_GPU"] = env.pop("SAFETENSORS_FAST_GPU", "1")
  217. # -- Observe RUN:AI streamer model loading.
  218. env["RUNAI_STREAMER_MEMORY_LIMIT"] = env.pop("RUNAI_STREAMER_MEMORY_LIMIT", "0")
  219. env["RUNAI_STREAMER_LOG_TO_STDERR"] = env.pop(
  220. "RUNAI_STREAMER_LOG_TO_STDERR", "1"
  221. )
  222. env["RUNAI_STREAMER_LOG_LEVEL"] = env.pop("RUNAI_STREAMER_LOG_LEVEL", "INFO")
  223. # Apply distributed environment variables
  224. if is_distributed:
  225. self._set_distributed_env(env)
  226. # Apply Ascend-specific environment variables
  227. if is_ascend(self._get_selected_gpu_devices()):
  228. self._set_ascend_env(env)
  229. return env
  230. def _set_distributed_env(self, env: Dict[str, str]):
  231. """
  232. Set up environment variables for distributed execution.
  233. """
  234. if is_ascend(self._get_selected_gpu_devices()):
  235. # See https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi-node_dsv3.2.html.
  236. if "HCCL_SOCKET_IFNAME" not in env:
  237. env["HCCL_IF_IP"] = self._worker.ip
  238. env["HCCL_SOCKET_IFNAME"] = f"={self._worker.ifname}"
  239. env["GLOO_SOCKET_IFNAME"] = self._worker.ifname
  240. env["TP_SOCKET_IFNAME"] = self._worker.ifname
  241. return
  242. if "NCCL_SOCKET_IFNAME" not in env:
  243. env["NCCL_SOCKET_IFNAME"] = f"={self._worker.ifname}"
  244. env["GLOO_SOCKET_IFNAME"] = self._worker.ifname
  245. def _set_ascend_env(self, env: Dict[str, str]):
  246. """
  247. Set up environment variables for Ascend devices.
  248. """
  249. # -- Optimize Pytorch NPU operations delivery performance.
  250. env["TASK_QUEUE_ENABLE"] = env.pop("TASK_QUEUE_ENABLE", "1")
  251. # -- Enable NUMA coarse-grained binding.
  252. env["CPU_AFFINITY_CONF"] = env.pop("CPU_AFFINITY_CONF", "1")
  253. # -- Reuse memory in multi-streams.
  254. env["PYTORCH_NPU_ALLOC_CONF"] = env.pop(
  255. "PYTORCH_NPU_ALLOC_CONF", "expandable_segments:True"
  256. )
  257. # -- Increase HCCL connection timeout to avoid issues in large clusters.
  258. env["HCCL_CONNECT_TIMEOUT"] = env.pop("HCCL_CONNECT_TIMEOUT", "7200")
  259. # -- Enable RDMA PCIe direct post with no strict mode for better performance.
  260. env["HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT"] = env.pop(
  261. "HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT", "TRUE"
  262. )
  263. if not is_ascend_310p(self._get_selected_gpu_devices()):
  264. # -- Disable HCCL execution timeout for better stability.
  265. env["HCCL_EXEC_TIMEOUT"] = env.pop("HCCL_EXEC_TIMEOUT", "0")
  266. # -- Enable the communication is scheduled by AI Vector directly with ROCE, instead of AI CPU.
  267. env["HCCL_OP_EXPANSION_MODE"] = env.pop("HCCL_OP_EXPANSION_MODE", "AIV")
  268. def _build_command_args(
  269. self,
  270. port: int,
  271. is_distributed: bool,
  272. is_distributed_leader: bool,
  273. entrypoint: Optional[List[str]] = None,
  274. ) -> Tuple[List[str], List[str]]:
  275. """
  276. Build SGLang command arguments for container execution.
  277. Returns:
  278. A tuple of (full_arguments, injected_backend_parameters) where
  279. injected_backend_parameters contains only the arguments automatically
  280. added by GPUStack, excluding the entrypoint and user-specified
  281. backend parameters.
  282. """
  283. arguments = [
  284. "python",
  285. "-m",
  286. "sglang.launch_server",
  287. "--model-path",
  288. self._model_path,
  289. ]
  290. # Allow version-specific command override if configured (before appending extra args)
  291. arguments = self.build_versioned_command_args(arguments)
  292. specified_max_model_len = find_parameter(
  293. self._model.backend_parameters,
  294. ["context-length"],
  295. )
  296. if specified_max_model_len is None:
  297. derived_max_model_len = self._derive_max_model_len()
  298. if derived_max_model_len and derived_max_model_len > 8192:
  299. arguments.extend(["--context-length", "8192"])
  300. # Add auto parallelism arguments if needed
  301. auto_parallelism_arguments = get_auto_parallelism_arguments(
  302. self._model.backend_parameters, self._model_instance, is_distributed
  303. )
  304. arguments.extend(auto_parallelism_arguments)
  305. # Add metrics arguments if needed
  306. metrics_arguments = get_metrics_arguments(
  307. self._model.backend_parameters, self._model.env
  308. )
  309. arguments.extend(metrics_arguments)
  310. # Suppress high-frequency /metrics access logs by default.
  311. access_log_arguments = get_access_log_arguments(
  312. self._model.backend_parameters, self._model.backend_version
  313. )
  314. arguments.extend(access_log_arguments)
  315. # Expose prefix-cache hits as cached_tokens in OpenAI usage.
  316. cache_report_arguments = get_cache_report_arguments(
  317. self._model.backend_parameters, self._model.backend_version
  318. )
  319. arguments.extend(cache_report_arguments)
  320. # Add multimodal argument if needed
  321. if is_multimodal_model(self._get_model_architecture()):
  322. arguments.append("--enable-multimodal")
  323. # Add speculative config arguments if needed
  324. speculative_config_arguments = self._get_speculative_arguments()
  325. arguments.extend(speculative_config_arguments)
  326. # Add multi-node deployment parameters if needed
  327. if is_distributed:
  328. multinode_arguments = self._get_multinode_arguments(
  329. is_distributed_leader=is_distributed_leader
  330. )
  331. arguments.extend(multinode_arguments)
  332. # Add hierarchical cache arguments if needed
  333. hicache_arguments = self._get_hicache_arguments()
  334. arguments.extend(hicache_arguments)
  335. if (
  336. self._model_instance.computed_resource_claim
  337. and self._model_instance.computed_resource_claim.vram_utilization
  338. ):
  339. input_utilization = find_parameter(
  340. self._model.backend_parameters, ["mem-fraction-static"]
  341. )
  342. if not input_utilization:
  343. arguments.extend(
  344. [
  345. "--mem-fraction-static",
  346. str(
  347. self._model_instance.computed_resource_claim.vram_utilization
  348. ),
  349. ]
  350. )
  351. # Add platform-specific parameters before user params so they appear in injected slice.
  352. if is_ascend(self._get_selected_gpu_devices()):
  353. # See https://github.com/sgl-project/sglang/pull/7722.
  354. arguments.extend(
  355. [
  356. "--attention-backend",
  357. "ascend",
  358. ]
  359. )
  360. if is_multimodal_model(self._get_model_architecture()):
  361. arguments.extend(
  362. [
  363. "--mm-attention-backend",
  364. "ascend_attn",
  365. ]
  366. )
  367. # Add user-defined backend parameters
  368. user_backend_parameters = self._flatten_backend_param()
  369. arguments.extend(user_backend_parameters)
  370. # Set host and port.
  371. extend_args_no_exist(
  372. arguments, ("--host", self._worker.ip), ("--port", str(port))
  373. )
  374. injected = self._get_injected_backend_parameters(
  375. arguments, user_backend_parameters, entrypoint
  376. )
  377. return arguments, injected
  378. def _build_command_args_for_diffusion(
  379. self, port: int, entrypoint: Optional[List[str]] = None
  380. ) -> Tuple[List[str], List[str]]:
  381. arguments = [
  382. "sglang",
  383. "serve",
  384. "--model-path",
  385. self._model_path,
  386. ]
  387. # Allow version-specific command override if configured (before appending extra args)
  388. arguments = self.build_versioned_command_args(arguments)
  389. # Add auto parallelism arguments if needed
  390. auto_parallelism_arguments = get_auto_parallelism_arguments(
  391. self._model.backend_parameters, self._model_instance, False
  392. )
  393. arguments.extend(auto_parallelism_arguments)
  394. attention_arguments = self._get_attention_backend_for_diffusion()
  395. arguments.extend(attention_arguments)
  396. # Add user-defined backend parameters
  397. user_backend_parameters = self._flatten_backend_param()
  398. arguments.extend(user_backend_parameters)
  399. # Set host and port.
  400. extend_args_no_exist(
  401. arguments, ("--host", self._worker.ip), ("--port", str(port))
  402. )
  403. injected = self._get_injected_backend_parameters(
  404. arguments, user_backend_parameters, entrypoint
  405. )
  406. return arguments, injected
  407. def _get_attention_backend_for_diffusion(self) -> List[str]:
  408. if (
  409. find_parameter(self._model.backend_parameters, ["attention-backend"])
  410. is not None
  411. ):
  412. return []
  413. devices = self._get_selected_gpu_devices()
  414. if devices and all(
  415. (d.vendor or "").lower() == ManufacturerEnum.NVIDIA for d in devices
  416. ):
  417. spec = SpecifierSet(">=8.0,<9.0")
  418. in_range = True
  419. for d in devices:
  420. cap = d.compute_capability
  421. try:
  422. v = Version(cap) if cap is not None else None
  423. except Exception:
  424. v = None
  425. if v is None or v not in spec:
  426. in_range = False
  427. break
  428. if not in_range:
  429. return ["--attention-backend", "torch_sdpa"]
  430. return []
  431. def _get_hicache_arguments(self) -> List[str]:
  432. """
  433. Get hierarchical KV cache arguments for SGLang.
  434. """
  435. extended_kv_cache = self._model.extended_kv_cache
  436. if not (extended_kv_cache and extended_kv_cache.enabled):
  437. return []
  438. arguments = ["--enable-hierarchical-cache"]
  439. if extended_kv_cache.chunk_size and extended_kv_cache.chunk_size > 0:
  440. arguments.extend(
  441. [
  442. "--page-size",
  443. str(extended_kv_cache.chunk_size),
  444. ]
  445. )
  446. if extended_kv_cache.ram_size and extended_kv_cache.ram_size > 0:
  447. arguments.extend(
  448. [
  449. "--hicache-size",
  450. str(extended_kv_cache.ram_size),
  451. ]
  452. )
  453. if extended_kv_cache.ram_ratio and extended_kv_cache.ram_ratio > 0:
  454. arguments.extend(
  455. [
  456. "--hicache-ratio",
  457. str(extended_kv_cache.ram_ratio),
  458. ]
  459. )
  460. return arguments
  461. def _get_multinode_arguments(self, is_distributed_leader: bool) -> List[str]:
  462. """
  463. Get multi-node deployment arguments for SGLang.
  464. """
  465. arguments = []
  466. # Check if this is a multi-node deployment
  467. if not (
  468. self._model_instance.distributed_servers
  469. and self._model_instance.distributed_servers.subordinate_workers
  470. ):
  471. return []
  472. subordinate_workers = (
  473. self._model_instance.distributed_servers.subordinate_workers
  474. )
  475. total_nodes = len(subordinate_workers) + 1 # +1 for the current node
  476. # Find the current node's rank
  477. current_worker_ip = self._worker.ip
  478. node_rank = 0 # Default to 0 (master node)
  479. # Determine node rank based on worker IP
  480. if not is_distributed_leader:
  481. for idx, worker in enumerate(subordinate_workers):
  482. if worker.worker_ip == current_worker_ip:
  483. node_rank = idx + 1 # Subordinate workers start from rank 1
  484. break
  485. # Add multi-node parameters
  486. arguments.extend(
  487. [
  488. "--nnodes",
  489. str(total_nodes),
  490. "--node-rank",
  491. str(node_rank),
  492. "--dist-init-addr",
  493. # During distributed setup,
  494. # we must get more than one port here,
  495. # so we use ports[1] for distributed initialization.
  496. f"{self._model_instance.worker_ip}:{self._model_instance.ports[1]}",
  497. ]
  498. )
  499. return arguments
  500. def _get_speculative_arguments(self) -> List[str]:
  501. """
  502. Get speculative arguments for SGLang.
  503. """
  504. speculative_config = self._model.speculative_config
  505. if not speculative_config or not speculative_config.enabled:
  506. return []
  507. sglang_speculative_algorithm_mapping = {
  508. SpeculativeAlgorithmEnum.EAGLE3: "EAGLE3",
  509. SpeculativeAlgorithmEnum.MTP: "EAGLE", # SGLang uses "EAGLE" for MTP
  510. SpeculativeAlgorithmEnum.NGRAM: "NGRAM",
  511. }
  512. arguments = []
  513. method = sglang_speculative_algorithm_mapping.get(
  514. speculative_config.algorithm, None
  515. )
  516. if method:
  517. arguments.extend(
  518. [
  519. "--speculative-algorithm",
  520. method,
  521. ]
  522. )
  523. if speculative_config.num_draft_tokens:
  524. arguments.extend(
  525. [
  526. "--speculative-num-draft-tokens",
  527. str(speculative_config.num_draft_tokens),
  528. ]
  529. )
  530. if speculative_config.ngram_max_match_length:
  531. arguments.extend(
  532. [
  533. "--speculative-ngram-max-match-window-size",
  534. str(speculative_config.ngram_max_match_length),
  535. ]
  536. )
  537. if speculative_config.ngram_min_match_length:
  538. arguments.extend(
  539. [
  540. "--speculative-ngram-min-match-window-size",
  541. str(speculative_config.ngram_min_match_length),
  542. ]
  543. )
  544. if speculative_config.draft_model and self._draft_model_path:
  545. arguments.extend(
  546. [
  547. "--speculative-draft-model",
  548. self._draft_model_path,
  549. ]
  550. )
  551. num_steps = find_parameter(
  552. self._model.backend_parameters, ["speculative-num-steps"]
  553. )
  554. topk = find_parameter(
  555. self._model.backend_parameters, ["speculative-eagle-topk"]
  556. )
  557. if num_steps is None and topk is None:
  558. default_steps, default_topk = self._get_default_speculative_steps_topk()
  559. arguments.extend(
  560. [
  561. "--speculative-num-steps",
  562. str(default_steps),
  563. "--speculative-eagle-topk",
  564. str(default_topk),
  565. ]
  566. )
  567. return arguments
  568. def _get_default_speculative_steps_topk(self) -> Tuple[int, int]:
  569. """
  570. Get the default speculative steps and topk for SGLang.
  571. Ref: https://github.com/sgl-project/sglang/blob/67fca6b297bf0202941bde7b608c6da14f6a8776/python/sglang/srt/server_args.py#L4363
  572. """
  573. architectures = getattr(self._pretrained_config, "architectures", []) or []
  574. arch = architectures[0] if architectures else ""
  575. if arch in [
  576. "DeepseekV32ForCausalLM",
  577. "DeepseekV3ForCausalLM",
  578. "DeepseekV2ForCausalLM",
  579. "GptOssForCausalLM",
  580. "BailingMoeForCausalLM",
  581. "BailingMoeV2ForCausalLM",
  582. ]:
  583. return (3, 1)
  584. else:
  585. # The default value for all other models
  586. return (5, 4)
  587. def get_auto_parallelism_arguments(
  588. backend_parameters: List[str],
  589. model_instance: ModelInstance,
  590. is_distributed: bool,
  591. ) -> List[str]:
  592. """
  593. Get auto parallelism arguments for SGLang based on GPU configuration.
  594. """
  595. arguments = []
  596. parallelism = find_parameter(
  597. backend_parameters,
  598. [
  599. "tensor-parallel-size",
  600. "tp-size",
  601. "pipeline-parallel-size",
  602. "pp-size",
  603. "data-parallel-size",
  604. "dp-size",
  605. ],
  606. )
  607. if parallelism is not None:
  608. return []
  609. if is_distributed:
  610. # distributed across multiple workers
  611. (tp, pp) = cal_distributed_parallelism_arguments(model_instance)
  612. return [
  613. "--tp-size",
  614. str(tp),
  615. "--pp-size",
  616. str(pp),
  617. ]
  618. # Check if tensor parallelism is already specified
  619. if model_instance.gpu_indexes and len(model_instance.gpu_indexes) > 1:
  620. gpu_count = len(model_instance.gpu_indexes)
  621. if gpu_count > 1:
  622. arguments.extend(["--tp-size", str(gpu_count)])
  623. return arguments
  624. def get_metrics_arguments(
  625. backend_parameters: List[str], env: Optional[Dict[str, str]] = None
  626. ) -> List[str]:
  627. """
  628. Get metrics flag for SGLang.
  629. """
  630. metrics_flag = find_parameter(
  631. backend_parameters,
  632. ["enable-metrics"],
  633. )
  634. if metrics_flag is not None:
  635. return []
  636. if env and env.get("GPUSTACK_DISABLE_METRICS"):
  637. return []
  638. return ["--enable-metrics"]
  639. def get_access_log_arguments(
  640. backend_parameters: List[str], backend_version: Optional[str] = None
  641. ) -> List[str]:
  642. """
  643. Get default SGLang access log filter arguments.
  644. --uvicorn-access-log-exclude-prefixes was introduced in SGLang v0.5.8.post1.
  645. """
  646. if not backend_version:
  647. return []
  648. if compare_versions(backend_version, "0.5.8.post1") < 0:
  649. return []
  650. access_log_filter = find_parameter(
  651. backend_parameters,
  652. ["uvicorn-access-log-exclude-prefixes"],
  653. )
  654. if access_log_filter is not None:
  655. return []
  656. return ["--uvicorn-access-log-exclude-prefixes", "/metrics"]
  657. def get_cache_report_arguments(
  658. backend_parameters: List[str], backend_version: Optional[str] = None
  659. ) -> List[str]:
  660. """
  661. Auto-inject SGLang's --enable-cache-report so OpenAI responses include
  662. `usage.prompt_tokens_details.cached_tokens`. The flag has existed since
  663. SGLang v0.3.4. RadixAttention prefix caching is on by default, so no
  664. separate cache-enable flag is needed.
  665. """
  666. if not backend_version:
  667. return []
  668. if compare_versions(backend_version, "0.3.4") < 0:
  669. return []
  670. if find_bool_parameter(backend_parameters, ["enable-cache-report"]):
  671. return []
  672. return ["--enable-cache-report"]