vllm.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766
  1. import json
  2. import logging
  3. import os
  4. from typing import Dict, List, Optional, Tuple
  5. from gpustack_runtime.deployer import (
  6. Container,
  7. ContainerEnv,
  8. ContainerExecution,
  9. ContainerProfileEnum,
  10. WorkloadPlan,
  11. create_workload,
  12. ContainerMount,
  13. ContainerPort,
  14. ContainerRestartPolicyEnum,
  15. )
  16. from gpustack_runtime.deployer.__utils__ import compare_versions
  17. from gpustack_runtime.detector import ManufacturerEnum, manufacturer_to_backend
  18. from gpustack.schemas.models import (
  19. ModelInstance,
  20. SpeculativeAlgorithmEnum,
  21. SpeculativeConfig,
  22. ModelInstanceDeploymentMetadata,
  23. is_audio_model,
  24. is_omni_model,
  25. )
  26. from gpustack.utils import network
  27. from gpustack.utils.command import (
  28. find_parameter,
  29. find_bool_parameter,
  30. find_int_parameter,
  31. extend_args_no_exist,
  32. format_backend_parameters,
  33. )
  34. from gpustack.utils.envs import sanitize_env
  35. from gpustack.utils.unit import byte_to_gib
  36. from gpustack.worker.backends.base import (
  37. InferenceServer,
  38. is_ascend_310p,
  39. is_ascend,
  40. cal_distributed_parallelism_arguments,
  41. )
  42. logger = logging.getLogger(__name__)
  43. class VLLMServer(InferenceServer):
  44. """
  45. Containerized vLLM inference server backend using gpustack-runtime.
  46. This backend runs vLLM in a Docker container managed by gpustack-runtime,
  47. providing better isolation, resource management, and deployment consistency.
  48. """
  49. def start(self): # noqa: C901
  50. try:
  51. self._start()
  52. except Exception as e:
  53. self._handle_error(e)
  54. def _start(self):
  55. logger.info(f"Starting vLLM model instance: {self._model_instance.name}")
  56. # Prepare distributed information.
  57. deployment_metadata = self._get_deployment_metadata()
  58. env = self._get_configured_env(
  59. is_distributed=deployment_metadata.distributed,
  60. )
  61. # Resolve image first so that backend_version is populated before
  62. # building command args (version-gated arguments depend on it).
  63. image = self._get_configured_image()
  64. if not image:
  65. raise ValueError("Failed to get vLLM backend image")
  66. command = None
  67. if self.inference_backend:
  68. command = self.inference_backend.get_container_entrypoint(
  69. self._model.backend_version
  70. )
  71. command_script = self._get_serving_command_script(env)
  72. command_args, injected = self._build_command_args(
  73. port=self._get_serving_port(),
  74. is_distributed=deployment_metadata.distributed,
  75. entrypoint=command,
  76. )
  77. try:
  78. self._update_model_instance(
  79. self._model_instance.id,
  80. injected_backend_parameters=format_backend_parameters(injected) or None,
  81. )
  82. except Exception as e:
  83. logger.warning(
  84. f"Failed to persist injected backend parameters for {self._model_instance.name}: {e}"
  85. )
  86. self._create_workload(
  87. deployment_metadata=deployment_metadata,
  88. command=command,
  89. command_script=command_script,
  90. command_args=command_args,
  91. env=env,
  92. image=image,
  93. )
  94. def _create_workload(
  95. self,
  96. deployment_metadata: ModelInstanceDeploymentMetadata,
  97. command: Optional[List[str]],
  98. command_script: Optional[str],
  99. command_args: List[str],
  100. env: Dict[str, str],
  101. image: str,
  102. ):
  103. # Command script will override the given command,
  104. # so we need to prepend command to command args.
  105. if command_script and command:
  106. command_args = command + command_args
  107. command = None
  108. resources = self._get_configured_resources()
  109. mounts = self._get_configured_mounts()
  110. ports = self._get_configured_ports()
  111. # Read container config from environment variables
  112. container_config = self._get_container_env_config(env)
  113. run_container = Container(
  114. image=image,
  115. name="default",
  116. profile=ContainerProfileEnum.RUN,
  117. restart_policy=ContainerRestartPolicyEnum.NEVER,
  118. execution=ContainerExecution(
  119. privileged=True,
  120. command=command,
  121. command_script=command_script,
  122. args=command_args,
  123. run_as_user=container_config.user,
  124. run_as_group=container_config.group,
  125. ),
  126. envs=[
  127. ContainerEnv(
  128. name=name,
  129. value=value,
  130. )
  131. for name, value in env.items()
  132. ],
  133. resources=resources,
  134. mounts=mounts,
  135. ports=ports,
  136. )
  137. # Adjust run container for distributed follower.
  138. if deployment_metadata.distributed_follower:
  139. ray_command, ray_command_args, ray_ports = self._build_ray_configuration(
  140. is_leader=False,
  141. )
  142. # Command script will override the given command,
  143. # so we need to prepend command to command args.
  144. if command_script:
  145. ray_command_args = ray_command + ray_command_args
  146. ray_command = None
  147. run_container.execution.command = ray_command
  148. # run_container.execution.command_script = command_script # already set
  149. run_container.execution.args = ray_command_args
  150. run_container.ports = ray_ports
  151. # Create sidecar container for distributed leader.
  152. sidecar_container = None
  153. if deployment_metadata.distributed_leader:
  154. run_container.mounts.append(
  155. ContainerMount(
  156. path="/tmp",
  157. volume="tmp-volume",
  158. ),
  159. )
  160. ray_command, ray_command_args, ray_ports = self._build_ray_configuration(
  161. is_leader=True,
  162. )
  163. # Command script will override the given command,
  164. # so we need to prepend command to command args.
  165. if command_script:
  166. ray_command_args = ray_command + ray_command_args
  167. ray_command = None
  168. # Copy envs and override RAY_LOG_TO_STDERR for the sidecar
  169. # so Ray head logs go to stderr (captured by container log stream),
  170. # while keeping RAY_LOG_TO_STDERR=0 in the main container to avoid
  171. # polluting vLLM's log output with Ray worker logs.
  172. sidecar_envs = list(run_container.envs)
  173. ray_stderr_found = False
  174. for i, e in enumerate(sidecar_envs):
  175. if e.name == "RAY_LOG_TO_STDERR":
  176. sidecar_envs[i] = ContainerEnv(name="RAY_LOG_TO_STDERR", value="1")
  177. ray_stderr_found = True
  178. break
  179. if not ray_stderr_found:
  180. sidecar_envs.append(ContainerEnv(name="RAY_LOG_TO_STDERR", value="1"))
  181. sidecar_container = Container(
  182. image=image,
  183. name="ray-head",
  184. profile=ContainerProfileEnum.RUN,
  185. restart_policy=ContainerRestartPolicyEnum.NEVER,
  186. execution=ContainerExecution(
  187. privileged=True,
  188. command=ray_command,
  189. command_script=command_script,
  190. args=ray_command_args,
  191. run_as_user=container_config.user,
  192. run_as_group=container_config.group,
  193. ),
  194. envs=sidecar_envs,
  195. resources=run_container.resources,
  196. mounts=run_container.mounts,
  197. ports=ray_ports,
  198. )
  199. logger.info(f"Creating vLLM container workload: {deployment_metadata.name}")
  200. logger.info(
  201. f"With image: {image}, "
  202. f"command: [{' '.join(command) if command else ''}], "
  203. f"arguments: [{' '.join(command_args)}], "
  204. f"ports: [{','.join([str(port.internal) for port in ports])}], "
  205. f"envs(inconsistent input items mean unchangeable):{os.linesep}"
  206. f"{os.linesep.join(f'{k}={v}' for k, v in sorted(sanitize_env(env).items()))}"
  207. )
  208. workload_plan = WorkloadPlan(
  209. name=deployment_metadata.name,
  210. host_network=True,
  211. shm_size=int(container_config.shm_size_gib * (1 << 30)),
  212. containers=(
  213. [run_container]
  214. if not sidecar_container
  215. else [run_container, sidecar_container]
  216. ),
  217. run_as_user=container_config.user,
  218. run_as_group=container_config.group,
  219. )
  220. create_workload(self._transform_workload_plan(workload_plan))
  221. logger.info(f"Created vLLM container workload: {deployment_metadata.name}")
  222. def _get_configured_env(self, is_distributed: bool) -> Dict[str, str]:
  223. """
  224. Get environment variables for vLLM service
  225. """
  226. # Apply GPUStack's inference environment setup
  227. env = super()._get_configured_env()
  228. # Optimize environment variables
  229. # -- Disable OpenMP parallelism to avoid resource contention, increases model loading.
  230. env["OMP_NUM_THREADS"] = env.pop("OMP_NUM_THREADS", "1")
  231. # -- Enable safetensors GPU loading pass-through for faster model loading.
  232. env["SAFETENSORS_FAST_GPU"] = env.pop("SAFETENSORS_FAST_GPU", "1")
  233. # -- Observe RUN:AI streamer model loading.
  234. env["RUNAI_STREAMER_MEMORY_LIMIT"] = env.pop("RUNAI_STREAMER_MEMORY_LIMIT", "0")
  235. env["RUNAI_STREAMER_LOG_TO_STDERR"] = env.pop(
  236. "RUNAI_STREAMER_LOG_TO_STDERR", "1"
  237. )
  238. env["RUNAI_STREAMER_LOG_LEVEL"] = env.pop("RUNAI_STREAMER_LOG_LEVEL", "INFO")
  239. # Persist the torch compile cache so repeated starts don't recompile.
  240. self._set_cache_env(env)
  241. # Apply LMCache environment variables if extended KV cache is enabled
  242. self._set_lmcache_env(env)
  243. # Apply distributed environment variables
  244. if is_distributed:
  245. self._set_distributed_env(env)
  246. # Apply Ascend-specific environment variables
  247. if is_ascend(self._get_selected_gpu_devices()):
  248. self._set_ascend_env(env)
  249. return env
  250. def _set_cache_env(self, env: Dict[str, str]):
  251. """
  252. Point VLLM_CACHE_ROOT at a persistent directory under gpustack's data dir
  253. so the torch compile cache survives container restarts. The directory is
  254. inherited by the inference container via gpustack-runtime's mirrored
  255. deployment (worker's data-dir mount is replicated to the vLLM container).
  256. """
  257. if "VLLM_CACHE_ROOT" in env:
  258. return
  259. if not self._config or not self._config.cache_dir:
  260. return
  261. cache_dir = os.path.join(self._config.cache_dir, "vllm")
  262. try:
  263. os.makedirs(cache_dir, exist_ok=True)
  264. except OSError as e:
  265. logger.warning(
  266. f"Failed to create vLLM cache dir {cache_dir}: {e}. "
  267. "Torch compile cache will not be persisted."
  268. )
  269. return
  270. env["VLLM_CACHE_ROOT"] = cache_dir
  271. def _set_lmcache_env(self, env: Dict[str, str]):
  272. """
  273. Set up LMCache environment variables if extended KV cache is enabled.
  274. """
  275. extended_kv_cache = self._model.extended_kv_cache
  276. if not (extended_kv_cache and extended_kv_cache.enabled):
  277. return
  278. if extended_kv_cache.chunk_size and extended_kv_cache.chunk_size > 0:
  279. env["LMCACHE_CHUNK_SIZE"] = str(extended_kv_cache.chunk_size)
  280. if extended_kv_cache.ram_size and extended_kv_cache.ram_size > 0:
  281. # Explicitly specified RAM size for KV cache
  282. env["LMCACHE_MAX_LOCAL_CPU_SIZE"] = str(extended_kv_cache.ram_size)
  283. elif extended_kv_cache.ram_ratio and extended_kv_cache.ram_ratio > 0:
  284. # Calculate RAM size based on ratio of total VRAM claim
  285. vram_claim = self._get_total_vram_claim()
  286. ram_size = int(vram_claim * extended_kv_cache.ram_ratio)
  287. env["LMCACHE_MAX_LOCAL_CPU_SIZE"] = str(byte_to_gib(ram_size))
  288. def _set_distributed_env(self, env: Dict[str, str]):
  289. """
  290. Set up environment variables for distributed execution.
  291. """
  292. # Configure Internal communication IP and port.
  293. # see https://docs.vllm.ai/en/stable/configuration/env_vars.html.
  294. env["VLLM_HOST_IP"] = self._worker.ip
  295. # During distributed setup,
  296. # we must get more than one port here,
  297. # so we use ports[-1] for distributed initialization.
  298. env["VLLM_PORT"] = str(self._model_instance.ports[-1])
  299. # Disable Ray logging to stderr by default,
  300. # see https://github.com/gpustack/gpustack/issues/4158#issuecomment-3809213348.
  301. env["RAY_LOG_TO_STDERR"] = env.pop("RAY_LOG_TO_STDERR", "0")
  302. # To reduce verbosity, set Ray backend log level to warning by default.
  303. env["RAY_BACKEND_LOG_LEVEL"] = env.pop("RAY_BACKEND_LOG_LEVEL", "warning")
  304. if is_ascend(self._get_selected_gpu_devices()):
  305. # See https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi-node_dsv3.2.html.
  306. if "HCCL_SOCKET_IFNAME" not in env:
  307. env["HCCL_IF_IP"] = self._worker.ip
  308. env["HCCL_SOCKET_IFNAME"] = f"={self._worker.ifname}"
  309. env["GLOO_SOCKET_IFNAME"] = self._worker.ifname
  310. env["TP_SOCKET_IFNAME"] = self._worker.ifname
  311. return
  312. if "NCCL_SOCKET_IFNAME" not in env:
  313. env["NCCL_SOCKET_IFNAME"] = f"={self._worker.ifname}"
  314. env["GLOO_SOCKET_IFNAME"] = self._worker.ifname
  315. def _set_ascend_env(self, env: Dict[str, str]):
  316. """
  317. Set up environment variables for Ascend devices.
  318. """
  319. # -- Optimize Pytorch NPU operations delivery performance.
  320. env["TASK_QUEUE_ENABLE"] = env.pop("TASK_QUEUE_ENABLE", "1")
  321. # -- Enable NUMA coarse-grained binding.
  322. env["CPU_AFFINITY_CONF"] = env.pop("CPU_AFFINITY_CONF", "1")
  323. # -- Reuse memory in multi-streams.
  324. env["PYTORCH_NPU_ALLOC_CONF"] = env.pop(
  325. "PYTORCH_NPU_ALLOC_CONF", "expandable_segments:True"
  326. )
  327. # -- Increase HCCL connection timeout to avoid issues in large clusters.
  328. env["HCCL_CONNECT_TIMEOUT"] = env.pop("HCCL_CONNECT_TIMEOUT", "7200")
  329. # -- Enable RDMA PCIe direct post with no strict mode for better performance.
  330. env["HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT"] = env.pop(
  331. "HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT", "TRUE"
  332. )
  333. if not is_ascend_310p(self._get_selected_gpu_devices()):
  334. # -- Disable HCCL execution timeout for better stability.
  335. env["HCCL_EXEC_TIMEOUT"] = env.pop("HCCL_EXEC_TIMEOUT", "0")
  336. # -- Enable the communication is scheduled by AI Vector directly with ROCE, instead of AI CPU.
  337. env["HCCL_OP_EXPANSION_MODE"] = env.pop("HCCL_OP_EXPANSION_MODE", "AIV")
  338. def _get_speculative_arguments(self) -> List[str]:
  339. """
  340. Get speculative arguments for vLLM.
  341. """
  342. speculative_config: SpeculativeConfig = self._model.speculative_config
  343. if not speculative_config or not speculative_config.enabled:
  344. return []
  345. vllm_speculative_algorithm_mapping = {
  346. SpeculativeAlgorithmEnum.EAGLE3: "eagle3",
  347. SpeculativeAlgorithmEnum.MTP: "mtp",
  348. SpeculativeAlgorithmEnum.NGRAM: "ngram",
  349. }
  350. method = vllm_speculative_algorithm_mapping.get(
  351. speculative_config.algorithm, None
  352. )
  353. if method:
  354. sp_dict = {
  355. "method": method,
  356. }
  357. if speculative_config.num_draft_tokens:
  358. sp_dict["num_speculative_tokens"] = speculative_config.num_draft_tokens
  359. if speculative_config.ngram_max_match_length:
  360. sp_dict["prompt_lookup_max"] = speculative_config.ngram_max_match_length
  361. if speculative_config.ngram_min_match_length:
  362. sp_dict["prompt_lookup_min"] = speculative_config.ngram_min_match_length
  363. if speculative_config.draft_model and self._draft_model_path:
  364. sp_dict["model"] = self._draft_model_path
  365. return [
  366. "--speculative-config",
  367. json.dumps(sp_dict),
  368. ]
  369. return []
  370. def _get_total_vram_claim(self) -> int:
  371. """
  372. Calculate total VRAM claim for the model instance on current worker.
  373. """
  374. vram = 0
  375. computed_resource_claim = self._model_instance.computed_resource_claim
  376. if self._worker.id != self._model_instance.worker_id:
  377. dservers = self._model_instance.distributed_servers
  378. subworkers = (
  379. dservers.subordinate_workers
  380. if dservers and dservers.subordinate_workers
  381. else []
  382. )
  383. for subworker in subworkers:
  384. if subworker.worker_id == self._worker.id:
  385. computed_resource_claim = subworker.computed_resource_claim
  386. break
  387. if not computed_resource_claim:
  388. return vram
  389. for _, vram_claim in computed_resource_claim.vram.items():
  390. vram += vram_claim
  391. return vram
  392. def _build_command_args(
  393. self,
  394. port: int,
  395. is_distributed: bool,
  396. entrypoint: Optional[List[str]] = None,
  397. ) -> Tuple[List[str], List[str]]:
  398. """
  399. Build vLLM command arguments for container execution.
  400. Returns:
  401. A tuple of (full_arguments, injected_backend_parameters) where
  402. injected_backend_parameters contains only the arguments automatically
  403. added by GPUStack, excluding the entrypoint/model path and
  404. user-specified backend parameters.
  405. """
  406. arguments = [
  407. "vllm",
  408. "serve",
  409. self._model_path,
  410. ]
  411. # Allow version-specific command override if configured (before appending extra args)
  412. arguments = self.build_versioned_command_args(arguments)
  413. # Omni modalities
  414. omni_enabled = find_bool_parameter(
  415. self._model.backend_parameters,
  416. ["omni"],
  417. )
  418. is_omni = is_omni_model(self._model)
  419. if is_omni and not omni_enabled:
  420. arguments.extend(
  421. [
  422. "--omni",
  423. ]
  424. )
  425. is_audio = is_audio_model(self._model)
  426. if not is_omni and not is_audio:
  427. specified_max_model_len = find_parameter(
  428. self._model.backend_parameters,
  429. ["max-model-len"],
  430. )
  431. if specified_max_model_len is None:
  432. derived_max_model_len = self._derive_max_model_len()
  433. if derived_max_model_len and derived_max_model_len > 8192:
  434. arguments.extend(["--max-model-len", "8192"])
  435. auto_parallelism_arguments = get_auto_parallelism_arguments(
  436. self._model.backend_parameters,
  437. self._model_instance,
  438. is_distributed,
  439. )
  440. arguments.extend(auto_parallelism_arguments)
  441. # Add speculative config arguments if needed
  442. speculative_config_arguments = self._get_speculative_arguments()
  443. arguments.extend(speculative_config_arguments)
  444. # Suppress high-frequency /metrics access logs by default.
  445. access_log_arguments = get_access_log_arguments(
  446. self._model.backend_parameters, self._model.backend_version
  447. )
  448. arguments.extend(access_log_arguments)
  449. # Expose prefix-cache hits as cached_tokens in OpenAI usage.
  450. cache_report_arguments = get_cache_report_arguments(
  451. self._model.backend_parameters, self._model.backend_version
  452. )
  453. arguments.extend(cache_report_arguments)
  454. if is_distributed:
  455. arguments.extend(["--distributed-executor-backend", "ray"])
  456. dps = find_int_parameter(
  457. self._model.backend_parameters, ["data-parallel-size", "dp"]
  458. )
  459. if dps and dps > 1:
  460. # Prefer to use Ray backend for data parallelism if DP size is specified.
  461. dpb = find_parameter(
  462. self._model.backend_parameters, ["data-parallel-backend", "dpb"]
  463. )
  464. if dpb is None:
  465. arguments.extend(["--data-parallel-backend", "ray"])
  466. # Specify a port for DP RPC communication,
  467. # we must get more than one port here, see gpustack/worker/serve_manager.py,
  468. # so we use ports[1] for DP RPC communication.
  469. arguments.extend(
  470. ["--data-parallel-rpc-port", str(self._model_instance.ports[1])]
  471. )
  472. if self._model.extended_kv_cache and self._model.extended_kv_cache.enabled:
  473. vendor, _, _ = self._get_device_info()
  474. if vendor in {
  475. manufacturer_to_backend(ManufacturerEnum.NVIDIA),
  476. manufacturer_to_backend(ManufacturerEnum.AMD),
  477. }:
  478. arguments.extend(
  479. [
  480. "--kv-transfer-config",
  481. '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}',
  482. ]
  483. )
  484. else:
  485. logger.warning(
  486. "Extended KV cache for vLLM is only supported on NVIDIA and AMD GPUs. Skipping LMCache configuration."
  487. )
  488. # For Ascend 310P, we need to enforce eager execution and default dtype to float16
  489. if is_ascend_310p(self._get_selected_gpu_devices()):
  490. arguments.extend(
  491. [
  492. "--enforce-eager",
  493. "--dtype",
  494. "float16",
  495. ]
  496. )
  497. # Inject user-defined backend parameters
  498. user_backend_parameters = self._flatten_backend_param()
  499. arguments.extend(user_backend_parameters)
  500. # Append immutable arguments to ensure proper operation for accessing.
  501. # Only add if not already present in arguments.
  502. extend_args_no_exist(
  503. arguments,
  504. ("--host", self._worker.ip),
  505. ("--port", str(port)),
  506. ("--served-model-name", self._model_instance.model_name),
  507. )
  508. injected = self._get_injected_backend_parameters(
  509. arguments, user_backend_parameters, entrypoint
  510. )
  511. return arguments, injected
  512. def _build_ray_configuration(
  513. self,
  514. is_leader: bool,
  515. ) -> (List[str], List[str], Optional[List[ContainerPort]]):
  516. # Parse the Ray port range from configuration,
  517. # assign ports in order as below:
  518. # 1. GCS server port (the first port of the range)
  519. # 2. Client port (reserved for compatibility, not used anymore, see https://github.com/gpustack/gpustack/issues/4171)
  520. # 3. Dashboard port
  521. # 4. Dashboard gRPC port (no longer used, since Ray 2.45.0 kept for backward compatibility)
  522. # 5. Dashboard agent gRPC port
  523. # 6. Dashboard agent listen port
  524. # 7. Metrics export port
  525. # 8. Node Manager port
  526. # 9. Object Manager port
  527. # 10. Raylet runtime env agent port
  528. # 11. Minimum port number for the worker
  529. # 12. Maximum port number for the worker (the last port of the range)
  530. start, end = network.parse_port_range(self._config.ray_port_range)
  531. gcs_server_port = start
  532. # client_port = start + 1
  533. dashboard_port = start + 2
  534. dashboard_grpc_port = start + 3
  535. dashboard_agent_grpc_port = start + 4
  536. dashboard_agent_listen_port = start + 5
  537. metrics_export_port = start + 6
  538. node_manager_port = start + 7
  539. object_manager_port = start + 8
  540. raylet_runtime_env_agent_port = start + 9
  541. worker_port_min = start + 10
  542. worker_port_max = end
  543. command = [
  544. "ray",
  545. "start",
  546. ]
  547. arguments = [
  548. "--block",
  549. "--disable-usage-stats",
  550. "--verbose",
  551. f"--node-manager-port={node_manager_port}",
  552. f"--object-manager-port={object_manager_port}",
  553. f"--runtime-env-agent-port={raylet_runtime_env_agent_port}",
  554. f"--dashboard-agent-grpc-port={dashboard_agent_grpc_port}",
  555. f"--dashboard-agent-listen-port={dashboard_agent_listen_port}",
  556. f"--metrics-export-port={metrics_export_port}",
  557. f"--min-worker-port={worker_port_min}",
  558. f"--max-worker-port={worker_port_max}",
  559. f"--node-ip-address={self._worker.ip}",
  560. ]
  561. ports = [
  562. ContainerPort(
  563. internal=port,
  564. )
  565. for port in [
  566. dashboard_grpc_port,
  567. dashboard_agent_grpc_port,
  568. dashboard_agent_listen_port,
  569. metrics_export_port,
  570. node_manager_port,
  571. object_manager_port,
  572. raylet_runtime_env_agent_port,
  573. ]
  574. ]
  575. if is_leader:
  576. arguments.extend(
  577. [
  578. "--head",
  579. f"--port={gcs_server_port}",
  580. f"--dashboard-host={self._worker.ip}",
  581. f"--dashboard-port={dashboard_port}",
  582. ]
  583. )
  584. ports.extend(
  585. [
  586. ContainerPort(
  587. internal=port,
  588. )
  589. for port in [gcs_server_port, dashboard_port]
  590. ]
  591. )
  592. else:
  593. arguments.extend(
  594. [
  595. f"--address={self._model_instance.worker_ip}:{gcs_server_port}",
  596. ]
  597. )
  598. return command, arguments, ports
  599. def get_auto_parallelism_arguments(
  600. backend_parameters: List[str],
  601. model_instance: ModelInstance,
  602. is_distributed: bool,
  603. ) -> List[str]:
  604. parallelism = find_parameter(
  605. backend_parameters,
  606. [
  607. "tensor-parallel-size",
  608. "tp",
  609. "pipeline-parallel-size",
  610. "pp",
  611. "data-parallel-size",
  612. "dp",
  613. ],
  614. )
  615. if parallelism is not None:
  616. return []
  617. if is_distributed:
  618. # distributed across multiple workers
  619. (tp, pp) = cal_distributed_parallelism_arguments(model_instance)
  620. return [
  621. "--tensor-parallel-size",
  622. str(tp),
  623. "--pipeline-parallel-size",
  624. str(pp),
  625. ]
  626. if model_instance.gpu_indexes is not None and len(model_instance.gpu_indexes) > 1:
  627. # single worker with multiple GPUs
  628. return [
  629. "--tensor-parallel-size",
  630. str(len(model_instance.gpu_indexes)),
  631. ]
  632. return []
  633. def get_access_log_arguments(
  634. backend_parameters: List[str], backend_version: Optional[str] = None
  635. ) -> List[str]:
  636. """
  637. Get default vLLM access log filter arguments.
  638. --disable-access-log-for-endpoints was introduced in vLLM 0.16.0.
  639. """
  640. if not backend_version:
  641. return []
  642. if compare_versions(backend_version, "0.16.0") < 0:
  643. return []
  644. access_log_filter = find_parameter(
  645. backend_parameters,
  646. ["disable-access-log-for-endpoints"],
  647. )
  648. if access_log_filter is not None:
  649. return []
  650. return ["--disable-access-log-for-endpoints", "/metrics"]
  651. def get_cache_report_arguments(
  652. backend_parameters: List[str], backend_version: Optional[str] = None
  653. ) -> List[str]:
  654. """
  655. Auto-inject `--enable-prompt-tokens-details` so vLLM populates
  656. `usage.prompt_tokens_details.cached_tokens` in OpenAI responses.
  657. Only injected for vLLM >= v0.9.0.1 — earlier V1 builds silently dropped
  658. the field (https://github.com/vllm-project/vllm/pull/18149).
  659. Prefix caching itself is the user's responsibility (`--enable-prefix-caching`):
  660. V1 has it on by default, V0 does not.
  661. """
  662. if not backend_version:
  663. return []
  664. if compare_versions(backend_version, "0.9.0.1") < 0:
  665. return []
  666. if find_bool_parameter(
  667. backend_parameters,
  668. ["enable-prompt-tokens-details", "no-enable-prompt-tokens-details"],
  669. ):
  670. return []
  671. return ["--enable-prompt-tokens-details"]