| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766 |
- import json
- import logging
- import os
- from typing import Dict, List, Optional, Tuple
- from gpustack_runtime.deployer import (
- Container,
- ContainerEnv,
- ContainerExecution,
- ContainerProfileEnum,
- WorkloadPlan,
- create_workload,
- ContainerMount,
- ContainerPort,
- ContainerRestartPolicyEnum,
- )
- from gpustack_runtime.deployer.__utils__ import compare_versions
- from gpustack_runtime.detector import ManufacturerEnum, manufacturer_to_backend
- from gpustack.schemas.models import (
- ModelInstance,
- SpeculativeAlgorithmEnum,
- SpeculativeConfig,
- ModelInstanceDeploymentMetadata,
- is_audio_model,
- is_omni_model,
- )
- from gpustack.utils import network
- from gpustack.utils.command import (
- find_parameter,
- find_bool_parameter,
- find_int_parameter,
- extend_args_no_exist,
- format_backend_parameters,
- )
- from gpustack.utils.envs import sanitize_env
- from gpustack.utils.unit import byte_to_gib
- from gpustack.worker.backends.base import (
- InferenceServer,
- is_ascend_310p,
- is_ascend,
- cal_distributed_parallelism_arguments,
- )
- logger = logging.getLogger(__name__)
- class VLLMServer(InferenceServer):
- """
- Containerized vLLM inference server backend using gpustack-runtime.
- This backend runs vLLM in a Docker container managed by gpustack-runtime,
- providing better isolation, resource management, and deployment consistency.
- """
- def start(self): # noqa: C901
- try:
- self._start()
- except Exception as e:
- self._handle_error(e)
- def _start(self):
- logger.info(f"Starting vLLM model instance: {self._model_instance.name}")
- # Prepare distributed information.
- deployment_metadata = self._get_deployment_metadata()
- env = self._get_configured_env(
- is_distributed=deployment_metadata.distributed,
- )
- # Resolve image first so that backend_version is populated before
- # building command args (version-gated arguments depend on it).
- image = self._get_configured_image()
- if not image:
- raise ValueError("Failed to get vLLM backend image")
- command = None
- if self.inference_backend:
- command = self.inference_backend.get_container_entrypoint(
- self._model.backend_version
- )
- command_script = self._get_serving_command_script(env)
- command_args, injected = self._build_command_args(
- port=self._get_serving_port(),
- is_distributed=deployment_metadata.distributed,
- entrypoint=command,
- )
- try:
- self._update_model_instance(
- self._model_instance.id,
- injected_backend_parameters=format_backend_parameters(injected) or None,
- )
- except Exception as e:
- logger.warning(
- f"Failed to persist injected backend parameters for {self._model_instance.name}: {e}"
- )
- self._create_workload(
- deployment_metadata=deployment_metadata,
- command=command,
- command_script=command_script,
- command_args=command_args,
- env=env,
- image=image,
- )
- def _create_workload(
- self,
- deployment_metadata: ModelInstanceDeploymentMetadata,
- command: Optional[List[str]],
- command_script: Optional[str],
- command_args: List[str],
- env: Dict[str, str],
- image: str,
- ):
- # Command script will override the given command,
- # so we need to prepend command to command args.
- if command_script and command:
- command_args = command + command_args
- command = None
- resources = self._get_configured_resources()
- mounts = self._get_configured_mounts()
- ports = self._get_configured_ports()
- # Read container config from environment variables
- container_config = self._get_container_env_config(env)
- run_container = Container(
- image=image,
- name="default",
- profile=ContainerProfileEnum.RUN,
- restart_policy=ContainerRestartPolicyEnum.NEVER,
- execution=ContainerExecution(
- privileged=True,
- command=command,
- command_script=command_script,
- args=command_args,
- run_as_user=container_config.user,
- run_as_group=container_config.group,
- ),
- envs=[
- ContainerEnv(
- name=name,
- value=value,
- )
- for name, value in env.items()
- ],
- resources=resources,
- mounts=mounts,
- ports=ports,
- )
- # Adjust run container for distributed follower.
- if deployment_metadata.distributed_follower:
- ray_command, ray_command_args, ray_ports = self._build_ray_configuration(
- is_leader=False,
- )
- # Command script will override the given command,
- # so we need to prepend command to command args.
- if command_script:
- ray_command_args = ray_command + ray_command_args
- ray_command = None
- run_container.execution.command = ray_command
- # run_container.execution.command_script = command_script # already set
- run_container.execution.args = ray_command_args
- run_container.ports = ray_ports
- # Create sidecar container for distributed leader.
- sidecar_container = None
- if deployment_metadata.distributed_leader:
- run_container.mounts.append(
- ContainerMount(
- path="/tmp",
- volume="tmp-volume",
- ),
- )
- ray_command, ray_command_args, ray_ports = self._build_ray_configuration(
- is_leader=True,
- )
- # Command script will override the given command,
- # so we need to prepend command to command args.
- if command_script:
- ray_command_args = ray_command + ray_command_args
- ray_command = None
- # Copy envs and override RAY_LOG_TO_STDERR for the sidecar
- # so Ray head logs go to stderr (captured by container log stream),
- # while keeping RAY_LOG_TO_STDERR=0 in the main container to avoid
- # polluting vLLM's log output with Ray worker logs.
- sidecar_envs = list(run_container.envs)
- ray_stderr_found = False
- for i, e in enumerate(sidecar_envs):
- if e.name == "RAY_LOG_TO_STDERR":
- sidecar_envs[i] = ContainerEnv(name="RAY_LOG_TO_STDERR", value="1")
- ray_stderr_found = True
- break
- if not ray_stderr_found:
- sidecar_envs.append(ContainerEnv(name="RAY_LOG_TO_STDERR", value="1"))
- sidecar_container = Container(
- image=image,
- name="ray-head",
- profile=ContainerProfileEnum.RUN,
- restart_policy=ContainerRestartPolicyEnum.NEVER,
- execution=ContainerExecution(
- privileged=True,
- command=ray_command,
- command_script=command_script,
- args=ray_command_args,
- run_as_user=container_config.user,
- run_as_group=container_config.group,
- ),
- envs=sidecar_envs,
- resources=run_container.resources,
- mounts=run_container.mounts,
- ports=ray_ports,
- )
- logger.info(f"Creating vLLM container workload: {deployment_metadata.name}")
- logger.info(
- f"With image: {image}, "
- f"command: [{' '.join(command) if command else ''}], "
- f"arguments: [{' '.join(command_args)}], "
- f"ports: [{','.join([str(port.internal) for port in ports])}], "
- f"envs(inconsistent input items mean unchangeable):{os.linesep}"
- f"{os.linesep.join(f'{k}={v}' for k, v in sorted(sanitize_env(env).items()))}"
- )
- workload_plan = WorkloadPlan(
- name=deployment_metadata.name,
- host_network=True,
- shm_size=int(container_config.shm_size_gib * (1 << 30)),
- containers=(
- [run_container]
- if not sidecar_container
- else [run_container, sidecar_container]
- ),
- run_as_user=container_config.user,
- run_as_group=container_config.group,
- )
- create_workload(self._transform_workload_plan(workload_plan))
- logger.info(f"Created vLLM container workload: {deployment_metadata.name}")
- def _get_configured_env(self, is_distributed: bool) -> Dict[str, str]:
- """
- Get environment variables for vLLM service
- """
- # Apply GPUStack's inference environment setup
- env = super()._get_configured_env()
- # Optimize environment variables
- # -- Disable OpenMP parallelism to avoid resource contention, increases model loading.
- env["OMP_NUM_THREADS"] = env.pop("OMP_NUM_THREADS", "1")
- # -- Enable safetensors GPU loading pass-through for faster model loading.
- env["SAFETENSORS_FAST_GPU"] = env.pop("SAFETENSORS_FAST_GPU", "1")
- # -- Observe RUN:AI streamer model loading.
- env["RUNAI_STREAMER_MEMORY_LIMIT"] = env.pop("RUNAI_STREAMER_MEMORY_LIMIT", "0")
- env["RUNAI_STREAMER_LOG_TO_STDERR"] = env.pop(
- "RUNAI_STREAMER_LOG_TO_STDERR", "1"
- )
- env["RUNAI_STREAMER_LOG_LEVEL"] = env.pop("RUNAI_STREAMER_LOG_LEVEL", "INFO")
- # Persist the torch compile cache so repeated starts don't recompile.
- self._set_cache_env(env)
- # Apply LMCache environment variables if extended KV cache is enabled
- self._set_lmcache_env(env)
- # Apply distributed environment variables
- if is_distributed:
- self._set_distributed_env(env)
- # Apply Ascend-specific environment variables
- if is_ascend(self._get_selected_gpu_devices()):
- self._set_ascend_env(env)
- return env
- def _set_cache_env(self, env: Dict[str, str]):
- """
- Point VLLM_CACHE_ROOT at a persistent directory under gpustack's data dir
- so the torch compile cache survives container restarts. The directory is
- inherited by the inference container via gpustack-runtime's mirrored
- deployment (worker's data-dir mount is replicated to the vLLM container).
- """
- if "VLLM_CACHE_ROOT" in env:
- return
- if not self._config or not self._config.cache_dir:
- return
- cache_dir = os.path.join(self._config.cache_dir, "vllm")
- try:
- os.makedirs(cache_dir, exist_ok=True)
- except OSError as e:
- logger.warning(
- f"Failed to create vLLM cache dir {cache_dir}: {e}. "
- "Torch compile cache will not be persisted."
- )
- return
- env["VLLM_CACHE_ROOT"] = cache_dir
- def _set_lmcache_env(self, env: Dict[str, str]):
- """
- Set up LMCache environment variables if extended KV cache is enabled.
- """
- extended_kv_cache = self._model.extended_kv_cache
- if not (extended_kv_cache and extended_kv_cache.enabled):
- return
- if extended_kv_cache.chunk_size and extended_kv_cache.chunk_size > 0:
- env["LMCACHE_CHUNK_SIZE"] = str(extended_kv_cache.chunk_size)
- if extended_kv_cache.ram_size and extended_kv_cache.ram_size > 0:
- # Explicitly specified RAM size for KV cache
- env["LMCACHE_MAX_LOCAL_CPU_SIZE"] = str(extended_kv_cache.ram_size)
- elif extended_kv_cache.ram_ratio and extended_kv_cache.ram_ratio > 0:
- # Calculate RAM size based on ratio of total VRAM claim
- vram_claim = self._get_total_vram_claim()
- ram_size = int(vram_claim * extended_kv_cache.ram_ratio)
- env["LMCACHE_MAX_LOCAL_CPU_SIZE"] = str(byte_to_gib(ram_size))
- def _set_distributed_env(self, env: Dict[str, str]):
- """
- Set up environment variables for distributed execution.
- """
- # Configure Internal communication IP and port.
- # see https://docs.vllm.ai/en/stable/configuration/env_vars.html.
- env["VLLM_HOST_IP"] = self._worker.ip
- # During distributed setup,
- # we must get more than one port here,
- # so we use ports[-1] for distributed initialization.
- env["VLLM_PORT"] = str(self._model_instance.ports[-1])
- # Disable Ray logging to stderr by default,
- # see https://github.com/gpustack/gpustack/issues/4158#issuecomment-3809213348.
- env["RAY_LOG_TO_STDERR"] = env.pop("RAY_LOG_TO_STDERR", "0")
- # To reduce verbosity, set Ray backend log level to warning by default.
- env["RAY_BACKEND_LOG_LEVEL"] = env.pop("RAY_BACKEND_LOG_LEVEL", "warning")
- if is_ascend(self._get_selected_gpu_devices()):
- # See https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi-node_dsv3.2.html.
- if "HCCL_SOCKET_IFNAME" not in env:
- env["HCCL_IF_IP"] = self._worker.ip
- env["HCCL_SOCKET_IFNAME"] = f"={self._worker.ifname}"
- env["GLOO_SOCKET_IFNAME"] = self._worker.ifname
- env["TP_SOCKET_IFNAME"] = self._worker.ifname
- return
- if "NCCL_SOCKET_IFNAME" not in env:
- env["NCCL_SOCKET_IFNAME"] = f"={self._worker.ifname}"
- env["GLOO_SOCKET_IFNAME"] = self._worker.ifname
- def _set_ascend_env(self, env: Dict[str, str]):
- """
- Set up environment variables for Ascend devices.
- """
- # -- Optimize Pytorch NPU operations delivery performance.
- env["TASK_QUEUE_ENABLE"] = env.pop("TASK_QUEUE_ENABLE", "1")
- # -- Enable NUMA coarse-grained binding.
- env["CPU_AFFINITY_CONF"] = env.pop("CPU_AFFINITY_CONF", "1")
- # -- Reuse memory in multi-streams.
- env["PYTORCH_NPU_ALLOC_CONF"] = env.pop(
- "PYTORCH_NPU_ALLOC_CONF", "expandable_segments:True"
- )
- # -- Increase HCCL connection timeout to avoid issues in large clusters.
- env["HCCL_CONNECT_TIMEOUT"] = env.pop("HCCL_CONNECT_TIMEOUT", "7200")
- # -- Enable RDMA PCIe direct post with no strict mode for better performance.
- env["HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT"] = env.pop(
- "HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT", "TRUE"
- )
- if not is_ascend_310p(self._get_selected_gpu_devices()):
- # -- Disable HCCL execution timeout for better stability.
- env["HCCL_EXEC_TIMEOUT"] = env.pop("HCCL_EXEC_TIMEOUT", "0")
- # -- Enable the communication is scheduled by AI Vector directly with ROCE, instead of AI CPU.
- env["HCCL_OP_EXPANSION_MODE"] = env.pop("HCCL_OP_EXPANSION_MODE", "AIV")
- def _get_speculative_arguments(self) -> List[str]:
- """
- Get speculative arguments for vLLM.
- """
- speculative_config: SpeculativeConfig = self._model.speculative_config
- if not speculative_config or not speculative_config.enabled:
- return []
- vllm_speculative_algorithm_mapping = {
- SpeculativeAlgorithmEnum.EAGLE3: "eagle3",
- SpeculativeAlgorithmEnum.MTP: "mtp",
- SpeculativeAlgorithmEnum.NGRAM: "ngram",
- }
- method = vllm_speculative_algorithm_mapping.get(
- speculative_config.algorithm, None
- )
- if method:
- sp_dict = {
- "method": method,
- }
- if speculative_config.num_draft_tokens:
- sp_dict["num_speculative_tokens"] = speculative_config.num_draft_tokens
- if speculative_config.ngram_max_match_length:
- sp_dict["prompt_lookup_max"] = speculative_config.ngram_max_match_length
- if speculative_config.ngram_min_match_length:
- sp_dict["prompt_lookup_min"] = speculative_config.ngram_min_match_length
- if speculative_config.draft_model and self._draft_model_path:
- sp_dict["model"] = self._draft_model_path
- return [
- "--speculative-config",
- json.dumps(sp_dict),
- ]
- return []
- def _get_total_vram_claim(self) -> int:
- """
- Calculate total VRAM claim for the model instance on current worker.
- """
- vram = 0
- computed_resource_claim = self._model_instance.computed_resource_claim
- if self._worker.id != self._model_instance.worker_id:
- dservers = self._model_instance.distributed_servers
- subworkers = (
- dservers.subordinate_workers
- if dservers and dservers.subordinate_workers
- else []
- )
- for subworker in subworkers:
- if subworker.worker_id == self._worker.id:
- computed_resource_claim = subworker.computed_resource_claim
- break
- if not computed_resource_claim:
- return vram
- for _, vram_claim in computed_resource_claim.vram.items():
- vram += vram_claim
- return vram
- def _build_command_args(
- self,
- port: int,
- is_distributed: bool,
- entrypoint: Optional[List[str]] = None,
- ) -> Tuple[List[str], List[str]]:
- """
- Build vLLM command arguments for container execution.
- Returns:
- A tuple of (full_arguments, injected_backend_parameters) where
- injected_backend_parameters contains only the arguments automatically
- added by GPUStack, excluding the entrypoint/model path and
- user-specified backend parameters.
- """
- arguments = [
- "vllm",
- "serve",
- self._model_path,
- ]
- # Allow version-specific command override if configured (before appending extra args)
- arguments = self.build_versioned_command_args(arguments)
- # Omni modalities
- omni_enabled = find_bool_parameter(
- self._model.backend_parameters,
- ["omni"],
- )
- is_omni = is_omni_model(self._model)
- if is_omni and not omni_enabled:
- arguments.extend(
- [
- "--omni",
- ]
- )
- is_audio = is_audio_model(self._model)
- if not is_omni and not is_audio:
- specified_max_model_len = find_parameter(
- self._model.backend_parameters,
- ["max-model-len"],
- )
- if specified_max_model_len is None:
- derived_max_model_len = self._derive_max_model_len()
- if derived_max_model_len and derived_max_model_len > 8192:
- arguments.extend(["--max-model-len", "8192"])
- auto_parallelism_arguments = get_auto_parallelism_arguments(
- self._model.backend_parameters,
- self._model_instance,
- is_distributed,
- )
- arguments.extend(auto_parallelism_arguments)
- # Add speculative config arguments if needed
- speculative_config_arguments = self._get_speculative_arguments()
- arguments.extend(speculative_config_arguments)
- # Suppress high-frequency /metrics access logs by default.
- access_log_arguments = get_access_log_arguments(
- self._model.backend_parameters, self._model.backend_version
- )
- arguments.extend(access_log_arguments)
- # Expose prefix-cache hits as cached_tokens in OpenAI usage.
- cache_report_arguments = get_cache_report_arguments(
- self._model.backend_parameters, self._model.backend_version
- )
- arguments.extend(cache_report_arguments)
- if is_distributed:
- arguments.extend(["--distributed-executor-backend", "ray"])
- dps = find_int_parameter(
- self._model.backend_parameters, ["data-parallel-size", "dp"]
- )
- if dps and dps > 1:
- # Prefer to use Ray backend for data parallelism if DP size is specified.
- dpb = find_parameter(
- self._model.backend_parameters, ["data-parallel-backend", "dpb"]
- )
- if dpb is None:
- arguments.extend(["--data-parallel-backend", "ray"])
- # Specify a port for DP RPC communication,
- # we must get more than one port here, see gpustack/worker/serve_manager.py,
- # so we use ports[1] for DP RPC communication.
- arguments.extend(
- ["--data-parallel-rpc-port", str(self._model_instance.ports[1])]
- )
- if self._model.extended_kv_cache and self._model.extended_kv_cache.enabled:
- vendor, _, _ = self._get_device_info()
- if vendor in {
- manufacturer_to_backend(ManufacturerEnum.NVIDIA),
- manufacturer_to_backend(ManufacturerEnum.AMD),
- }:
- arguments.extend(
- [
- "--kv-transfer-config",
- '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}',
- ]
- )
- else:
- logger.warning(
- "Extended KV cache for vLLM is only supported on NVIDIA and AMD GPUs. Skipping LMCache configuration."
- )
- # For Ascend 310P, we need to enforce eager execution and default dtype to float16
- if is_ascend_310p(self._get_selected_gpu_devices()):
- arguments.extend(
- [
- "--enforce-eager",
- "--dtype",
- "float16",
- ]
- )
- # Inject user-defined backend parameters
- user_backend_parameters = self._flatten_backend_param()
- arguments.extend(user_backend_parameters)
- # Append immutable arguments to ensure proper operation for accessing.
- # Only add if not already present in arguments.
- extend_args_no_exist(
- arguments,
- ("--host", self._worker.ip),
- ("--port", str(port)),
- ("--served-model-name", self._model_instance.model_name),
- )
- injected = self._get_injected_backend_parameters(
- arguments, user_backend_parameters, entrypoint
- )
- return arguments, injected
- def _build_ray_configuration(
- self,
- is_leader: bool,
- ) -> (List[str], List[str], Optional[List[ContainerPort]]):
- # Parse the Ray port range from configuration,
- # assign ports in order as below:
- # 1. GCS server port (the first port of the range)
- # 2. Client port (reserved for compatibility, not used anymore, see https://github.com/gpustack/gpustack/issues/4171)
- # 3. Dashboard port
- # 4. Dashboard gRPC port (no longer used, since Ray 2.45.0 kept for backward compatibility)
- # 5. Dashboard agent gRPC port
- # 6. Dashboard agent listen port
- # 7. Metrics export port
- # 8. Node Manager port
- # 9. Object Manager port
- # 10. Raylet runtime env agent port
- # 11. Minimum port number for the worker
- # 12. Maximum port number for the worker (the last port of the range)
- start, end = network.parse_port_range(self._config.ray_port_range)
- gcs_server_port = start
- # client_port = start + 1
- dashboard_port = start + 2
- dashboard_grpc_port = start + 3
- dashboard_agent_grpc_port = start + 4
- dashboard_agent_listen_port = start + 5
- metrics_export_port = start + 6
- node_manager_port = start + 7
- object_manager_port = start + 8
- raylet_runtime_env_agent_port = start + 9
- worker_port_min = start + 10
- worker_port_max = end
- command = [
- "ray",
- "start",
- ]
- arguments = [
- "--block",
- "--disable-usage-stats",
- "--verbose",
- f"--node-manager-port={node_manager_port}",
- f"--object-manager-port={object_manager_port}",
- f"--runtime-env-agent-port={raylet_runtime_env_agent_port}",
- f"--dashboard-agent-grpc-port={dashboard_agent_grpc_port}",
- f"--dashboard-agent-listen-port={dashboard_agent_listen_port}",
- f"--metrics-export-port={metrics_export_port}",
- f"--min-worker-port={worker_port_min}",
- f"--max-worker-port={worker_port_max}",
- f"--node-ip-address={self._worker.ip}",
- ]
- ports = [
- ContainerPort(
- internal=port,
- )
- for port in [
- dashboard_grpc_port,
- dashboard_agent_grpc_port,
- dashboard_agent_listen_port,
- metrics_export_port,
- node_manager_port,
- object_manager_port,
- raylet_runtime_env_agent_port,
- ]
- ]
- if is_leader:
- arguments.extend(
- [
- "--head",
- f"--port={gcs_server_port}",
- f"--dashboard-host={self._worker.ip}",
- f"--dashboard-port={dashboard_port}",
- ]
- )
- ports.extend(
- [
- ContainerPort(
- internal=port,
- )
- for port in [gcs_server_port, dashboard_port]
- ]
- )
- else:
- arguments.extend(
- [
- f"--address={self._model_instance.worker_ip}:{gcs_server_port}",
- ]
- )
- return command, arguments, ports
- def get_auto_parallelism_arguments(
- backend_parameters: List[str],
- model_instance: ModelInstance,
- is_distributed: bool,
- ) -> List[str]:
- parallelism = find_parameter(
- backend_parameters,
- [
- "tensor-parallel-size",
- "tp",
- "pipeline-parallel-size",
- "pp",
- "data-parallel-size",
- "dp",
- ],
- )
- if parallelism is not None:
- return []
- if is_distributed:
- # distributed across multiple workers
- (tp, pp) = cal_distributed_parallelism_arguments(model_instance)
- return [
- "--tensor-parallel-size",
- str(tp),
- "--pipeline-parallel-size",
- str(pp),
- ]
- if model_instance.gpu_indexes is not None and len(model_instance.gpu_indexes) > 1:
- # single worker with multiple GPUs
- return [
- "--tensor-parallel-size",
- str(len(model_instance.gpu_indexes)),
- ]
- return []
- def get_access_log_arguments(
- backend_parameters: List[str], backend_version: Optional[str] = None
- ) -> List[str]:
- """
- Get default vLLM access log filter arguments.
- --disable-access-log-for-endpoints was introduced in vLLM 0.16.0.
- """
- if not backend_version:
- return []
- if compare_versions(backend_version, "0.16.0") < 0:
- return []
- access_log_filter = find_parameter(
- backend_parameters,
- ["disable-access-log-for-endpoints"],
- )
- if access_log_filter is not None:
- return []
- return ["--disable-access-log-for-endpoints", "/metrics"]
- def get_cache_report_arguments(
- backend_parameters: List[str], backend_version: Optional[str] = None
- ) -> List[str]:
- """
- Auto-inject `--enable-prompt-tokens-details` so vLLM populates
- `usage.prompt_tokens_details.cached_tokens` in OpenAI responses.
- Only injected for vLLM >= v0.9.0.1 — earlier V1 builds silently dropped
- the field (https://github.com/vllm-project/vllm/pull/18149).
- Prefix caching itself is the user's responsibility (`--enable-prefix-caching`):
- V1 has it on by default, V0 does not.
- """
- if not backend_version:
- return []
- if compare_versions(backend_version, "0.9.0.1") < 0:
- return []
- if find_bool_parameter(
- backend_parameters,
- ["enable-prompt-tokens-details", "no-enable-prompt-tokens-details"],
- ):
- return []
- return ["--enable-prompt-tokens-details"]
|