base.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080
  1. import logging
  2. import os
  3. import sys
  4. import shlex
  5. import json
  6. import threading
  7. from functools import lru_cache
  8. from pathlib import Path
  9. from typing import Dict, Optional, List, Tuple, Union
  10. from abc import ABC, abstractmethod
  11. from transformers import PretrainedConfig
  12. from gpustack_runner import list_backend_runners
  13. from gpustack_runner.runner import BackendVersionedRunner
  14. from gpustack_runtime.deployer import ContainerResources, ContainerMount, ContainerPort
  15. from gpustack_runtime.deployer.__utils__ import compare_versions
  16. from gpustack_runtime.detector import (
  17. ManufacturerEnum,
  18. available_backends,
  19. )
  20. from gpustack_runtime.detector.ascend import get_ascend_cann_variant
  21. from gpustack_runtime import envs as runtime_envs
  22. from gpustack_runtime.envs import (
  23. to_bool,
  24. )
  25. from gpustack_runtime.logging import setup_logging as setup_runtime_logging
  26. from gpustack_runtime.deployer.docker import DockerWorkloadPlan
  27. from gpustack_runtime.deployer import WorkloadPlan
  28. from gpustack.client.generated_clientset import ClientSet
  29. from gpustack.config.config import Config, set_global_config
  30. from gpustack.logging import setup_logging
  31. from gpustack.schemas.inference_backend import InferenceBackend, ContainerEnvConfig
  32. from gpustack.schemas.models import (
  33. BackendEnum,
  34. ModelInstance,
  35. ModelInstanceUpdate,
  36. ModelInstanceStateEnum,
  37. ModelUpdate,
  38. ModelInstanceDeploymentMetadata,
  39. )
  40. from gpustack.schemas.workers import GPUDevicesStatus
  41. from gpustack.server.bus import Event
  42. from gpustack.utils.config import apply_registry_override_to_image
  43. from gpustack.utils.envs import filter_env_vars
  44. from gpustack.utils.hub import get_hf_text_config, get_max_model_len
  45. from gpustack.utils.hub import get_pretrained_config, safe_pretrained_config_from_dict
  46. from gpustack.utils.profiling import time_decorator
  47. from gpustack.utils import platform
  48. from gpustack.utils.runtime import transform_workload_plan
  49. logger = logging.getLogger(__name__)
  50. lock = threading.Lock()
  51. class ModelInstanceStateError(Exception):
  52. pass
  53. # Reference: requirements for `usage.prompt_tokens_details.cached_tokens` to
  54. # appear in OpenAI-compatible responses, per backend.
  55. #
  56. # Tracked in https://github.com/gpustack/gpustack/issues/5189.
  57. #
  58. # vLLM
  59. # ----
  60. # - Prefix caching must be on. V1 enables it by default; V0 requires
  61. # `--enable-prefix-caching`. *User responsibility.*
  62. # - `--enable-prompt-tokens-details` is required to populate the field.
  63. # Broken in V1 prior to v0.9.0.1
  64. # (https://github.com/vllm-project/vllm/pull/18149).
  65. # - GPUStack auto-injects `--enable-prompt-tokens-details` for vLLM
  66. # >= 0.9.0.1. See `gpustack.worker.backends.vllm.get_cache_report_arguments`.
  67. #
  68. # SGLang
  69. # ------
  70. # - RadixAttention prefix caching is on by default; no extra flag.
  71. # - `--enable-cache-report` is required to populate the field (since v0.3.4).
  72. # - GPUStack auto-injects `--enable-cache-report` for SGLang >= 0.3.4. See
  73. # `gpustack.worker.backends.sglang.get_cache_report_arguments`.
  74. #
  75. # Ascend MindIE
  76. # -------------
  77. # - Prefix caching must be on. *User responsibility:* add
  78. # `--enable-prefix-caching` to the model's backend parameters. Conflicts
  79. # with `--rope-scaling` and `--data-parallel-size > 1` (validated by
  80. # `AscendMindIEParameters._validate`).
  81. # - Cache token details require MindIE >= 2.3.0, GPUStack does not auto-inject any cache-related flag for MindIE.
  82. class InferenceServer(ABC):
  83. _model_path: Optional[str] = None
  84. _draft_model_path: Optional[str] = None
  85. """
  86. The absolute path to the model files.
  87. This is set when the model instance state changes to STARTING.
  88. """
  89. _pretrained_config: Optional[PretrainedConfig] = None
  90. """The model configuration, if available."""
  91. _pretrained_config_initialized: bool = False
  92. """Whether pretrained config loading has been attempted."""
  93. _fallback_registry: Optional[str] = None
  94. """The fallback container registry to use if needed."""
  95. @time_decorator
  96. def __init__(
  97. self,
  98. clientset: ClientSet,
  99. mi: ModelInstance,
  100. cfg: Config,
  101. worker_id: int,
  102. inference_backend: InferenceBackend,
  103. fallback_registry: Optional[str] = None,
  104. ):
  105. setup_logging(debug=cfg.debug)
  106. setup_runtime_logging()
  107. set_global_config(cfg)
  108. try:
  109. self._clientset = clientset
  110. self._model_instance = mi
  111. self._config = cfg
  112. self._fallback_registry = fallback_registry
  113. self._worker = self._clientset.workers.get(worker_id)
  114. if not self._worker:
  115. raise KeyError(f"Worker {worker_id} not found")
  116. self.get_model()
  117. self.inference_backend = inference_backend
  118. if (
  119. not inference_backend
  120. and self._model.image_name
  121. and self._model.run_command
  122. ):
  123. # Any deployment that directly specifies an image and command is treated as a Custom backend.
  124. # A basic InferenceBackend object is created to prevent exceptions in subsequent workflows.
  125. self.inference_backend = InferenceBackend(
  126. backend_name=BackendEnum.CUSTOM.value,
  127. run_command=self._model.run_command,
  128. )
  129. if not self.inference_backend:
  130. raise KeyError(
  131. f"Inference backend {self._model.backend} not specified or not found"
  132. )
  133. logger.info("Preparing model files...")
  134. self._until_model_instance_starting()
  135. logger.info("Model files are ready.")
  136. except ModelInstanceStateError:
  137. sys.exit(1)
  138. except Exception as e:
  139. error_message = f"Failed to initialize: {e}"
  140. logger.error(error_message)
  141. try:
  142. patch_dict = {
  143. "state_message": error_message,
  144. "state": ModelInstanceStateEnum.ERROR,
  145. }
  146. self._update_model_instance(mi.id, **patch_dict)
  147. except Exception as ue:
  148. logger.error(f"Failed to update model instance: {ue}")
  149. sys.exit(1)
  150. def _stop_when_starting(self, event: Event) -> bool:
  151. if event.data["state"] == ModelInstanceStateEnum.ERROR:
  152. raise ModelInstanceStateError()
  153. elif event.data["state"] == ModelInstanceStateEnum.STARTING:
  154. self._model_path = str(Path(event.data["resolved_path"]).absolute())
  155. if event.data["draft_model_resolved_path"]:
  156. self._draft_model_path = str(
  157. Path(event.data["draft_model_resolved_path"]).absolute()
  158. )
  159. return True
  160. return False
  161. @abstractmethod
  162. def start(self):
  163. pass
  164. def get_model(self):
  165. model = self._clientset.models.get(id=self._model_instance.model_id)
  166. data_dir = self._config.data_dir
  167. for i, param in enumerate(model.backend_parameters or []):
  168. model.backend_parameters[i] = param.replace("{data_dir}", data_dir)
  169. self._model = model
  170. def _until_model_instance_starting(self):
  171. self._clientset.model_instances.watch(
  172. callback=None,
  173. stop_condition=self._stop_when_starting,
  174. params={"id": self._model_instance.id},
  175. )
  176. def _update_model_instance(self, id: int, **kwargs):
  177. mi_public = self._clientset.model_instances.get(id=id)
  178. mi = ModelInstanceUpdate(**mi_public.model_dump())
  179. for key, value in kwargs.items():
  180. setattr(mi, key, value)
  181. self._clientset.model_instances.update(id=id, model_update=mi)
  182. def _handle_error(self, error: Exception):
  183. """
  184. Handle errors during backend server startup in a unified way.
  185. Updates model instance state and re-raises the original error.
  186. """
  187. cause = getattr(error, "__cause__", None)
  188. cause_text = f": {cause}" if cause else ""
  189. error_message = f"Failed to run {self._model.backend}: {error}{cause_text}"
  190. try:
  191. is_main_worker = self._model_instance.worker_id == self._worker.id
  192. if is_main_worker:
  193. patch_dict = {
  194. "state_message": error_message,
  195. "state": ModelInstanceStateEnum.ERROR,
  196. }
  197. self._update_model_instance(self._model_instance.id, **patch_dict)
  198. else:
  199. # For subordinate workers, update sw.state instead of mi.state
  200. # to avoid race conditions with the main worker's state management.
  201. self._update_subordinate_worker_error(error_message)
  202. except Exception as ue:
  203. logger.error(f"Failed to update model instance: {ue}")
  204. raise error
  205. def _update_subordinate_worker_error(self, error_message: str):
  206. """
  207. Update the subordinate worker's state to ERROR.
  208. Fetches the latest model instance to get the current subordinate worker state,
  209. then updates only this worker's entry.
  210. """
  211. mi_public = self._clientset.model_instances.get(id=self._model_instance.id)
  212. mi = ModelInstanceUpdate(**mi_public.model_dump())
  213. sw_pos = next(
  214. (
  215. i
  216. for i, sw in enumerate(mi.distributed_servers.subordinate_workers)
  217. if sw.worker_id == self._worker.id
  218. ),
  219. )
  220. mi.distributed_servers.subordinate_workers[sw_pos].state = (
  221. ModelInstanceStateEnum.ERROR
  222. )
  223. mi.distributed_servers.subordinate_workers[sw_pos].state_message = error_message
  224. self._clientset.model_instances.update(
  225. id=self._model_instance.id, model_update=mi
  226. )
  227. def _get_deployment_metadata(self) -> ModelInstanceDeploymentMetadata:
  228. """
  229. Get the deployment metadata for the model instance.
  230. Returns:
  231. The deployment metadata.
  232. Raises:
  233. RuntimeError:
  234. If the model instance is not handling by the current worker.
  235. """
  236. deployment_metadata = self._model_instance.get_deployment_metadata(
  237. self._worker.id
  238. )
  239. if not deployment_metadata:
  240. raise RuntimeError(
  241. "Failed to get deployment metadata: model instance is not handling by the current worker"
  242. )
  243. return deployment_metadata
  244. def _get_pretrained_config(self) -> Optional[PretrainedConfig]:
  245. """
  246. Get the pretrained model configuration, if available.
  247. Returns:
  248. The pretrained model configuration dictionary, or None if not available.
  249. """
  250. if self._pretrained_config_initialized:
  251. return self._pretrained_config
  252. auto_config_error: Optional[Exception] = None
  253. try:
  254. pretrained_config = get_pretrained_config(self._model)
  255. if isinstance(pretrained_config, dict):
  256. # Ensure we have a PretrainedConfig object, not a dict, for consistency.
  257. pretrained_config = safe_pretrained_config_from_dict(pretrained_config)
  258. self._pretrained_config = pretrained_config
  259. self._pretrained_config_initialized = True
  260. return pretrained_config
  261. except Exception as e:
  262. logger.debug(
  263. f"Failed to get pretrained config via AutoConfig, falling back to local config.json. Error: {e}"
  264. )
  265. auto_config_error = e
  266. try:
  267. fallback_config = self._load_pretrained_config_from_local_config_json()
  268. self._pretrained_config = fallback_config
  269. self._pretrained_config_initialized = True
  270. return fallback_config
  271. except Exception as e:
  272. raise RuntimeError(
  273. "Failed to load pretrained config. "
  274. f"AutoConfig error: {auto_config_error}. "
  275. f"Local config.json fallback error: {e}."
  276. ) from e
  277. def _load_pretrained_config_from_local_config_json(
  278. self,
  279. ) -> Optional[PretrainedConfig]:
  280. """
  281. Load PretrainedConfig from local config.json under resolved model path.
  282. """
  283. if not self._model_path:
  284. return None
  285. config_path = os.path.join(self._model_path, "config.json")
  286. if not os.path.isfile(config_path):
  287. return None
  288. with open(config_path, "r", encoding="utf-8") as f:
  289. config_dict = json.load(f)
  290. if isinstance(config_dict, dict):
  291. return safe_pretrained_config_from_dict(config_dict)
  292. return None
  293. def _derive_max_model_len(self, default: Optional[int] = None) -> Optional[int]:
  294. """
  295. Derive max model length from model config.
  296. Returns default value if unavailable.
  297. Args:
  298. default:
  299. The default max model length to return if unable to derive from config.
  300. Returns:
  301. The derived max model length, or the default value if derivation fails.
  302. """
  303. try:
  304. pretrained_config = self._get_pretrained_config()
  305. pretrained_or_hf_text_config = get_hf_text_config(pretrained_config)
  306. return get_max_model_len(pretrained_or_hf_text_config)
  307. except Exception as e:
  308. logger.warning(
  309. f"Failed to derive max model length: {e}, continuing with default"
  310. )
  311. return default
  312. def _get_model_architecture(self) -> List[str]:
  313. """
  314. Get model architecture from model config.
  315. Returns:
  316. A list of model architecture strings.
  317. """
  318. try:
  319. pretrained_config = self._get_pretrained_config()
  320. if pretrained_config and hasattr(pretrained_config, "architectures"):
  321. return pretrained_config.architectures
  322. except Exception as e:
  323. logger.warning(
  324. f"Failed to derive model architecture: {e}, continuing with empty list"
  325. )
  326. return []
  327. def _get_configured_env(self, **kwargs) -> Dict[str, str]:
  328. """
  329. Get the environment variables for the model instance.
  330. Merge the model's env with the system env.
  331. If there are conflicts, the model's env takes precedence.
  332. Returns:
  333. A dictionary of environment variables for the model instance.
  334. """
  335. env = {}
  336. if not runtime_envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT:
  337. env = filter_env_vars(os.environ)
  338. if self._model.env:
  339. env.update(self._model.env)
  340. return env
  341. @lru_cache
  342. def _get_selected_gpu_devices(self) -> GPUDevicesStatus:
  343. """
  344. Get the GPU devices assigned to the model instance.
  345. Returns:
  346. A list of GPU device information assigned to the model instance.
  347. """
  348. minstance = self._model_instance
  349. dservers = minstance.distributed_servers
  350. gpu_type = None
  351. if (
  352. dservers
  353. and dservers.subordinate_workers
  354. and minstance.worker_id != self._worker.id
  355. ):
  356. subworker = next(
  357. (
  358. w
  359. for w in dservers.subordinate_workers
  360. if w.worker_id == self._worker.id
  361. ),
  362. None,
  363. )
  364. gpu_indexes = sorted(subworker.gpu_indexes or [])
  365. gpu_type = subworker.gpu_type
  366. else:
  367. gpu_indexes = sorted(self._model_instance.gpu_indexes or [])
  368. gpu_type = self._model_instance.gpu_type
  369. gpu_devices: GPUDevicesStatus = []
  370. if gpu_indexes and self._worker.status.gpu_devices:
  371. for index in gpu_indexes:
  372. gpu_device = next(
  373. (
  374. d
  375. for d in self._worker.status.gpu_devices
  376. if d.index == index and (gpu_type is None or d.type == gpu_type)
  377. ),
  378. None,
  379. )
  380. if gpu_device:
  381. gpu_devices.append(gpu_device)
  382. return gpu_devices
  383. def _get_device_info(self) -> Tuple[Optional[str], Optional[str], Optional[str]]:
  384. """Get the device information for the serving.
  385. If not found, retrieve from the first device of the worker.
  386. Returns:
  387. A tuple of (vendor, runtime_version, arch_family).
  388. """
  389. gpu_devices = self._get_selected_gpu_devices()
  390. if gpu_devices:
  391. gpu_device = gpu_devices[0]
  392. return (
  393. gpu_device.type,
  394. gpu_device.runtime_version,
  395. gpu_device.arch_family,
  396. )
  397. elif self._worker.status.gpu_devices:
  398. gpu_device = self._worker.status.gpu_devices[0]
  399. return (
  400. gpu_device.type,
  401. gpu_device.runtime_version,
  402. gpu_device.arch_family,
  403. )
  404. return None, None, None
  405. def _get_configured_resources(
  406. self, mount_all_devices: bool = False
  407. ) -> ContainerResources:
  408. """
  409. Get the resource requests for the model instance.
  410. Args:
  411. mount_all_devices:
  412. Whether to mount all available GPU devices.
  413. If true, ignores the GPUs assigned to the model instance and try to mount all available GPUs.
  414. Returns:
  415. A ContainerResources object representing the resource requests for the model instance.
  416. Raises:
  417. If the GPUs assigned to the model instance are of different types.
  418. """
  419. resources = ContainerResources()
  420. gpu_devices = self._get_selected_gpu_devices()
  421. if gpu_devices:
  422. gpu_type = gpu_devices[0].type
  423. for device in gpu_devices[1:]:
  424. if device.type != gpu_type:
  425. raise RuntimeError(
  426. "All GPUs assigned to the model instance must be of the same type."
  427. )
  428. key = runtime_envs.GPUSTACK_RUNTIME_DETECT_BACKEND_MAP_RESOURCE_KEY.get(
  429. gpu_type
  430. )
  431. if key:
  432. resources[key] = (
  433. ",".join(str(d.index) for d in gpu_devices)
  434. if not mount_all_devices
  435. else "all"
  436. )
  437. return resources
  438. def _get_configured_mounts(self) -> List[ContainerMount]:
  439. """
  440. Get the volume mounts for the model instance.
  441. If runtime mirrored deployment is enabled, no mounts will be set up.
  442. Returns:
  443. A list of ContainerMount objects for the model instance.
  444. """
  445. mounts: List[ContainerMount] = []
  446. if (
  447. self._model_path
  448. and not runtime_envs.GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT
  449. ):
  450. model_dir = os.path.dirname(self._model_path)
  451. mounts.append(
  452. ContainerMount(
  453. path=model_dir,
  454. ),
  455. )
  456. return mounts
  457. def _get_configured_ports(self) -> List[ContainerPort]:
  458. """
  459. Get the ports for the model instance.
  460. Returns:
  461. A list of ContainerPort objects for the model instance.
  462. """
  463. return [
  464. ContainerPort(
  465. internal=port,
  466. )
  467. for port in self._model_instance.ports or []
  468. ]
  469. @staticmethod
  470. def _get_container_env_config(env: Dict[str, str]) -> ContainerEnvConfig:
  471. """
  472. Read container configuration from environment variables passed to the container.
  473. Args:
  474. env: The environment variables dictionary passed to the container.
  475. Returns:
  476. A ContainerEnvConfig containing container configuration:
  477. - user: Run as specific UID (int)
  478. - group: Run as specific GID (int)
  479. - shm_size_gib: Shared memory size in GiB (float, default 10.0)
  480. """
  481. config = ContainerEnvConfig()
  482. # Read user ID
  483. uid_str = env.get("GPUSTACK_MODEL_RUNTIME_UID")
  484. if uid_str:
  485. try:
  486. config.user = int(uid_str)
  487. except ValueError:
  488. logger.warning(
  489. f"Invalid GPUSTACK_MODEL_RUNTIME_UID value: {uid_str}, ignoring"
  490. )
  491. # Read group ID
  492. gid_str = env.get("GPUSTACK_MODEL_RUNTIME_GID")
  493. if gid_str:
  494. try:
  495. config.group = int(gid_str)
  496. except ValueError:
  497. logger.warning(
  498. f"Invalid GPUSTACK_MODEL_RUNTIME_GID value: {gid_str}, ignoring"
  499. )
  500. # Read shared memory size in GiB
  501. shm_str = env.get("GPUSTACK_MODEL_RUNTIME_SHM_SIZE_GIB", "10")
  502. try:
  503. config.shm_size_gib = float(shm_str)
  504. except ValueError:
  505. logger.warning(
  506. f"Invalid GPUSTACK_MODEL_RUNTIME_SHM_SIZE_GIB value: {shm_str}, using default 10.0"
  507. )
  508. config.shm_size_gib = 10.0
  509. return config
  510. def _get_serving_port(self) -> int:
  511. """
  512. Get the (main) serving port for the model instance.
  513. Returns:
  514. The (main) serving port for the model instance.
  515. """
  516. return (
  517. self._model_instance.ports[0]
  518. if self._model_instance.ports
  519. else self._model_instance.port
  520. )
  521. @staticmethod
  522. def _get_serving_command_script(env: dict[str, str]) -> Optional[str]:
  523. """
  524. Get the serving command script for the model instance.
  525. Return None if `GPUSTACK_MODEL_SERVING_COMMAND_SCRIPT_DISABLED` is disabled,
  526. or no specific envs are set.
  527. Args:
  528. env:
  529. The environment variables for the model instance.
  530. Returns:
  531. The serving command script for the model instance, or None if not needed.
  532. """
  533. # Skip if explicitly disabled.
  534. if env and to_bool(
  535. env.get("GPUSTACK_MODEL_SERVING_COMMAND_SCRIPT_DISABLED", "0")
  536. ):
  537. return None
  538. # Skip if no specific envs are set.
  539. if not env or "PYPI_PACKAGES_INSTALL" not in env:
  540. return None
  541. return """#!/bin/sh
  542. #
  543. # Prepare
  544. #
  545. if [ -n "${PYPI_PACKAGES_INSTALL:-}" ]; then
  546. if command -v uv >/dev/null 2>&1; then
  547. echo "Installing additional PyPi packages: ${PYPI_PACKAGES_INSTALL}"
  548. export UV_HTTP_TIMEOUT=500
  549. export UV_NO_CACHE=1
  550. if [ -n "${PIP_INDEX_URL:-}" ]; then
  551. export UV_DEFAULT_INDEX="${PIP_INDEX_URL}"
  552. export UV_INDEX_URL="${PIP_INDEX_URL}"
  553. fi
  554. if [ -n "${PIP_EXTRA_INDEX_URL:-}" ]; then
  555. export UV_INDEX="${PIP_EXTRA_INDEX_URL}"
  556. export UV_EXTRA_INDEX_URL="${PIP_EXTRA_INDEX_URL}"
  557. fi
  558. uv pip install --system ${PYPI_PACKAGES_INSTALL}
  559. uv pip tree --system
  560. elif command -v pip >/dev/null 2>&1; then
  561. echo "Installing additional PyPi packages: ${PYPI_PACKAGES_INSTALL}"
  562. export PIP_DISABLE_PIP_VERSION_CHECK=1
  563. export PIP_ROOT_USER_ACTION=ignore
  564. export PIP_TIMEOUT=500
  565. export PIP_NO_CACHE_DIR=1
  566. pip install ${PYPI_PACKAGES_INSTALL}
  567. pip freeze
  568. fi
  569. unset PYPI_PACKAGES_INSTALL
  570. fi
  571. #
  572. # Execute
  573. #
  574. exec "$@"
  575. """
  576. def build_versioned_command_args(
  577. self,
  578. default_args: List[str],
  579. model_path: Optional[str] = None,
  580. port: Optional[int] = None,
  581. ) -> List[str]:
  582. """
  583. Override default startup arguments based on version configuration
  584. when the version uses non-built-in version and defines a custom run_command
  585. Args:
  586. - default_args: The default command argument list (e.g., ["vllm", "serve", "/path/to/model"]).
  587. - model_path: Path used to replace {{model_path}}; if None, fall back to self._model_path.
  588. - port: Port used to replace {{port}}; if None, fall back to self._model_instance.port.
  589. Returns:
  590. The final command argument list used for container execution.
  591. """
  592. # if no version or inference backend is available, return default_args
  593. version = self._model.backend_version
  594. if not version or not self.inference_backend:
  595. return default_args
  596. # Load version configuration
  597. version_config = None
  598. try:
  599. version_config, version = self.inference_backend.get_version_config(version)
  600. except Exception:
  601. version_config = self.inference_backend.version_configs.root.get(version)
  602. # Only perform replacement when the version uses non-built-in version and defines run_command
  603. if (
  604. version_config
  605. and version_config.built_in_frameworks is None
  606. and version_config.run_command
  607. ):
  608. resolved_model_path = (
  609. model_path if model_path is not None else self._model_path
  610. )
  611. resolved_port = port if port is not None else self._model_instance.port
  612. resolved_model_name = self._model_instance.model_name
  613. command = self.inference_backend.replace_command_param(
  614. version=version,
  615. model_path=resolved_model_path,
  616. port=resolved_port,
  617. worker_ip=self._worker.ip,
  618. model_name=resolved_model_name,
  619. command=version_config.run_command,
  620. env=self._model.env,
  621. )
  622. if command:
  623. return shlex.split(command)
  624. # Return original default_args by default
  625. return default_args
  626. @staticmethod
  627. def _get_backend_parameter_start_index(
  628. arguments: List[str],
  629. entrypoint: Optional[List[str]] = None,
  630. ) -> int:
  631. """
  632. Return where backend parameters start in container args.
  633. When a container entrypoint is configured separately, `arguments`
  634. contains only entrypoint arguments, so backend parameters start at 0.
  635. Otherwise, skip the command prefix embedded in `arguments`.
  636. """
  637. if entrypoint:
  638. return 0
  639. if not arguments:
  640. return 0
  641. command = os.path.basename(arguments[0])
  642. if (
  643. len(arguments) >= 3
  644. and command.startswith("python")
  645. and arguments[1] == "-m"
  646. ):
  647. return 3
  648. for index, argument in enumerate(arguments):
  649. if argument.startswith("-"):
  650. return index
  651. return len(arguments)
  652. def _get_injected_backend_parameters(
  653. self,
  654. arguments: List[str],
  655. user_backend_parameters: List[str],
  656. entrypoint: Optional[List[str]] = None,
  657. ) -> List[str]:
  658. """
  659. Derive injected backend parameters from the final command line.
  660. The final command is the source of truth: remove the command prefix (or
  661. separate container entrypoint) and the user-specified backend
  662. parameters, and the remaining backend parameters are injected by
  663. GPUStack.
  664. """
  665. start_index = self._get_backend_parameter_start_index(arguments, entrypoint)
  666. candidates = arguments[start_index:]
  667. if not user_backend_parameters:
  668. return candidates
  669. user_param_len = len(user_backend_parameters)
  670. for start in range(len(candidates) - user_param_len, -1, -1):
  671. end = start + user_param_len
  672. if candidates[start:end] == user_backend_parameters:
  673. return candidates[:start] + candidates[end:]
  674. return candidates
  675. def _get_configured_image(
  676. self,
  677. backend: Optional[str] = None,
  678. ) -> Optional[str]:
  679. """
  680. Resolve the container image to use for the current backend, then apply
  681. registry override once if needed.
  682. See _resolve_image for resolution details.
  683. """
  684. image_name, target_version = self._resolve_image(backend)
  685. if image_name is None:
  686. return None
  687. # Update model backend service version at upper layer if we detected it
  688. if target_version:
  689. self._update_model_backend_service_version(target_version)
  690. return apply_registry_override_to_image(
  691. self._config, image_name, self._fallback_registry
  692. )
  693. def _resolve_image( # noqa: C901
  694. self,
  695. backend: Optional[str] = None,
  696. ) -> (Optional[str], Optional[str]):
  697. """
  698. Resolve the container image to use for the current backend.
  699. This method returns the raw image name without applying any registry
  700. override. Callers should apply overrides as needed.
  701. Precedence:
  702. 1) Explicitly configured image on the model (self._model.image_name)
  703. 2) Prefer image name from the user's config when using custom backend or built-in backend with a custom version
  704. 3) Auto-detected image from gpustack-runner based on device vendor/arch and backend
  705. Return:
  706. image_name, backend_version
  707. """
  708. # 1) Return directly if explicitly provided.
  709. if self._model.image_name:
  710. return self._model.image_name, None
  711. # 2) Configuration takes priority when backend_version is set
  712. if self._model and self.inference_backend:
  713. image_name, target_version = self.inference_backend.get_image_name(
  714. self._model.backend_version
  715. )
  716. if image_name and target_version:
  717. return image_name, target_version
  718. """
  719. Prepare queries for retrieving runners.
  720. """
  721. def get_docker_image(bvr: BackendVersionedRunner) -> str:
  722. return bvr.variants[0].services[0].versions[0].platforms[0].docker_image
  723. backend, runtime_version, arch_family = self._get_device_info()
  724. if not backend:
  725. # Return directly if there is not a valid device.
  726. # GPUStack-Runner does not provide CPU-only platform images.
  727. # To use a CPU-only version, user must configure in `Inference Backend` page.
  728. return None
  729. if backend not in available_backends():
  730. # Return directly if found backend is not within the available backends.
  731. return None
  732. """
  733. Retrieve runners by queries.
  734. For example, the queries of runners is as below.
  735. - backend: cuda
  736. backend_variant: None
  737. service: vllm
  738. service_version: 0.10.0
  739. platform: linux/amd64
  740. - backend: cann
  741. backend_variant: 910b
  742. service: vllm
  743. service_version: 0.10.0
  744. platform: linux/arm64
  745. """
  746. backend_variant = None
  747. service = self._model.backend.lower()
  748. model_service_version = self._model.backend_version
  749. service_version = model_service_version
  750. # Default variant for some backends.
  751. if backend == "cann":
  752. if arch_family:
  753. backend_variant = get_ascend_cann_variant(arch_family)
  754. if not backend_variant:
  755. backend_variant = "910b"
  756. logger.info(
  757. f"_resolve_image query: backend={backend}, backend_variant={backend_variant}, service={service}, service_version={service_version}, platform={platform.system_arch()}"
  758. )
  759. runners = list_backend_runners(
  760. backend=backend,
  761. backend_variant=backend_variant,
  762. service=service,
  763. service_version=model_service_version,
  764. platform=platform.system_arch(),
  765. with_deprecated=model_service_version is not None,
  766. )
  767. if not runners:
  768. # Return directly if there is not a valid runner.
  769. return None, None
  770. """
  771. Pick the appropriate backend version from among the multiple versions.
  772. For example, the content of runners is as below.
  773. [
  774. {
  775. "backend": "cuda",
  776. "versions": [
  777. {
  778. "version": "12.8",
  779. ...
  780. },
  781. {
  782. "version": "12.6",
  783. ...
  784. },
  785. {
  786. "version": "12.4",
  787. ...
  788. }
  789. ]
  790. }
  791. ]
  792. """
  793. backend_versioned_runners = runners[0].versions
  794. # Try to update backend version for server model.
  795. if backend_versioned_runners and len(backend_versioned_runners) > 0:
  796. service_version = _get_service_version_from_versioned_runner(
  797. backend_versioned_runners[0]
  798. )
  799. # Return directly if there is only one versioned backend.
  800. if len(backend_versioned_runners) == 1:
  801. return get_docker_image(backend_versioned_runners[0]), service_version
  802. backend_version = runtime_version
  803. # Iterate all backend versions, and get the one that less or equal to backend version.
  804. # Here, we assume the runners' sequence is ordered and arranged in descending order.
  805. if backend_version:
  806. for backend_versioned_runner in backend_versioned_runners:
  807. if (
  808. compare_versions(backend_versioned_runner.version, backend_version)
  809. <= 0
  810. ):
  811. service_version = _get_service_version_from_versioned_runner(
  812. backend_versioned_runner
  813. )
  814. return get_docker_image(backend_versioned_runner), service_version
  815. # Return the first(latest) backend version of selected runner
  816. # if failed to detect host backend version or no backend version matched.
  817. service_version = _get_service_version_from_versioned_runner(
  818. backend_versioned_runners[0]
  819. )
  820. return get_docker_image(backend_versioned_runners[0]), service_version
  821. def _update_model_backend_service_version(
  822. self, service_version: Optional[str]
  823. ) -> None:
  824. """
  825. Update model backend (service) version back to server if not already set.
  826. This method is extracted from image resolution flow to be called from the upper
  827. layer after the version is detected.
  828. """
  829. if not service_version:
  830. return
  831. try:
  832. if not self._model.backend_version:
  833. self._model.backend_version = service_version
  834. self._clientset.models.update(
  835. self._model.id, ModelUpdate(**self._model.model_dump())
  836. )
  837. if not self._model_instance.backend_version:
  838. self._update_model_instance(
  839. self._model_instance.id, backend_version=service_version
  840. )
  841. except Exception as e:
  842. logger.error(
  843. f"Failed to update model service version {service_version}: {e}"
  844. )
  845. def _flatten_backend_param(self) -> List[str]:
  846. """
  847. Flattens all backend parameter strings into a list of individual tokens.
  848. Each entry in `backend_parameters` may contain one or more whitespace-separated
  849. arguments. This method splits them and returns a single flattened list.
  850. e.g.
  851. self._model.backend_parameters = ["--ctx-size 1024"] -> ["--ctx-size", "1024"]
  852. self._model.backend_parameters = [" --ctx-size=1024"] -> ["--ctx-size=1024"]
  853. self._model.backend_parameters = ["--ctx-size =1024"] -> ["--ctx-size=1024"]
  854. """
  855. result = []
  856. for param in self._model.backend_parameters or []:
  857. # Strip leading/trailing whitespace
  858. param_stripped = param.strip()
  859. if "=" in param_stripped:
  860. # Handle cases like "--foo = bar" or "--foo =bar"
  861. # Split by = and strip whitespace around it
  862. key, value = map(str.strip, param_stripped.split("=", 1))
  863. result.append(f"{key}={value}")
  864. continue
  865. result.extend(shlex.split(param_stripped))
  866. return result
  867. def _transform_workload_plan(
  868. self, workload: WorkloadPlan
  869. ) -> Union[DockerWorkloadPlan, WorkloadPlan]:
  870. """
  871. If the deployer is docker, transform the generic WorkloadPlan to DockerWorkloadPlan,
  872. and fill the pause image and restart image with registry override.
  873. """
  874. return transform_workload_plan(self._config, workload, self._fallback_registry)
  875. def _get_service_version_from_versioned_runner(
  876. backend_versioned_runner: BackendVersionedRunner,
  877. ) -> Optional[str]:
  878. """
  879. Get the service version from the backend versioned runner.
  880. Args:
  881. backend_versioned_runner:
  882. The backend versioned runner.
  883. Returns:
  884. The service version string, or None if not found.
  885. """
  886. try:
  887. return backend_versioned_runner.variants[0].services[0].versions[0].version
  888. except Exception as e:
  889. logger.error(
  890. f"Failed to get service version from backend versioned runner: {e}"
  891. )
  892. return None
  893. def is_ascend_310p(devices: GPUDevicesStatus) -> bool:
  894. """
  895. Check if the model instance is running on VLLM Ascend 310P.
  896. """
  897. return all(
  898. gpu.vendor == ManufacturerEnum.ASCEND.value
  899. and get_ascend_cann_variant(gpu.arch_family) == "310p"
  900. for gpu in devices
  901. )
  902. def is_ascend(devices: GPUDevicesStatus) -> bool:
  903. """
  904. Check if all devices are Ascend.
  905. """
  906. return all(gpu.vendor == ManufacturerEnum.ASCEND.value for gpu in devices)
  907. def cal_distributed_parallelism_arguments(
  908. model_instance: ModelInstance,
  909. ) -> tuple[int, int]:
  910. pp = len(model_instance.distributed_servers.subordinate_workers) + 1
  911. tp = len(model_instance.gpu_indexes) if model_instance.gpu_indexes else 1
  912. uneven_pp = tp
  913. uneven = False
  914. for subordinate_worker in model_instance.distributed_servers.subordinate_workers:
  915. num_gpus = len(subordinate_worker.gpu_indexes)
  916. uneven_pp += num_gpus
  917. if num_gpus != tp:
  918. uneven = True
  919. if uneven:
  920. tp = 1
  921. pp = uneven_pp
  922. logger.warning(
  923. f"The number of GPUs selected for each worker is not equal: {num_gpus} != {tp}, fallback to using pipeline parallelism."
  924. )
  925. return tp, pp