ascend_mindie.py 87 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146
  1. import argparse
  2. import dataclasses
  3. import json
  4. import logging
  5. import os
  6. from functools import lru_cache
  7. from pathlib import Path
  8. from typing import Optional, List, Dict, Any, Tuple
  9. from gpustack_runtime.deployer import (
  10. Container,
  11. ContainerProfileEnum,
  12. ContainerExecution,
  13. ContainerEnv,
  14. WorkloadPlan,
  15. create_workload,
  16. ContainerFile,
  17. ContainerRestartPolicyEnum,
  18. )
  19. from gpustack_runtime.envs import to_bool
  20. from gpustack.schemas.models import ModelInstanceDeploymentMetadata
  21. from gpustack.utils.command import find_parameter, format_backend_parameters
  22. from gpustack.utils.envs import sanitize_env
  23. from gpustack.worker.backends.base import InferenceServer, is_ascend_310p
  24. logger = logging.getLogger(__name__)
  25. @dataclasses.dataclass
  26. class AscendMindIEParameters:
  27. #
  28. # Log config
  29. #
  30. log_level: str = "Info"
  31. #
  32. # Server config
  33. #
  34. max_link_num: int = 1000
  35. token_timeout: int = 600
  36. e2e_timeout: int = 600
  37. openai_support: str = "vllm"
  38. #
  39. # Backend config
  40. #
  41. kv_pool_config: Optional[str] = None
  42. kv_pool_config_parsed: Optional[Dict[str, Any]] = None # store JSON parsed result
  43. #
  44. # Model deploy config
  45. #
  46. max_seq_len: int = 8192
  47. max_input_token_len: int = -1
  48. truncation: bool = False
  49. #
  50. # Model config
  51. #
  52. cpu_mem_size: int = 0
  53. npu_memory_fraction: float = 0.8
  54. trust_remote_code: bool = False
  55. models: Optional[str] = None
  56. models_parsed: Optional[any] = None # store JSON parsed result
  57. async_scheduler_wait_time: int = 120
  58. #
  59. # Schedule config
  60. #
  61. cache_block_size: int = 128
  62. max_prefill_batch_size: int = 50
  63. max_prefill_tokens: int = -1
  64. prefill_time_ms_per_req: int = 150
  65. prefill_policy_type: int = 0
  66. max_batch_size: int = 200
  67. max_iter_times: int = -1
  68. decode_time_ms_per_req: int = 50
  69. decode_policy_type: int = 0
  70. max_preempt_count: int = 0
  71. support_select_batch: bool = False
  72. max_queue_delay_microseconds: int = 5000
  73. max_first_token_wait_time: int = 2500
  74. #
  75. # Extends or Features
  76. #
  77. override_generation_config: Optional[str] = None
  78. override_generation_config_parsed: Optional[any] = None # store JSON parsed result
  79. enforce_eager: bool = False
  80. no_metrics: bool = False
  81. dtype: str = "auto"
  82. rope_scaling: Optional[str] = None
  83. rope_scaling_parsed: Optional[any] = None # store JSON parsed result
  84. rope_theta: Optional[float] = None
  85. enable_split: bool = False
  86. policy_type: int = 0
  87. split_chunk_tokens: int = 512
  88. split_start_batch_size: int = 16
  89. enable_memory_decoding: bool = False
  90. memory_decoding_length: int = 16
  91. memory_decoding_dynamic_algo: bool = False
  92. enable_lookahead: bool = False
  93. lookahead_level: int = 4
  94. lookahead_window: int = 5
  95. lookahead_guess_set_size: int = 5
  96. enable_multi_token_prediction: bool = False
  97. multi_token_prediction_tokens: int = 1
  98. enable_prefix_caching: bool = False
  99. local_world_size: int = -1 # store validation input
  100. world_size: int = -1 # store validation input
  101. pipeline_parallel_size: int = 1
  102. data_parallel_size: int = -1
  103. context_parallel_size: int = -1
  104. tensor_parallel_size: int = -1
  105. sequence_parallel_size: int = -1
  106. moe_expert_parallel_size: int = -1
  107. moe_tensor_parallel_size: int = -1
  108. enable_buffer_response: bool = False
  109. prefill_expected_time_ms: Optional[int] = None
  110. decode_expected_time_ms: Optional[int] = None
  111. def changed_backend_parameters(
  112. self,
  113. baseline: "AscendMindIEParameters",
  114. exclude_names: Optional[set] = None,
  115. ) -> List[str]:
  116. exclude_names = exclude_names or set()
  117. parameters = []
  118. # Add here when the field has no corresponding CLI flag. Cases:
  119. # - `*_parsed`: derived form of another raw input field, not a flag.
  120. # - `local_world_size` / `world_size`: set externally from GPU topology
  121. # via constructor, never registered with argparse.
  122. skipped_fields = {
  123. "kv_pool_config_parsed",
  124. "models_parsed",
  125. "override_generation_config_parsed",
  126. "rope_scaling_parsed",
  127. "local_world_size",
  128. "world_size",
  129. }
  130. # Add here when argparse uses `action='store_true'` (no `--no-foo`).
  131. # Should not add fields using `BooleanOptionalAction` — they support `--no-foo`.
  132. store_true_fields = {
  133. "trust_remote_code",
  134. "memory_decoding_dynamic_algo",
  135. "no_metrics",
  136. "enforce_eager",
  137. }
  138. for field in dataclasses.fields(self):
  139. name = field.name
  140. if name in skipped_fields:
  141. continue
  142. flag_name = name.replace("_", "-")
  143. if flag_name in exclude_names:
  144. continue
  145. value = getattr(self, name)
  146. if value == getattr(baseline, name):
  147. continue
  148. flag = f"--{flag_name}"
  149. if isinstance(value, bool):
  150. if value:
  151. parameters.append(flag)
  152. elif name not in store_true_fields:
  153. parameters.append(f"--no-{flag_name}")
  154. continue
  155. if value is not None:
  156. parameters.extend([flag, str(value)])
  157. return parameters
  158. def from_args_and_envs(
  159. self, args: List[str], envs: Optional[Dict[str, str]] = None
  160. ):
  161. """
  162. Parse parameters from command line arguments and environment variables.
  163. Args:
  164. args:
  165. A list of command line arguments.
  166. envs:
  167. A dictionary of environment variables. Optional.
  168. Raises:
  169. Failed to parse the arguments or invalid argument values will raise.
  170. """
  171. parser = argparse.ArgumentParser(exit_on_error=False, allow_abbrev=False)
  172. #
  173. # Log config
  174. #
  175. parser.add_argument(
  176. "--log-level",
  177. type=str,
  178. default="Info",
  179. choices=['Verbose', 'Info', 'Warning', 'Warn', 'Error', 'Debug'],
  180. help="Log level for MindIE.",
  181. )
  182. #
  183. # Server config
  184. #
  185. parser.add_argument(
  186. "--max-link-num",
  187. type=int,
  188. default=self.max_link_num,
  189. help="Maximum parallel requests",
  190. )
  191. parser.add_argument(
  192. "--token-timeout",
  193. type=int,
  194. default=self.token_timeout,
  195. help="Timeout for a token generation in seconds.",
  196. )
  197. parser.add_argument(
  198. "--e2e-timeout",
  199. type=int,
  200. default=self.e2e_timeout,
  201. help="E2E (from request accepted to inference stopped) timeout in seconds.",
  202. )
  203. parser.add_argument(
  204. "--openai-support",
  205. type=str,
  206. default=self.openai_support,
  207. help="The compatibility mode for OpenAI API.",
  208. )
  209. #
  210. # Backend config
  211. #
  212. parser.add_argument(
  213. "--kv-pool-config",
  214. type=str,
  215. default=self.kv_pool_config,
  216. help="KV pool configuration in JSON format. "
  217. "For example: `{\"backend\":\"<KV pool backend name>\", \"configPath\":\"/path/to/your/config/file\"}`.",
  218. )
  219. #
  220. # Model deploy config
  221. #
  222. parser.add_argument(
  223. "--max-seq-len",
  224. type=int,
  225. default=self.max_seq_len,
  226. help="Model context length. "
  227. "If unspecified, will be automatically derived from the model config.",
  228. )
  229. parser.add_argument(
  230. "--max-input-token-len",
  231. type=int,
  232. default=self.max_input_token_len,
  233. help="Max input token length. "
  234. "If unspecified, will be automatically derived from `--max-seq-len`.",
  235. )
  236. parser.add_argument(
  237. "--truncation",
  238. action=argparse.BooleanOptionalAction,
  239. help="Truncate the input token length, "
  240. "when the length is larger than the minimum between `--max-input-token-len` and `--max-seq-len` - 1.",
  241. )
  242. #
  243. # Model config
  244. #
  245. parser.add_argument(
  246. "--cpu-mem-size",
  247. type=int,
  248. default=self.cpu_mem_size,
  249. help="CPU swap space size (GiB). "
  250. "Works when specified `--max-preempt-count`.",
  251. )
  252. parser.add_argument(
  253. "--npu-memory-fraction",
  254. type=float,
  255. help="The fraction of NPU memory to be used for the model executor, "
  256. "which can range from 0 to 1 (included). "
  257. "For example, a value of 0.5 would imply 50% NPU memory utilization. "
  258. f"If unspecified, will use the default value of {self.npu_memory_fraction}.",
  259. )
  260. parser.add_argument(
  261. "--trust-remote-code",
  262. action='store_true',
  263. help="Trust remote code.",
  264. )
  265. parser.add_argument(
  266. "--models",
  267. type=str,
  268. required=False,
  269. help="Models configuration in JSON format, for certain specific configurations, like Expert Parallelism Implementation Method, Tensor Parallelism LM Header/Output Attention Split.",
  270. )
  271. parser.add_argument(
  272. "--async-scheduler-wait-time",
  273. type=int,
  274. default=self.async_scheduler_wait_time,
  275. help="The wait time (in seconds) for the asynchronous scheduler to start.",
  276. )
  277. #
  278. # Schedule config
  279. #
  280. parser.add_argument(
  281. "--cache-block-size",
  282. type=int,
  283. default=self.cache_block_size,
  284. help="KV cache block size, which must be powers of 2. "
  285. f"If unspecified, will use the default value of {self.cache_block_size}.",
  286. )
  287. parser.add_argument(
  288. "--max-prefill-batch-size",
  289. type=int,
  290. default=self.max_prefill_batch_size,
  291. help="During prefilling stage, the maximum requests can be batched, "
  292. "which must be less than `--max-batch-size`.",
  293. )
  294. parser.add_argument(
  295. "--max-prefill-tokens",
  296. type=int,
  297. default=self.max_prefill_tokens,
  298. help="During each prefill, the total number of all input tokens in the current batch cannot exceed `--max-prefill-tokens`. Default is same as `--max-seq-len`.",
  299. )
  300. parser.add_argument(
  301. "--prefill-time-ms-per-req",
  302. type=int,
  303. default=self.prefill_time_ms_per_req,
  304. help="Compare with --decode-time-ms-per-req to select prefilling or decoding, "
  305. "works with `--support-select-batch`.",
  306. )
  307. parser.add_argument(
  308. "--prefill-policy-type",
  309. type=int,
  310. choices=[0, 1, 2, 3],
  311. default=self.prefill_policy_type,
  312. help="Strategy of prefilling stage. "
  313. "0: FCFS, first come first serving, "
  314. "1: STATE, same as FCFS, "
  315. "2: PRIORITY, priority queue, "
  316. "3: MLFQ, multi-levels feedback queue.",
  317. )
  318. parser.add_argument(
  319. "--max-batch-size",
  320. type=int,
  321. default=self.max_batch_size,
  322. help="During decoding stage, the maximum requests can be batched.",
  323. )
  324. parser.add_argument(
  325. "--max-iter-times",
  326. type=int,
  327. default=self.max_iter_times,
  328. help="Maximum iterations for decoding stage. Default is same as `--max-seq-len`.",
  329. )
  330. parser.add_argument(
  331. "--decode-time-ms-per-req",
  332. type=int,
  333. default=self.decode_time_ms_per_req,
  334. help="Compare with `--prefill-time-ms-per-req` to select prefilling or decoding, "
  335. "works with `--support-select-batch`.",
  336. )
  337. parser.add_argument(
  338. "--decode-policy-type",
  339. type=int,
  340. choices=[0, 1, 2, 3],
  341. default=self.decode_policy_type,
  342. help="Strategy of decoding stage. "
  343. "0: FCFS, first come first serving, "
  344. "1: STATE, process those requests have been preempted or swapped at first, "
  345. "2: PRIORITY, priority queue, "
  346. "3: MLFQ, multi-levels feedback queue.",
  347. )
  348. parser.add_argument(
  349. "--max-preempt-count",
  350. type=int,
  351. default=self.max_preempt_count,
  352. help="Maximum preempt requests during decoding stage, which must be less than `--max-batch-size`.",
  353. )
  354. parser.add_argument(
  355. "--support-select-batch",
  356. action=argparse.BooleanOptionalAction,
  357. help="Enable batch selecting. "
  358. "According to `--prefill-time-ms-per-req` and `--decode-time-ms-per-req`, "
  359. "select the execution priority for this batch. "
  360. "Use `--no-support-select-batch` to disable explicitly.",
  361. )
  362. parser.add_argument(
  363. "--max-queue-delay-microseconds",
  364. type=int,
  365. default=self.max_queue_delay_microseconds,
  366. help="Maximum microseconds of queue waiting.",
  367. )
  368. parser.add_argument(
  369. "--max-first-token-wait-time",
  370. type=int,
  371. default=self.max_first_token_wait_time,
  372. help="Maximum milliseconds to wait for the first token generation.",
  373. )
  374. #
  375. # Extends or Features
  376. #
  377. parser.add_argument(
  378. "--override-generation-config",
  379. type=str,
  380. required=False,
  381. help="Overrides or sets generation config in JSON format. "
  382. "For example: `{\"temperature\": 0.5}`. "
  383. "This will merge into the `generation_config.json` of the model structure.",
  384. )
  385. parser.add_argument(
  386. "--enable-memory-decoding",
  387. action=argparse.BooleanOptionalAction,
  388. help="Enable memory decoding speculation. "
  389. "Use `--no-enable-memory-decoding` to disable explicitly.",
  390. )
  391. parser.add_argument(
  392. "--memory-decoding-length",
  393. type=int,
  394. default=self.memory_decoding_length,
  395. help="Length for memory decoding speculation.",
  396. )
  397. parser.add_argument(
  398. "--memory-decoding-dynamic-algo",
  399. action="store_true",
  400. help="Enable dynamic algorithm for memory decoding speculation.",
  401. )
  402. parser.add_argument(
  403. "--enable-lookahead",
  404. action=argparse.BooleanOptionalAction,
  405. help="Enable lookahead speculation. "
  406. "Use `--no-enable-lookahead` to disable explicitly.",
  407. )
  408. parser.add_argument(
  409. "--lookahead-level",
  410. type=int,
  411. default=self.lookahead_level,
  412. help="Level for lookahead speculation.",
  413. )
  414. parser.add_argument(
  415. "--lookahead-window",
  416. type=int,
  417. default=self.lookahead_window,
  418. help="Window size for lookahead speculation.",
  419. )
  420. parser.add_argument(
  421. "--lookahead-guess-set-size",
  422. type=int,
  423. default=self.lookahead_guess_set_size,
  424. help="Guess set size for lookahead speculation.",
  425. )
  426. parser.add_argument(
  427. "--enable-buffer-response",
  428. action=argparse.BooleanOptionalAction,
  429. help="Enable buffer response. "
  430. "Use `--no-enable-buffer-response` to disable explicitly.",
  431. )
  432. parser.add_argument(
  433. "--prefill-expected-time-ms",
  434. type=int,
  435. required=False,
  436. help="Expected latency (SLO) for Time to First Token (TTFT) in milliseconds.",
  437. )
  438. parser.add_argument(
  439. "--decode-expected-time-ms",
  440. type=int,
  441. required=False,
  442. help="Expected latency (SLO) for Time Per Output Token (TPOT) in milliseconds.",
  443. )
  444. parser.add_argument(
  445. "--enable-split",
  446. action=argparse.BooleanOptionalAction,
  447. help="Enable split fuse, something like chunked prefill. "
  448. "Use `--no-enable-split` to disable explicitly.",
  449. )
  450. parser.add_argument(
  451. "--policy-type",
  452. type=int,
  453. choices=[0, 4, 5, 6, 7],
  454. default=self.policy_type,
  455. help="Strategy of split fuse. "
  456. "- `0`: FCFS, first come first serving, "
  457. "- `4`: SJF, shortest job first, "
  458. "- `5`: LJF, longest job first, "
  459. "- `6`: Skip-Join MLFQ, skip-Join multi-levels feedback queue, "
  460. "- `7`: SJF-MLFQ, shortest job first and multi-levels feedback queue.",
  461. )
  462. parser.add_argument(
  463. "--split-chunk-tokens",
  464. type=int,
  465. default=self.split_chunk_tokens,
  466. help="Tokens size to batch for split fuse. Multiple of 16.",
  467. )
  468. parser.add_argument(
  469. "--split-start-batch-size",
  470. type=int,
  471. default=self.split_start_batch_size,
  472. help="Batch size to start splitting for split fuse.",
  473. )
  474. parser.add_argument(
  475. "--enable-multi-token-prediction",
  476. action=argparse.BooleanOptionalAction,
  477. help="Enable multi-token prediction. "
  478. "Use `--no-enable-multi-token-prediction` to disable explicitly.",
  479. )
  480. parser.add_argument(
  481. "--multi-token-prediction-tokens",
  482. type=int,
  483. default=self.multi_token_prediction_tokens,
  484. help="Number of multi-token prediction tokens. "
  485. "This is only effective when `--enable-multi-token-prediction` is enabled.",
  486. )
  487. parser.add_argument(
  488. "--enable-prefix-caching",
  489. action=argparse.BooleanOptionalAction,
  490. help="Enable prefix caching. "
  491. "Use `--no-enable-prefix-caching` to disable explicitly.",
  492. )
  493. parser.add_argument(
  494. "--no-metrics",
  495. action='store_true',
  496. help="Disable exposing metrics in /metrics router.",
  497. )
  498. parser.add_argument(
  499. "--enforce-eager",
  500. action='store_true',
  501. help="Emit operators in eager mode.",
  502. )
  503. parser.add_argument(
  504. "--dtype",
  505. type=str,
  506. default=self.dtype,
  507. choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
  508. help="Data type for model weights and activations. "
  509. "- `auto`: use the default data type of the model config, "
  510. "- `half`: for FP16, "
  511. "- `float16`: is the same as `half`, "
  512. "- `bfloat16`: for BF16, "
  513. "- `float`: is the shorthand for `float32`, "
  514. "- `float32`: for FP32. ",
  515. )
  516. parser.add_argument(
  517. "--pipeline-parallel-size",
  518. "-pp",
  519. type=int,
  520. default=self.pipeline_parallel_size,
  521. required=False,
  522. help="Number of pipeline parallel groups.",
  523. )
  524. parser.add_argument(
  525. "--data-parallel-size",
  526. "-dp",
  527. type=int,
  528. default=self.data_parallel_size,
  529. required=False,
  530. help="Number of data parallel groups for Attention layers. "
  531. "`-1` means disabling data parallelism, otherwise, must be a power of 2.",
  532. )
  533. parser.add_argument(
  534. "--context-parallel-size",
  535. "-cp",
  536. type=int,
  537. default=self.context_parallel_size,
  538. required=False,
  539. help="Number of context parallel groups for Attention layers."
  540. "`-1` means disabling context parallelism, otherwise, must be power of 2.",
  541. )
  542. parser.add_argument(
  543. "--tensor-parallel-size",
  544. "-tp",
  545. type=int,
  546. default=self.tensor_parallel_size,
  547. required=False,
  548. help="Number of tensor parallel groups for Attention layers."
  549. "`-1` means using world size as tensor parallel size, otherwise, must be a power of 2.",
  550. )
  551. parser.add_argument(
  552. "--sequence-parallel-size",
  553. "-sp",
  554. type=int,
  555. default=self.sequence_parallel_size,
  556. required=False,
  557. help="Number of sequence parallel groups for MLP layers. "
  558. "`-1` means disabling sequence parallelism, otherwise, must be power of 2.",
  559. )
  560. parser.add_argument(
  561. "--moe-expert-parallel-size",
  562. "-moe-ep",
  563. type=int,
  564. default=self.moe_expert_parallel_size,
  565. required=False,
  566. help="Number of expert parallel groups. "
  567. "`-1` means disabling MoE expert parallelism, otherwise, must be power of 2.",
  568. )
  569. parser.add_argument(
  570. "--moe-tensor-parallel-size",
  571. "-moe-tp",
  572. type=int,
  573. default=self.moe_tensor_parallel_size,
  574. required=False,
  575. help="Number of tensor parallel groups for MoE MLP layers. "
  576. "`-1` and means using world size as MoE tensor parallel size, otherwise, must be power of 2. ",
  577. )
  578. parser.add_argument(
  579. "--rope-scaling",
  580. type=str,
  581. required=False,
  582. help="RoPE scaling configuration in JSON format. "
  583. "For example: `{\"type\": \"yarn\", \"factor\" :4.0, \"original_max_position_embeddings\": 32768}`. "
  584. "This will merge into the `config.json` of the model structure.",
  585. )
  586. parser.add_argument(
  587. "--rope-theta",
  588. type=float,
  589. required=False,
  590. help="RoPE theta configuration. "
  591. "This will merge into the `config.json` of the model structure.",
  592. )
  593. if args:
  594. args_parsed, _ = parser.parse_known_args(args=args)
  595. for attr_name in [attr.name for attr in dataclasses.fields(self.__class__)]:
  596. try:
  597. attr_value = getattr(args_parsed, attr_name, None)
  598. if attr_value is not None:
  599. try:
  600. setattr(self, attr_name, attr_value)
  601. except ValueError as e:
  602. # Never reach here, but just in case.
  603. raise argparse.ArgumentTypeError(
  604. f"Invalid value for --{attr_name.replace('_', '-')} {attr_value}"
  605. ) from e
  606. except AttributeError:
  607. # If reach here, that means the field is an internal property,
  608. # which would not register in the argument parser.
  609. pass
  610. if not hasattr(args_parsed, "npu_memory_fraction"):
  611. self._from_envs(envs or {})
  612. else:
  613. self._from_envs(envs or {})
  614. self._default()
  615. self._validate()
  616. def _from_envs(self, envs: Dict[str, str]):
  617. """
  618. Parse parameters from environment variables.
  619. Supported environment variables:
  620. - NPU_MEMORY_FRACTION: The fraction of NPU memory to be used for the model executor.
  621. Args:
  622. envs:
  623. A dictionary of environment variables.
  624. """
  625. allowed_env_attr_mapping = {
  626. "NPU_MEMORY_FRACTION": "npu_memory_fraction",
  627. }
  628. for env_var, attr_name in allowed_env_attr_mapping.items():
  629. if env_var in envs:
  630. try:
  631. attr_type = type(getattr(self, attr_name))
  632. setattr(self, attr_name, attr_type(envs[env_var]))
  633. except ValueError as e:
  634. raise argparse.ArgumentTypeError(
  635. f"Invalid value for {env_var}: {envs[env_var]}"
  636. ) from e
  637. def _default(self): # noqa: C901
  638. # Model deploy config
  639. if self.max_input_token_len <= 0:
  640. if self.max_prefill_tokens > 0:
  641. self.max_input_token_len = min(
  642. self.max_seq_len, self.max_prefill_tokens
  643. )
  644. else:
  645. self.max_input_token_len = self.max_seq_len
  646. # Model config
  647. self.max_prefill_batch_size = min(
  648. self.max_prefill_batch_size, self.max_batch_size
  649. )
  650. # Schedule config
  651. if self.max_prefill_tokens <= 0:
  652. self.max_prefill_tokens = self.max_seq_len
  653. if self.max_iter_times <= 0:
  654. self.max_iter_times = self.max_seq_len
  655. if self.max_preempt_count == 0 and self.cpu_mem_size > 0:
  656. self.cpu_mem_size = 0
  657. # Extends or Features
  658. # -- Parallelism
  659. if self.world_size > 0:
  660. # Base on the world size to infer other parallel sizes.
  661. #
  662. if self.tensor_parallel_size < 0:
  663. if self.pipeline_parallel_size > 1:
  664. self.tensor_parallel_size = (
  665. self.world_size // self.pipeline_parallel_size
  666. )
  667. else:
  668. self.tensor_parallel_size = self.world_size
  669. if self.data_parallel_size > 1:
  670. self.tensor_parallel_size //= self.data_parallel_size
  671. elif self.context_parallel_size > 1:
  672. self.tensor_parallel_size //= self.context_parallel_size
  673. self.data_parallel_size = 1
  674. if self.moe_tensor_parallel_size < 0 and self.pipeline_parallel_size <= 1:
  675. if self.moe_expert_parallel_size > 1:
  676. self.moe_tensor_parallel_size = (
  677. self.world_size // self.moe_expert_parallel_size
  678. )
  679. else:
  680. self.moe_tensor_parallel_size = self.world_size
  681. else:
  682. # Infer the world size from other parallel sizes.
  683. #
  684. if self.pipeline_parallel_size > 1:
  685. if self.tensor_parallel_size < 0:
  686. self.tensor_parallel_size = 1
  687. self.local_world_size = self.tensor_parallel_size
  688. self.world_size = (
  689. self.pipeline_parallel_size * self.tensor_parallel_size
  690. )
  691. else:
  692. self.world_size = self.tensor_parallel_size
  693. if self.data_parallel_size > 1:
  694. if self.tensor_parallel_size < 0:
  695. self.tensor_parallel_size = 1
  696. if self.local_world_size < 0:
  697. self.local_world_size = self.tensor_parallel_size
  698. self.world_size = (
  699. self.data_parallel_size * self.tensor_parallel_size
  700. )
  701. elif self.context_parallel_size > 1:
  702. if self.tensor_parallel_size < 0:
  703. self.tensor_parallel_size = 1
  704. if self.local_world_size < 0:
  705. self.local_world_size = self.tensor_parallel_size
  706. self.world_size = (
  707. self.context_parallel_size * self.tensor_parallel_size
  708. )
  709. self.data_parallel_size = 1
  710. if self.moe_expert_parallel_size > 1:
  711. if self.moe_tensor_parallel_size < 0:
  712. self.moe_tensor_parallel_size = 1
  713. if self.tensor_parallel_size < 0:
  714. self.tensor_parallel_size = self.moe_tensor_parallel_size
  715. if self.local_world_size < 0:
  716. self.local_world_size = self.tensor_parallel_size
  717. self.world_size = (
  718. self.moe_expert_parallel_size * self.moe_tensor_parallel_size
  719. )
  720. elif self.moe_tensor_parallel_size < 0:
  721. self.moe_tensor_parallel_size = self.world_size
  722. def _validate(self): # noqa: C901
  723. # Server config
  724. if not (1 <= self.max_link_num <= 1000):
  725. raise argparse.ArgumentTypeError(
  726. "--max-link-num must be in the range [1, 1000]"
  727. )
  728. if not (1 <= self.token_timeout <= 3600):
  729. raise argparse.ArgumentTypeError(
  730. "--token-timeout must be in the range [1, 3600]"
  731. )
  732. if not (1 <= self.e2e_timeout <= 3600):
  733. raise argparse.ArgumentTypeError(
  734. "--e2e-timeout must be in the range [1, 3600]"
  735. )
  736. if not self.openai_support:
  737. raise argparse.ArgumentTypeError("--openai-support cannot be empty")
  738. # Backend config
  739. if self.kv_pool_config:
  740. try:
  741. self.kv_pool_config_parsed = json.loads(self.kv_pool_config)
  742. except json.JSONDecodeError as e:
  743. raise argparse.ArgumentTypeError(
  744. f"--kv-pool-config must be a valid JSON string: {self.kv_pool_config}"
  745. ) from e
  746. # Model deploy config
  747. if self.max_seq_len <= 0:
  748. raise argparse.ArgumentTypeError("--max-seq-len must be greater than 0")
  749. if not (0 < self.max_input_token_len <= self.max_seq_len):
  750. raise argparse.ArgumentTypeError(
  751. "--max-input-token-len must be in the range (0, --max-seq-len]"
  752. )
  753. # Model config
  754. if self.cpu_mem_size < 0:
  755. raise argparse.ArgumentTypeError(
  756. "--cpu-mem-size must be greater than or equal to 0"
  757. )
  758. if not (0 < self.npu_memory_fraction <= 1):
  759. raise argparse.ArgumentTypeError(
  760. "--npu-memory-fraction must be in the range (0, 1]"
  761. )
  762. if self.models:
  763. try:
  764. self.models_parsed = json.loads(self.models)
  765. except json.JSONDecodeError as e:
  766. raise argparse.ArgumentTypeError(
  767. f"--models must be a valid JSON string: {self.models}"
  768. ) from e
  769. if not (1 <= self.async_scheduler_wait_time <= 3600):
  770. raise argparse.ArgumentTypeError(
  771. "--async-scheduler-wait-time must be in the range [1, 3600]"
  772. )
  773. # Schedule config
  774. if self.cache_block_size & (self.cache_block_size - 1) != 0:
  775. raise argparse.ArgumentTypeError("--cache-block-size must be powers of 2")
  776. if not (1 <= self.max_prefill_batch_size <= self.max_batch_size):
  777. raise argparse.ArgumentTypeError(
  778. "--max-prefill-batch-size must be in the range [1, --max-batch-size]"
  779. )
  780. if not (0 <= self.prefill_time_ms_per_req <= 1000):
  781. raise argparse.ArgumentTypeError(
  782. "--prefill-time-ms-per-req must be in the range [0, 1000]"
  783. )
  784. if not (1 <= self.max_batch_size <= 5000):
  785. raise argparse.ArgumentTypeError(
  786. "--max-batch-size must be in the range [1, 5000]"
  787. )
  788. if not (
  789. self.max_input_token_len <= self.max_prefill_tokens <= self.max_seq_len
  790. ):
  791. raise argparse.ArgumentTypeError(
  792. "--max-prefill-tokens must be in the range [--max-input-token-len, --max-seq-len]"
  793. )
  794. if not (1 <= self.max_iter_times <= self.max_seq_len):
  795. raise argparse.ArgumentTypeError(
  796. "--max-iter-times must be in the range [1, --max-seq-len]"
  797. )
  798. if not (0 <= self.decode_time_ms_per_req <= 1000):
  799. raise argparse.ArgumentTypeError(
  800. "--decode-time-ms-per-req must be in the range [0, 1000]"
  801. )
  802. if not (0 <= self.max_preempt_count <= self.max_batch_size):
  803. raise argparse.ArgumentTypeError(
  804. "--max-preempt-count must be in the range [0, --max-batch-size]"
  805. )
  806. if not (500 <= self.max_queue_delay_microseconds <= 1000000):
  807. raise argparse.ArgumentTypeError(
  808. "--max-queue-delay-microseconds must be in the range [500, 1000000]"
  809. )
  810. if not (0 <= self.max_first_token_wait_time <= 3600000):
  811. raise argparse.ArgumentTypeError(
  812. "--max-first-token-wait-time must be in the range [0, 3600000]"
  813. )
  814. # Extends or Features
  815. if self.override_generation_config:
  816. try:
  817. self.override_generation_config_parsed = json.loads(
  818. self.override_generation_config
  819. )
  820. except json.JSONDecodeError as e:
  821. raise argparse.ArgumentTypeError(
  822. f"--override-generation-config must be a valid JSON string: {self.override_generation_config}"
  823. ) from e
  824. # -- Extending context size
  825. if self.rope_scaling:
  826. try:
  827. self.rope_scaling_parsed = json.loads(self.rope_scaling)
  828. except json.JSONDecodeError as e:
  829. raise argparse.ArgumentTypeError(
  830. f"--rope-scaling must be a valid JSON string: {self.rope_scaling}"
  831. ) from e
  832. # -- Split fuse
  833. if self.enable_split:
  834. if not (1 <= self.split_chunk_tokens <= 8192):
  835. raise argparse.ArgumentTypeError(
  836. "--split-chunk-tokens must be in the range [1, 8192]"
  837. )
  838. elif self.split_chunk_tokens % 16 != 0:
  839. raise argparse.ArgumentTypeError(
  840. "--split-chunk-tokens must be the multiple of 16"
  841. )
  842. if not (0 <= self.split_start_batch_size <= self.max_batch_size):
  843. raise argparse.ArgumentTypeError(
  844. "--split-start-batch-size must be in the range [0, --max-batch-size]"
  845. )
  846. # -- Parallelism
  847. pp, tp, dp, cp, sp, moe_tp, moe_ep, ws, local_ws = (
  848. self.pipeline_parallel_size,
  849. self.tensor_parallel_size,
  850. self.data_parallel_size,
  851. self.context_parallel_size,
  852. self.sequence_parallel_size,
  853. self.moe_tensor_parallel_size,
  854. self.moe_expert_parallel_size,
  855. self.world_size,
  856. self.local_world_size,
  857. )
  858. if pp <= 0:
  859. raise argparse.ArgumentTypeError(
  860. "--pipeline-parallel-size must be greater than 0"
  861. )
  862. if tp > 0 and tp & (tp - 1) != 0:
  863. raise argparse.ArgumentTypeError(
  864. "--tensor-parallel-size must be the power of 2"
  865. )
  866. if dp > 0 and dp & (dp - 1) != 0:
  867. raise argparse.ArgumentTypeError(
  868. "--data-parallel-size must be the power of 2"
  869. )
  870. if cp > 0 and cp & (cp - 1) != 0:
  871. raise argparse.ArgumentTypeError(
  872. "--context-parallel-size must be the power of 2"
  873. )
  874. if sp > 0 and sp & (sp - 1) != 0:
  875. raise argparse.ArgumentTypeError(
  876. "--sequence-parallel-size must be the power of 2"
  877. )
  878. if moe_tp > 0 and moe_tp & (moe_tp - 1) != 0:
  879. raise argparse.ArgumentTypeError(
  880. "--moe-tensor-parallel-size must be the power of 2"
  881. )
  882. if moe_ep > 0 and moe_ep & (moe_ep - 1) != 0:
  883. raise argparse.ArgumentTypeError(
  884. "--moe-expert-parallel-size must be the power of 2"
  885. )
  886. if pp != 1 and dp != -1:
  887. raise argparse.ArgumentTypeError(
  888. f"--pipeline-parallel-size {pp} "
  889. f"and --data-parallel-size {dp} "
  890. "are incompatible, "
  891. "set --pipeline-parallel-size to 1 or disable data parallelism"
  892. )
  893. if dp > 1 and cp > 1:
  894. raise argparse.ArgumentTypeError(
  895. f"--data-parallel-size {dp} "
  896. f"and --context-parallel-size {cp} "
  897. "are incompatible, "
  898. "set --data-parallel-size to 1 or disable context parallelism"
  899. )
  900. # Check pp * tp == world size if enable pipeline parallelism
  901. if pp > 1:
  902. if 0 < ws != pp * tp:
  903. raise argparse.ArgumentTypeError(
  904. f"--pipeline-parallel-size {pp} "
  905. f"and --tensor-parallel-size {tp} "
  906. f"must be multiples of world size: {ws}"
  907. )
  908. else:
  909. # Check tp == world size or tp <= local world size
  910. if 0 < local_ws < tp and 0 < ws != tp:
  911. raise argparse.ArgumentTypeError(
  912. f"--tensor-parallel-size {tp} "
  913. f"must be less or equal to local world size: {local_ws} "
  914. f"or equal to world size: {ws}"
  915. )
  916. # Check dp * tp == world size if enable data parallelism
  917. if dp > 1:
  918. if 0 < ws != dp * tp:
  919. raise argparse.ArgumentTypeError(
  920. f"--data-parallel-size {dp} "
  921. f"and --tensor-parallel-size {tp} "
  922. f"must be multiples of world size: {ws}"
  923. )
  924. # Check cp * tp == world size if enable context parallelism
  925. elif cp > 1:
  926. if 0 < ws != cp * tp:
  927. raise argparse.ArgumentTypeError(
  928. f"--context-parallel-size {cp} "
  929. f"and --tensor-parallel-size {tp} "
  930. f"must be multiples of world size: {ws}"
  931. )
  932. # Check moe_tp * moe_ep == world size if enable expert parallelism
  933. if moe_ep > 1:
  934. # Check moe_tp == world size or moe_tp <= local world size
  935. if 0 < local_ws < moe_tp and 0 < ws != moe_tp:
  936. raise argparse.ArgumentTypeError(
  937. f"--moe-tensor-parallel-size {moe_tp} "
  938. f"must be less or equal to local world size: {local_ws} "
  939. f"or equal to world size: {ws}"
  940. )
  941. if 0 < ws != moe_ep * moe_tp:
  942. raise argparse.ArgumentTypeError(
  943. f"--moe-expert-parallel-size {moe_ep}"
  944. f"and --moe-tensor-parallel-size {moe_tp} "
  945. f"must be multiples of world size: {ws}"
  946. )
  947. # Otherwise, check moe_tp == world size
  948. else:
  949. if 0 < ws != moe_tp:
  950. raise argparse.ArgumentTypeError(
  951. f"--moe-tensor-parallel-size {moe_tp} "
  952. f"must be equal to world size: {ws}"
  953. )
  954. # Check sp == tp if enable sequence parallelism
  955. if sp > 1:
  956. if sp != tp:
  957. raise argparse.ArgumentTypeError(
  958. f"--sequence-parallel-size {sp} "
  959. f"must be equal to --tensor-parallel-size {tp}"
  960. )
  961. # -- Speculative decoding
  962. if self.enable_memory_decoding:
  963. if not (1 <= self.memory_decoding_length <= 16):
  964. raise argparse.ArgumentTypeError(
  965. "--memory-decoding-length must be in the range [1, 16]"
  966. )
  967. if self.enable_lookahead:
  968. if not (3 <= self.lookahead_level <= 16):
  969. raise argparse.ArgumentTypeError(
  970. "--lookahead-level must be in the range [3, 16]"
  971. )
  972. if not (1 <= self.lookahead_window <= 16):
  973. raise argparse.ArgumentTypeError(
  974. "--lookahead-window must be in the range [1, 16]"
  975. )
  976. if not (1 <= self.lookahead_guess_set_size <= 16):
  977. raise argparse.ArgumentTypeError(
  978. "--lookahead-guess-set-size must be in the range [1, 16]"
  979. )
  980. if self.enable_multi_token_prediction:
  981. if self.multi_token_prediction_tokens <= 0:
  982. raise argparse.ArgumentTypeError(
  983. "--multi-token-prediction-tokens must be greater than 0"
  984. )
  985. # -- Buffer response
  986. if self.enable_buffer_response:
  987. if self.prefill_expected_time_ms is None:
  988. raise argparse.ArgumentTypeError(
  989. "--prefill-expected-time-ms is required when --enable-buffer-response is enabled"
  990. )
  991. elif self.prefill_expected_time_ms <= 0:
  992. raise argparse.ArgumentTypeError(
  993. "--prefill-expected-time-ms must be greater than 0"
  994. )
  995. if self.decode_expected_time_ms is None:
  996. raise argparse.ArgumentTypeError(
  997. "--decode-expected-time-ms is required when --enable-buffer-response is enabled"
  998. )
  999. elif self.decode_expected_time_ms <= 0:
  1000. raise argparse.ArgumentTypeError(
  1001. "--decode-expected-time-ms must be greater than 0"
  1002. )
  1003. # Feature compatibility check
  1004. if self.enable_split:
  1005. if self.enable_memory_decoding or self.enable_lookahead:
  1006. raise argparse.ArgumentTypeError(
  1007. "--enable-memory-decoding and --enable-lookahead are not supported when --enable-split is enabled"
  1008. )
  1009. if self.rope_scaling:
  1010. raise argparse.ArgumentTypeError(
  1011. "--rope-scaling is not supported when --enable-split is enabled"
  1012. )
  1013. if self.enable_memory_decoding:
  1014. if self.enable_lookahead:
  1015. raise argparse.ArgumentTypeError(
  1016. "--enable-lookahead is not supported when --enable-memory-decoding is enabled"
  1017. )
  1018. if self.rope_scaling:
  1019. raise argparse.ArgumentTypeError(
  1020. "--rope-scaling is not supported when --enable-memory-decoding is enabled"
  1021. )
  1022. elif self.enable_lookahead:
  1023. if self.rope_scaling:
  1024. raise argparse.ArgumentTypeError(
  1025. "--rope-scaling is not supported when --enable-lookahead is enabled"
  1026. )
  1027. if self.enable_multi_token_prediction:
  1028. if self.enable_memory_decoding or self.enable_lookahead:
  1029. raise argparse.ArgumentTypeError(
  1030. "--enable-memory-decoding and --enable-lookahead are not supported when --enable-multi-token-prediction is enabled"
  1031. )
  1032. if self.enable_split:
  1033. raise argparse.ArgumentTypeError(
  1034. "--enable-split is not supported when --enable-multi-token-prediction is enabled"
  1035. )
  1036. if self.rope_scaling:
  1037. raise argparse.ArgumentTypeError(
  1038. "--rope-scaling is not supported when --enable-multi-token-prediction is enabled"
  1039. )
  1040. if self.enable_prefix_caching:
  1041. if self.rope_scaling:
  1042. raise argparse.ArgumentTypeError(
  1043. "--rope-scaling is not supported when --enable-prefix-caching is enabled"
  1044. )
  1045. if self.data_parallel_size > 1:
  1046. if self.enable_memory_decoding or self.enable_lookahead:
  1047. raise argparse.ArgumentTypeError(
  1048. "--enable-memory-decoding and --enable-lookahead are not supported when --data-parallel-size > 1"
  1049. )
  1050. if self.enable_split:
  1051. raise argparse.ArgumentTypeError(
  1052. "--enable-split is not supported when --data-parallel-size > 1"
  1053. )
  1054. if self.enable_prefix_caching:
  1055. raise argparse.ArgumentTypeError(
  1056. "--enable-prefix-caching is not supported when --data-parallel-size > 1"
  1057. )
  1058. class AscendMindIEServer(InferenceServer):
  1059. """
  1060. Containerized Ascend MindIE inference server backend using gpustack-runtime.
  1061. This backend preserves all the original logic from AscendMindIEServer but runs
  1062. the final service in a Docker container instead of a subprocess.
  1063. """
  1064. def start(self):
  1065. try:
  1066. self._start()
  1067. except Exception as e:
  1068. self._handle_error(e)
  1069. def _start(self): # noqa: C901
  1070. logger.info(
  1071. f"Starting Ascend MindIE model instance: {self._model_instance.name}"
  1072. )
  1073. # Prepare distributed information.
  1074. dservers = self._model_instance.distributed_servers
  1075. subworkers = (
  1076. dservers.subordinate_workers
  1077. if dservers and dservers.subordinate_workers
  1078. else []
  1079. )
  1080. deployment_metadata = self._get_deployment_metadata()
  1081. # Root path is defined by in Dockerfile ENV
  1082. # https://github.com/gpustack/runner/blob/main/pack/cann/Dockerfile#L273
  1083. root_path = Path("/usr/local/Ascend")
  1084. install_path = root_path.joinpath("mindie", "latest", "mindie-service")
  1085. # Load config,
  1086. # the config includes two parts: environment variables and a JSON configuration file.
  1087. logger.debug("Loading Ascend MindIE config")
  1088. # - Load environment variables.
  1089. env = self._get_configured_env()
  1090. config_files: list[ContainerFile] = []
  1091. # - Load JSON configuration,
  1092. # see https://www.hiascend.com/document/detail/zh/mindie/20RC1/mindiellm/llmdev/mindie_llm0004.html,
  1093. # https://www.hiascend.com/document/detail/zh/mindie/20RC1/mindieservice/servicedev/mindie_service0285.html.
  1094. config = self._get_mindie_config_json()
  1095. log_config = config.get("LogConfig", {}) # Deprecated since MindIE 2.0.RC1
  1096. server_config = config.get("ServerConfig", {})
  1097. backend_config = config.get("BackendConfig", {})
  1098. model_deploy_config = backend_config.get("ModelDeployConfig", {})
  1099. model_config = model_deploy_config.get("ModelConfig", [{}])[0]
  1100. schedule_config = backend_config.get("ScheduleConfig", {})
  1101. # Mutate config
  1102. logger.debug("Mutating Ascend MindIE config")
  1103. # - Global config
  1104. # -- Pin installation path, which helps to locate other resources.
  1105. env["MIES_INSTALL_PATH"] = str(install_path)
  1106. # -- Enable exposing metircs.
  1107. env["MIES_SERVICE_MONITOR_MODE"] = env.pop("MIES_SERVICE_MONITOR_MODE", "1")
  1108. # -- Enable high performance swapper.
  1109. env["MIES_RECOMPUTE_THRESHOLD"] = env.pop("MIES_RECOMPUTE_THRESHOLD", "0.5")
  1110. # env["MINDIE_LLM_USE_MB_SWAPPER"] = "1" # Atlas 300I Duo needs to unset this.
  1111. env["MINDIE_LLM_RECOMPUTE_THRESHOLD"] = env.pop(
  1112. "MINDIE_LLM_RECOMPUTE_THRESHOLD", "0.5"
  1113. )
  1114. # -- Enforce continues batching.
  1115. env["MINDIE_LLM_CONTINUOUS_BATCHING"] = env.pop(
  1116. "MINDIE_LLM_CONTINUOUS_BATCHING", "1"
  1117. )
  1118. # -- Disable checking files permission.
  1119. env["MINDIE_CHECK_INPUTFILES_PERMISSION"] = "0"
  1120. # -- Enforce using ATB as backend
  1121. env["MINDIE_LLM_FRAMEWORK_BACKEND"] = "ATB"
  1122. # -- Enforce using 80% of GPU memory.
  1123. env["NPU_MEMORY_FRACTION"] = env.pop("NPU_MEMORY_FRACTION", "0.8")
  1124. # -- Disable OpenMP parallelism, speed up model loading.
  1125. env["OMP_NUM_THREADS"] = env.pop("OMP_NUM_THREADS", "1")
  1126. # -- Enable safetensors GPU loading pass-through for faster model loading.
  1127. env["SAFETENSORS_FAST_GPU"] = env.pop("SAFETENSORS_FAST_GPU", "1")
  1128. # -- Improve performance.
  1129. env["MINDIE_ASYNC_SCHEDULING_ENABLE"] = env.pop(
  1130. "MINDIE_ASYNC_SCHEDULING_ENABLE", "1"
  1131. )
  1132. env["TASK_QUEUE_ENABLE"] = env.pop("TASK_QUEUE_ENABLE", "1")
  1133. env["CPU_AFFINITY_CONF"] = env.pop("CPU_AFFINITY_CONF", "1")
  1134. env["ATB_OPERATION_EXECUTE_ASYNC"] = "1"
  1135. env["ATB_LAYER_INTERNAL_TENSOR_REUSE"] = env.pop(
  1136. "ATB_LAYER_INTERNAL_TENSOR_REUSE", "1"
  1137. )
  1138. env["INF_NAN_MODE_ENABLE"] = env.pop("INF_NAN_MODE_ENABLE", "0")
  1139. env["ATB_LLM_ENABLE_AUTO_TRANSPOSE"] = env.pop(
  1140. "ATB_LLM_ENABLE_AUTO_TRANSPOSE", "1"
  1141. )
  1142. env["ATB_CONVERT_NCHW_TO_ND"] = env.pop("ATB_CONVERT_NCHW_TO_ND", "1")
  1143. env["ATB_WORKSPACE_MEM_ALLOC_ALG_TYPE"] = env.pop(
  1144. "ATB_WORKSPACE_MEM_ALLOC_ALG_TYPE", "3"
  1145. )
  1146. env["ATB_WORKSPACE_MEM_ALLOC_GLOBAL"] = env.pop(
  1147. "ATB_WORKSPACE_MEM_ALLOC_GLOBAL", "1"
  1148. )
  1149. env["PYTORCH_NPU_ALLOC_CONF"] = env.pop(
  1150. "PYTORCH_NPU_ALLOC_CONF", "expandable_segments:True"
  1151. )
  1152. # -- Pop conflict configuration items.
  1153. env.pop("NPU_VISIBLE_DEVICES", "")
  1154. env.pop("NPU-VISIBLE-DEVICES", "")
  1155. env.pop("NPU_DEVICE_IDS", "")
  1156. env.pop("ASCEND_LAUNCH_BLOCKING", "")
  1157. env.pop("ASCEND_RT_VISIBLE_DEVICES", "")
  1158. env.pop("MIES_CONTAINER_MANAGEMENT_IP", "")
  1159. env.pop("WORLD_SIZE", "")
  1160. env.pop("RANKTABLEFILE", "")
  1161. env.pop("RANK_TABLE_FILE", "")
  1162. if not deployment_metadata.distributed:
  1163. env.pop("MIES_CONTAINER_IP", "")
  1164. env.pop("HOST_IP", "")
  1165. # - Listening config
  1166. serving_port = self._get_serving_port()
  1167. server_config["ipAddress"] = self._worker.ip
  1168. server_config.pop("managementIpAddress", None)
  1169. server_config["allowAllZeroIpListening"] = True
  1170. server_config["maxLinkNum"] = 1000
  1171. server_config["port"] = serving_port
  1172. server_config["managementPort"] = serving_port
  1173. server_config["metricsPort"] = serving_port
  1174. server_config["httpsEnabled"] = False
  1175. server_config["interCommTLSEnabled"] = False
  1176. # - Device config
  1177. backend_config["interNodeTLSEnabled"] = False
  1178. backend_config["npuDeviceIds"] = [
  1179. # Use logic(count) device indexes as NPU device IDs,
  1180. # which is friendly to virtualized environments.
  1181. list(range(len(self._model_instance.gpu_indexes)))
  1182. ]
  1183. model_config["worldSize"] = len(self._model_instance.gpu_indexes)
  1184. backend_config["multiNodesInferEnabled"] = False
  1185. if deployment_metadata.distributed:
  1186. # Add distributed config if in distributed mode.
  1187. backend_config["multiNodesInferEnabled"] = True
  1188. # During distributed setup,
  1189. # we must get more than one port here,
  1190. # so we use ports[1] for distributed initialization.
  1191. backend_config["multiNodesInferPort"] = self._model_instance.ports[1]
  1192. if deployment_metadata.distributed_follower:
  1193. subworker = subworkers[deployment_metadata.distributed_follower_index]
  1194. # Override device config if is a subordinate worker.
  1195. backend_config["npuDeviceIds"] = [
  1196. # Use logic(count) device indexes as NPU device IDs,
  1197. # which is friendly to virtualized environments.
  1198. list(range(len(subworker.gpu_indexes)))
  1199. ]
  1200. model_config["worldSize"] = len(subworker.gpu_indexes)
  1201. # - Model config
  1202. derived_max_seq_len = self._derive_max_model_len(default=8192)
  1203. user_backend_parameters = self._flatten_backend_param()
  1204. local_world_size = len(self._model_instance.gpu_indexes)
  1205. world_size = local_world_size
  1206. if deployment_metadata.distributed:
  1207. world_size = local_world_size * (len(subworkers) + 1)
  1208. (
  1209. params,
  1210. injected_backend_parameters,
  1211. apply_backend_parameters,
  1212. ) = self._prepare_backend_parameters(
  1213. local_world_size=local_world_size,
  1214. world_size=world_size,
  1215. derived_max_seq_len=derived_max_seq_len,
  1216. user_backend_parameters=user_backend_parameters,
  1217. env=env,
  1218. )
  1219. max_seq_len = params.max_seq_len
  1220. model_deploy_config["maxSeqLen"] = max_seq_len
  1221. model_deploy_config["maxInputTokenLen"] = max_seq_len
  1222. model_deploy_config["truncation"] = False
  1223. schedule_config["maxIterTimes"] = max_seq_len
  1224. schedule_config["maxPrefillTokens"] = max_seq_len
  1225. model_config["modelName"] = self._model.name
  1226. model_config["modelWeightPath"] = self._model_path
  1227. # - Customize config, translate to Ascend MindIE configuration language,
  1228. # see https://www.hiascend.com/document/detail/zh/mindie/20RC1/mindieservice/servicedev/mindie_service0285.html,
  1229. # https://www.hiascend.com/document/detail/zh/mindie/20RC1/mindieservice/servicedev/mindie_service0300.html,
  1230. # https://www.hiascend.com/document/detail/zh/mindie/20RC1/mindiellm/llmdev/mindie_llm0302.html,
  1231. # https://www.hiascend.com/document/detail/zh/mindie/20RC1/mindiellm/llmdev/mindie_llm0424.html,
  1232. # https://www.hiascend.com/document/detail/zh/mindie/20RC1/mindiellm/llmdev/mindie_llm0009.html,
  1233. # https://www.hiascend.com/document/detail/zh/mindie/20RC1/mindiellm/llmdev/mindie_llm0300.html,
  1234. # https://www.hiascend.com/document/detail/zh/mindie/20RC1/mindiellm/llmdev/mindie_llm0425.html.
  1235. if apply_backend_parameters:
  1236. # -- Log config
  1237. log_config["logLevel"] = params.log_level
  1238. env["MINDIE_LOG_LEVEL"] = params.log_level.upper()
  1239. # -- Server config
  1240. server_config["maxLinkNum"] = params.max_link_num
  1241. server_config["openAiSupport"] = params.openai_support
  1242. # -- Backend config
  1243. if params.kv_pool_config_parsed:
  1244. backend_config["kvPoolConfig"] = params.kv_pool_config_parsed
  1245. # -- Model deploy config
  1246. model_deploy_config["maxSeqLen"] = params.max_seq_len
  1247. model_deploy_config["maxInputTokenLen"] = params.max_input_token_len
  1248. schedule_config["maxIterTimes"] = params.max_iter_times
  1249. schedule_config["maxPrefillTokens"] = params.max_prefill_tokens
  1250. model_deploy_config["truncation"] = params.truncation
  1251. # -- Model config
  1252. model_config["cpuMemSize"] = params.cpu_mem_size
  1253. env["MIES_USE_MB_SWAPPER"] = "1" if params.cpu_mem_size > 0 else "0"
  1254. env["NPU_MEMORY_FRACTION"] = str(params.npu_memory_fraction)
  1255. model_config["trustRemoteCode"] = params.trust_remote_code
  1256. if params.models_parsed:
  1257. model_config["models"] = params.models_parsed
  1258. model_config["async_scheduler_wait_time"] = params.async_scheduler_wait_time
  1259. # -- Schedule config
  1260. schedule_config["cacheBlockSize"] = params.cache_block_size
  1261. schedule_config["maxPrefillBatchSize"] = params.max_prefill_batch_size
  1262. schedule_config["prefillTimeMsPerReq"] = params.prefill_time_ms_per_req
  1263. schedule_config["prefillPolicyType"] = params.prefill_policy_type
  1264. schedule_config["maxBatchSize"] = params.max_batch_size
  1265. schedule_config["decodeTimeMsPerReq"] = params.decode_time_ms_per_req
  1266. schedule_config["decodePolicyType"] = params.decode_policy_type
  1267. schedule_config["maxPreemptCount"] = params.max_preempt_count
  1268. schedule_config["supportSelectBatch"] = params.support_select_batch
  1269. schedule_config["maxQueueDelayMicroseconds"] = (
  1270. params.max_queue_delay_microseconds
  1271. )
  1272. schedule_config["maxFirstTokenWaitTime"] = params.max_first_token_wait_time
  1273. # -- Extends or Features
  1274. # --- Disable exposing metrics
  1275. if params.no_metrics:
  1276. env["MIES_SERVICE_MONITOR_MODE"] = "0"
  1277. # --- Emitting operators in synchronous way.
  1278. if params.enforce_eager:
  1279. env["MINDIE_ASYNC_SCHEDULING_ENABLE"] = "0"
  1280. env["TASK_QUEUE_ENABLE"] = "0"
  1281. env["ATB_OPERATION_EXECUTE_ASYNC"] = "0"
  1282. env["ASCEND_LAUNCH_BLOCKING"] = "1"
  1283. # --- Mutating model config.
  1284. model_config_path = Path(self._model_path).joinpath("config.json")
  1285. with open(
  1286. model_config_path,
  1287. "r",
  1288. encoding="utf-8",
  1289. ) as f:
  1290. model_path_config = json.load(f)
  1291. # Merge the updated model config with the existing one
  1292. if params.dtype != "auto":
  1293. dtype = params.dtype
  1294. if dtype == "half":
  1295. dtype = "float16"
  1296. elif dtype == "float":
  1297. dtype = "float32"
  1298. model_path_config["torch_dtype"] = dtype
  1299. if params.rope_scaling_parsed:
  1300. rope_scaling = model_path_config.get("rope_scaling")
  1301. if rope_scaling:
  1302. # Merge the updated RoPE scaling config with the existing one
  1303. rope_scaling.update(params.rope_scaling_parsed)
  1304. else:
  1305. # Override the RoPE scaling config
  1306. rope_scaling = params.rope_scaling_parsed
  1307. model_path_config["rope_scaling"] = rope_scaling
  1308. if params.rope_theta:
  1309. model_path_config["rope_theta"] = params.rope_theta
  1310. # Save the mutated model config
  1311. model_config_str = json.dumps(
  1312. model_path_config,
  1313. indent=4,
  1314. ensure_ascii=False,
  1315. )
  1316. config_files.append(
  1317. ContainerFile(
  1318. path=str(model_config_path),
  1319. content=model_config_str,
  1320. mode=0o750,
  1321. ),
  1322. )
  1323. # --- Mutating model generation config
  1324. model_generation_config_path = Path(self._model_path).joinpath(
  1325. "generation_config.json"
  1326. )
  1327. if params.override_generation_config_parsed:
  1328. if model_generation_config_path.exists():
  1329. with open(
  1330. model_generation_config_path,
  1331. "r",
  1332. encoding="utf-8",
  1333. ) as f:
  1334. generation_config = json.load(f)
  1335. # Merge the updated generation config with the existing one
  1336. generation_config.update(params.override_generation_config_parsed)
  1337. else:
  1338. # Override the generation config
  1339. generation_config = params.override_generation_config_parsed
  1340. # Save the new generation config
  1341. model_generation_config_str = json.dumps(
  1342. generation_config,
  1343. indent=4,
  1344. ensure_ascii=False,
  1345. )
  1346. config_files.append(
  1347. ContainerFile(
  1348. path=str(model_generation_config_path),
  1349. content=model_generation_config_str,
  1350. ),
  1351. )
  1352. # --- Split fuse
  1353. if params.enable_split:
  1354. schedule_config["enableSplit"] = True
  1355. schedule_config["templateType"] = "Mix"
  1356. schedule_config["policyType"] = params.policy_type
  1357. schedule_config["splitType"] = False
  1358. schedule_config["splitStartType"] = False
  1359. schedule_config["splitChunkTokens"] = params.split_chunk_tokens
  1360. schedule_config["splitStartBatchSize"] = params.split_start_batch_size
  1361. model_config["plugin_params"] = json.dumps(
  1362. {
  1363. "plugin_type": "splitfuse",
  1364. }
  1365. )
  1366. # --- Speculative decoding
  1367. if params.enable_memory_decoding:
  1368. model_deploy_config["speculationGamma"] = params.memory_decoding_length
  1369. if derived_max_seq_len > max_seq_len == schedule_config["maxIterTimes"]:
  1370. schedule_config["maxIterTimes"] = (
  1371. max_seq_len + params.memory_decoding_length
  1372. )
  1373. model_config["plugin_params"] = json.dumps(
  1374. {
  1375. "plugin_type": "memory_decoding",
  1376. "decoding_length": params.memory_decoding_length,
  1377. "dynamic_algo": params.memory_decoding_dynamic_algo,
  1378. }
  1379. )
  1380. if params.enable_lookahead:
  1381. model_deploy_config["speculationGamma"] = (
  1382. params.lookahead_level - 1
  1383. ) * (params.lookahead_window + params.lookahead_guess_set_size)
  1384. model_config["plugin_params"] = json.dumps(
  1385. {
  1386. "plugin_type": "la",
  1387. "level": params.lookahead_level,
  1388. "window": params.lookahead_window,
  1389. "guess_set_size": params.lookahead_guess_set_size,
  1390. }
  1391. )
  1392. # --- Multi-token prediction
  1393. if params.enable_multi_token_prediction:
  1394. model_config["plugin_params"] = json.dumps(
  1395. {
  1396. "plugin_type": "mtp",
  1397. "num_speculative_tokens": params.multi_token_prediction_tokens,
  1398. }
  1399. )
  1400. # --- Prefix cache
  1401. if params.enable_prefix_caching:
  1402. schedule_config["enablePrefixCache"] = True
  1403. model_config["plugin_params"] = json.dumps(
  1404. {
  1405. "plugin_type": "prefix_cache",
  1406. }
  1407. )
  1408. # --- Parallelism
  1409. if params.pipeline_parallel_size > 1:
  1410. model_config["pp"] = params.pipeline_parallel_size
  1411. model_config["tp"] = params.tensor_parallel_size
  1412. else:
  1413. if params.data_parallel_size > 0:
  1414. model_config["dp"] = params.data_parallel_size
  1415. if params.context_parallel_size > 0:
  1416. model_config["cp"] = params.context_parallel_size
  1417. if params.tensor_parallel_size > 0:
  1418. model_config["tp"] = params.tensor_parallel_size
  1419. model_config["moe_tp"] = params.moe_tensor_parallel_size
  1420. if params.moe_expert_parallel_size > 0:
  1421. model_config["moe_ep"] = params.moe_expert_parallel_size
  1422. model_config["moe_tp"] = params.moe_tensor_parallel_size
  1423. if params.sequence_parallel_size > 0:
  1424. model_config["sp"] = params.sequence_parallel_size
  1425. # --- Asynchronous scheduling
  1426. if params.max_batch_size <= 50:
  1427. env["MINDIE_ASYNC_SCHEDULING_ENABLE"] = "0"
  1428. # --- Buffer response
  1429. if params.enable_buffer_response:
  1430. schedule_config["bufferResponseEnabled"] = True
  1431. schedule_config["prefillExpectedTime"] = params.prefill_expected_time_ms
  1432. schedule_config["decodeExpectedTime"] = params.decode_expected_time_ms
  1433. # Generate rank table file if needed,
  1434. # see https://www.hiascend.com/document/detail/zh/mindie/20RC2/envdeployment/instg/mindie_instg_0027.html,
  1435. # https://www.hiascend.com/forum/thread-0237183374051498211-1-1.html
  1436. if deployment_metadata.distributed:
  1437. server_count = f"{len(subworkers) + 1}"
  1438. server_list = [
  1439. {
  1440. "server_id": self._model_instance.worker_ip,
  1441. "container_ip": self._model_instance.worker_ip,
  1442. "device": [
  1443. {
  1444. # Unlike above npuDeviceIds,
  1445. # here we must use real device indexes as device IDs.
  1446. # I guess Ascend needs to construct the communication topology based on real device IDs,
  1447. # see https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/hccl/hcclug/hcclug_000014.html#ZH-CN_TOPIC_0000002479883061__zh-cn_topic_0000001463640385_section10882094214.
  1448. #
  1449. # Since rank table will in charge of device mapping in distributed mode,
  1450. # the above logic(count) device indexes will not affect distributed deployment,
  1451. # see https://www.hiascend.com/document/detail/zh/mindie/21RC2/mindiellm/llmdev/mindie_llm0004.html#ZH-CN_TOPIC_0000002366997374__section7821428101811.
  1452. "device_id": str(self._model_instance.gpu_indexes[i]),
  1453. "device_ip": self._model_instance.gpu_addresses[i],
  1454. "rank_id": str(i),
  1455. }
  1456. for i in range(len(self._model_instance.gpu_indexes))
  1457. ],
  1458. },
  1459. ]
  1460. for i, sw in enumerate(subworkers):
  1461. server_list.append(
  1462. {
  1463. "server_id": sw.worker_ip,
  1464. "container_ip": sw.worker_ip,
  1465. "device": [
  1466. {
  1467. # Unlike above npuDeviceIds,
  1468. # here we must use real device indexes as device IDs.
  1469. # I guess Ascend needs to construct the communication topology based on real device IDs,
  1470. # see https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/hccl/hcclug/hcclug_000014.html#ZH-CN_TOPIC_0000002479883061__zh-cn_topic_0000001463640385_section10882094214.
  1471. #
  1472. # Since rank table will in charge of device mapping in distributed mode,
  1473. # the above logic(count) device indexes will not affect distributed deployment,
  1474. # see https://www.hiascend.com/document/detail/zh/mindie/21RC2/mindiellm/llmdev/mindie_llm0004.html#ZH-CN_TOPIC_0000002366997374__section7821428101811.
  1475. "device_id": str(sw.gpu_indexes[j]),
  1476. "device_ip": sw.gpu_addresses[j],
  1477. "rank_id": str(j + len(sw.gpu_indexes) * (i + 1)),
  1478. }
  1479. for j in range(len(sw.gpu_indexes))
  1480. ],
  1481. }
  1482. )
  1483. # Save rank table to a JSON file.
  1484. rank_table = {
  1485. "version": "1.0",
  1486. "server_count": server_count,
  1487. "server_list": server_list,
  1488. "status": "completed",
  1489. }
  1490. rank_table_path = install_path.joinpath("conf", "ranktable.json")
  1491. rank_table_str = json.dumps(rank_table, indent=4, ensure_ascii=False)
  1492. config_files.append(
  1493. ContainerFile(
  1494. path=str(rank_table_path),
  1495. content=rank_table_str,
  1496. mode=0o640,
  1497. )
  1498. )
  1499. # - Set environment variables.
  1500. env["WORLD_SIZE"] = str(
  1501. len(self._model_instance.gpu_indexes) * (len(subworkers) + 1)
  1502. )
  1503. env["RANKTABLEFILE"] = str(rank_table_path)
  1504. env["RANK_TABLE_FILE"] = str(rank_table_path)
  1505. env["MIES_CONTAINER_IP"] = env.pop("MIES_CONTAINER_IP", self._worker.ip)
  1506. env["HOST_IP"] = env.pop("HOST_IP", self._worker.ip)
  1507. env["ATB_LLM_HCCL_ENABLE"] = env.pop("ATB_LLM_HCCL_ENABLE", "1")
  1508. env["ATB_LLM_COMM_BACKEND"] = env.pop("ATB_LLM_COMM_BACKEND", "hccl")
  1509. env["HCCL_CONNECT_TIMEOUT"] = env.pop("HCCL_CONNECT_TIMEOUT", "7200")
  1510. env["HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT"] = env.pop(
  1511. "HCCL_RDMA_PCIE_DIRECT_POST_NOSTRICT", "TRUE"
  1512. )
  1513. if not is_ascend_310p(self._get_selected_gpu_devices()):
  1514. env["HCCL_EXEC_TIMEOUT"] = env.pop("HCCL_EXEC_TIMEOUT", "0")
  1515. env["HCCL_OP_EXPANSION_MODE"] = env.pop("HCCL_OP_EXPANSION_MODE", "AIV")
  1516. # NB(thxCode): For deterministic calculation, needs the following environment variables.
  1517. # LCCL_DETERMINISTIC=1
  1518. # ATB_WORKSPACE_MEM_ALLOC_GLOBAL=1
  1519. # HCCL_DETERMINISTIC=true
  1520. # ATB_MATMUL_SHUFFLE_K_ENABLE=0
  1521. # ATB_LLM_LCOC_ENABLE=0
  1522. # HCCL_OP_EXPANSION_MODE=""
  1523. logger.info(
  1524. f"With rank table JSON configuration:{os.linesep}{rank_table_str}"
  1525. )
  1526. # Generate JSON configuration file by model instance id.
  1527. config_path = str(install_path.joinpath("conf", "config.json"))
  1528. config_str = json.dumps(config, indent=4, ensure_ascii=False)
  1529. config_files.append(
  1530. ContainerFile(
  1531. path=config_path,
  1532. content=config_str,
  1533. mode=0o640,
  1534. ),
  1535. )
  1536. logger.info(
  1537. f"With JSON configuration(inconsistent input items mean unchangeable):{os.linesep}{config_str}"
  1538. )
  1539. # Indicate the JSON configuration file.
  1540. env["MIES_CONFIG_JSON_PATH"] = str(config_path)
  1541. command = None
  1542. if self.inference_backend:
  1543. command = self.inference_backend.get_container_entrypoint(
  1544. self._model.backend_version
  1545. )
  1546. command_script = self._get_serving_command_script(env)
  1547. command_args = self.build_versioned_command_args(
  1548. [
  1549. str(install_path.joinpath("bin", "mindieservice_daemon")),
  1550. ]
  1551. )
  1552. try:
  1553. self._update_model_instance(
  1554. self._model_instance.id,
  1555. injected_backend_parameters=format_backend_parameters(
  1556. injected_backend_parameters
  1557. )
  1558. or None,
  1559. )
  1560. except Exception as e:
  1561. logger.warning(
  1562. "Failed to persist injected backend parameters for "
  1563. f"{self._model_instance.name}: {e}"
  1564. )
  1565. self._create_workload(
  1566. deployment_metadata=deployment_metadata,
  1567. command=command,
  1568. command_script=command_script,
  1569. command_args=command_args,
  1570. env=env,
  1571. config_files=config_files,
  1572. working_dir=str(install_path.joinpath("bin")),
  1573. )
  1574. def _prepare_backend_parameters(
  1575. self,
  1576. local_world_size: int,
  1577. world_size: int,
  1578. derived_max_seq_len: int,
  1579. user_backend_parameters: List[str],
  1580. env: Dict[str, str],
  1581. ) -> Tuple[AscendMindIEParameters, List[str], bool]:
  1582. baseline_params = AscendMindIEParameters()
  1583. baseline_params.from_args_and_envs([], env)
  1584. implicit_backend_parameters = self._get_implicit_backend_parameters(
  1585. user_backend_parameters,
  1586. derived_max_seq_len,
  1587. )
  1588. params = self._new_parameters(
  1589. local_world_size,
  1590. world_size,
  1591. self._get_effective_max_seq_len(
  1592. user_backend_parameters,
  1593. derived_max_seq_len,
  1594. ),
  1595. )
  1596. effective_backend_parameters = (
  1597. implicit_backend_parameters + user_backend_parameters
  1598. )
  1599. apply_backend_parameters = bool(user_backend_parameters) or (
  1600. find_parameter(implicit_backend_parameters, ["dtype"]) is not None
  1601. )
  1602. if apply_backend_parameters:
  1603. formatted_backend_parameters = os.linesep.join(effective_backend_parameters)
  1604. logger.debug(
  1605. f"Parsing given parameters: {os.linesep}"
  1606. f"{formatted_backend_parameters}"
  1607. )
  1608. params.from_args_and_envs(effective_backend_parameters, env)
  1609. parameters_diff = []
  1610. if apply_backend_parameters:
  1611. parameters_diff = params.changed_backend_parameters(
  1612. baseline_params,
  1613. exclude_names=self._backend_parameter_names(
  1614. implicit_backend_parameters
  1615. ),
  1616. )
  1617. injected_backend_parameters = self._filter_user_defined_parameters(
  1618. implicit_backend_parameters + parameters_diff,
  1619. user_backend_parameters,
  1620. )
  1621. return params, injected_backend_parameters, apply_backend_parameters
  1622. def _get_implicit_backend_parameters(
  1623. self,
  1624. user_backend_parameters: List[str],
  1625. derived_max_seq_len: int,
  1626. ) -> List[str]:
  1627. implicit_backend_parameters = []
  1628. effective_max_seq_len = self._get_effective_max_seq_len(
  1629. user_backend_parameters,
  1630. derived_max_seq_len,
  1631. )
  1632. if effective_max_seq_len != derived_max_seq_len:
  1633. implicit_backend_parameters.extend(
  1634. ["--max-seq-len", str(effective_max_seq_len)]
  1635. )
  1636. # For Ascend 310P, default dtype to float16 before user parameters
  1637. # so users can still override it.
  1638. if (
  1639. is_ascend_310p(self._get_selected_gpu_devices())
  1640. and find_parameter(user_backend_parameters, ["dtype"]) is None
  1641. ):
  1642. implicit_backend_parameters.extend(["--dtype", "float16"])
  1643. return implicit_backend_parameters
  1644. @staticmethod
  1645. def _new_parameters(
  1646. local_world_size: int,
  1647. world_size: int,
  1648. max_seq_len: int,
  1649. ) -> AscendMindIEParameters:
  1650. return AscendMindIEParameters(
  1651. local_world_size=local_world_size,
  1652. world_size=world_size,
  1653. max_seq_len=max_seq_len,
  1654. )
  1655. @staticmethod
  1656. def _filter_user_defined_parameters(
  1657. parameters: List[str],
  1658. user_backend_parameters: List[str],
  1659. ) -> List[str]:
  1660. user_parameter_names = AscendMindIEServer._backend_parameter_names(
  1661. user_backend_parameters
  1662. )
  1663. filtered = []
  1664. index = 0
  1665. while index < len(parameters):
  1666. parameter = parameters[index]
  1667. if not parameter.startswith("-"):
  1668. filtered.append(parameter)
  1669. index += 1
  1670. continue
  1671. if (
  1672. AscendMindIEServer._backend_parameter_name(parameter)
  1673. in user_parameter_names
  1674. ):
  1675. index += 1
  1676. if (
  1677. "=" not in parameter
  1678. and index < len(parameters)
  1679. and not parameters[index].startswith("-")
  1680. ):
  1681. index += 1
  1682. continue
  1683. filtered.append(parameter)
  1684. index += 1
  1685. if (
  1686. "=" not in parameter
  1687. and index < len(parameters)
  1688. and not parameters[index].startswith("-")
  1689. ):
  1690. filtered.append(parameters[index])
  1691. index += 1
  1692. return filtered
  1693. @staticmethod
  1694. def _backend_parameter_names(parameters: List[str]) -> set:
  1695. names = set()
  1696. for parameter in parameters:
  1697. if parameter.startswith("-"):
  1698. names.add(AscendMindIEServer._backend_parameter_name(parameter))
  1699. return names
  1700. @staticmethod
  1701. def _backend_parameter_name(parameter: str) -> str:
  1702. aliases = {
  1703. "pp": "pipeline-parallel-size",
  1704. "dp": "data-parallel-size",
  1705. "cp": "context-parallel-size",
  1706. "tp": "tensor-parallel-size",
  1707. "sp": "sequence-parallel-size",
  1708. "moe-ep": "moe-expert-parallel-size",
  1709. "moe-tp": "moe-tensor-parallel-size",
  1710. }
  1711. boolean_optional_parameters = {
  1712. "truncation",
  1713. "support-select-batch",
  1714. "enable-memory-decoding",
  1715. "enable-lookahead",
  1716. "enable-buffer-response",
  1717. "enable-split",
  1718. "enable-multi-token-prediction",
  1719. "enable-prefix-caching",
  1720. }
  1721. name = parameter.split("=", 1)[0].lstrip("-")
  1722. if name.startswith("no-") and name.removeprefix("no-") in (
  1723. boolean_optional_parameters
  1724. ):
  1725. name = name.removeprefix("no-")
  1726. return aliases.get(name, name)
  1727. @staticmethod
  1728. def _get_effective_max_seq_len(
  1729. user_backend_parameters: List[str],
  1730. derived_max_seq_len: int,
  1731. ) -> int:
  1732. if (
  1733. derived_max_seq_len > 8192
  1734. and find_parameter(user_backend_parameters, ["max-seq-len"]) is None
  1735. ):
  1736. return 8192
  1737. return derived_max_seq_len
  1738. def _create_workload(
  1739. self,
  1740. deployment_metadata: ModelInstanceDeploymentMetadata,
  1741. command: Optional[List[str]],
  1742. command_script: Optional[str],
  1743. command_args: List[str],
  1744. env: Dict[str, str],
  1745. config_files: List[ContainerFile],
  1746. working_dir: Optional[str],
  1747. ):
  1748. image = self._get_configured_image(backend="cann")
  1749. if not image:
  1750. raise ValueError("Failed to get Ascend MindIE backend image")
  1751. # Command script will override the given command,
  1752. # so we need to prepend command to command args.
  1753. if command_script and command:
  1754. command_args = command + command_args
  1755. command = None
  1756. resources = self._get_configured_resources(
  1757. mount_all_devices=deployment_metadata.distributed,
  1758. )
  1759. mounts = self._get_configured_mounts()
  1760. ports = self._get_configured_ports()
  1761. # Read container config from environment variables
  1762. container_config = self._get_container_env_config(env)
  1763. run_container = Container(
  1764. image=image,
  1765. name="default",
  1766. profile=ContainerProfileEnum.RUN,
  1767. restart_policy=ContainerRestartPolicyEnum.NEVER,
  1768. execution=ContainerExecution(
  1769. privileged=True,
  1770. command=command,
  1771. command_script=command_script,
  1772. args=command_args,
  1773. working_dir=working_dir,
  1774. run_as_user=container_config.user,
  1775. run_as_group=container_config.group,
  1776. ),
  1777. envs=[
  1778. ContainerEnv(
  1779. name=name,
  1780. value=value,
  1781. )
  1782. for name, value in env.items()
  1783. ],
  1784. resources=resources,
  1785. mounts=mounts,
  1786. files=config_files,
  1787. ports=ports,
  1788. )
  1789. logger.info(
  1790. f"Creating Ascend MindIE container workload: {deployment_metadata.name}"
  1791. )
  1792. logger.info(
  1793. f"With image: {image}, "
  1794. f"command: [{' '.join(command) if command else ''}], "
  1795. f"arguments: [{' '.join(command_args)}], "
  1796. f"ports: [{','.join([str(port.internal) for port in ports])}], "
  1797. f"envs(inconsistent input items mean unchangeable):{os.linesep}"
  1798. f"{os.linesep.join(f'{k}={v}' for k, v in sorted(sanitize_env(env).items()))}"
  1799. )
  1800. workload_plan = WorkloadPlan(
  1801. name=deployment_metadata.name,
  1802. host_network=True,
  1803. shm_size=int(container_config.shm_size_gib * (1 << 30)),
  1804. containers=[run_container],
  1805. run_as_user=container_config.user,
  1806. run_as_group=container_config.group,
  1807. )
  1808. create_workload(self._transform_workload_plan(workload_plan))
  1809. logger.info(
  1810. f"Created Ascend MindIE container workload: {deployment_metadata.name}"
  1811. )
  1812. @staticmethod
  1813. def _get_serving_command_script(env: dict[str, str]) -> Optional[str]:
  1814. """
  1815. Get serving command script for the MindIE service.
  1816. """
  1817. # Skip if explicitly disabled.
  1818. if env and to_bool(
  1819. env.get("GPUSTACK_MODEL_SERVING_COMMAND_SCRIPT_DISABLED", "0")
  1820. ):
  1821. return None
  1822. return """#!/usr/bin/bash
  1823. #
  1824. # Prepare
  1825. #
  1826. if [ -n "${PYPI_PACKAGES_INSTALL:-}" ]; then
  1827. if command -v uv >/dev/null 2>&1; then
  1828. echo "Installing additional PyPi packages: ${PYPI_PACKAGES_INSTALL}"
  1829. export UV_HTTP_TIMEOUT=500
  1830. export UV_NO_CACHE=1
  1831. if [ -n "${PIP_INDEX_URL:-}" ]; then
  1832. export UV_DEFAULT_INDEX="${PIP_INDEX_URL}"
  1833. export UV_INDEX_URL="${PIP_INDEX_URL}"
  1834. fi
  1835. if [ -n "${PIP_EXTRA_INDEX_URL:-}" ]; then
  1836. export UV_INDEX="${PIP_EXTRA_INDEX_URL}"
  1837. export UV_EXTRA_INDEX_URL="${PIP_EXTRA_INDEX_URL}"
  1838. fi
  1839. uv pip install --system ${PYPI_PACKAGES_INSTALL}
  1840. uv pip tree --system
  1841. elif command -v pip >/dev/null 2>&1; then
  1842. echo "Installing additional PyPi packages: ${PYPI_PACKAGES_INSTALL}"
  1843. export PIP_DISABLE_PIP_VERSION_CHECK=1
  1844. export PIP_ROOT_USER_ACTION=ignore
  1845. export PIP_TIMEOUT=500
  1846. export PIP_NO_CACHE_DIR=1
  1847. pip install ${PYPI_PACKAGES_INSTALL}
  1848. pip freeze
  1849. fi
  1850. unset PYPI_PACKAGES_INSTALL
  1851. fi
  1852. #
  1853. # Execute
  1854. #
  1855. ## Cache Envs Configured by GPUStack
  1856. MINDIE_LOG_LEVEL=${MINDIE_LOG_LEVEL:-INFO}
  1857. MIES_CERTS_LOG_LEVEL=${MIES_CERTS_LOG_LEVEL:-INFO}
  1858. MINDIE_LLM_LOG_LEVEL=${MINDIE_LLM_LOG_LEVEL:-WARN}
  1859. MINDIE_LLM_PYTHON_LOG_LEVEL=${MINDIE_LLM_PYTHON_LOG_LEVEL:-WARN}
  1860. ASCEND_GLOBAL_LOG_LEVEL=${ASCEND_GLOBAL_LOG_LEVEL:-3}
  1861. ASCEND_SLOG_LEVEL=${ASCEND_SLOG_LEVEL:-WARN}
  1862. MINDIE_RT_LOG_LEVEL=${MINDIE_RT_LOG_LEVEL:-3}
  1863. ATB_LOG_LEVEL=${ATB_LOG_LEVEL:-ERROR}
  1864. ASDOPS_LOG_LEVEL=${ASDOPS_LOG_LEVEL:-ERROR}
  1865. OCK_LOG_LEVEL=${OCK_LOG_LEVEL:-ERROR}
  1866. LOG_LEVEL=${LOG_LEVEL:-ERROR}
  1867. TORCH_AIE_LOG_LEVEL=${TORCH_AIE_LOG_LEVEL:-3}
  1868. CANN_HOME=${CANN_HOME:-/usr/local/Ascend}
  1869. ## Activate Ascend Envs
  1870. PYTHON_LIB_PREFIX=$(python3 -c "import sys; print(sys.base_prefix);")
  1871. export LD_LIBRARY_PATH=${PYTHON_LIB_PREFIX}/lib:${PYTHON_LIB_PREFIX}/lib64:${LD_LIBRARY_PATH}
  1872. source ${CANN_HOME}/ascend-toolkit/set_env.sh
  1873. source ${CANN_HOME}/nnal/atb/set_env.sh
  1874. source ${CANN_HOME}/atb-models/set_env.sh
  1875. source ${CANN_HOME}/mindie/set_env.sh
  1876. ## Override Envs Configured by GPUStack
  1877. export MINDIE_LOG_LEVEL=${MINDIE_LOG_LEVEL}
  1878. export MINDIE_LOG_TO_STDOUT=1
  1879. export MINDIE_LOG_TO_FILE=0
  1880. export MIES_CERTS_LOG_LEVEL=${MIES_CERTS_LOG_LEVEL}
  1881. export MIES_CERTS_LOG_TO_STDOUT=1
  1882. export MIES_CERTS_LOG_TO_FILE=0
  1883. export MINDIE_LLM_LOG_LEVEL=${MINDIE_LLM_LOG_LEVEL}
  1884. export MINDIE_LLM_LOG_TO_STDOUT=1
  1885. export MINDIE_LLM_LOG_TO_FILE=0
  1886. export MINDIE_LLM_PYTHON_LOG_LEVEL=${MINDIE_LLM_PYTHON_LOG_LEVEL}
  1887. export MINDIE_LLM_PYTHON_LOG_TO_STDOUT=1
  1888. export MINDIE_LLM_PYTHON_LOG_TO_FILE=0
  1889. export ASCEND_GLOBAL_LOG_LEVEL=${ASCEND_GLOBAL_LOG_LEVEL}
  1890. export ASCEND_GLOBAL_EVENT_ENABLE=0
  1891. export ASCEND_SLOG_LEVEL=${ASCEND_SLOG_LEVEL}
  1892. export ASCEND_SLOG_PRINT_TO_STDOUT=1
  1893. export ASCEND_SLOG_PRINT_TO_FILE=0
  1894. export MINDIE_RT_LOG_LEVEL=${MINDIE_RT_LOG_LEVEL}
  1895. export MINDIE_RT_LOG_PRINT_TO_STDOUT=1
  1896. export MINDIE_RT_LOG_PRINT_TO_FILE=0
  1897. export ATB_LOG_LEVEL=${ATB_LOG_LEVEL}
  1898. export ATB_LOG_TO_STDOUT=1
  1899. export ATB_LOG_TO_FILE=0
  1900. export ATB_STREAM_SYNC_EVERY_KERNEL_ENABLE=0
  1901. export ATB_LOG_TO_FILE_FLUSH=0
  1902. export ASDOPS_LOG_LEVEL=${ASDOPS_LOG_LEVEL}
  1903. export ASDOPS_LOG_TO_STDOUT=1
  1904. export ASDOPS_LOG_TO_FILE=0
  1905. export OCK_LOG_LEVEL=${OCK_LOG_LEVEL}
  1906. export OCK_LOG_TO_STDOUT=1
  1907. export OCK_LOG_TO_FILE=0
  1908. export LOG_LEVEL=${LOG_LEVEL}
  1909. export LOG_TO_STDOUT=1
  1910. export LOG_TO_FILE=0
  1911. export TORCH_AIE_LOG_LEVEL=${TORCH_AIE_LOG_LEVEL}
  1912. export TORCH_AIE_PRINT_TO_STDOUT=1
  1913. export TORCH_AIE_PRINT_TO_FILE=0
  1914. ## Execute the binary preprocessed by GPUStack Runner if exists,
  1915. ## otherwise, execute the original binary.
  1916. if [ -x ${CANN_HOME}/mindie/latest/mindie-service/bin/mindieservice_daemon_ ]; then
  1917. ${CANN_HOME}/mindie/latest/mindie-service/bin/mindieservice_daemon_
  1918. else
  1919. exec "$@"
  1920. fi
  1921. """
  1922. @staticmethod
  1923. @lru_cache
  1924. def _get_mindie_config_json() -> dict[str, Any]:
  1925. config_str = """
  1926. {
  1927. "Version" : "1.0.0",
  1928. "ServerConfig" :
  1929. {
  1930. "ipAddress" : "127.0.0.1",
  1931. "managementIpAddress" : "127.0.0.2",
  1932. "port" : 1025,
  1933. "managementPort" : 1026,
  1934. "metricsPort" : 1027,
  1935. "allowAllZeroIpListening" : false,
  1936. "maxLinkNum" : 1000,
  1937. "httpsEnabled" : true,
  1938. "fullTextEnabled" : false,
  1939. "tlsCaPath" : "security/ca/",
  1940. "tlsCaFile" : ["ca.pem"],
  1941. "tlsCert" : "security/certs/server.pem",
  1942. "tlsPk" : "security/keys/server.key.pem",
  1943. "tlsPkPwd" : "security/pass/key_pwd.txt",
  1944. "tlsCrlPath" : "security/certs/",
  1945. "tlsCrlFiles" : ["server_crl.pem"],
  1946. "managementTlsCaFile" : ["management_ca.pem"],
  1947. "managementTlsCert" : "security/certs/management/server.pem",
  1948. "managementTlsPk" : "security/keys/management/server.key.pem",
  1949. "managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
  1950. "managementTlsCrlPath" : "security/management/certs/",
  1951. "managementTlsCrlFiles" : ["server_crl.pem"],
  1952. "kmcKsfMaster" : "tools/pmt/master/ksfa",
  1953. "kmcKsfStandby" : "tools/pmt/standby/ksfb",
  1954. "inferMode" : "standard",
  1955. "interCommTLSEnabled" : true,
  1956. "interCommPort" : 1121,
  1957. "interCommTlsCaPath" : "security/grpc/ca/",
  1958. "interCommTlsCaFiles" : ["ca.pem"],
  1959. "interCommTlsCert" : "security/grpc/certs/server.pem",
  1960. "interCommPk" : "security/grpc/keys/server.key.pem",
  1961. "interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
  1962. "interCommTlsCrlPath" : "security/grpc/certs/",
  1963. "interCommTlsCrlFiles" : ["server_crl.pem"],
  1964. "openAiSupport" : "vllm",
  1965. "tokenTimeout" : 600,
  1966. "e2eTimeout" : 600,
  1967. "distDPServerEnabled":false,
  1968. "layerwiseDisaggregated" : false,
  1969. "layerwiseDisaggregatedRoleType" : "",
  1970. "layerwiseDisaggregatedMasterIpAddress" : "127.0.0.3",
  1971. "layerwiseDisaggregatedSlaveIpAddress" : ["127.0.0.4"],
  1972. "layerwiseDisaggregatedDataPort" : 10024,
  1973. "layerwiseDisaggregatedCrtlPort" : [10001,10002],
  1974. "HealthCheckConfig" :
  1975. {
  1976. "npuUsageThreshold" : 0
  1977. }
  1978. },
  1979. "BackendConfig" : {
  1980. "backendName" : "mindieservice_llm_engine",
  1981. "modelInstanceNumber" : 1,
  1982. "npuDeviceIds" : [[0,1,2,3]],
  1983. "tokenizerProcessNumber" : 8,
  1984. "multiNodesInferEnabled" : false,
  1985. "multiNodesInferPort" : 1120,
  1986. "interNodeTLSEnabled" : true,
  1987. "interNodeTlsCaPath" : "security/grpc/ca/",
  1988. "interNodeTlsCaFiles" : ["ca.pem"],
  1989. "interNodeTlsCert" : "security/grpc/certs/server.pem",
  1990. "interNodeTlsPk" : "security/grpc/keys/server.key.pem",
  1991. "interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
  1992. "interNodeTlsCrlPath" : "security/grpc/certs/",
  1993. "interNodeTlsCrlFiles" : ["server_crl.pem"],
  1994. "interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
  1995. "interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
  1996. "ModelDeployConfig" :
  1997. {
  1998. "maxSeqLen" : 2560,
  1999. "maxInputTokenLen" : 2048,
  2000. "truncation" : false,
  2001. "ModelConfig" : [
  2002. {
  2003. "modelInstanceType" : "Standard",
  2004. "modelName" : "llama_65b",
  2005. "modelWeightPath" : "/data/atb_testdata/weights/llama1-65b-safetensors",
  2006. "worldSize" : 4,
  2007. "cpuMemSize" : 0,
  2008. "npuMemSize" : -1,
  2009. "backendType" : "atb",
  2010. "trustRemoteCode" : false,
  2011. "async_scheduler_wait_time": 120,
  2012. "kv_trans_timeout": 10,
  2013. "kv_link_timeout": 1080
  2014. }
  2015. ]
  2016. },
  2017. "ScheduleConfig" :
  2018. {
  2019. "templateType" : "Standard",
  2020. "templateName" : "Standard_LLM",
  2021. "cacheBlockSize" : 128,
  2022. "maxPrefillBatchSize" : 50,
  2023. "maxPrefillTokens" : 8192,
  2024. "prefillTimeMsPerReq" : 150,
  2025. "prefillPolicyType" : 0,
  2026. "decodeTimeMsPerReq" : 50,
  2027. "decodePolicyType" : 0,
  2028. "maxBatchSize" : 200,
  2029. "maxIterTimes" : 512,
  2030. "maxPreemptCount" : 0,
  2031. "supportSelectBatch" : false,
  2032. "maxQueueDelayMicroseconds" : 5000,
  2033. "maxFirstTokenWaitTime": 2500
  2034. }
  2035. },
  2036. "LogConfig": {
  2037. "dynamicLogLevel" : "",
  2038. "dynamicLogLevelValidHours" : 2,
  2039. "dynamicLogLevelValidTime" : ""
  2040. },
  2041. "EnableDynamicAdjustTimeoutConfig": false
  2042. }
  2043. """
  2044. return json.loads(config_str)