calculator.py 48 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466
  1. import argparse
  2. import asyncio
  3. import dataclasses
  4. import json
  5. from enum import Enum
  6. import logging
  7. import os
  8. import subprocess
  9. from dataclasses import dataclass
  10. import time
  11. from typing import List, Optional, Dict, Tuple, Any
  12. from dataclasses_json import dataclass_json
  13. from gpustack.client.worker_filesystem_client import WorkerFilesystemClient
  14. from gpustack.config.config import get_global_config
  15. from gpustack.policies.worker_filters.gpu_matching_filter import GPUMatchingFilter
  16. from gpustack.policies.worker_filters.label_matching_filter import LabelMatchingFilter
  17. from gpustack.policies.worker_filters.local_path_filter import LocalPathFilter
  18. from gpustack.schemas.models import (
  19. BackendEnum,
  20. Model,
  21. SourceEnum,
  22. get_mmproj_filename,
  23. CategoryEnum,
  24. is_audio_model,
  25. )
  26. from gpustack.schemas.workers import Worker
  27. from gpustack.utils.compat_importlib import pkg_resources
  28. from gpustack.utils.convert import parse_duration, safe_int
  29. from gpustack.utils.hub import (
  30. filter_filename,
  31. list_repo,
  32. match_hugging_face_files,
  33. match_model_scope_file_paths,
  34. get_pretrained_config,
  35. read_repo_file_content,
  36. safe_pretrained_config_from_dict,
  37. )
  38. from gpustack.utils import platform
  39. logger = logging.getLogger(__name__)
  40. fetch_file_timeout_in_seconds = 15
  41. class GPUOffloadEnum(str, Enum):
  42. Full = "full"
  43. Partial = "partial"
  44. Disable = "disable"
  45. @dataclass_json
  46. @dataclass
  47. class LayerMemoryEstimate:
  48. uma: int
  49. nonuma: int
  50. handleLayers: Optional[int] = None
  51. @dataclass_json
  52. @dataclass
  53. class MemoryEstimate:
  54. fullOffloaded: bool
  55. ram: LayerMemoryEstimate
  56. vrams: List[LayerMemoryEstimate]
  57. offloadLayers: Optional[int] = None # Not available for diffusion models
  58. def to_log_string(self) -> str:
  59. vram_strings = ', '.join(
  60. [
  61. f"(uma:{vram.uma}, non-uma:{vram.nonuma}, layers:{vram.handleLayers})"
  62. for vram in self.vrams
  63. ]
  64. )
  65. return (
  66. f"layers: {self.offloadLayers}, "
  67. f"{'full offloaded, ' if self.fullOffloaded else ''}"
  68. f"ram: (uma:{self.ram.uma}, non-uma:{self.ram.nonuma}, layers:{self.ram.handleLayers}), "
  69. f"vrams: [{vram_strings}]"
  70. )
  71. @dataclass_json
  72. @dataclass
  73. class Estimate:
  74. items: List[MemoryEstimate]
  75. architecture: str
  76. embeddingOnly: bool = False
  77. imageOnly: bool = False
  78. distributable: bool = False
  79. reranking: bool = False
  80. contextSize: Optional[int] = None
  81. @dataclass_json
  82. @dataclass
  83. class Architecture:
  84. # Describe the model architecture,
  85. # value from "model", "projector", "adapter" and so on.
  86. type: Optional[str] = "model"
  87. # Describe the model architecture name.
  88. architecture: Optional[str] = None
  89. # Describe the clip's projector type,
  90. # only used when type is "projector".
  91. clipProjectorType: Optional[str] = None
  92. # Describe the adapter type,
  93. # only used when type is "adapter".
  94. adapterType: Optional[str] = None
  95. # Describe the diffusion model architecture,
  96. # only used when type is "diffusion".
  97. diffusionArchitecture: Optional[str] = None
  98. # Describe the conditioners of the diffusion model,
  99. # only used when type is "diffusion".
  100. diffusionConditioners: Optional[List[Dict]] = None
  101. # Describe the autoencoder of the diffusion model,
  102. # only used when type is "diffusion".
  103. diffusionAutoencoder: Optional[Dict] = None
  104. def is_deployable(self) -> bool:
  105. """
  106. Check if the model is deployable.
  107. Returns:
  108. bool: True if the model is deployable, False otherwise.
  109. """
  110. if self.type in ["projector", "adapter"] and not self.architecture:
  111. return False
  112. if self.architecture == "diffusion":
  113. return bool(self.diffusionConditioners and self.diffusionAutoencoder)
  114. return True
  115. def __str__(self) -> str:
  116. """
  117. Get a string representation of the architecture.
  118. """
  119. if self.type == "projector":
  120. return f"projector({self.clipProjectorType})"
  121. elif self.type == "adapter":
  122. return f"adapter({self.adapterType})"
  123. else:
  124. if self.architecture == "diffusion":
  125. return f"diffusion model({self.diffusionArchitecture})"
  126. else:
  127. return f"model({self.architecture})"
  128. @dataclass_json
  129. @dataclass
  130. class GGUFParserOutput:
  131. estimate: Estimate
  132. architecture: Optional[Architecture] = None
  133. @dataclass
  134. class ModelResourceClaim:
  135. model: Model
  136. resource_claim_estimate: Estimate
  137. resource_architecture: Optional[Architecture] = None
  138. # overwrite the hash to use in uniquequeue
  139. def __hash__(self):
  140. if self.model.id and self.model.updated_at:
  141. return hash((self.model.id, self.model.updated_at))
  142. return hash(self.model.model_source_index)
  143. def __eq__(self, other):
  144. if isinstance(other, ModelResourceClaim):
  145. return self.__hash__() == other.model.__hash__()
  146. return False
  147. def _get_empty_estimate(n_gpu: int = 1) -> Tuple[Estimate, Architecture]:
  148. empty_layer_memory_estimate = LayerMemoryEstimate(
  149. uma=0, nonuma=0, handleLayers=None
  150. )
  151. memory_estimate = MemoryEstimate(
  152. offloadLayers=999,
  153. fullOffloaded=True,
  154. ram=empty_layer_memory_estimate,
  155. vrams=[empty_layer_memory_estimate for _ in range(n_gpu)],
  156. )
  157. e = Estimate(
  158. items=[memory_estimate],
  159. contextSize=0,
  160. architecture="",
  161. embeddingOnly=False,
  162. imageOnly=False,
  163. distributable=False,
  164. reranking=False,
  165. )
  166. a = Architecture()
  167. return e, a
  168. def _gguf_parser_env(model: Model) -> dict:
  169. env = os.environ.copy()
  170. if model.source == SourceEnum.HUGGING_FACE:
  171. global_config = get_global_config()
  172. if global_config.huggingface_token:
  173. env["HF_TOKEN"] = str(global_config.huggingface_token)
  174. return env
  175. class NoExitArgumentParser(argparse.ArgumentParser):
  176. def error(self, message):
  177. raise argparse.ArgumentError(None, message)
  178. def exit(self, status=0, message=None):
  179. if status != 0:
  180. raise argparse.ArgumentError(None, message or "Unknown error")
  181. # Ignore exit with status 0 (e.g., --help, --version)
  182. return
  183. @dataclass
  184. class GGUFParserCommandMutableParameters:
  185. # NB(thxCode): Partial options are not applied to backend, but to the parser.
  186. # We can receive these options from the backend advanced config.
  187. backend_version: Optional[str] = None
  188. # Estimate
  189. flash_attention: Optional[bool] = None
  190. main_gpu: Optional[int] = None
  191. parallel_size: int = 4
  192. platform_footprint: str = "150,500"
  193. # Estimate/LLaMACpp
  194. batch_size: Optional[int] = None
  195. cache_type_k: Optional[str] = None
  196. cache_type_v: Optional[str] = None
  197. ctx_size: int = 8192
  198. rope_freq_base: Optional[float] = None
  199. rope_freq_scale: Optional[float] = None
  200. rope_scale: Optional[float] = None
  201. rope_scaling: Optional[str] = None
  202. yarn_orig_ctx: Optional[int] = None
  203. override_tensor: Optional[List[str]] = None
  204. gpu_layers_draft: Optional[int] = None
  205. mmap: bool = False
  206. no_kv_offload: Optional[bool] = None
  207. split_mode: Optional[str] = None
  208. ubatch_size: Optional[int] = None
  209. visual_max_image_size: Optional[int] = None
  210. max_projected_cache: int = 10
  211. swa_full: bool = False
  212. # Estimate/StableDiffusionCpp
  213. image_autoencoder_tiling: bool = True
  214. image_batch_count: Optional[int] = None
  215. image_free_compute_memory_immediately: Optional[bool] = None
  216. image_height: Optional[int] = None
  217. image_no_autoencoder_offload: Optional[bool] = None
  218. image_no_conditioner_offload: Optional[bool] = None
  219. image_no_control_net_offload: Optional[bool] = None
  220. image_width: Optional[int] = None
  221. # Load
  222. cache_expiration: str = "0"
  223. skip_cache: Optional[bool] = None
  224. skip_dns_cache: Optional[bool] = None
  225. skip_proxy: Optional[bool] = None
  226. skip_range_download_detect: Optional[bool] = None
  227. skip_tls_verify: Optional[bool] = None
  228. header: Optional[List[str]] = None
  229. def from_args(self, args: List[str]):
  230. parser = NoExitArgumentParser(exit_on_error=False, allow_abbrev=False)
  231. # Default any True arguments here,
  232. # so that they can be set to False later.
  233. parser.set_defaults(image_autoencoder_tiling=True)
  234. # Estimate
  235. parser.add_argument(
  236. "--flash-attention",
  237. "--flash-attn",
  238. "--diffusion-fa",
  239. "-fa",
  240. type=bool,
  241. action=argparse.BooleanOptionalAction, # generated "--no-flash-attention", "--no-flash-attn", "--no-diffusion-fa"
  242. required=False,
  243. )
  244. parser.add_argument(
  245. "--main-gpu",
  246. "-mg",
  247. type=int,
  248. required=False,
  249. )
  250. parser.add_argument(
  251. "--parallel-size",
  252. "--parallel",
  253. "-np",
  254. type=int,
  255. required=False,
  256. )
  257. parser.add_argument(
  258. "--platform-footprint",
  259. type=str,
  260. required=False,
  261. )
  262. # Estimate/LLaMACpp
  263. parser.add_argument(
  264. "--batch-size",
  265. "-b",
  266. type=int,
  267. required=False,
  268. )
  269. parser.add_argument(
  270. "--cache-type-k",
  271. "-ctk",
  272. type=str,
  273. required=False,
  274. )
  275. parser.add_argument(
  276. "--cache-type-v",
  277. "-ctv",
  278. type=str,
  279. required=False,
  280. )
  281. parser.add_argument(
  282. "--ctx-size",
  283. "-c",
  284. type=int,
  285. required=False,
  286. )
  287. parser.add_argument(
  288. "--rope-freq-base",
  289. type=float,
  290. required=False,
  291. )
  292. parser.add_argument(
  293. "--rope-freq-scale",
  294. type=float,
  295. required=False,
  296. )
  297. parser.add_argument(
  298. "--rope-scale",
  299. type=float,
  300. required=False,
  301. )
  302. parser.add_argument(
  303. "--rope-scaling",
  304. type=str,
  305. required=False,
  306. )
  307. parser.add_argument(
  308. "--yarn-orig-ctx",
  309. type=int,
  310. required=False,
  311. )
  312. parser.add_argument(
  313. "--override-tensor",
  314. "-ot",
  315. action='append',
  316. required=False,
  317. )
  318. parser.add_argument(
  319. "--gpu-layers-draft",
  320. "--n-gpu-layers-draft",
  321. "-ngld",
  322. type=int,
  323. required=False,
  324. )
  325. parser.add_argument(
  326. "--mmap",
  327. type=bool,
  328. action=argparse.BooleanOptionalAction, # generated "--no-mmap"
  329. required=False,
  330. )
  331. parser.add_argument(
  332. "--no-kv-offload",
  333. "-nkvo",
  334. action='store_true',
  335. required=False,
  336. )
  337. parser.add_argument(
  338. "--split-mode",
  339. "-sm",
  340. type=str,
  341. required=False,
  342. )
  343. parser.add_argument(
  344. "--ubatch-size",
  345. "-ub",
  346. type=int,
  347. required=False,
  348. )
  349. parser.add_argument(
  350. "--visual-max-image-size",
  351. type=int,
  352. required=False,
  353. )
  354. parser.add_argument(
  355. "--max-projected-cache",
  356. "--visual-max-image-cache",
  357. type=int,
  358. required=False,
  359. )
  360. parser.add_argument(
  361. "--swa-full",
  362. action='store_true',
  363. required=False,
  364. )
  365. # Estimate/StableDiffusionCpp
  366. parser.add_argument(
  367. "--image-autoencoder-tiling",
  368. "--image-vae-tiling",
  369. "--vae-tiling",
  370. action='store_true',
  371. dest="image_autoencoder_tiling",
  372. required=False,
  373. )
  374. parser.add_argument(
  375. "--image-no-autoencoder-tiling",
  376. "--image-no-vae-tiling",
  377. action='store_false',
  378. dest="image_autoencoder_tiling",
  379. required=False,
  380. )
  381. parser.add_argument(
  382. "--image-batch-count",
  383. "--batch-count",
  384. "--image-max-batch",
  385. type=int,
  386. required=False,
  387. )
  388. parser.add_argument(
  389. "--image-free-compute-memory-immediately",
  390. action='store_true',
  391. required=False,
  392. )
  393. parser.add_argument(
  394. "--image-height",
  395. "--height",
  396. "--image-max-height",
  397. type=int,
  398. required=False,
  399. )
  400. parser.add_argument(
  401. "--image-no-autoencoder-offload",
  402. "--vae-on-cpu",
  403. "--image-no-vae-model-offload",
  404. action='store_true',
  405. required=False,
  406. )
  407. parser.add_argument(
  408. "--image-no-conditioner-offload",
  409. "--clip-on-cpu",
  410. "--image-no-text-encoder-model-offload",
  411. action='store_true',
  412. required=False,
  413. )
  414. parser.add_argument(
  415. "--image-no-control-net-offload",
  416. "--control-net-cpu",
  417. "--image-no-control-net-model-offload",
  418. action='store_true',
  419. required=False,
  420. )
  421. parser.add_argument(
  422. "--image-width",
  423. "--width",
  424. "--image-max-width",
  425. type=int,
  426. required=False,
  427. )
  428. # Load
  429. parser.add_argument(
  430. "--cache-expiration",
  431. type=str,
  432. required=False,
  433. )
  434. parser.add_argument(
  435. "--skip-cache",
  436. action='store_true',
  437. required=False,
  438. )
  439. parser.add_argument(
  440. "--skip-dns-cache",
  441. action='store_true',
  442. required=False,
  443. )
  444. parser.add_argument(
  445. "--skip-proxy",
  446. action='store_true',
  447. required=False,
  448. )
  449. parser.add_argument(
  450. "--skip-range-download-detect",
  451. action='store_true',
  452. required=False,
  453. )
  454. parser.add_argument(
  455. "--skip-tls-verify",
  456. action='store_true',
  457. required=False,
  458. )
  459. parser.add_argument(
  460. "--header",
  461. action='append',
  462. required=False,
  463. )
  464. slogger = logger.getChild("gguf_parser_command")
  465. try:
  466. args_parsed = parser.parse_known_args(args=args)
  467. for attr_name in [attr.name for attr in dataclasses.fields(self.__class__)]:
  468. try:
  469. attr_value = getattr(args_parsed[0], attr_name, None)
  470. if attr_value is not None:
  471. try:
  472. setattr(self, attr_name, attr_value)
  473. except ValueError as e:
  474. slogger.warning(
  475. f"Failed to receive mutable parameter {attr_name}: {e}"
  476. )
  477. except AttributeError:
  478. # If reach here, that means the field is an internal property,
  479. # which would not register in the argument parser.
  480. pass
  481. except Exception as e:
  482. slogger.warning(f"Failed to parse mutable parameters: {e}")
  483. def extend_command(self, command: List[str]):
  484. internal_properties = [
  485. "backend_version",
  486. ]
  487. for attr_name in [attr.name for attr in dataclasses.fields(self.__class__)]:
  488. if attr_name in internal_properties:
  489. # Skip internal properties.
  490. continue
  491. attr_value = getattr(self, attr_name, None)
  492. if attr_value is not None:
  493. if isinstance(attr_value, bool):
  494. command.append(
  495. f"--{attr_name.replace('_', '-')}={'true' if attr_value else 'false'}"
  496. )
  497. elif isinstance(attr_value, int):
  498. command.append(f"--{attr_name.replace('_', '-')}={str(attr_value)}")
  499. elif isinstance(attr_value, list):
  500. for sv in attr_value:
  501. command.append(f"--{attr_name.replace('_', '-')}={str(sv)}")
  502. else:
  503. command.append(f"--{attr_name.replace('_', '-')}={str(attr_value)}")
  504. async def _gguf_parser_command(
  505. model: Model, offload: GPUOffloadEnum = GPUOffloadEnum.Full, **kwargs
  506. ):
  507. bin_path = pkg_resources.files("gpustack.third_party.bin.gguf-parser").joinpath(
  508. "gguf-parser" + (".exe" if platform.system() == "windows" else "")
  509. )
  510. # Preset the command with immutable arguments.
  511. command = [
  512. bin_path,
  513. "--skip-tokenizer",
  514. "--skip-metadata",
  515. "--json",
  516. ]
  517. # Extend the command with mutable arguments.
  518. params = GGUFParserCommandMutableParameters(backend_version=model.backend_version)
  519. params.from_args(model.backend_parameters)
  520. params.extend_command(command)
  521. # Extend the command with controlled arguments.
  522. cache_dir = kwargs.get("cache_dir")
  523. if cache_dir:
  524. command.extend(["--cache-path", cache_dir])
  525. if offload == GPUOffloadEnum.Full:
  526. command.extend(["--gpu-layers", "-1"])
  527. elif offload == GPUOffloadEnum.Partial:
  528. command.extend(["--gpu-layers-step", "1"])
  529. elif offload == GPUOffloadEnum.Disable:
  530. command.extend(["--gpu-layers", "0"])
  531. tensor_split = kwargs.get("tensor_split")
  532. if tensor_split:
  533. if all(i < 1024 * 1024 for i in tensor_split):
  534. # user provided
  535. tensor_split_str = ",".join([str(i) for i in tensor_split])
  536. else:
  537. # computed by the system, convert to MiB to prevent overflow
  538. tensor_split_str = ",".join(
  539. [str(int(i / (1024 * 1024))) for i in tensor_split]
  540. )
  541. command.extend(["--tensor-split", tensor_split_str])
  542. rpc = kwargs.get("rpc")
  543. if rpc:
  544. rpc_str = ",".join([v for v in rpc])
  545. command.extend(["--rpc", rpc_str])
  546. source_args = await _gguf_parser_command_args_from_source(
  547. model, cache_expiration=params.cache_expiration, **kwargs
  548. )
  549. command.extend(source_args)
  550. return command
  551. async def _try_parse_on_workers(
  552. model: Model,
  553. workers: List[Worker],
  554. offload: GPUOffloadEnum,
  555. **kwargs,
  556. ) -> ModelResourceClaim:
  557. """
  558. Try to parse GGUF on specified workers concurrently.
  559. Returns:
  560. ModelResourceClaim from the first worker that succeeds.
  561. Raises:
  562. ValueError: If no workers are given, or every worker fails (with per-worker details).
  563. """
  564. if not workers:
  565. raise ValueError(
  566. f"No workers are available to run gguf-parser for model '{model.name}'."
  567. )
  568. # Prepare parameters once
  569. offload_str = offload.value # "full", "partial", "disable"
  570. # Prepare override parameters (only pass necessary ones)
  571. parse_kwargs = {}
  572. for key in ("tensor_split", "rpc"):
  573. if key in kwargs:
  574. parse_kwargs[key] = kwargs[key]
  575. worker_errors: Dict[str, str] = {}
  576. async def try_parse_on_worker(worker: Worker) -> Optional[ModelResourceClaim]:
  577. """Try to parse GGUF on a single worker."""
  578. try:
  579. async with WorkerFilesystemClient() as fs_client:
  580. output_dict = await fs_client.parse_gguf(
  581. worker,
  582. model,
  583. offload=offload_str,
  584. **parse_kwargs,
  585. )
  586. claim = GGUFParserOutput.from_dict(output_dict)
  587. if offload == GPUOffloadEnum.Disable:
  588. clear_vram_claim(claim)
  589. logger.info(
  590. f"Successfully parsed GGUF on worker {worker.name} "
  591. f"for model {model.name}"
  592. )
  593. return ModelResourceClaim(
  594. model=model,
  595. resource_claim_estimate=claim.estimate,
  596. resource_architecture=claim.architecture,
  597. )
  598. except Exception as e:
  599. error_msg = str(e)
  600. logger.info(
  601. f"Failed to parse GGUF on worker {worker.name} for model "
  602. f"{model.name}: {error_msg}"
  603. )
  604. worker_errors[worker.name] = error_msg
  605. return None
  606. # Concurrently try all workers and return the first successful result
  607. tasks = [try_parse_on_worker(worker) for worker in workers]
  608. # Use as_completed to get results as they finish
  609. for completed_task in asyncio.as_completed(tasks):
  610. result = await completed_task
  611. if result:
  612. return result
  613. error_items = list(worker_errors.items())
  614. shown_errors = "; \n".join(f"{name}: {err}" for name, err in error_items[:3])
  615. if len(error_items) > 3:
  616. shown_errors += f" (+{len(error_items) - 3} more)"
  617. raise ValueError(
  618. f"Failed to run gguf-parser on all {len(workers)} available worker(s) "
  619. f"for model '{model.name}'.\n"
  620. f"Per-worker details:\n{shown_errors}"
  621. )
  622. async def get_pretrained_config_with_workers(
  623. model: Model,
  624. workers: Optional[List[Worker]] = None,
  625. trust_remote_code: bool = False,
  626. ) -> Optional[Any]:
  627. """
  628. Unified async entry point for getting pretrained config.
  629. Handles all model sources with appropriate fallback strategies:
  630. - For LOCAL_PATH model which is not available locally, get from workers
  631. - For others, AutoConfig and fallback to config.json
  632. Args:
  633. model: Model to get config for
  634. workers: Available workers (for LOCAL_PATH)
  635. trust_remote_code: Whether to trust remote code
  636. Returns:
  637. PretrainedConfig object or None
  638. Raises:
  639. ValueError: If config is required but cannot be loaded
  640. """
  641. pretrained_config = None
  642. timeout_in_seconds = 15
  643. try:
  644. if model.source == SourceEnum.LOCAL_PATH and not os.path.exists(
  645. model.local_path
  646. ):
  647. logger.info(
  648. f"Model path '{model.local_path}' does not exist in server node. Trying workers..."
  649. )
  650. pretrained_config = await get_pretrained_config_from_workers(
  651. model,
  652. workers,
  653. )
  654. else:
  655. logger.info(f"Trying to get pretrained config from {model.source}")
  656. pretrained_config = await asyncio.wait_for(
  657. asyncio.to_thread(
  658. get_pretrained_config, model, trust_remote_code=trust_remote_code
  659. ),
  660. timeout=timeout_in_seconds,
  661. )
  662. except Exception as e:
  663. if should_fallback_load_config_json(e, model):
  664. config_dict = await asyncio.wait_for(
  665. asyncio.to_thread(
  666. read_repo_file_content,
  667. model,
  668. "config.json",
  669. token=get_global_config().huggingface_token,
  670. ),
  671. timeout=timeout_in_seconds,
  672. )
  673. if config_dict:
  674. return safe_pretrained_config_from_dict(config_dict)
  675. # If config_dict is None for LOCAL_PATH, provide a clearer error message
  676. if model.source == SourceEnum.LOCAL_PATH:
  677. raise ValueError(
  678. f"Model path '{model.local_path}' does not exist or config.json is not found. "
  679. f"Please ensure the model files are available at the specified path."
  680. )
  681. if model.env and model.env.get("GPUSTACK_SKIP_MODEL_EVALUATION"):
  682. return pretrained_config
  683. if model.categories and any(
  684. cat in model.categories
  685. for cat in [CategoryEnum.IMAGE, CategoryEnum.UNKNOWN]
  686. ):
  687. return pretrained_config
  688. raise e
  689. return pretrained_config
  690. def should_fallback_load_config_json(e: Exception, model: Model) -> bool:
  691. """
  692. Determine whether to fallback to loading config.json based on the exception and model.
  693. Args:
  694. e: The exception encountered during loading
  695. model: The model being processed
  696. Returns:
  697. bool: True if should fallback to loading config.json, False otherwise
  698. """
  699. # For LOCAL_PATH models, the path must be a valid directory
  700. if model.source == SourceEnum.LOCAL_PATH and not (
  701. model.local_path and os.path.isdir(model.local_path)
  702. ):
  703. return False
  704. if model.backend == BackendEnum.VLLM and is_audio_model(model):
  705. # TODO(michelia): Qwen3-ASR is currently supported by vLLM but not yet by Hugging Face Transformers.
  706. # Track upstream progress: https://github.com/huggingface/transformers/issues/43837
  707. return True
  708. # For very new HF checkpoints, AutoConfig may fail on an older transformers.
  709. # Falling back to reading config.json avoids requiring a transformers upgrade.
  710. #
  711. # Example upstream error message (may vary by transformers version):
  712. """
  713. The checkpoint you are trying to load has model type `{config_dict['model_type']}`
  714. but Transformers does not recognize this architecture. This could be because of an
  715. issue with the checkpoint, or because your version of Transformers is out of date.
  716. You can update Transformers with the command `pip install --upgrade transformers`.
  717. If this does not work, and the checkpoint is very new, then there may not be a
  718. release version that supports this model yet. In this case, you can get the most
  719. up-to-date code by installing Transformers from source with the command
  720. `pip install git+https://github.com/huggingface/transformers.git`
  721. """
  722. msg = str(e).lower()
  723. if (
  724. "update transformers" in msg
  725. or "pip install --upgrade transformers" in msg
  726. or "does not recognize this architecture" in msg
  727. or "install transformers from source" in msg
  728. ):
  729. return True
  730. # Fallback for backend version specified or import errors
  731. return model.backend_version is not None or isinstance(e, ImportError)
  732. async def check_diffusers_model_index_from_workers(
  733. model: Model,
  734. workers: List[Worker],
  735. ) -> bool:
  736. """
  737. Check if a LOCAL_PATH model is a diffusers model by querying workers.
  738. This function is specifically for LOCAL_PATH models that are not available
  739. locally on the server. It uses the optimized worker query strategy.
  740. Args:
  741. model: Model with source LOCAL_PATH
  742. workers: List of workers to query
  743. Returns:
  744. True if model_index.json contains _diffusers_version, False otherwise
  745. """
  746. if not workers:
  747. return False
  748. # Read model_index.json from workers
  749. try:
  750. data = await read_local_path_file_from_workers(
  751. model, "model_index.json", workers
  752. )
  753. except Exception as e:
  754. logger.info(f"Failed to read model_index.json from workers: {e}")
  755. return False
  756. if data is None:
  757. return False
  758. # Check for _diffusers_version key
  759. if isinstance(data, dict) and "_diffusers_version" in data:
  760. return True
  761. if isinstance(data, list):
  762. for item in data:
  763. if isinstance(item, dict) and "_diffusers_version" in item:
  764. return True
  765. return False
  766. async def read_local_path_file_from_workers( # noqa: C901
  767. model: Model,
  768. file_path: str,
  769. workers: List[Worker],
  770. ) -> Optional[Dict[str, Any]]:
  771. """
  772. Read a file from LOCAL_PATH model by querying workers.
  773. Steps:
  774. 1. Apply filters (GPU selector, label selector) to reduce broadcast scope
  775. 2. Broadcast to filtered workers
  776. Args:
  777. model: Model with source LOCAL_PATH
  778. file_path: Relative path to the file (e.g., "config.json", "model_index.json")
  779. workers: List of workers to query
  780. Returns:
  781. File content as dict if successful
  782. Raises:
  783. ValueError: If the file cannot be read from any worker, with per-worker diagnostic details.
  784. """
  785. if not workers:
  786. raise ValueError(
  787. f"No workers are available to read '{file_path}' for model '{model.name}'. "
  788. )
  789. # Build full file path
  790. fp = os.path.join(model.local_path, file_path)
  791. worker_errors: Dict[str, str] = {}
  792. async def try_read_from_worker(worker: Worker) -> Optional[Dict[str, Any]]:
  793. """Try to read file from a single worker."""
  794. try:
  795. async with WorkerFilesystemClient() as filesystem_client:
  796. logger.info(f"Trying to read {file_path} from worker {worker.name}")
  797. content = await filesystem_client.read_model_config(worker, fp)
  798. if content:
  799. logger.info(
  800. f"Successfully read {file_path} from worker {worker.name}"
  801. )
  802. return content
  803. worker_errors[worker.name] = "file not found or empty"
  804. return None
  805. except Exception as e:
  806. error_msg = str(e)
  807. logger.info(
  808. f"Failed to read {file_path} from worker {worker.name}: {error_msg}"
  809. )
  810. worker_errors[worker.name] = error_msg
  811. return None
  812. # Step 1: Apply filters to reduce broadcast scope
  813. filtered_workers = workers
  814. filter_messages = []
  815. # Apply GPUMatchingFilter
  816. if model.gpu_selector:
  817. gpu_filter = GPUMatchingFilter(model)
  818. filtered_workers, gpu_messages = await gpu_filter.filter(filtered_workers)
  819. filter_messages.extend(gpu_messages)
  820. # Apply LabelMatchingFilter
  821. if model.worker_selector:
  822. label_filter = LabelMatchingFilter(model)
  823. filtered_workers, label_messages = await label_filter.filter(filtered_workers)
  824. filter_messages.extend(label_messages)
  825. # Apply LocalPathFilter for LOCAL_PATH models
  826. local_path_filter = LocalPathFilter(model)
  827. filtered_workers, local_path_messages = await local_path_filter.filter(
  828. filtered_workers
  829. )
  830. filter_messages.extend(local_path_messages)
  831. if filter_messages:
  832. for msg in filter_messages:
  833. logger.info(f"Worker filtering for {file_path} read: {msg}")
  834. # Step 2: Broadcast to filtered workers
  835. if not filtered_workers:
  836. if filter_messages:
  837. shown = filter_messages[:3]
  838. suffix = (
  839. f" (+{len(filter_messages) - 3} more)"
  840. if len(filter_messages) > 3
  841. else ""
  842. )
  843. detail = "; ".join(shown) + suffix
  844. else:
  845. detail = "no workers matched the current filters"
  846. raise ValueError(
  847. f"No workers are available after filtering to read '{file_path}' "
  848. f"for model '{model.name}'. {detail}"
  849. )
  850. logger.info(
  851. f"Broadcasting {file_path} read request to {len(filtered_workers)} filtered workers "
  852. f"(reduced from {len(workers)} total workers)"
  853. )
  854. tasks = [try_read_from_worker(worker) for worker in filtered_workers]
  855. for completed_task in asyncio.as_completed(tasks):
  856. result = await completed_task
  857. if result:
  858. return result
  859. error_items = list(worker_errors.items())
  860. shown_errors = ";\n".join(f"{name}: {err}" for name, err in error_items[:3])
  861. if len(error_items) > 3:
  862. shown_errors += f" (+{len(error_items) - 3} more)"
  863. raise ValueError(
  864. f"Failed to read '{file_path}' from all {len(filtered_workers)} available "
  865. f"worker(s) for model '{model.name}'.\n"
  866. f"Per-worker details:\n{shown_errors}"
  867. )
  868. async def get_pretrained_config_from_workers(
  869. model: Model,
  870. workers: List[Worker],
  871. ) -> Optional[Any]:
  872. """
  873. Get pretrained config from remote workers for LOCAL_PATH models.
  874. Args:
  875. model: Model with source LOCAL_PATH
  876. workers: List of workers to query
  877. Returns:
  878. PretrainedConfig object if successful
  879. Raises:
  880. ValueError: If config.json cannot be read from any worker.
  881. """
  882. config_dict = await read_local_path_file_from_workers(model, "config.json", workers)
  883. return safe_pretrained_config_from_dict(config_dict)
  884. async def _calculate_from_workers( # noqa: C901
  885. model: Model,
  886. workers: List[Worker],
  887. offload: GPUOffloadEnum,
  888. **kwargs,
  889. ) -> ModelResourceClaim:
  890. """
  891. Calculate model resource claim by running gguf-parser on a worker.
  892. Args:
  893. model: Model to calculate the resource claim for.
  894. workers: List of available workers.
  895. offload: GPU offload strategy.
  896. kwargs: Additional arguments to pass to the GGUF parser.
  897. Returns:
  898. ModelResourceClaim from the first worker that succeeds.
  899. Raises:
  900. ValueError: If no worker remains after filtering, or gguf-parser fails on every worker.
  901. """
  902. # Step 1: Apply worker filters before broadcasting
  903. filtered_workers = workers
  904. filter_messages = []
  905. # Apply GPUMatchingFilter
  906. if model.gpu_selector:
  907. from gpustack.policies.worker_filters.gpu_matching_filter import (
  908. GPUMatchingFilter,
  909. )
  910. gpu_filter = GPUMatchingFilter(model)
  911. filtered_workers, gpu_messages = await gpu_filter.filter(filtered_workers)
  912. filter_messages.extend(gpu_messages)
  913. # Apply LabelMatchingFilter
  914. if model.worker_selector:
  915. from gpustack.policies.worker_filters.label_matching_filter import (
  916. LabelMatchingFilter,
  917. )
  918. label_filter = LabelMatchingFilter(model)
  919. filtered_workers, label_messages = await label_filter.filter(filtered_workers)
  920. filter_messages.extend(label_messages)
  921. # Apply LocalPathFilter for LOCAL_PATH models
  922. local_path_filter = LocalPathFilter(model)
  923. filtered_workers, local_path_messages = await local_path_filter.filter(
  924. filtered_workers
  925. )
  926. filter_messages.extend(local_path_messages)
  927. if filter_messages:
  928. for msg in filter_messages:
  929. logger.info(f"Worker filtering for GGUF parsing: {msg}")
  930. # Step 2: Broadcasting to filtered workers
  931. if not filtered_workers:
  932. if filter_messages:
  933. shown = filter_messages[:3]
  934. suffix = (
  935. f" (+{len(filter_messages) - 3} more)"
  936. if len(filter_messages) > 3
  937. else ""
  938. )
  939. detail = "; ".join(shown) + suffix
  940. else:
  941. detail = "no workers matched the current filters"
  942. raise ValueError(
  943. f"No workers are available after filtering to run gguf-parser "
  944. f"for model '{model.name}'. {detail}"
  945. )
  946. logger.info(
  947. f"Broadcasting GGUF parse request to {len(filtered_workers)} filtered workers "
  948. f"(reduced from {len(workers)} total workers) for model {model.name}"
  949. )
  950. return await _try_parse_on_workers(model, filtered_workers, offload, **kwargs)
  951. async def calculate_gguf_model_resource_claim(
  952. model: Model,
  953. offload: GPUOffloadEnum = GPUOffloadEnum.Full,
  954. workers: Optional[List[Worker]] = None,
  955. **kwargs,
  956. ) -> ModelResourceClaim:
  957. """
  958. Calculate the resource claim of the model.
  959. Args:
  960. model: Model to calculate the resource claim for.
  961. offload: GPU offload strategy.
  962. workers: Optional list of available workers for remote parsing.
  963. kwargs: Additional arguments to pass to the GGUF parser.
  964. """
  965. if model.source == SourceEnum.LOCAL_PATH and not os.path.exists(model.local_path):
  966. # Try to calculate on worker if workers are provided
  967. if workers:
  968. logger.info(
  969. f"Model path '{model.local_path}' does not exist on the server node. "
  970. f"Running gguf-parser on workers for model {model.name}..."
  971. )
  972. return await _calculate_from_workers(model, workers, offload, **kwargs)
  973. # Skip the calculation if the model is not available, policies like spread strategy still apply.
  974. # TODO Support user provided resource claim for better scheduling.
  975. e, a = _get_empty_estimate()
  976. tensor_split = kwargs.get("tensor_split")
  977. if tensor_split:
  978. e, a = _get_empty_estimate(n_gpu=len(tensor_split))
  979. return ModelResourceClaim(
  980. model=model,
  981. resource_claim_estimate=e,
  982. resource_architecture=a,
  983. )
  984. command = await _gguf_parser_command(model, offload, **kwargs)
  985. env = _gguf_parser_env(model)
  986. try:
  987. start_time = time.time()
  988. logger.trace(
  989. f"Running parser for model {model.name} with command: {' '.join(map(str, command))}"
  990. )
  991. process = await asyncio.create_subprocess_exec(
  992. *command,
  993. env=env,
  994. stdout=asyncio.subprocess.PIPE,
  995. stderr=asyncio.subprocess.PIPE,
  996. )
  997. stdout, stderr = await process.communicate()
  998. if process.returncode != 0:
  999. raise subprocess.CalledProcessError(
  1000. process.returncode, command, output=stdout, stderr=stderr
  1001. )
  1002. cmd_output = stdout.decode()
  1003. claim: GGUFParserOutput = GGUFParserOutput.from_json(cmd_output)
  1004. latency = time.time() - start_time
  1005. if offload == GPUOffloadEnum.Full:
  1006. logger.trace(
  1007. f"Finished running parser for full offload model instance {model.name}, latency: {latency:.2f}, "
  1008. f"{claim.estimate.items[0].to_log_string()}"
  1009. )
  1010. elif offload == GPUOffloadEnum.Partial:
  1011. logger.trace(
  1012. f"Finished running parser for partial offloading model instance {model.name}, latency: {latency:.2f}, at least: "
  1013. f"{claim.estimate.items[1].to_log_string() if len(claim.estimate.items) > 1 else claim.estimate.items[0].to_log_string()}"
  1014. )
  1015. elif offload == GPUOffloadEnum.Disable:
  1016. logger.trace(
  1017. f"Finished running parser for disabled offloading model instance {model.name}, latency: {latency:.2f}, "
  1018. f"{claim.estimate.items[0].to_log_string()}"
  1019. )
  1020. clear_vram_claim(claim)
  1021. return ModelResourceClaim(
  1022. model=model,
  1023. resource_claim_estimate=claim.estimate,
  1024. resource_architecture=claim.architecture,
  1025. )
  1026. except subprocess.CalledProcessError as e:
  1027. logger.error(
  1028. f"Failed to execute {command}, error: {e}, "
  1029. + f"stderr: {e.stderr.decode()}, "
  1030. + f"stdout: {e.stdout.decode()}"
  1031. )
  1032. raise Exception(e.stderr.decode() if stderr else e.stdout.decode()) from e
  1033. except Exception as e:
  1034. raise Exception(
  1035. f"Failed to parse the output of {command}, error: {e}",
  1036. )
  1037. def clear_vram_claim(claim: GGUFParserOutput):
  1038. for item in claim.estimate.items:
  1039. # gguf-parser provides vram claim when offloadLayers is 0 due to current llama.cpp behavior, but llama-box won't allocate such vram.
  1040. if item.offloadLayers == 0:
  1041. item.vrams = [
  1042. LayerMemoryEstimate(uma=0, nonuma=0, handleLayers=0) for _ in item.vrams
  1043. ]
  1044. async def _gguf_parser_command_args_from_source(model: Model, **kwargs) -> List[str]:
  1045. """
  1046. Get the model url based on the model source.
  1047. Args:
  1048. model: Model to get the url for.
  1049. """
  1050. if model.source not in [
  1051. SourceEnum.HUGGING_FACE,
  1052. SourceEnum.MODEL_SCOPE,
  1053. SourceEnum.LOCAL_PATH,
  1054. ]:
  1055. raise ValueError(f"Unsupported source: {model.source}")
  1056. try:
  1057. if model.source in [SourceEnum.HUGGING_FACE, SourceEnum.MODEL_SCOPE]:
  1058. cache_expiration = kwargs.get("cache_expiration")
  1059. if cache_expiration and cache_expiration != "0":
  1060. cache_expiration = parse_duration(cache_expiration)
  1061. cache_expiration = safe_int(cache_expiration)
  1062. if model.source == SourceEnum.HUGGING_FACE:
  1063. repo_arg, file_arg, mmproj_arg = [
  1064. "--hf-repo",
  1065. "--hf-file",
  1066. "--hf-mmproj-file",
  1067. ]
  1068. repo_id = model.huggingface_repo_id
  1069. file_name = model.huggingface_filename
  1070. else:
  1071. repo_arg, file_arg, mmproj_arg = [
  1072. "--ms-repo",
  1073. "--ms-file",
  1074. "--ms-mmproj-file",
  1075. ]
  1076. repo_id = model.model_scope_model_id
  1077. file_name = model.model_scope_file_path
  1078. args = [repo_arg, repo_id]
  1079. global_config = get_global_config()
  1080. repo_file_infos = await asyncio.wait_for(
  1081. asyncio.to_thread(
  1082. list_repo,
  1083. repo_id,
  1084. model.source,
  1085. global_config.huggingface_token,
  1086. cache_expiration,
  1087. ),
  1088. timeout=fetch_file_timeout_in_seconds,
  1089. )
  1090. repo_files = [file.get("name", "") for file in repo_file_infos]
  1091. model_filename = filter_filename(file_name, repo_files)
  1092. if len(model_filename) == 0:
  1093. raise ValueError(f"File {model_filename} not found in {repo_id}")
  1094. args.extend([file_arg, model_filename[0]])
  1095. mmproj_filename = get_mmproj_filename(model)
  1096. mmproj_filename = filter_filename(mmproj_filename, repo_files)
  1097. if mmproj_filename:
  1098. args.extend([mmproj_arg, mmproj_filename[0]])
  1099. return args
  1100. elif model.source == SourceEnum.LOCAL_PATH:
  1101. return ["--path", model.local_path]
  1102. except asyncio.TimeoutError:
  1103. raise Exception(
  1104. f"Timeout when getting the file for model {model.name or model.readable_source}"
  1105. )
  1106. except Exception as e:
  1107. raise Exception(
  1108. f"Failed to get the file for model {model.name or model.readable_source}, error: {e}"
  1109. )
  1110. def read_model_index_json(path: str) -> dict:
  1111. """
  1112. Read and parse model_index.json from local directory.
  1113. Args:
  1114. path: Directory path containing model_index.json
  1115. Returns:
  1116. Parsed JSON data from model_index.json, or None if not found
  1117. Raises:
  1118. json.JSONDecodeError: If model_index.json is invalid
  1119. PermissionError: If permission denied
  1120. OSError: For other I/O errors
  1121. """
  1122. model_index_path = os.path.join(path, "model_index.json")
  1123. if not os.path.exists(model_index_path):
  1124. return None
  1125. try:
  1126. with open(model_index_path, 'r', encoding='utf-8') as f:
  1127. return json.load(f)
  1128. except PermissionError:
  1129. logger.error(f"Permission denied reading model_index.json: {model_index_path}")
  1130. raise
  1131. except json.JSONDecodeError as e:
  1132. logger.error(f"Failed to parse model_index.json: {e}")
  1133. raise
  1134. except OSError as e:
  1135. logger.error(f"Failed to read model_index.json: {e}")
  1136. raise
  1137. def calculate_llm_model_weight_size(path: str) -> int:
  1138. """
  1139. Calculate total size of LLM model weights in root directory.
  1140. Args:
  1141. path: Directory path to scan
  1142. Returns:
  1143. Total size in bytes of weight files
  1144. Raises:
  1145. FileNotFoundError: If path doesn't exist
  1146. NotADirectoryError: If path is not a directory
  1147. PermissionError: If permission denied
  1148. """
  1149. if not os.path.exists(path):
  1150. raise FileNotFoundError(f"The specified path '{path}' does not exist.")
  1151. if not os.path.isdir(path):
  1152. raise NotADirectoryError(f"The specified path '{path}' is not a directory.")
  1153. weight_file_extensions = (".safetensors", ".bin", ".pt", ".pth")
  1154. total_size = 0
  1155. try:
  1156. with os.scandir(path) as it:
  1157. for entry in it:
  1158. if entry.is_file() and entry.name.endswith(weight_file_extensions):
  1159. total_size += entry.stat().st_size
  1160. except PermissionError:
  1161. logger.error(f"Permission denied when accessing '{path}'.")
  1162. raise
  1163. return total_size
  1164. def calculate_diffusion_model_weight_size(path: str) -> int:
  1165. """
  1166. Calculate total size of diffusion model weights.
  1167. Logic:
  1168. 1. Read model_index.json to get pipeline components
  1169. 2. Scan subdirectories defined in pipeline
  1170. 3. Sum up weight files (.safetensors, .bin, .pt, .pth)
  1171. Args:
  1172. path: Directory path containing model_index.json
  1173. Returns:
  1174. Total size in bytes of weight files
  1175. Raises:
  1176. FileNotFoundError: If model_index.json not found or path doesn't exist
  1177. NotADirectoryError: If path is not a directory
  1178. PermissionError: If permission denied
  1179. json.JSONDecodeError: If model_index.json is invalid
  1180. """
  1181. if not os.path.exists(path):
  1182. raise FileNotFoundError(f"The specified path '{path}' does not exist.")
  1183. if not os.path.isdir(path):
  1184. raise NotADirectoryError(f"The specified path '{path}' is not a directory.")
  1185. weight_file_extensions = (".safetensors", ".bin", ".pt", ".pth")
  1186. # Read pipeline definition
  1187. pipeline_data = read_model_index_json(path)
  1188. if pipeline_data is None:
  1189. raise FileNotFoundError(f"model_index.json not found in {path}")
  1190. if not isinstance(pipeline_data, dict):
  1191. raise TypeError(f"model_index.json in {path} is not a valid JSON object.")
  1192. # Remove metadata keys (starting with _)
  1193. component_dirs = {key for key in pipeline_data.keys() if not key.startswith('_')}
  1194. total_size = 0
  1195. # Scan each component directory
  1196. for component_dir in component_dirs:
  1197. component_path = os.path.join(path, component_dir)
  1198. if not os.path.isdir(component_path):
  1199. continue
  1200. # Scan files in component directory
  1201. try:
  1202. with os.scandir(component_path) as entries:
  1203. for entry in entries:
  1204. if entry.is_file() and entry.name.endswith(weight_file_extensions):
  1205. total_size += entry.stat().st_size
  1206. except PermissionError:
  1207. logger.error(f"Permission denied scanning directory: {component_path}")
  1208. raise
  1209. except OSError as e:
  1210. logger.error(f"Error scanning directory {component_path}: {e}")
  1211. raise
  1212. return total_size
  1213. def calculate_local_model_weight_size(path: str, is_diffusion: bool = False) -> int:
  1214. """
  1215. Calculate model weight size based on model type.
  1216. Unified entry point for calculating model weight sizes.
  1217. Args:
  1218. path: Directory path to scan
  1219. is_diffusion: Whether this is a diffusion model (default: False)
  1220. Returns:
  1221. Total size in bytes of weight files
  1222. Raises:
  1223. FileNotFoundError: If path doesn't exist
  1224. NotADirectoryError: If path is not a directory
  1225. PermissionError: If permission denied
  1226. json.JSONDecodeError: If model_index.json is invalid (diffusion only)
  1227. """
  1228. if is_diffusion:
  1229. return calculate_diffusion_model_weight_size(path)
  1230. else:
  1231. return calculate_llm_model_weight_size(path)
  1232. def hf_model_filename(
  1233. repo_id: str, filename: Optional[str] = None, token: Optional[str] = None
  1234. ) -> Optional[str]:
  1235. if filename is None:
  1236. return None
  1237. else:
  1238. matching_files = match_hugging_face_files(repo_id, filename, None, token)
  1239. if len(matching_files) == 0:
  1240. raise ValueError(f"File {filename} not found in {repo_id}")
  1241. return matching_files[0]
  1242. def hf_mmproj_filename(model: Model, token: Optional[str] = None) -> Optional[str]:
  1243. mmproj_filename = get_mmproj_filename(model)
  1244. matching_files = match_hugging_face_files(
  1245. model.huggingface_repo_id, mmproj_filename, None, token
  1246. )
  1247. if len(matching_files) == 0:
  1248. return None
  1249. matching_files = sorted(matching_files, reverse=True)
  1250. return matching_files[0]
  1251. def model_scope_file_path(model_id: str, file_path: str) -> str:
  1252. file_paths = match_model_scope_file_paths(model_id, file_path)
  1253. if len(file_paths) == 0:
  1254. raise ValueError(f"File {file_path} not found in {model_id}")
  1255. return file_paths[0]
  1256. def model_scope_mmproj_file_path(model: Model) -> Optional[str]:
  1257. mmproj_filename = get_mmproj_filename(model)
  1258. file_paths = match_model_scope_file_paths(
  1259. model.model_scope_model_id, mmproj_filename
  1260. )
  1261. if len(file_paths) == 0:
  1262. return None
  1263. file_paths = sorted(file_paths, reverse=True)
  1264. return file_paths[0]