model-catalog-modelscope.yaml 81 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573
  1. # YAML Variables
  2. .vllm_omni_ascend_stable_version: &vllm_omni_ascend_stable_version "0.14.1"
  3. .vllm_omni_stable_version: &vllm_omni_stable_version "0.16.0"
  4. draft_models:
  5. - name: Qwen3-8B-EAGLE3
  6. algorithm: eagle3
  7. source: model_scope
  8. model_scope_model_id: gpustack/qwen3_8b_eagle3
  9. - name: Qwen3-30B-A3B-EAGLE3
  10. algorithm: eagle3
  11. source: model_scope
  12. model_scope_model_id: gpustack/qwen3_30b_moe_eagle3
  13. - name: Qwen3-235B-A22B-EAGLE3
  14. algorithm: eagle3
  15. source: model_scope
  16. model_scope_model_id: gpustack/Qwen3-235B-A22B-EAGLE3
  17. - name: gpt-oss-120b-EAGLE3
  18. algorithm: eagle3
  19. source: model_scope
  20. model_scope_model_id: gpustack/EAGLE3-gpt-oss-120b-bf16
  21. model_sets:
  22. - name: Qwen3-0.6B
  23. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  24. home: https://qwenlm.github.io
  25. icon: /static/catalog_icons/qwen.png
  26. size: 0.6
  27. categories:
  28. - llm
  29. capabilities:
  30. - context/128K
  31. - tools
  32. licenses:
  33. - apache-2.0
  34. release_date: "2025-04-19"
  35. specs:
  36. # Ascend NPUs
  37. - mode: throughput
  38. quantization: BF16
  39. gpu_filters:
  40. vendor: ascend
  41. source: model_scope
  42. model_scope_model_id: Qwen/Qwen3-0.6B
  43. backend: MindIE
  44. backend_parameters:
  45. - --max-seq-len=8192
  46. # Other GPUs
  47. - mode: standard
  48. quantization: BF16
  49. source: model_scope
  50. model_scope_model_id: Qwen/Qwen3-0.6B
  51. backend: vLLM
  52. backend_parameters:
  53. - --reasoning-parser=deepseek_r1
  54. - --max-model-len=8192
  55. - name: Qwen3-8B
  56. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  57. home: https://qwenlm.github.io
  58. icon: /static/catalog_icons/qwen.png
  59. size: 8
  60. categories:
  61. - llm
  62. capabilities:
  63. - context/128K
  64. - tools
  65. licenses:
  66. - apache-2.0
  67. release_date: "2025-04-19"
  68. specs:
  69. # Ascend NPUs
  70. - mode: throughput
  71. quantization: W8A8
  72. gpu_filters:
  73. vendor: ascend
  74. vendor_variant: "910b"
  75. source: model_scope
  76. model_scope_model_id: vllm-ascend/Qwen3-8B-W8A8
  77. backend: MindIE
  78. backend_parameters:
  79. - --enable-prefix-caching
  80. - --max-seq-len=32768
  81. - mode: standard
  82. quantization: BF16
  83. gpu_filters:
  84. vendor: ascend
  85. source: model_scope
  86. model_scope_model_id: Qwen/Qwen3-8B
  87. backend: MindIE
  88. backend_parameters:
  89. - --max-seq-len=32768
  90. # Other GPUs
  91. - mode: throughput
  92. quantization: FP8
  93. source: model_scope
  94. model_scope_model_id: Qwen/Qwen3-8B-FP8
  95. backend: vLLM
  96. backend_parameters:
  97. - --reasoning-parser=deepseek_r1
  98. - --max-model-len=32768
  99. - mode: standard
  100. quantization: BF16
  101. source: model_scope
  102. model_scope_model_id: Qwen/Qwen3-8B
  103. backend: vLLM
  104. backend_parameters:
  105. - --reasoning-parser=deepseek_r1
  106. - --max-model-len=32768
  107. - name: Falcon-H1R-7B
  108. description: Falcon-H1R-7B is a reasoning-specialized language model built on top of Falcon-H1-7B-Base, featuring a Hybrid-Head Language Model (Transformer-SSM) architecture that delivers outstanding performance in mathematics, programming, and instruction following.
  109. home: https://huggingface.co/tiiuae
  110. icon: /static/catalog_icons/tii.png
  111. size: 7
  112. categories:
  113. - llm
  114. capabilities:
  115. - context/256K
  116. licenses:
  117. - falcon-llm-license
  118. release_date: "2026-01-05"
  119. specs:
  120. - mode: standard
  121. quantization: BF16
  122. source: model_scope
  123. model_scope_model_id: tiiuae/Falcon-H1R-7B
  124. backend: vLLM
  125. backend_parameters:
  126. - --reasoning-parser=deepseek_r1
  127. - --max-model-len=65536
  128. - name: Qwen3-14B
  129. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  130. home: https://qwenlm.github.io
  131. icon: /static/catalog_icons/qwen.png
  132. size: 14
  133. categories:
  134. - llm
  135. capabilities:
  136. - context/128K
  137. - tools
  138. licenses:
  139. - apache-2.0
  140. release_date: "2025-04-19"
  141. specs:
  142. # Ascend NPUs
  143. - mode: throughput
  144. quantization: BF16
  145. gpu_filters:
  146. vendor: ascend
  147. source: model_scope
  148. model_scope_model_id: Qwen/Qwen3-14B
  149. backend: MindIE
  150. backend_parameters:
  151. - --max-seq-len=32768
  152. # Other GPUs
  153. - mode: throughput
  154. quantization: FP8
  155. gpu_filters:
  156. vendor: nvidia
  157. compute_capability: ">=9.0" # Hopper or later
  158. source: model_scope
  159. model_scope_model_id: Qwen/Qwen3-14B-FP8
  160. backend: SGLang
  161. backend_parameters:
  162. - --reasoning-parser=qwen3
  163. - --context-length=32768
  164. - mode: throughput
  165. quantization: FP8
  166. gpu_filters:
  167. vendor: nvidia
  168. compute_capability: "<9.0" # Before Hopper
  169. source: model_scope
  170. model_scope_model_id: Qwen/Qwen3-14B-FP8
  171. backend: vLLM
  172. backend_parameters:
  173. - --reasoning-parser=deepseek_r1
  174. - --max-model-len=32768
  175. - mode: standard
  176. quantization: BF16
  177. source: model_scope
  178. model_scope_model_id: Qwen/Qwen3-14B
  179. backend: vLLM
  180. backend_parameters:
  181. - --reasoning-parser=deepseek_r1
  182. - --max-model-len=32768
  183. - name: Qwen3-32B
  184. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  185. home: https://qwenlm.github.io
  186. icon: /static/catalog_icons/qwen.png
  187. size: 32
  188. categories:
  189. - llm
  190. capabilities:
  191. - context/128K
  192. - tools
  193. licenses:
  194. - apache-2.0
  195. release_date: "2025-04-19"
  196. specs:
  197. # Ascend NPUs
  198. - mode: throughput
  199. quantization: W8A8
  200. gpu_filters:
  201. vendor: ascend
  202. vendor_variant: "910b"
  203. source: model_scope
  204. model_scope_model_id: vllm-ascend/Qwen3-32B-W8A8
  205. backend: MindIE
  206. backend_parameters:
  207. - --enable-prefix-caching
  208. - --max-seq-len=32768
  209. - mode: standard
  210. quantization: BF16
  211. gpu_filters:
  212. vendor: ascend
  213. source: model_scope
  214. model_scope_model_id: Qwen/Qwen3-32B
  215. backend: MindIE
  216. backend_parameters:
  217. - --max-seq-len=32768
  218. # Other GPUs
  219. - mode: throughput
  220. quantization: FP8
  221. source: model_scope
  222. model_scope_model_id: Qwen/Qwen3-32B-FP8
  223. backend: vLLM
  224. backend_parameters:
  225. - --reasoning-parser=deepseek_r1
  226. - --max-model-len=32768
  227. - mode: standard
  228. quantization: BF16
  229. source: model_scope
  230. model_scope_model_id: Qwen/Qwen3-32B
  231. backend: vLLM
  232. backend_parameters:
  233. - --reasoning-parser=deepseek_r1
  234. - --max-model-len=32768
  235. - name: Qwen3-Coder-Next
  236. description: Qwen3-Coder-Next is a super-efficient coding model with 80B total parameters and 3B activated parameters (MoE architecture). It achieves performance comparable to models with 10-20x more active parameters, excelling at long-horizon reasoning, complex tool usage, and IDE integration.
  237. home: https://qwenlm.github.io
  238. icon: /static/catalog_icons/qwen.png
  239. size: 80
  240. activated_size: 3
  241. categories:
  242. - llm
  243. capabilities:
  244. - context/256K
  245. - tools
  246. licenses:
  247. - apache-2.0
  248. release_date: "2026-02-03"
  249. specs:
  250. - mode: throughput
  251. quantization: FP8
  252. source: model_scope
  253. model_scope_model_id: Qwen/Qwen3-Coder-Next-FP8
  254. backend: vLLM
  255. backend_parameters:
  256. - --max-model-len=65536
  257. - --enable-auto-tool-choice
  258. - --tool-call-parser=qwen3_coder
  259. - mode: standard
  260. quantization: BF16
  261. source: model_scope
  262. model_scope_model_id: Qwen/Qwen3-Coder-Next
  263. backend: vLLM
  264. backend_parameters:
  265. - --max-model-len=65536
  266. - --enable-auto-tool-choice
  267. - --tool-call-parser=qwen3_coder
  268. - name: Qwen3-30B-A3B-Instruct-2507
  269. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  270. home: https://qwenlm.github.io
  271. icon: /static/catalog_icons/qwen.png
  272. size: 30
  273. activated_size: 3
  274. categories:
  275. - llm
  276. capabilities:
  277. - context/256K
  278. - tools
  279. licenses:
  280. - apache-2.0
  281. release_date: "2025-07-21"
  282. specs:
  283. # Ascend NPUs
  284. - mode: throughput
  285. quantization: BF16
  286. gpu_filters:
  287. vendor: ascend
  288. source: model_scope
  289. model_scope_model_id: Qwen/Qwen3-30B-A3B-Instruct-2507
  290. backend: MindIE
  291. backend_parameters:
  292. - --max-seq-len=32768
  293. # Other GPUs
  294. - mode: throughput
  295. quantization: FP8
  296. gpu_filters:
  297. vendor: nvidia
  298. compute_capability: ">=9.0" # Hopper or later
  299. source: model_scope
  300. model_scope_model_id: Qwen/Qwen3-30B-A3B-Instruct-2507-FP8
  301. backend: SGLang
  302. backend_parameters:
  303. - --tool-call-parser=qwen25
  304. - --context-length=32768
  305. - mode: throughput
  306. quantization: FP8
  307. gpu_filters:
  308. vendor: nvidia
  309. compute_capability: "<9.0" # Before Hopper
  310. source: model_scope
  311. model_scope_model_id: Qwen/Qwen3-30B-A3B-Instruct-2507-FP8
  312. backend: vLLM
  313. backend_parameters:
  314. - --tool-call-parser=hermes
  315. - --enable-auto-tool-choice
  316. - --max-model-len=32768
  317. - mode: standard
  318. quantization: BF16
  319. source: model_scope
  320. model_scope_model_id: Qwen/Qwen3-30B-A3B-Instruct-2507
  321. backend: vLLM
  322. backend_parameters:
  323. - --tool-call-parser=hermes
  324. - --enable-auto-tool-choice
  325. - --max-model-len=32768
  326. - name: Qwen3-30B-A3B-Thinking-2507
  327. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  328. home: https://qwenlm.github.io
  329. icon: /static/catalog_icons/qwen.png
  330. size: 30
  331. activated_size: 3
  332. categories:
  333. - llm
  334. capabilities:
  335. - context/256K
  336. - tools
  337. licenses:
  338. - apache-2.0
  339. release_date: "2025-07-21"
  340. specs:
  341. # Ascend NPUs
  342. - mode: throughput
  343. quantization: BF16
  344. gpu_filters:
  345. vendor: ascend
  346. source: model_scope
  347. model_scope_model_id: Qwen/Qwen3-30B-A3B-Thinking-2507
  348. backend: MindIE
  349. backend_parameters:
  350. - --max-seq-len=32768
  351. # Other GPUs
  352. - mode: throughput
  353. quantization: FP8
  354. gpu_filters:
  355. vendor: nvidia
  356. compute_capability: ">=9.0" # Hopper or later
  357. source: model_scope
  358. model_scope_model_id: Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
  359. backend: SGLang
  360. backend_parameters:
  361. - --reasoning-parser=deepseek-r1
  362. - --tool-call-parser=qwen25
  363. - --context-length=32768
  364. - mode: throughput
  365. quantization: FP8
  366. gpu_filters:
  367. vendor: nvidia
  368. compute_capability: "<9.0" # Before Hopper
  369. source: model_scope
  370. model_scope_model_id: Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
  371. backend: vLLM
  372. backend_parameters:
  373. - --reasoning-parser=deepseek_r1
  374. - --tool-call-parser=hermes
  375. - --enable-auto-tool-choice
  376. - --max-model-len=32768
  377. - mode: standard
  378. quantization: BF16
  379. source: model_scope
  380. model_scope_model_id: Qwen/Qwen3-30B-A3B-Thinking-2507
  381. backend: vLLM
  382. backend_parameters:
  383. - --reasoning-parser=deepseek_r1
  384. - --tool-call-parser=hermes
  385. - --enable-auto-tool-choice
  386. - --max-model-len=32768
  387. - name: Qwen3-235B-A22B-Instruct-2507
  388. description: The updated version of the Qwen3-235B-A22B non-thinking mode.
  389. home: https://qwenlm.github.io
  390. icon: /static/catalog_icons/qwen.png
  391. size: 235
  392. activated_size: 22
  393. categories:
  394. - llm
  395. capabilities:
  396. - context/1M
  397. - tools
  398. licenses:
  399. - apache-2.0
  400. release_date: "2025-07-21"
  401. specs:
  402. # Ascend NPUs
  403. - mode: throughput
  404. quantization: BF16
  405. gpu_filters:
  406. vendor: ascend
  407. source: model_scope
  408. model_scope_model_id: Qwen/Qwen3-235B-A22B-Instruct-2507
  409. backend: MindIE
  410. backend_parameters:
  411. - --max-seq-len=65536
  412. # Other GPUs
  413. - mode: throughput
  414. quantization: FP8
  415. source: model_scope
  416. model_scope_model_id: Qwen/Qwen3-235B-A22B-Instruct-2507-FP8
  417. backend: vLLM
  418. backend_parameters:
  419. - --tool-call-parser=hermes
  420. - --enable-auto-tool-choice
  421. - --max-model-len=65536
  422. - mode: standard
  423. quantization: BF16
  424. source: model_scope
  425. model_scope_model_id: Qwen/Qwen3-235B-A22B-Instruct-2507
  426. backend: vLLM
  427. backend_parameters:
  428. - --tool-call-parser=hermes
  429. - --enable-auto-tool-choice
  430. - --max-model-len=65536
  431. - name: Qwen3-235B-A22B-Thinking-2507
  432. description: The updated version of the Qwen3-235B-A22B thinking mode.
  433. home: https://qwenlm.github.io
  434. icon: /static/catalog_icons/qwen.png
  435. size: 235
  436. activated_size: 22
  437. categories:
  438. - llm
  439. capabilities:
  440. - context/1M
  441. - tools
  442. licenses:
  443. - apache-2.0
  444. release_date: "2025-07-21"
  445. specs:
  446. # Ascend NPUs
  447. - mode: throughput
  448. quantization: BF16
  449. gpu_filters:
  450. vendor: ascend
  451. source: model_scope
  452. model_scope_model_id: Qwen/Qwen3-235B-A22B-Thinking-2507
  453. backend: MindIE
  454. backend_parameters:
  455. - --max-seq-len=65536
  456. # Other GPUs
  457. - mode: throughput
  458. quantization: FP8
  459. source: model_scope
  460. model_scope_model_id: Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
  461. backend: vLLM
  462. backend_parameters:
  463. - --reasoning-parser=deepseek_r1
  464. - --tool-call-parser=hermes
  465. - --enable-auto-tool-choice
  466. - --max-model-len=65536
  467. - mode: standard
  468. quantization: BF16
  469. source: model_scope
  470. model_scope_model_id: Qwen/Qwen3-235B-A22B-Thinking-2507
  471. backend: vLLM
  472. backend_parameters:
  473. - --reasoning-parser=deepseek_r1
  474. - --tool-call-parser=hermes
  475. - --enable-auto-tool-choice
  476. - --max-model-len=65536
  477. - name: Qwen3.5-0.8B
  478. description: Qwen3.5-0.8B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks.
  479. home: https://qwenlm.github.io
  480. icon: /static/catalog_icons/qwen.png
  481. size: 0.8
  482. categories:
  483. - llm
  484. capabilities:
  485. - context/256K
  486. - reasoning
  487. - tools
  488. - vision
  489. licenses:
  490. - apache-2.0
  491. release_date: "2026-03-02"
  492. specs:
  493. # Ascend NPUs
  494. - mode: standard
  495. quantization: BF16
  496. gpu_filters:
  497. vendor: ascend
  498. source: model_scope
  499. model_scope_model_id: Qwen/Qwen3.5-0.8B
  500. backend: SGLang
  501. backend_version: 0.5.9
  502. backend_parameters:
  503. - --context-length=32768
  504. - --disable-radix-cache
  505. - --chunked-prefill-size=4096
  506. - --max-prefill-tokens=4096
  507. - --max-total-tokens=40960
  508. - mode: standard
  509. quantization: BF16
  510. source: model_scope
  511. model_scope_model_id: Qwen/Qwen3.5-0.8B
  512. backend: vLLM
  513. backend_version: 0.17.1
  514. backend_parameters:
  515. - --reasoning-parser=qwen3
  516. - --max-model-len=32768
  517. - name: Qwen3.5-2B
  518. description: Qwen3.5-2B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks.
  519. home: https://qwenlm.github.io
  520. icon: /static/catalog_icons/qwen.png
  521. size: 2
  522. categories:
  523. - llm
  524. capabilities:
  525. - context/256K
  526. - reasoning
  527. - tools
  528. - vision
  529. licenses:
  530. - apache-2.0
  531. release_date: "2026-03-02"
  532. specs:
  533. # Ascend NPUs
  534. - mode: standard
  535. quantization: BF16
  536. gpu_filters:
  537. vendor: ascend
  538. source: model_scope
  539. model_scope_model_id: Qwen/Qwen3.5-2B
  540. backend: SGLang
  541. backend_version: 0.5.9
  542. backend_parameters:
  543. - --context-length=32768
  544. - --disable-radix-cache
  545. - --chunked-prefill-size=4096
  546. - --max-prefill-tokens=4096
  547. - --max-total-tokens=40960
  548. - mode: standard
  549. quantization: BF16
  550. source: model_scope
  551. model_scope_model_id: Qwen/Qwen3.5-2B
  552. backend: vLLM
  553. backend_version: 0.17.1
  554. backend_parameters:
  555. - --reasoning-parser=qwen3
  556. - --max-model-len=32768
  557. - name: Qwen3.5-4B
  558. description: Qwen3.5-4B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks.
  559. home: https://qwenlm.github.io
  560. icon: /static/catalog_icons/qwen.png
  561. size: 4
  562. categories:
  563. - llm
  564. capabilities:
  565. - context/256K
  566. - reasoning
  567. - tools
  568. - vision
  569. licenses:
  570. - apache-2.0
  571. release_date: "2026-03-02"
  572. specs:
  573. # Ascend NPUs
  574. - mode: standard
  575. quantization: BF16
  576. gpu_filters:
  577. vendor: ascend
  578. source: model_scope
  579. model_scope_model_id: Qwen/Qwen3.5-4B
  580. backend: SGLang
  581. backend_version: 0.5.9
  582. backend_parameters:
  583. - --reasoning-parser=qwen3
  584. - --context-length=32768
  585. - --disable-radix-cache
  586. - --chunked-prefill-size=4096
  587. - --max-prefill-tokens=4096
  588. - --max-total-tokens=40960
  589. - mode: standard
  590. quantization: BF16
  591. source: model_scope
  592. model_scope_model_id: Qwen/Qwen3.5-4B
  593. backend: vLLM
  594. backend_version: 0.17.1
  595. backend_parameters:
  596. - --reasoning-parser=qwen3
  597. - --max-model-len=32768
  598. - name: Qwen3.5-9B
  599. description: Qwen3.5-9B is a model from the Qwen family, designed for strong reasoning, coding, and multilingual understanding with competitive performance across a wide range of tasks.
  600. home: https://qwenlm.github.io
  601. icon: /static/catalog_icons/qwen.png
  602. size: 9
  603. categories:
  604. - llm
  605. capabilities:
  606. - context/256K
  607. - reasoning
  608. - tools
  609. - vision
  610. licenses:
  611. - apache-2.0
  612. release_date: "2026-03-02"
  613. specs:
  614. # Ascend NPUs
  615. - mode: standard
  616. quantization: BF16
  617. gpu_filters:
  618. vendor: ascend
  619. source: model_scope
  620. model_scope_model_id: Qwen/Qwen3.5-9B
  621. backend: SGLang
  622. backend_version: 0.5.9
  623. backend_parameters:
  624. - --reasoning-parser=qwen3
  625. - --context-length=32768
  626. - --disable-radix-cache
  627. - --chunked-prefill-size=4096
  628. - --max-prefill-tokens=4096
  629. - --max-total-tokens=40960
  630. - mode: throughput
  631. quantization: BF16
  632. source: model_scope
  633. model_scope_model_id: Qwen/Qwen3.5-9B
  634. backend: vLLM
  635. backend_version: 0.17.1
  636. backend_parameters:
  637. - --reasoning-parser=qwen3
  638. - --max-model-len=32768
  639. - --performance-mode=throughput
  640. - --enable-prefix-caching
  641. - mode: latency
  642. quantization: BF16
  643. source: model_scope
  644. model_scope_model_id: Qwen/Qwen3.5-9B
  645. backend: vLLM
  646. backend_version: 0.17.1
  647. backend_parameters:
  648. - --reasoning-parser=qwen3
  649. - --max-model-len=32768
  650. - --performance-mode=interactivity
  651. - --language-model-only
  652. speculative_config:
  653. enabled: true
  654. algorithm: mtp
  655. num_draft_tokens: 1
  656. - mode: standard
  657. quantization: BF16
  658. source: model_scope
  659. model_scope_model_id: Qwen/Qwen3.5-9B
  660. backend: vLLM
  661. backend_version: 0.17.1
  662. backend_parameters:
  663. - --reasoning-parser=qwen3
  664. - --max-model-len=32768
  665. - name: Qwen3.5-27B
  666. description: Qwen3.5-27B is a model designed for strong reasoning, coding, and multilingual understanding with competitive performance across a wide range of tasks.
  667. home: https://qwenlm.github.io
  668. icon: /static/catalog_icons/qwen.png
  669. size: 27
  670. categories:
  671. - llm
  672. capabilities:
  673. - context/256K
  674. - reasoning
  675. - tools
  676. - vision
  677. licenses:
  678. - apache-2.0
  679. release_date: "2026-02-24"
  680. specs:
  681. # Ascend NPUs
  682. - mode: standard
  683. quantization: BF16
  684. gpu_filters:
  685. vendor: ascend
  686. source: model_scope
  687. model_scope_model_id: Qwen/Qwen3.5-27B
  688. backend: SGLang
  689. backend_version: 0.5.9
  690. backend_parameters:
  691. - --reasoning-parser=qwen3
  692. - --context-length=32768
  693. - --disable-radix-cache
  694. - --chunked-prefill-size=4096
  695. - --max-prefill-tokens=4096
  696. - --max-total-tokens=40960
  697. - mode: standard
  698. quantization: BF16
  699. source: model_scope
  700. model_scope_model_id: Qwen/Qwen3.5-27B
  701. backend: vLLM
  702. backend_version: 0.17.1
  703. backend_parameters:
  704. - --reasoning-parser=qwen3
  705. - --max-model-len=32768
  706. - mode: throughput
  707. quantization: FP8
  708. gpu_filters:
  709. vendor: nvidia
  710. compute_capability: ">=9.0"
  711. source: model_scope
  712. model_scope_model_id: Qwen/Qwen3.5-27B-FP8
  713. backend: vLLM
  714. backend_version: 0.17.1
  715. backend_parameters:
  716. - --reasoning-parser=qwen3
  717. - --max-model-len=32768
  718. - --performance-mode=throughput
  719. - --enable-prefix-caching
  720. - name: Qwen3.5-35B-A3B
  721. description: Qwen3.5-35B-A3B is a 35-billion-parameter open-source large language model from the Qwen family, designed for strong reasoning, code generation, and multilingual understanding across diverse tasks.
  722. home: https://qwenlm.github.io
  723. icon: /static/catalog_icons/qwen.png
  724. size: 35
  725. activated_size: 3
  726. categories:
  727. - llm
  728. capabilities:
  729. - context/256K
  730. - reasoning
  731. - tools
  732. - vision
  733. licenses:
  734. - apache-2.0
  735. release_date: "2026-02-24"
  736. specs:
  737. # Ascend NPUs
  738. - mode: standard
  739. quantization: BF16
  740. gpu_filters:
  741. vendor: ascend
  742. source: model_scope
  743. model_scope_model_id: Qwen/Qwen3.5-35B-A3B
  744. backend: SGLang
  745. backend_version: 0.5.9
  746. backend_parameters:
  747. - --reasoning-parser=qwen3
  748. - --context-length=32768
  749. - --disable-radix-cache
  750. - --chunked-prefill-size=4096
  751. - --max-prefill-tokens=4096
  752. - --max-total-tokens=40960
  753. - mode: standard
  754. quantization: BF16
  755. source: model_scope
  756. model_scope_model_id: Qwen/Qwen3.5-35B-A3B
  757. backend: vLLM
  758. backend_version: 0.17.1
  759. backend_parameters:
  760. - --reasoning-parser=qwen3
  761. - --max-model-len=32768
  762. - mode: throughput
  763. quantization: FP8
  764. gpu_filters:
  765. vendor: nvidia
  766. compute_capability: ">=9.0"
  767. source: model_scope
  768. model_scope_model_id: Qwen/Qwen3.5-35B-A3B-FP8
  769. backend: vLLM
  770. backend_version: 0.17.1
  771. backend_parameters:
  772. - --reasoning-parser=qwen3
  773. - --max-model-len=32768
  774. - --performance-mode=throughput
  775. - --enable-prefix-caching
  776. - mode: latency
  777. quantization: FP8
  778. gpu_filters:
  779. vendor: nvidia
  780. compute_capability: ">=9.0"
  781. source: model_scope
  782. model_scope_model_id: Qwen/Qwen3.5-35B-A3B-FP8
  783. backend: vLLM
  784. backend_version: 0.17.1
  785. backend_parameters:
  786. - --reasoning-parser=qwen3
  787. - --max-model-len=32768
  788. speculative_config:
  789. enabled: true
  790. algorithm: mtp
  791. num_draft_tokens: 1
  792. - name: Qwen3.5-122B-A10B
  793. description: Qwen3.5-122B-A10B is a 122-billion-parameter open-source large language model from the Qwen family, designed for strong reasoning, code generation, and multilingual understanding across diverse tasks.
  794. home: https://qwenlm.github.io
  795. icon: /static/catalog_icons/qwen.png
  796. size: 122
  797. activated_size: 10
  798. categories:
  799. - llm
  800. capabilities:
  801. - context/256K
  802. - reasoning
  803. - tools
  804. - vision
  805. licenses:
  806. - apache-2.0
  807. release_date: "2026-02-24"
  808. specs:
  809. # Ascend NPUs
  810. - mode: standard
  811. quantization: BF16
  812. gpu_filters:
  813. vendor: ascend
  814. source: model_scope
  815. model_scope_model_id: Qwen/Qwen3.5-122B-A10B
  816. backend: SGLang
  817. backend_version: 0.5.9
  818. backend_parameters:
  819. - --reasoning-parser=qwen3
  820. - --context-length=32768
  821. - --disable-radix-cache
  822. - --chunked-prefill-size=4096
  823. - --max-prefill-tokens=4096
  824. - --max-total-tokens=40960
  825. - mode: standard
  826. quantization: BF16
  827. source: model_scope
  828. model_scope_model_id: Qwen/Qwen3.5-122B-A10B
  829. backend: vLLM
  830. backend_version: 0.17.1
  831. backend_parameters:
  832. - --reasoning-parser=qwen3
  833. - --max-model-len=32768
  834. - mode: throughput
  835. quantization: FP8
  836. gpu_filters:
  837. vendor: nvidia
  838. compute_capability: ">=9.0"
  839. source: model_scope
  840. model_scope_model_id: Qwen/Qwen3.5-122B-A10B-FP8
  841. backend: vLLM
  842. backend_version: 0.17.1
  843. backend_parameters:
  844. - --reasoning-parser=qwen3
  845. - --max-model-len=32768
  846. - --performance-mode=throughput
  847. - --enable-prefix-caching
  848. - name: Qwen3.5-397B-A17B
  849. description: Qwen3.5-397B-A17B is a flagship MoE-hybrid model that delivers state-of-the-art reasoning and multimodal performance with ultra-efficient inference capabilities.
  850. home: https://qwenlm.github.io
  851. icon: /static/catalog_icons/qwen.png
  852. size: 397
  853. activated_size: 17
  854. categories:
  855. - llm
  856. capabilities:
  857. - context/256K
  858. - reasoning
  859. - tools
  860. - vision
  861. licenses:
  862. - apache-2.0
  863. release_date: "2026-02-16"
  864. specs:
  865. # Ascend NPUs
  866. - mode: standard
  867. quantization: BF16
  868. gpu_filters:
  869. vendor: ascend
  870. source: model_scope
  871. model_scope_model_id: Qwen/Qwen3.5-397B-A17B
  872. backend: SGLang
  873. backend_version: 0.5.9
  874. backend_parameters:
  875. - --reasoning-parser=qwen3
  876. - --context-length=32768
  877. - --disable-radix-cache
  878. - --chunked-prefill-size=4096
  879. - --max-prefill-tokens=4096
  880. - --max-total-tokens=40960
  881. - mode: standard
  882. quantization: BF16
  883. source: model_scope
  884. model_scope_model_id: Qwen/Qwen3.5-397B-A17B
  885. backend: vLLM
  886. backend_version: 0.17.1
  887. backend_parameters:
  888. - --reasoning-parser=qwen3
  889. - --max-model-len=32768
  890. - mode: throughput
  891. quantization: FP8
  892. gpu_filters:
  893. vendor: nvidia
  894. compute_capability: ">=9.0"
  895. source: model_scope
  896. model_scope_model_id: Qwen/Qwen3.5-397B-A17B-FP8
  897. backend: vLLM
  898. backend_version: 0.17.1
  899. backend_parameters:
  900. - --reasoning-parser=qwen3
  901. - --max-model-len=32768
  902. - --performance-mode=throughput
  903. - --enable-prefix-caching
  904. - name: GLM-4.7
  905. description: GLM-4.7 is a large language model developed by Zhipu AI, featuring advanced agentic, reasoning, and coding capabilities.
  906. home: https://z.ai
  907. icon: /static/catalog_icons/zai.png
  908. size: 355
  909. activated_size: 32
  910. categories:
  911. - llm
  912. capabilities:
  913. - context/1M
  914. - reasoning
  915. - tools
  916. licenses:
  917. - mit
  918. release_date: "2025-12-22"
  919. specs:
  920. # TODO: tool-call-parser glm47 not yet available in the latest vLLM/SGLang release
  921. - mode: throughput
  922. quantization: FP8
  923. gpu_filters:
  924. vendor: nvidia
  925. compute_capability: ">=9.0" # Hopper or later
  926. source: model_scope
  927. model_scope_model_id: ZhipuAI/GLM-4.7-FP8
  928. backend: SGLang
  929. backend_parameters:
  930. - --reasoning-parser=glm45
  931. - --context-length=65536
  932. - mode: throughput
  933. quantization: FP8
  934. gpu_filters:
  935. vendor: nvidia
  936. compute_capability: "<9.0" # Before Hopper
  937. source: model_scope
  938. model_scope_model_id: ZhipuAI/GLM-4.7-FP8
  939. backend: vLLM
  940. backend_parameters:
  941. - --reasoning-parser=glm45
  942. - --max-model-len=65536
  943. - mode: standard
  944. quantization: BF16
  945. source: model_scope
  946. model_scope_model_id: ZhipuAI/GLM-4.7
  947. backend: vLLM
  948. backend_parameters:
  949. - --reasoning-parser=glm45
  950. - --max-model-len=65536
  951. - name: GLM-4.6
  952. description: GLM-4.6 is a large language model developed by Zhipu AI, featuring advanced agentic, reasoning, and coding capabilities.
  953. home: https://z.ai
  954. icon: /static/catalog_icons/zai.png
  955. size: 355
  956. activated_size: 32
  957. categories:
  958. - llm
  959. capabilities:
  960. - context/1M
  961. - reasoning
  962. - tools
  963. licenses:
  964. - mit
  965. release_date: "2025-09-30"
  966. specs:
  967. - mode: throughput
  968. quantization: FP8
  969. gpu_filters:
  970. vendor: nvidia
  971. compute_capability: ">=9.0" # Hopper or later
  972. source: model_scope
  973. model_scope_model_id: ZhipuAI/GLM-4.6-FP8
  974. backend: SGLang
  975. backend_parameters:
  976. - --tool-call-parser=glm
  977. - --reasoning-parser=glm45
  978. - --context-length=65536
  979. - mode: throughput
  980. quantization: FP8
  981. gpu_filters:
  982. vendor: nvidia
  983. compute_capability: "<9.0" # Before Hopper
  984. source: model_scope
  985. model_scope_model_id: ZhipuAI/GLM-4.6-FP8
  986. backend: vLLM
  987. backend_parameters:
  988. - --reasoning-parser=glm45
  989. - --tool-call-parser=glm45
  990. - --enable-auto-tool-choice
  991. - --max-model-len=65536
  992. - mode: standard
  993. quantization: BF16
  994. source: model_scope
  995. model_scope_model_id: ZhipuAI/GLM-4.6
  996. backend: vLLM
  997. backend_parameters:
  998. - --reasoning-parser=glm45
  999. - --tool-call-parser=glm45
  1000. - --enable-auto-tool-choice
  1001. - --max-model-len=65536
  1002. - name: gpt-oss-120b
  1003. description: The gpt-oss series is OpenAI's family of open-weight models, designed for powerful reasoning, agentic tasks, and versatile developer use cases.
  1004. home: https://openai.com
  1005. icon: /static/catalog_icons/openai.png
  1006. categories:
  1007. - llm
  1008. capabilities:
  1009. - context/128K
  1010. size: 120
  1011. licenses:
  1012. - apache-2.0
  1013. release_date: "2025-08-05"
  1014. specs:
  1015. - mode: throughput
  1016. quantization: "MXFP4"
  1017. source: model_scope
  1018. model_scope_model_id: openai-mirror/gpt-oss-120b
  1019. backend: vLLM
  1020. backend_parameters:
  1021. - --max-model-len=32768
  1022. - --tool-call-parser=openai
  1023. - --enable-auto-tool-choice
  1024. - --async-scheduling
  1025. - mode: standard
  1026. quantization: "MXFP4"
  1027. source: model_scope
  1028. model_scope_model_id: openai-mirror/gpt-oss-120b
  1029. backend: vLLM
  1030. backend_parameters:
  1031. - --max-model-len=32768
  1032. - --tool-call-parser=openai
  1033. - --enable-auto-tool-choice
  1034. - name: gpt-oss-20b
  1035. description: The gpt-oss series is OpenAI's family of open-weight models, designed for powerful reasoning, agentic tasks, and versatile developer use cases.
  1036. home: https://openai.com
  1037. icon: /static/catalog_icons/openai.png
  1038. categories:
  1039. - llm
  1040. capabilities:
  1041. - context/128K
  1042. size: 20
  1043. licenses:
  1044. - apache-2.0
  1045. release_date: "2025-08-05"
  1046. specs:
  1047. - mode: throughput
  1048. quantization: "MXFP4"
  1049. source: model_scope
  1050. model_scope_model_id: openai-mirror/gpt-oss-20b
  1051. backend: vLLM
  1052. backend_parameters:
  1053. - --max-model-len=32768
  1054. - --tool-call-parser=openai
  1055. - --enable-auto-tool-choice
  1056. - --async-scheduling
  1057. - mode: standard
  1058. quantization: "MXFP4"
  1059. source: model_scope
  1060. model_scope_model_id: openai-mirror/gpt-oss-20b
  1061. backend: vLLM
  1062. backend_parameters:
  1063. - --max-model-len=32768
  1064. - --tool-call-parser=openai
  1065. - --enable-auto-tool-choice
  1066. - name: Deepseek-R1-0528
  1067. description: DeepSeek-R1-0528 is a minor version of the DeepSeek R1 model that features enhanced reasoning depth and inference capabilities. These improvements are achieved through increased computational resources and algorithmic optimizations applied during post-training. The model delivers strong performance across a range of benchmark evaluations, including mathematics, programming, and general logic, with overall capabilities approaching those of leading models such as O3 and Gemini 2.5 Pro.
  1068. home: https://www.deepseek.com
  1069. icon: /static/catalog_icons/deepseek.png
  1070. categories:
  1071. - llm
  1072. capabilities:
  1073. - context/128K
  1074. size: 671
  1075. licenses:
  1076. - mit
  1077. release_date: "2025-05-28"
  1078. specs:
  1079. - mode: throughput
  1080. quantization: W8A8
  1081. gpu_filters:
  1082. vendor: ascend
  1083. vendor_variant: "910b"
  1084. source: model_scope
  1085. model_scope_model_id: gpustack/DeepSeek-R1-0528-w8a8
  1086. backend: MindIE
  1087. backend_parameters:
  1088. - --max-seq-len=32768
  1089. - --npu-memory-fraction=0.95
  1090. - mode: throughput
  1091. quantization: FP8
  1092. gpu_filters:
  1093. vendor: nvidia
  1094. compute_capability: ">=9.0" # Hopper or later
  1095. source: model_scope
  1096. model_scope_model_id: deepseek-ai/DeepSeek-R1-0528
  1097. backend: SGLang
  1098. backend_parameters:
  1099. - --enable-dp-attention
  1100. - --context-length=32768
  1101. - mode: standard
  1102. quantization: FP8
  1103. source: model_scope
  1104. model_scope_model_id: deepseek-ai/DeepSeek-R1-0528
  1105. backend: vLLM
  1106. backend_parameters:
  1107. - --max-model-len=32768
  1108. - name: DeepSeek-OCR
  1109. description: DeepSeek-OCR is an advanced optical character recognition (OCR) model developed by DeepSeek AI. It is designed to accurately extract text from images and scanned documents.
  1110. home: https://www.deepseek.com
  1111. icon: /static/catalog_icons/deepseek.png
  1112. size: 3
  1113. categories:
  1114. - llm
  1115. licenses:
  1116. - mit
  1117. release_date: "2025-10-20"
  1118. specs:
  1119. - mode: standard
  1120. quantization: "BF16"
  1121. gpu_filters:
  1122. vendor:
  1123. - nvidia
  1124. - amd
  1125. source: model_scope
  1126. model_scope_model_id: deepseek-ai/DeepSeek-OCR
  1127. backend: vLLM
  1128. backend_version: 0.11.2
  1129. backend_parameters:
  1130. - --logits_processors=vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor
  1131. - --no-enable-prefix-caching
  1132. - --mm-processor-cache-gb=0
  1133. - name: PaddleOCR-VL-1.5
  1134. description: PaddleOCR-VL-1.5 is an advanced optical character recognition (OCR) vision-language model developed by PaddlePaddle. It is designed to accurately extract and understand text from images and documents.
  1135. home: https://www.paddleocr.com
  1136. icon: /static/catalog_icons/paddlepaddle.jpeg
  1137. size: 0.9
  1138. categories:
  1139. - llm
  1140. capabilities:
  1141. - vision
  1142. licenses:
  1143. - apache-2.0
  1144. release_date: "2026-01-29"
  1145. specs:
  1146. - mode: standard
  1147. quantization: "BF16"
  1148. source: model_scope
  1149. model_scope_model_id: PaddlePaddle/PaddleOCR-VL-1.5
  1150. backend: vLLM
  1151. backend_parameters:
  1152. - --trust-remote-code
  1153. - --max-num-batched-tokens=16384
  1154. - --no-enable-prefix-caching
  1155. - --mm-processor-cache-gb=0
  1156. - name: LightOnOCR-2-1B
  1157. description: LightOnOCR-2-1B is an efficient end-to-end vision-language model for optical character recognition (OCR), converting documents (PDFs, scans, images) into clean, naturally ordered text. It achieves state-of-the-art performance on OlmOCR-Bench while being significantly faster and more cost-effective than competitors.
  1158. home: https://www.lighton.ai
  1159. icon: /static/catalog_icons/lighton.png
  1160. size: 1
  1161. categories:
  1162. - llm
  1163. capabilities:
  1164. - vision
  1165. licenses:
  1166. - apache-2.0
  1167. release_date: "2026-01-19"
  1168. specs:
  1169. - mode: standard
  1170. quantization: "BF16"
  1171. source: model_scope
  1172. model_scope_model_id: lightonai/LightOnOCR-2-1B
  1173. backend: vLLM
  1174. backend_parameters:
  1175. - '--limit-mm-per-prompt={"image": 1}'
  1176. - --mm-processor-cache-gb=0
  1177. - --no-enable-prefix-caching
  1178. - name: Deepseek-V3.2
  1179. description: 'DeepSeek-V3.2 is a model that balances computational efficiency with strong reasoning and agent capabilities through three technical innovations: DeepSeek Sparse Attention (DSA), Scalable Reinforcement Learning Framework, Large-Scale Agentic Task Synthesis Pipeline.'
  1180. home: https://www.deepseek.com
  1181. icon: /static/catalog_icons/deepseek.png
  1182. categories:
  1183. - llm
  1184. capabilities:
  1185. - context/128K
  1186. size: 685
  1187. licenses:
  1188. - mit
  1189. release_date: "2025-12-01"
  1190. specs:
  1191. - mode: throughput
  1192. quantization: W8A8
  1193. gpu_filters:
  1194. vendor: ascend
  1195. vendor_variant: "910b"
  1196. source: model_scope
  1197. model_scope_model_id: vllm-ascend/DeepSeek-V3.2-W8A8
  1198. backend: vLLM
  1199. backend_version: 0.14.1
  1200. backend_parameters:
  1201. - --max-model-len=65536
  1202. - --gpu-memory-utilization=0.92
  1203. - --no-enable-prefix-caching
  1204. - --trust-remote-code
  1205. - --max-num-seqs=16
  1206. - '--compilation-config={"cudagraph_mode": "FULL_DECODE_ONLY"}'
  1207. - --tensor-parallel-size=8
  1208. - --data-parallel-size=2
  1209. - --data-parallel-size-local=1
  1210. - --enable-expert-parallel
  1211. - --quantization=ascend
  1212. - --tokenizer-mode=deepseek_v32
  1213. - mode: throughput
  1214. quantization: FP8
  1215. gpu_filters:
  1216. vendor: nvidia
  1217. compute_capability: ">=9.0" # Hopper or later
  1218. source: model_scope
  1219. model_scope_model_id: deepseek-ai/DeepSeek-V3.2
  1220. backend: SGLang
  1221. backend_version: 0.5.6.post2
  1222. backend_parameters:
  1223. - --enable-dp-attention
  1224. - --context-length=65536
  1225. - --reasoning-parser=deepseek-v3
  1226. - --tool-call-parser=deepseek_v32
  1227. - --chat-template={data_dir}/chat_templates/tool_chat_template_deepseekv32.jinja
  1228. - mode: standard
  1229. quantization: FP8
  1230. source: model_scope
  1231. model_scope_model_id: deepseek-ai/DeepSeek-V3.2
  1232. backend: vLLM
  1233. backend_version: 0.13.0
  1234. backend_parameters:
  1235. - --max-model-len=65536
  1236. - --tokenizer-mode=deepseek_v32
  1237. - --reasoning-parser=deepseek_v3
  1238. - --tool-call-parser=deepseek_v32
  1239. - --enable-auto-tool-choice
  1240. - name: Deepseek-V3.2-Speciale
  1241. description: This model is the high-compute variant of DeepSeek-V3.2, surpasses GPT-5 and matches Gemini-3.0-Pro in reasoning, achieving gold-medal level performance in the 2025 IMO and IOI competitions.
  1242. home: https://www.deepseek.com
  1243. icon: /static/catalog_icons/deepseek.png
  1244. categories:
  1245. - llm
  1246. capabilities:
  1247. - context/128K
  1248. size: 685
  1249. licenses:
  1250. - mit
  1251. release_date: "2025-12-01"
  1252. specs:
  1253. - mode: throughput
  1254. quantization: FP8
  1255. gpu_filters:
  1256. vendor: nvidia
  1257. compute_capability: ">=9.0" # Hopper or later
  1258. source: model_scope
  1259. model_scope_model_id: deepseek-ai/DeepSeek-V3.2-Speciale
  1260. backend: SGLang
  1261. backend_version: 0.5.6.post2
  1262. backend_parameters:
  1263. - --enable-dp-attention
  1264. - --context-length=65536
  1265. - --reasoning-parser=deepseek-v3
  1266. - mode: standard
  1267. quantization: FP8
  1268. source: model_scope
  1269. model_scope_model_id: deepseek-ai/DeepSeek-V3.2-Speciale
  1270. backend: vLLM
  1271. backend_version: 0.13.0
  1272. backend_parameters:
  1273. - --max-model-len=65536
  1274. - --tokenizer-mode=deepseek_v32
  1275. - --reasoning-parser=deepseek_v3
  1276. - name: MiniMax-M2.1
  1277. description: MiniMax-M2.1 is a high-performance agentic model, optimized for robustness in coding, tool use, instruction following, and long-horizon planning. It excels in multilingual software development and complex multi-step workflows.
  1278. home: https://www.minimax.io
  1279. icon: /static/catalog_icons/minimax.png
  1280. size: 230
  1281. activated_size: 10
  1282. categories:
  1283. - llm
  1284. capabilities:
  1285. - context/192K
  1286. - tools
  1287. licenses:
  1288. - modified-mit
  1289. release_date: "2025-12-23"
  1290. specs:
  1291. - mode: standard
  1292. quantization: FP8
  1293. source: model_scope
  1294. model_scope_model_id: MiniMax/MiniMax-M2.1
  1295. backend: vLLM
  1296. backend_parameters:
  1297. - --max-model-len=65536
  1298. - --reasoning-parser=minimax_m2_append_think
  1299. - --tool-call-parser=minimax_m2
  1300. - --enable-auto-tool-choice
  1301. - --trust-remote-code
  1302. - name: MiniMax-M2.5
  1303. description: MiniMax-M2.5 is a powerful MoE (Mixture-of-Experts) model that delivers exceptional performance in logical reasoning, coding, and complex agent tasks through highly efficient inference.
  1304. home: https://www.minimax.io/
  1305. icon: /static/catalog_icons/minimax.png
  1306. size: 230
  1307. activated_size: 10
  1308. categories:
  1309. - llm
  1310. capabilities:
  1311. - context/196K
  1312. - reasoning
  1313. - tools
  1314. licenses:
  1315. - modified-mit
  1316. release_date: "2026-02-12"
  1317. specs:
  1318. - mode: standard
  1319. quantization: BF16
  1320. source: model_scope
  1321. model_scope_model_id: MiniMax/MiniMax-M2.5
  1322. backend: vLLM
  1323. backend_parameters:
  1324. - --max-model-len=65536
  1325. - --reasoning-parser=minimax_m2_append_think
  1326. - --tool-call-parser=minimax_m2
  1327. - --enable-auto-tool-choice
  1328. - --trust-remote-code
  1329. - --enable-expert-parallel
  1330. - name: Kimi-K2.5
  1331. description: Kimi-K2.5 is a multimodal mixture-of-experts model with 1T total parameters and 32B activated parameters. It features native INT4 quantization, vision support, dual operating modes (thinking/instant), agent swarm capabilities, and excels at visual reasoning, coding with vision, and complex tool orchestration.
  1332. home: https://www.moonshot.ai
  1333. icon: /static/catalog_icons/kimi.png
  1334. size: 1
  1335. size_unit: T
  1336. activated_size: 32
  1337. categories:
  1338. - llm
  1339. capabilities:
  1340. - context/256K
  1341. - vision
  1342. - tools
  1343. licenses:
  1344. - modified-mit
  1345. release_date: "2026-01-26"
  1346. specs:
  1347. - mode: standard
  1348. quantization: INT4
  1349. source: model_scope
  1350. model_scope_model_id: MoonshotAI/Kimi-K2.5
  1351. backend: vLLM
  1352. backend_parameters:
  1353. - --max-model-len=65536
  1354. - --mm-encoder-tp-mode=data
  1355. - --tool-call-parser=kimi_k2
  1356. - --reasoning-parser=kimi_k2
  1357. - --trust-remote-code
  1358. - name: Step-3.5-Flash
  1359. description: Step-3.5-Flash is a fast, cost-effective multimodal model with 196B total parameters and 11B active parameters (MoE), optimized for quick inference. Built on StepFun's Step3 architecture, it delivers strong performance across text and vision tasks with efficient token usage.
  1360. home: https://www.stepfun.com
  1361. icon: /static/catalog_icons/stepfun.png
  1362. size: 196
  1363. activated_size: 11
  1364. categories:
  1365. - llm
  1366. capabilities:
  1367. - context/256K
  1368. - tools
  1369. licenses:
  1370. - apache-2.0
  1371. release_date: "2026-02-02"
  1372. specs:
  1373. - mode: throughput
  1374. quantization: FP8
  1375. source: model_scope
  1376. model_scope_model_id: stepfun-ai/Step-3.5-Flash-FP8
  1377. backend: vLLM
  1378. backend_parameters:
  1379. - --max-model-len=65536
  1380. - --disable-cascade-attn
  1381. - --reasoning-parser=step3p5
  1382. - --enable-auto-tool-choice
  1383. - --tool-call-parser=step3p5
  1384. - --trust-remote-code
  1385. - --quantization=fp8
  1386. - mode: standard
  1387. quantization: BF16
  1388. source: model_scope
  1389. model_scope_model_id: stepfun-ai/Step-3.5-Flash
  1390. backend: vLLM
  1391. backend_parameters:
  1392. - --max-model-len=65536
  1393. - --disable-cascade-attn
  1394. - --reasoning-parser=step3p5
  1395. - --enable-auto-tool-choice
  1396. - --tool-call-parser=step3p5
  1397. - --trust-remote-code
  1398. - name: Nanbeige4.1-3B
  1399. description: Nanbeige4.1-3B is a 3B-parameter language model from Nanbeige LLM Lab, optimized for long-context reasoning, agentic tasks, and tool use.
  1400. home: https://modelscope.cn/organization/nanbeige
  1401. icon: /static/catalog_icons/nanbeige.png
  1402. size: 3
  1403. categories:
  1404. - llm
  1405. capabilities:
  1406. - context/256K
  1407. - reasoning
  1408. - tools
  1409. licenses:
  1410. - apache-2.0
  1411. release_date: "2026-02-13"
  1412. specs:
  1413. - mode: standard
  1414. quantization: BF16
  1415. source: model_scope
  1416. model_scope_model_id: nanbeige/Nanbeige4.1-3B
  1417. backend: vLLM
  1418. backend_parameters:
  1419. - --max-model-len=32768
  1420. # Embedding models
  1421. - name: Qwen3-Embedding-0.6B
  1422. description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1423. home: https://qwenlm.github.io
  1424. icon: /static/catalog_icons/qwen.png
  1425. size: 0.6
  1426. categories:
  1427. - embedding
  1428. capabilities:
  1429. - dimensions/4096
  1430. - max_tokens/32K
  1431. licenses:
  1432. - apache-2.0
  1433. release_date: "2025-06-09"
  1434. specs:
  1435. - mode: standard
  1436. quantization: "BF16"
  1437. source: model_scope
  1438. model_scope_model_id: Qwen/Qwen3-Embedding-0.6B
  1439. categories:
  1440. - embedding
  1441. backend: vLLM
  1442. - name: Qwen3-Embedding-4B
  1443. description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1444. home: https://qwenlm.github.io
  1445. icon: /static/catalog_icons/qwen.png
  1446. size: 4
  1447. categories:
  1448. - embedding
  1449. capabilities:
  1450. - dimensions/4096
  1451. - max_tokens/32K
  1452. licenses:
  1453. - apache-2.0
  1454. release_date: "2025-06-09"
  1455. specs:
  1456. - mode: standard
  1457. quantization: "BF16"
  1458. source: model_scope
  1459. model_scope_model_id: Qwen/Qwen3-Embedding-4B
  1460. categories:
  1461. - embedding
  1462. backend: vLLM
  1463. - name: Qwen3-Embedding-8B
  1464. description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1465. home: https://qwenlm.github.io
  1466. icon: /static/catalog_icons/qwen.png
  1467. size: 8
  1468. categories:
  1469. - embedding
  1470. capabilities:
  1471. - dimensions/4096
  1472. - max_tokens/32K
  1473. licenses:
  1474. - apache-2.0
  1475. release_date: "2025-06-09"
  1476. specs:
  1477. - mode: standard
  1478. quantization: "BF16"
  1479. source: model_scope
  1480. model_scope_model_id: Qwen/Qwen3-Embedding-8B
  1481. categories:
  1482. - embedding
  1483. backend: vLLM
  1484. - name: Qwen3-VL-Embedding-2B
  1485. description: Qwen3-VL-Embedding is a multimodal embedding model series optimized for multimodal retrieval, clustering, and classification. It supports image-text retrieval and unified multimodal representation learning with 30+ languages support.
  1486. home: https://qwenlm.github.io
  1487. icon: /static/catalog_icons/qwen.png
  1488. size: 2
  1489. categories:
  1490. - embedding
  1491. capabilities:
  1492. - vision
  1493. - dimensions/2048
  1494. - max_tokens/32K
  1495. licenses:
  1496. - apache-2.0
  1497. release_date: "2026-01-08"
  1498. specs:
  1499. - mode: standard
  1500. quantization: "BF16"
  1501. source: model_scope
  1502. model_scope_model_id: Qwen/Qwen3-VL-Embedding-2B
  1503. categories:
  1504. - embedding
  1505. backend: vLLM
  1506. backend_parameters:
  1507. - --runner=pooling
  1508. - name: Qwen3-VL-Embedding-8B
  1509. description: Qwen3-VL-Embedding is a multimodal embedding model series optimized for multimodal retrieval, clustering, and classification. It supports image-text retrieval and unified multimodal representation learning with 30+ languages support.
  1510. home: https://qwenlm.github.io
  1511. icon: /static/catalog_icons/qwen.png
  1512. size: 8
  1513. categories:
  1514. - embedding
  1515. capabilities:
  1516. - vision
  1517. - dimensions/4096
  1518. - max_tokens/32K
  1519. licenses:
  1520. - apache-2.0
  1521. release_date: "2026-01-08"
  1522. specs:
  1523. - mode: standard
  1524. quantization: "BF16"
  1525. source: model_scope
  1526. model_scope_model_id: Qwen/Qwen3-VL-Embedding-8B
  1527. categories:
  1528. - embedding
  1529. backend: vLLM
  1530. backend_parameters:
  1531. - --runner=pooling
  1532. - name: BGE-M3
  1533. description: BGE-M3 is a new model from BAAI distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity.
  1534. home: https://bge-model.com
  1535. icon: /static/catalog_icons/bge_logo.jpeg
  1536. categories:
  1537. - embedding
  1538. capabilities:
  1539. - dimensions/1024
  1540. - max_tokens/8192
  1541. size: 567
  1542. size_unit: M
  1543. licenses:
  1544. - mit
  1545. release_date: "2024-01-28"
  1546. specs:
  1547. - mode: standard
  1548. quantization: "BF16"
  1549. source: model_scope
  1550. model_scope_model_id: BAAI/BGE-M3
  1551. categories:
  1552. - embedding
  1553. backend: vLLM
  1554. - name: BGE-Large-ZH-V1.5
  1555. description: BGE is short for BAAI general embedding. This is a Chinese text embedding model with more reasonable similarity distribution.
  1556. home: https://bge-model.com
  1557. icon: /static/catalog_icons/bge_logo.jpeg
  1558. categories:
  1559. - embedding
  1560. capabilities:
  1561. - dimensions/1024
  1562. - max_tokens/512
  1563. size: 335
  1564. size_unit: M
  1565. licenses:
  1566. - mit
  1567. release_date: "2023-09-12"
  1568. specs:
  1569. - mode: standard
  1570. quantization: "BF16"
  1571. source: model_scope
  1572. model_scope_model_id: BAAI/bge-large-zh-v1.5
  1573. categories:
  1574. - embedding
  1575. backend: vLLM
  1576. - name: BGE-Large-EN-V1.5
  1577. description: BGE is short for BAAI general embedding. This is an English text embedding model with more reasonable similarity distribution.
  1578. home: https://bge-model.com
  1579. icon: /static/catalog_icons/bge_logo.jpeg
  1580. categories:
  1581. - embedding
  1582. capabilities:
  1583. - dimensions/1024
  1584. - max_tokens/512
  1585. size: 335
  1586. size_unit: M
  1587. licenses:
  1588. - mit
  1589. release_date: "2023-09-12"
  1590. specs:
  1591. - mode: standard
  1592. quantization: "BF16"
  1593. source: model_scope
  1594. model_scope_model_id: BAAI/bge-large-en-v1.5
  1595. categories:
  1596. - embedding
  1597. backend: vLLM
  1598. - name: Nomic-Embed-Text-V1.5
  1599. description: Nomic-embed-text is a large context length text encoder that surpasses OpenAI text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
  1600. home: https://nomic.ai
  1601. icon: /static/catalog_icons/nomic.png
  1602. categories:
  1603. - embedding
  1604. capabilities:
  1605. - dimensions/768
  1606. - max_tokens/8192
  1607. size: 137
  1608. size_unit: M
  1609. licenses:
  1610. - apache-2.0
  1611. release_date: "2024-02-14"
  1612. specs:
  1613. - mode: standard
  1614. quantization: "BF16"
  1615. source: model_scope
  1616. model_scope_model_id: nomic-ai/nomic-embed-text-v1.5
  1617. categories:
  1618. - embedding
  1619. backend: vLLM
  1620. backend_parameters:
  1621. - --trust-remote-code
  1622. - name: Jina-Embeddings-V3
  1623. description: jina-embeddings-v3 is a multilingual multi-task text embedding model designed for a variety of NLP applications. Based on the Jina-XLM-RoBERTa architecture, this model supports Rotary Position Embeddings to handle long input sequences up to 8192 tokens.
  1624. home: https://jina.ai
  1625. icon: /static/catalog_icons/jina.png
  1626. categories:
  1627. - embedding
  1628. capabilities:
  1629. - dimensions/1024
  1630. - max_tokens/8192
  1631. size: 570
  1632. size_unit: M
  1633. licenses:
  1634. - cc-by-nc-4.0
  1635. release_date: "2024-09-18"
  1636. specs:
  1637. - mode: standard
  1638. quantization: "BF16"
  1639. source: model_scope
  1640. model_scope_model_id: jinaai/jina-embeddings-v3
  1641. categories:
  1642. - embedding
  1643. backend: vLLM
  1644. backend_parameters:
  1645. - --trust-remote-code
  1646. # Reranker models
  1647. - name: Qwen3-Reranker-0.6B
  1648. description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1649. home: https://qwenlm.github.io
  1650. icon: /static/catalog_icons/qwen.png
  1651. size: 0.6
  1652. categories:
  1653. - reranker
  1654. capabilities:
  1655. - max_tokens/32K
  1656. licenses:
  1657. - apache-2.0
  1658. release_date: "2025-06-09"
  1659. specs:
  1660. - mode: standard
  1661. quantization: BF16
  1662. source: model_scope
  1663. model_scope_model_id: Qwen/Qwen3-Reranker-0.6B
  1664. categories:
  1665. - reranker
  1666. backend: vLLM
  1667. backend_parameters:
  1668. - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1669. - name: Qwen3-Reranker-4B
  1670. description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1671. home: https://qwenlm.github.io
  1672. icon: /static/catalog_icons/qwen.png
  1673. size: 4
  1674. categories:
  1675. - reranker
  1676. capabilities:
  1677. - max_tokens/32K
  1678. licenses:
  1679. - apache-2.0
  1680. release_date: "2025-06-09"
  1681. specs:
  1682. - mode: standard
  1683. quantization: BF16
  1684. source: model_scope
  1685. model_scope_model_id: Qwen/Qwen3-Reranker-4B
  1686. categories:
  1687. - reranker
  1688. backend: vLLM
  1689. backend_parameters:
  1690. - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1691. - name: Qwen3-Reranker-8B
  1692. description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1693. home: https://qwenlm.github.io
  1694. icon: /static/catalog_icons/qwen.png
  1695. size: 8
  1696. categories:
  1697. - reranker
  1698. capabilities:
  1699. - max_tokens/32K
  1700. licenses:
  1701. - apache-2.0
  1702. release_date: "2025-06-09"
  1703. specs:
  1704. - mode: standard
  1705. quantization: BF16
  1706. source: model_scope
  1707. model_scope_model_id: Qwen/Qwen3-Reranker-8B
  1708. categories:
  1709. - reranker
  1710. backend: vLLM
  1711. backend_parameters:
  1712. - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1713. - name: Qwen3-VL-Reranker-2B
  1714. description: Qwen3-VL-Reranker is a multimodal text reranking model series optimized for multimodal retrieval, clustering, classification, and bitext mining. It consistently outperforms the base embedding model and baseline rerankers.
  1715. home: https://qwenlm.github.io
  1716. icon: /static/catalog_icons/qwen.png
  1717. size: 2
  1718. categories:
  1719. - reranker
  1720. capabilities:
  1721. - vision
  1722. - max_tokens/32K
  1723. licenses:
  1724. - apache-2.0
  1725. release_date: "2026-01-08"
  1726. specs:
  1727. - mode: standard
  1728. quantization: BF16
  1729. source: model_scope
  1730. model_scope_model_id: Qwen/Qwen3-VL-Reranker-2B
  1731. categories:
  1732. - reranker
  1733. backend: vLLM
  1734. backend_parameters:
  1735. - '--hf_overrides={"architectures":["Qwen3VLForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1736. - name: Qwen3-VL-Reranker-8B
  1737. description: Qwen3-VL-Reranker is a multimodal text reranking model series optimized for multimodal retrieval, clustering, classification, and bitext mining. It consistently outperforms the base embedding model and baseline rerankers, with the 8B model showing particularly strong results.
  1738. home: https://qwenlm.github.io
  1739. icon: /static/catalog_icons/qwen.png
  1740. size: 8
  1741. categories:
  1742. - reranker
  1743. capabilities:
  1744. - vision
  1745. - max_tokens/32K
  1746. licenses:
  1747. - apache-2.0
  1748. release_date: "2026-01-08"
  1749. specs:
  1750. - mode: standard
  1751. quantization: BF16
  1752. source: model_scope
  1753. model_scope_model_id: Qwen/Qwen3-VL-Reranker-8B
  1754. categories:
  1755. - reranker
  1756. backend: vLLM
  1757. backend_parameters:
  1758. - '--hf_overrides={"architectures":["Qwen3VLForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1759. - name: BGE-Reranker-V2-M3
  1760. description: BGE-Reranker-V2-M3 is a reranker model from BAAI.
  1761. home: https://bge-model.com
  1762. icon: /static/catalog_icons/bge_logo.jpeg
  1763. categories:
  1764. - reranker
  1765. size: 568
  1766. size_unit: M
  1767. licenses:
  1768. - apache-2.0
  1769. release_date: "2024-03-19"
  1770. specs:
  1771. - mode: standard
  1772. quantization: "BF16"
  1773. source: model_scope
  1774. model_scope_model_id: BAAI/bge-reranker-v2-m3
  1775. categories:
  1776. - reranker
  1777. backend: vLLM
  1778. - name: Jina-Reranker-M0
  1779. description: Jina-Reranker-M0 is a multilingual multimodal document reranker model with 2.4B parameters. It accepts a query alongside visually rich documents and outputs ranked documents by relevance. Supports 29 languages and multimodal content including text, figures, tables, and infographics.
  1780. home: https://jina.ai
  1781. icon: /static/catalog_icons/jina.png
  1782. size: 2.4
  1783. categories:
  1784. - reranker
  1785. capabilities:
  1786. - max_tokens/10K
  1787. - vision
  1788. licenses:
  1789. - cc-by-nc-4.0
  1790. release_date: "2025-04-08"
  1791. specs:
  1792. - mode: standard
  1793. quantization: "BF16"
  1794. source: model_scope
  1795. model_scope_model_id: jinaai/jina-reranker-m0
  1796. backend: vLLM
  1797. # Image models
  1798. - name: FLUX.1-dev
  1799. description: FLUX.1 [dev] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions.
  1800. home: https://blackforestlabs.ai
  1801. icon: /static/catalog_icons/blackforestlabs.png
  1802. size: 12
  1803. categories:
  1804. - image
  1805. licenses:
  1806. - flux-1-dev-non-commercial-license
  1807. release_date: "2024-08-02"
  1808. specs:
  1809. - mode: standard
  1810. quantization: "BF16"
  1811. gpu_filters:
  1812. vendor: nvidia
  1813. source: model_scope
  1814. model_scope_model_id: black-forest-labs/FLUX.1-dev
  1815. backend: SGLang
  1816. backend_version: 0.5.6.post2
  1817. env:
  1818. GPUSTACK_MODEL_VRAM_CLAIM: "37580963840" # 35 GiB, observed empirically
  1819. - name: FLUX.2-klein-4B
  1820. description: FLUX.2-klein-4B is a 4 billion parameter image generation model from Black Forest Labs.
  1821. home: https://blackforestlabs.ai
  1822. icon: /static/catalog_icons/blackforestlabs.png
  1823. size: 4
  1824. categories:
  1825. - image
  1826. licenses:
  1827. - apache-2.0
  1828. release_date: "2026-01-15"
  1829. .base_spec: &flux_2_klein_4b_base_spec
  1830. mode: standard
  1831. quantization: "BF16"
  1832. source: model_scope
  1833. model_scope_model_id: black-forest-labs/FLUX.2-klein-4B
  1834. backend: vLLM
  1835. backend_parameters:
  1836. - --omni
  1837. specs:
  1838. - <<: *flux_2_klein_4b_base_spec
  1839. gpu_filters:
  1840. vendor: ascend
  1841. backend_version: *vllm_omni_ascend_stable_version
  1842. - <<: *flux_2_klein_4b_base_spec
  1843. backend_version: *vllm_omni_stable_version
  1844. - name: FLUX.2-klein-9B
  1845. description: FLUX.2-klein-9B is a 9 billion parameter image generation model from Black Forest Labs.
  1846. home: https://blackforestlabs.ai
  1847. icon: /static/catalog_icons/blackforestlabs.png
  1848. size: 9
  1849. categories:
  1850. - image
  1851. licenses:
  1852. - apache-2.0
  1853. release_date: "2026-01-15"
  1854. .base_spec: &flux_2_klein_9b_base_spec
  1855. mode: standard
  1856. quantization: "BF16"
  1857. source: model_scope
  1858. model_scope_model_id: black-forest-labs/FLUX.2-klein-9B
  1859. backend: vLLM
  1860. backend_parameters:
  1861. - --omni
  1862. specs:
  1863. - <<: *flux_2_klein_9b_base_spec
  1864. gpu_filters:
  1865. vendor: ascend
  1866. backend_version: *vllm_omni_ascend_stable_version
  1867. - <<: *flux_2_klein_9b_base_spec
  1868. backend_version: *vllm_omni_stable_version
  1869. - name: Qwen-Image
  1870. description: Qwen-Image is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing.
  1871. home: https://qwen.ai
  1872. icon: /static/catalog_icons/qwen.png
  1873. size: 20
  1874. categories:
  1875. - image
  1876. licenses:
  1877. - apache-2.0
  1878. release_date: "2025-08-04"
  1879. .base_spec: &qwen_image_base_spec
  1880. mode: standard
  1881. quantization: "BF16"
  1882. source: model_scope
  1883. model_scope_model_id: Qwen/Qwen-Image
  1884. backend: vLLM
  1885. backend_parameters:
  1886. - --omni
  1887. specs:
  1888. - <<: *qwen_image_base_spec
  1889. gpu_filters:
  1890. vendor: ascend
  1891. backend_version: *vllm_omni_ascend_stable_version
  1892. - <<: *qwen_image_base_spec
  1893. backend_version: *vllm_omni_stable_version
  1894. - name: Qwen-Image-Edit
  1895. description: Built upon the 20B Qwen-Image model, Qwen-Image-Edit successfully extends Qwen-Image's unique text rendering capabilities to image editing tasks, enabling precise text editing.
  1896. home: https://qwen.ai
  1897. icon: /static/catalog_icons/qwen.png
  1898. size: 20
  1899. categories:
  1900. - image
  1901. licenses:
  1902. - apache-2.0
  1903. release_date: "2025-08-19"
  1904. specs:
  1905. - mode: standard
  1906. quantization: "BF16"
  1907. gpu_filters:
  1908. vendor: nvidia
  1909. source: model_scope
  1910. model_scope_model_id: Qwen/Qwen-Image-Edit
  1911. backend: SGLang
  1912. backend_version: 0.5.6.post2
  1913. - name: Qwen-Image-2512
  1914. description: Qwen-Image-2512 is the December update of Qwen-Image's text-to-image foundational model, delivering enhanced image generation capabilities.
  1915. home: https://qwen.ai
  1916. icon: /static/catalog_icons/qwen.png
  1917. size: 20
  1918. categories:
  1919. - image
  1920. licenses:
  1921. - apache-2.0
  1922. release_date: "2025-12-30"
  1923. .base_spec: &qwen_image_2512_base_spec
  1924. mode: standard
  1925. quantization: "BF16"
  1926. source: model_scope
  1927. model_scope_model_id: Qwen/Qwen-Image-2512
  1928. backend: vLLM
  1929. backend_parameters:
  1930. - --omni
  1931. specs:
  1932. - <<: *qwen_image_2512_base_spec
  1933. gpu_filters:
  1934. vendor: ascend
  1935. backend_version: *vllm_omni_ascend_stable_version
  1936. - <<: *qwen_image_2512_base_spec
  1937. backend_version: *vllm_omni_stable_version
  1938. - name: Z-Image
  1939. description: Z-Image is the foundation model of the Z-Image family, engineered for good quality, robust generative diversity, broad stylistic coverage, and precise prompt adherence.
  1940. home: https://qwen.ai
  1941. icon: /static/catalog_icons/qwen.png
  1942. size: 6
  1943. categories:
  1944. - image
  1945. licenses:
  1946. - apache-2.0
  1947. release_date: "2026-01-28"
  1948. .base_spec: &z_image_base_spec
  1949. mode: standard
  1950. quantization: "BF16"
  1951. source: model_scope
  1952. model_scope_model_id: Tongyi-MAI/Z-Image
  1953. backend: vLLM
  1954. backend_parameters:
  1955. - --omni
  1956. specs:
  1957. - <<: *z_image_base_spec
  1958. gpu_filters:
  1959. vendor: ascend
  1960. backend_version: *vllm_omni_ascend_stable_version
  1961. - <<: *z_image_base_spec
  1962. backend_version: *vllm_omni_stable_version
  1963. - name: Z-Image-Turbo
  1964. description: Z-Image is a powerful and highly efficient image generation model with 6B parameters.
  1965. home: https://qwen.ai
  1966. icon: /static/catalog_icons/qwen.png
  1967. size: 6
  1968. categories:
  1969. - image
  1970. licenses:
  1971. - apache-2.0
  1972. release_date: "2025-11-27"
  1973. .base_spec: &z_image_turbo_base_spec
  1974. mode: standard
  1975. quantization: "BF16"
  1976. source: model_scope
  1977. model_scope_model_id: Tongyi-MAI/Z-Image-Turbo
  1978. backend: vLLM
  1979. backend_parameters:
  1980. - --omni
  1981. env:
  1982. GPUSTACK_MODEL_VRAM_CLAIM: "24696061952" # 23 GiB observed. Weight file size is 33 GiB in F32 while vLLM loads in BF16.
  1983. specs:
  1984. - <<: *z_image_turbo_base_spec
  1985. gpu_filters:
  1986. vendor: ascend
  1987. backend_version: *vllm_omni_ascend_stable_version
  1988. - <<: *z_image_turbo_base_spec
  1989. backend_version: *vllm_omni_stable_version
  1990. - name: Qwen3-VL-8B-Instruct
  1991. description: Qwen3-VL-8B-Instruct is a vision-language model that delivers comprehensive upgrades across text understanding, visual perception, and reasoning capabilities, supporting image/video/text unified understanding.
  1992. home: https://qwen.ai
  1993. icon: /static/catalog_icons/qwen.png
  1994. size: 8
  1995. categories:
  1996. - llm
  1997. capabilities:
  1998. - context/1M
  1999. - vision
  2000. licenses:
  2001. - apache-2.0
  2002. release_date: "2025-10-15"
  2003. specs:
  2004. - mode: standard
  2005. quantization: BF16
  2006. source: model_scope
  2007. model_scope_model_id: Qwen/Qwen3-VL-8B-Instruct
  2008. backend: vLLM
  2009. backend_parameters:
  2010. - --max-model-len=65536
  2011. - name: Qwen3-VL-8B-Thinking
  2012. description: Qwen3-VL-8B-Thinking is a vision-language model that delivers comprehensive upgrades across text understanding, visual perception, and reasoning capabilities, supporting image/video/text unified understanding with thinking mode.
  2013. home: https://qwen.ai
  2014. icon: /static/catalog_icons/qwen.png
  2015. size: 8
  2016. categories:
  2017. - llm
  2018. capabilities:
  2019. - context/1M
  2020. - vision
  2021. licenses:
  2022. - apache-2.0
  2023. release_date: "2025-10-15"
  2024. specs:
  2025. - mode: standard
  2026. quantization: BF16
  2027. source: model_scope
  2028. model_scope_model_id: Qwen/Qwen3-VL-8B-Thinking
  2029. backend: vLLM
  2030. backend_parameters:
  2031. - --max-model-len=65536
  2032. - name: Qwen3-VL-32B-Instruct
  2033. description: Qwen3-VL-32B-Instruct is a vision-language model featuring superior visual intelligence, enhanced spatial awareness capabilities, and OCR functionality.
  2034. home: https://qwen.ai
  2035. icon: /static/catalog_icons/qwen.png
  2036. size: 32
  2037. categories:
  2038. - llm
  2039. capabilities:
  2040. - context/1M
  2041. - vision
  2042. licenses:
  2043. - apache-2.0
  2044. release_date: "2025-10-21"
  2045. specs:
  2046. - mode: standard
  2047. quantization: BF16
  2048. source: model_scope
  2049. model_scope_model_id: Qwen/Qwen3-VL-32B-Instruct
  2050. backend: vLLM
  2051. backend_parameters:
  2052. - --max-model-len=65536
  2053. - name: Qwen3-VL-32B-Thinking
  2054. description: Qwen3-VL-32B-Thinking is a vision-language model featuring superior visual intelligence, enhanced spatial awareness capabilities, and OCR functionality with thinking mode.
  2055. home: https://qwen.ai
  2056. icon: /static/catalog_icons/qwen.png
  2057. size: 32
  2058. categories:
  2059. - llm
  2060. capabilities:
  2061. - context/1M
  2062. - vision
  2063. licenses:
  2064. - apache-2.0
  2065. release_date: "2025-10-21"
  2066. specs:
  2067. - mode: standard
  2068. quantization: BF16
  2069. source: model_scope
  2070. model_scope_model_id: Qwen/Qwen3-VL-32B-Thinking
  2071. backend: vLLM
  2072. backend_parameters:
  2073. - --max-model-len=65536
  2074. - name: Qwen3-VL-30B-A3B-Instruct
  2075. description: Qwen3-VL-30B-A3B-Instruct is a mixture-of-experts vision-language model with 30B total parameters and 3B active parameters, featuring advanced spatial perception, 2D and 3D grounding.
  2076. home: https://qwen.ai
  2077. icon: /static/catalog_icons/qwen.png
  2078. size: 30
  2079. activated_size: 3
  2080. categories:
  2081. - llm
  2082. capabilities:
  2083. - context/1M
  2084. - vision
  2085. licenses:
  2086. - apache-2.0
  2087. release_date: "2025-10-05"
  2088. specs:
  2089. - mode: standard
  2090. quantization: BF16
  2091. source: model_scope
  2092. model_scope_model_id: Qwen/Qwen3-VL-30B-A3B-Instruct
  2093. backend: vLLM
  2094. backend_parameters:
  2095. - --max-model-len=65536
  2096. - name: Qwen3-VL-30B-A3B-Thinking
  2097. description: Qwen3-VL-30B-A3B-Thinking is a mixture-of-experts vision-language model with 30B total parameters and 3B active parameters, featuring advanced spatial perception, 2D and 3D grounding with thinking mode.
  2098. home: https://qwen.ai
  2099. icon: /static/catalog_icons/qwen.png
  2100. size: 30
  2101. activated_size: 3
  2102. categories:
  2103. - llm
  2104. capabilities:
  2105. - context/1M
  2106. - vision
  2107. licenses:
  2108. - apache-2.0
  2109. release_date: "2025-10-05"
  2110. specs:
  2111. - mode: standard
  2112. quantization: BF16
  2113. source: model_scope
  2114. model_scope_model_id: Qwen/Qwen3-VL-30B-A3B-Thinking
  2115. backend: vLLM
  2116. backend_parameters:
  2117. - --max-model-len=65536
  2118. - name: Qwen3-VL-235B-A22B-Instruct
  2119. description: Qwen3-VL-235B-A22B-Instruct is the largest vision-language model in the Qwen3-VL series with 235B total parameters and 22B active parameters, featuring state-of-the-art visual understanding and reasoning capabilities.
  2120. home: https://qwen.ai
  2121. icon: /static/catalog_icons/qwen.png
  2122. size: 235
  2123. activated_size: 22
  2124. categories:
  2125. - llm
  2126. capabilities:
  2127. - context/1M
  2128. - vision
  2129. licenses:
  2130. - apache-2.0
  2131. release_date: "2025-09-23"
  2132. specs:
  2133. - mode: standard
  2134. quantization: BF16
  2135. source: model_scope
  2136. model_scope_model_id: Qwen/Qwen3-VL-235B-A22B-Instruct
  2137. backend: vLLM
  2138. backend_parameters:
  2139. - --max-model-len=65536
  2140. - name: Qwen3-VL-235B-A22B-Thinking
  2141. description: Qwen3-VL-235B-A22B-Thinking is the largest vision-language model in the Qwen3-VL series with 235B total parameters and 22B active parameters, featuring state-of-the-art visual understanding and reasoning capabilities with thinking mode.
  2142. home: https://qwen.ai
  2143. icon: /static/catalog_icons/qwen.png
  2144. size: 235
  2145. activated_size: 22
  2146. categories:
  2147. - llm
  2148. capabilities:
  2149. - context/1M
  2150. - vision
  2151. licenses:
  2152. - apache-2.0
  2153. release_date: "2025-09-23"
  2154. specs:
  2155. - mode: standard
  2156. quantization: BF16
  2157. source: model_scope
  2158. model_scope_model_id: Qwen/Qwen3-VL-235B-A22B-Thinking
  2159. backend: vLLM
  2160. backend_parameters:
  2161. - --max-model-len=65536
  2162. # Audio models
  2163. - name: CosyVoice2-0.5B
  2164. description: CosyVoice2-0.5B is a speech generation model. It supports multilingual speech synthesis with high naturalness and expressiveness.
  2165. home: https://github.com/FunAudioLLM
  2166. icon: /static/catalog_icons/FunAudioLLM.png
  2167. size: 0.5
  2168. categories:
  2169. - text_to_speech
  2170. licenses:
  2171. - apache-2.0
  2172. release_date: "2024-12-01"
  2173. specs:
  2174. - mode: standard
  2175. quantization: FP16
  2176. source: model_scope
  2177. model_scope_model_id: gpustack/CosyVoice2-0.5B
  2178. backend: VoxBox
  2179. env:
  2180. GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
  2181. - name: CosyVoice-300M
  2182. description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba.
  2183. home: https://github.com/FunAudioLLM
  2184. icon: /static/catalog_icons/FunAudioLLM.png
  2185. size: 300
  2186. size_unit: M
  2187. categories:
  2188. - text_to_speech
  2189. licenses:
  2190. - apache-2.0
  2191. release_date: "2024-07-05"
  2192. specs:
  2193. - mode: standard
  2194. quantization: FP16
  2195. source: model_scope
  2196. model_scope_model_id: gpustack/CosyVoice-300M
  2197. backend: VoxBox
  2198. env:
  2199. GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
  2200. - name: CosyVoice-300M-SFT
  2201. description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba.
  2202. home: https://github.com/FunAudioLLM
  2203. icon: /static/catalog_icons/FunAudioLLM.png
  2204. size: 300
  2205. size_unit: M
  2206. categories:
  2207. - text_to_speech
  2208. licenses:
  2209. - apache-2.0
  2210. release_date: "2024-07-05"
  2211. specs:
  2212. - mode: standard
  2213. quantization: FP16
  2214. source: model_scope
  2215. model_scope_model_id: iic/CosyVoice-300M-SFT
  2216. backend: VoxBox
  2217. env:
  2218. GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
  2219. - name: CosyVoice-300M-Instruct
  2220. description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba.
  2221. home: https://github.com/FunAudioLLM
  2222. icon: /static/catalog_icons/FunAudioLLM.png
  2223. size: 300
  2224. size_unit: M
  2225. categories:
  2226. - text_to_speech
  2227. licenses:
  2228. - apache-2.0
  2229. release_date: "2024-07-05"
  2230. specs:
  2231. - mode: standard
  2232. quantization: FP16
  2233. source: model_scope
  2234. model_scope_model_id: gpustack/CosyVoice-300M-Instruct
  2235. backend: VoxBox
  2236. env:
  2237. GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
  2238. - name: Faster-Whisper-Large-V3
  2239. description: Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation, proposed in the paper Robust Speech Recognition via Large-Scale Weak Supervision by Alec Radford et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many datasets and domains in a zero-shot setting. This is the conversion of openai/whisper-large-v3 to the CTranslate2 model format.
  2240. home: https://huggingface.co/Systran
  2241. icon: /static/catalog_icons/systran.png
  2242. size: 1.55
  2243. categories:
  2244. - speech_to_text
  2245. licenses:
  2246. - mit
  2247. release_date: "2023-11-23"
  2248. specs:
  2249. - mode: standard
  2250. quantization: FP16
  2251. source: model_scope
  2252. model_scope_model_id: gpustack/faster-whisper-large-v3
  2253. backend: VoxBox
  2254. env:
  2255. GPUSTACK_MODEL_VRAM_CLAIM: "10737418240" # 10 GiB, per OpenAI Whisper large reference VRAM.
  2256. - name: Faster-Whisper-Medium
  2257. description: Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. This is the conversion of openai/whisper-medium to the CTranslate2 model format.
  2258. home: https://huggingface.co/Systran
  2259. icon: /static/catalog_icons/systran.png
  2260. size: 769
  2261. size_unit: M
  2262. categories:
  2263. - speech_to_text
  2264. licenses:
  2265. - mit
  2266. release_date: "2023-03-23"
  2267. specs:
  2268. - mode: standard
  2269. quantization: FP16
  2270. source: model_scope
  2271. model_scope_model_id: gpustack/faster-whisper-medium
  2272. backend: VoxBox
  2273. env:
  2274. GPUSTACK_MODEL_VRAM_CLAIM: "5368709120" # 5 GiB, per OpenAI Whisper medium reference VRAM.
  2275. - name: Faster-Whisper-Small
  2276. description: Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. This is the conversion of openai/whisper-small to the CTranslate2 model format.
  2277. home: https://huggingface.co/Systran
  2278. icon: /static/catalog_icons/systran.png
  2279. size: 244
  2280. size_unit: M
  2281. categories:
  2282. - speech_to_text
  2283. licenses:
  2284. - mit
  2285. release_date: "2023-03-23"
  2286. specs:
  2287. - mode: standard
  2288. quantization: FP16
  2289. source: model_scope
  2290. model_scope_model_id: gpustack/faster-whisper-small
  2291. backend: VoxBox
  2292. env:
  2293. GPUSTACK_MODEL_VRAM_CLAIM: "2147483648" # 2 GiB, per OpenAI Whisper small reference VRAM.
  2294. - name: Whisper-Large-V3-Turbo
  2295. description: Whisper large-v3-turbo is a finetuned version of a pruned Whisper large-v3. In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4. As a result, the model is way faster, at the expense of a minor quality degradation.
  2296. home: https://openai.com
  2297. icon: /static/catalog_icons/openai.png
  2298. size: 809
  2299. size_unit: M
  2300. categories:
  2301. - speech_to_text
  2302. licenses:
  2303. - mit
  2304. release_date: "2024-10-01"
  2305. specs:
  2306. - mode: standard
  2307. quantization: BF16
  2308. source: model_scope
  2309. model_scope_model_id: openai/whisper-large-v3-turbo
  2310. backend: vLLM
  2311. - name: Whisper-Large-V3
  2312. description: Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation. Trained on 5M hours of labeled data, Whisper large-v3 demonstrates strong ability to generalise to many datasets and domains in a zero-shot setting.
  2313. home: https://openai.com
  2314. icon: /static/catalog_icons/openai.png
  2315. size: 1.55
  2316. categories:
  2317. - speech_to_text
  2318. licenses:
  2319. - mit
  2320. release_date: "2023-11-06"
  2321. specs:
  2322. - mode: standard
  2323. quantization: BF16
  2324. source: model_scope
  2325. model_scope_model_id: openai/whisper-large-v3
  2326. backend: vLLM
  2327. env:
  2328. GPUSTACK_MODEL_VRAM_CLAIM: "4294967296" # 4 GiB. The repo stores weight files in multiple formats so explicitly set VRAM claim to avoid over-allocation.
  2329. - name: Voxtral-Mini-3B-2507
  2330. description: Voxtral-Mini-3B-2507 is a speech-to-text model from Mistral AI, designed for automatic speech recognition with high accuracy and efficiency.
  2331. home: https://mistral.ai
  2332. icon: /static/catalog_icons/mistral.png
  2333. size: 3
  2334. categories:
  2335. - speech_to_text
  2336. licenses:
  2337. - apache-2.0
  2338. release_date: "2025-07-18"
  2339. specs:
  2340. - mode: standard
  2341. quantization: BF16
  2342. source: model_scope
  2343. model_scope_model_id: mistralai/Voxtral-Mini-3B-2507
  2344. backend: vLLM
  2345. - name: Granite-Speech-3.3-2B
  2346. description: Granite-Speech-3.3-2B is a speech-to-text model from IBM, part of the Granite series, designed for automatic speech recognition with strong multilingual capabilities.
  2347. home: https://www.ibm.com
  2348. icon: /static/catalog_icons/ibm.png
  2349. size: 2
  2350. categories:
  2351. - speech_to_text
  2352. licenses:
  2353. - apache-2.0
  2354. release_date: "2025-06-19"
  2355. specs:
  2356. - mode: standard
  2357. quantization: BF16
  2358. source: model_scope
  2359. model_scope_model_id: ibm-granite/granite-speech-3.3-2b
  2360. backend: vLLM
  2361. - name: Granite-Speech-3.3-8B
  2362. description: Granite-Speech-3.3-8B is a speech-to-text model from IBM, part of the Granite series, designed for automatic speech recognition with enhanced accuracy and multilingual support.
  2363. home: https://www.ibm.com
  2364. icon: /static/catalog_icons/ibm.png
  2365. size: 8
  2366. categories:
  2367. - speech_to_text
  2368. licenses:
  2369. - apache-2.0
  2370. release_date: "2025-06-19"
  2371. specs:
  2372. - mode: standard
  2373. quantization: BF16
  2374. source: model_scope
  2375. model_scope_model_id: ibm-granite/granite-speech-3.3-8b
  2376. backend: vLLM
  2377. - name: Qwen3-ASR-1.7B
  2378. description: Qwen3-ASR-1.7B support language identification and ASR for 52 languages and dialects. It leverages large-scale speech training data and the strong audio understanding capability of its foundation model, Qwen3-Omni.
  2379. home: https://qwen.ai
  2380. icon: /static/catalog_icons/qwen.png
  2381. size: 1.7
  2382. categories:
  2383. - speech_to_text
  2384. licenses:
  2385. - apache-2.0
  2386. release_date: "2026-01-29"
  2387. specs:
  2388. - mode: standard
  2389. quantization: BF16
  2390. source: model_scope
  2391. model_scope_model_id: Qwen/Qwen3-ASR-1.7B
  2392. backend: vLLM
  2393. categories:
  2394. - speech_to_text
  2395. - name: Qwen3-ASR-0.6B
  2396. description: Qwen3-ASR-0.6B support language identification and ASR for 52 languages and dialects. It leverages large-scale speech training data and the strong audio understanding capability of its foundation model, Qwen3-Omni.
  2397. home: https://qwen.ai
  2398. icon: /static/catalog_icons/qwen.png
  2399. size: 0.6
  2400. categories:
  2401. - speech_to_text
  2402. licenses:
  2403. - apache-2.0
  2404. release_date: "2026-01-29"
  2405. specs:
  2406. - mode: standard
  2407. quantization: BF16
  2408. source: model_scope
  2409. model_scope_model_id: Qwen/Qwen3-ASR-0.6B
  2410. backend: vLLM
  2411. categories:
  2412. - speech_to_text
  2413. - name: Dia-1.6B
  2414. description: Dia is a text-to-speech model created by Nari Labs. Dia directly generates highly realistic dialogue from a transcript. You can condition the output on audio, enabling emotion and tone control. The model can also produce nonverbal communications like laughter, coughing, clearing throat, etc.
  2415. home: https://narilabs.org
  2416. icon: /static/catalog_icons/narilabs.png
  2417. size: 1.6
  2418. categories:
  2419. - text_to_speech
  2420. licenses:
  2421. - apache-2.0
  2422. release_date: "2025-04-21"
  2423. specs:
  2424. - mode: standard
  2425. quantization: FP32
  2426. source: model_scope
  2427. model_scope_model_id: nari-labs/Dia-1.6B
  2428. backend: VoxBox
  2429. env:
  2430. GPUSTACK_MODEL_VRAM_CLAIM: "10737418240" # 10 GiB, Dia model empirical estimate.
  2431. - name: Qwen3-TTS-12Hz-1.7B-Base
  2432. description: Qwen3-TTS-12Hz-1.7B-Base is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting 12kHz audio generation.
  2433. home: https://qwen.ai
  2434. icon: /static/catalog_icons/qwen.png
  2435. size: 1.7
  2436. categories:
  2437. - text_to_speech
  2438. licenses:
  2439. - apache-2.0
  2440. release_date: "2026-01-22"
  2441. .base_spec: &qwen3_tts_12hz_1_7b_base_base_spec
  2442. mode: standard
  2443. quantization: "BF16"
  2444. source: model_scope
  2445. model_scope_model_id: Qwen/Qwen3-TTS-12Hz-1.7B-Base
  2446. backend: vLLM
  2447. backend_parameters:
  2448. - --omni
  2449. specs:
  2450. - <<: *qwen3_tts_12hz_1_7b_base_base_spec
  2451. gpu_filters:
  2452. vendor: ascend
  2453. backend_version: *vllm_omni_ascend_stable_version
  2454. - <<: *qwen3_tts_12hz_1_7b_base_base_spec
  2455. backend_version: *vllm_omni_stable_version
  2456. - name: Qwen3-TTS-12Hz-1.7B-CustomVoice
  2457. description: Qwen3-TTS-12Hz-1.7B-CustomVoice is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting custom voice cloning and 12kHz audio generation.
  2458. home: https://qwen.ai
  2459. icon: /static/catalog_icons/qwen.png
  2460. size: 1.7
  2461. categories:
  2462. - text_to_speech
  2463. licenses:
  2464. - apache-2.0
  2465. release_date: "2026-01-22"
  2466. .base_spec: &qwen3_tts_12hz_1_7b_customvoice_base_spec
  2467. mode: standard
  2468. quantization: "BF16"
  2469. source: model_scope
  2470. model_scope_model_id: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
  2471. backend: vLLM
  2472. backend_parameters:
  2473. - --omni
  2474. specs:
  2475. - <<: *qwen3_tts_12hz_1_7b_customvoice_base_spec
  2476. gpu_filters:
  2477. vendor: ascend
  2478. backend_version: *vllm_omni_ascend_stable_version
  2479. - <<: *qwen3_tts_12hz_1_7b_customvoice_base_spec
  2480. backend_version: *vllm_omni_stable_version
  2481. - name: Qwen3-TTS-12Hz-1.7B-VoiceDesign
  2482. description: Qwen3-TTS-12Hz-1.7B-VoiceDesign is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting voice design and 12kHz audio generation.
  2483. home: https://qwen.ai
  2484. icon: /static/catalog_icons/qwen.png
  2485. size: 1.7
  2486. categories:
  2487. - text_to_speech
  2488. licenses:
  2489. - apache-2.0
  2490. release_date: "2026-01-22"
  2491. .base_spec: &qwen3_tts_12hz_1_7b_voicedesign_base_spec
  2492. mode: standard
  2493. quantization: "BF16"
  2494. source: model_scope
  2495. model_scope_model_id: Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
  2496. backend: vLLM
  2497. backend_parameters:
  2498. - --omni
  2499. specs:
  2500. - <<: *qwen3_tts_12hz_1_7b_voicedesign_base_spec
  2501. gpu_filters:
  2502. vendor: ascend
  2503. backend_version: *vllm_omni_ascend_stable_version
  2504. - <<: *qwen3_tts_12hz_1_7b_voicedesign_base_spec
  2505. backend_version: *vllm_omni_stable_version
  2506. - name: Qwen3-TTS-12Hz-0.6B-Base
  2507. description: Qwen3-TTS-12Hz-0.6B-Base is a text-to-speech model from the Qwen3-TTS series with 0.6B parameters, supporting 12kHz audio generation.
  2508. home: https://qwen.ai
  2509. icon: /static/catalog_icons/qwen.png
  2510. size: 0.6
  2511. categories:
  2512. - text_to_speech
  2513. licenses:
  2514. - apache-2.0
  2515. release_date: "2026-01-22"
  2516. .base_spec: &qwen3_tts_12hz_0_6b_base_base_spec
  2517. mode: standard
  2518. quantization: "BF16"
  2519. source: model_scope
  2520. model_scope_model_id: Qwen/Qwen3-TTS-12Hz-0.6B-Base
  2521. backend: vLLM
  2522. backend_parameters:
  2523. - --omni
  2524. specs:
  2525. - <<: *qwen3_tts_12hz_0_6b_base_base_spec
  2526. gpu_filters:
  2527. vendor: ascend
  2528. backend_version: *vllm_omni_ascend_stable_version
  2529. - <<: *qwen3_tts_12hz_0_6b_base_base_spec
  2530. backend_version: *vllm_omni_stable_version
  2531. - name: Qwen3-TTS-12Hz-0.6B-CustomVoice
  2532. description: Qwen3-TTS-12Hz-0.6B-CustomVoice is a text-to-speech model from the Qwen3-TTS series with 0.6B parameters, supporting custom voice cloning and 12kHz audio generation.
  2533. home: https://qwen.ai
  2534. icon: /static/catalog_icons/qwen.png
  2535. size: 0.6
  2536. categories:
  2537. - text_to_speech
  2538. licenses:
  2539. - apache-2.0
  2540. release_date: "2026-01-22"
  2541. .base_spec: &qwen3_tts_12hz_0_6b_customvoice_base_spec
  2542. mode: standard
  2543. quantization: "BF16"
  2544. source: model_scope
  2545. model_scope_model_id: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice
  2546. backend: vLLM
  2547. backend_parameters:
  2548. - --omni
  2549. specs:
  2550. - <<: *qwen3_tts_12hz_0_6b_customvoice_base_spec
  2551. gpu_filters:
  2552. vendor: ascend
  2553. backend_version: *vllm_omni_ascend_stable_version
  2554. - <<: *qwen3_tts_12hz_0_6b_customvoice_base_spec
  2555. backend_version: *vllm_omni_stable_version
  2556. - name: SenseVoice-Small
  2557. description: SenseVoice is a speech foundation model with multiple speech understanding capabilities, including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and audio event detection (AED).
  2558. home: https://github.com/FunAudioLLM
  2559. icon: /static/catalog_icons/FunAudioLLM.png
  2560. categories:
  2561. - speech_to_text
  2562. licenses:
  2563. - apache-2.0
  2564. release_date: "2024-07-31"
  2565. specs:
  2566. - mode: standard
  2567. quantization: FP16
  2568. source: model_scope
  2569. model_scope_model_id: iic/SenseVoiceSmall
  2570. backend: VoxBox
  2571. env:
  2572. GPUSTACK_MODEL_VRAM_CLAIM: "12884901888" # 12 GiB, it depends on the audio length. This value works for ~10 minutes audio input.