model-catalog.yaml 79 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521
  1. # YAML Variables
  2. .vllm_omni_ascend_stable_version: &vllm_omni_ascend_stable_version "0.14.1"
  3. .vllm_omni_stable_version: &vllm_omni_stable_version "0.16.0"
  4. draft_models:
  5. - name: Qwen3-8B-EAGLE3
  6. algorithm: eagle3
  7. source: huggingface
  8. huggingface_repo_id: Tengyunw/qwen3_8b_eagle3
  9. - name: Qwen3-30B-A3B-EAGLE3
  10. algorithm: eagle3
  11. source: huggingface
  12. huggingface_repo_id: Tengyunw/qwen3_30b_moe_eagle3
  13. - name: Qwen3-235B-A22B-EAGLE3
  14. algorithm: eagle3
  15. source: huggingface
  16. huggingface_repo_id: lmsys/Qwen3-235B-A22B-EAGLE3
  17. - name: gpt-oss-120b-EAGLE3
  18. algorithm: eagle3
  19. source: huggingface
  20. huggingface_repo_id: lmsys/EAGLE3-gpt-oss-120b-bf16
  21. model_sets:
  22. - name: Qwen3-0.6B
  23. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  24. home: https://qwenlm.github.io
  25. icon: /static/catalog_icons/qwen.png
  26. size: 0.6
  27. categories:
  28. - llm
  29. capabilities:
  30. - context/128K
  31. - tools
  32. licenses:
  33. - apache-2.0
  34. release_date: "2025-04-19"
  35. specs:
  36. # Ascend NPUs
  37. - mode: throughput
  38. quantization: BF16
  39. gpu_filters:
  40. vendor: ascend
  41. source: huggingface
  42. huggingface_repo_id: Qwen/Qwen3-0.6B
  43. backend: MindIE
  44. backend_parameters:
  45. - --max-seq-len=8192
  46. # Other GPUs
  47. - mode: standard
  48. quantization: BF16
  49. source: huggingface
  50. huggingface_repo_id: Qwen/Qwen3-0.6B
  51. backend: vLLM
  52. backend_parameters:
  53. - --reasoning-parser=deepseek_r1
  54. - --max-model-len=8192
  55. - name: Qwen3-8B
  56. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  57. home: https://qwenlm.github.io
  58. icon: /static/catalog_icons/qwen.png
  59. size: 8
  60. categories:
  61. - llm
  62. capabilities:
  63. - context/128K
  64. - tools
  65. licenses:
  66. - apache-2.0
  67. release_date: "2025-04-19"
  68. specs:
  69. # Ascend NPUs
  70. - mode: throughput
  71. quantization: BF16
  72. gpu_filters:
  73. vendor: ascend
  74. source: huggingface
  75. huggingface_repo_id: Qwen/Qwen3-8B
  76. backend: MindIE
  77. backend_parameters:
  78. - --max-seq-len=32768
  79. # Other GPUs
  80. - mode: throughput
  81. quantization: FP8
  82. source: huggingface
  83. huggingface_repo_id: Qwen/Qwen3-8B-FP8
  84. backend: vLLM
  85. backend_parameters:
  86. - --reasoning-parser=deepseek_r1
  87. - --max-model-len=32768
  88. - mode: standard
  89. quantization: BF16
  90. source: huggingface
  91. huggingface_repo_id: Qwen/Qwen3-8B
  92. backend: vLLM
  93. backend_parameters:
  94. - --reasoning-parser=deepseek_r1
  95. - --max-model-len=32768
  96. - name: Falcon-H1R-7B
  97. description: Falcon-H1R-7B is a reasoning-specialized language model built on top of Falcon-H1-7B-Base, featuring a Hybrid-Head Language Model (Transformer-SSM) architecture that delivers outstanding performance in mathematics, programming, and instruction following.
  98. home: https://huggingface.co/tiiuae
  99. icon: /static/catalog_icons/tii.png
  100. size: 7
  101. categories:
  102. - llm
  103. capabilities:
  104. - context/256K
  105. licenses:
  106. - falcon-llm-license
  107. release_date: "2026-01-05"
  108. specs:
  109. - mode: standard
  110. quantization: BF16
  111. source: huggingface
  112. huggingface_repo_id: tiiuae/Falcon-H1R-7B
  113. backend: vLLM
  114. backend_parameters:
  115. - --reasoning-parser=deepseek_r1
  116. - --max-model-len=65536
  117. - name: Qwen3-14B
  118. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  119. home: https://qwenlm.github.io
  120. icon: /static/catalog_icons/qwen.png
  121. size: 14
  122. categories:
  123. - llm
  124. capabilities:
  125. - context/128K
  126. - tools
  127. licenses:
  128. - apache-2.0
  129. release_date: "2025-04-19"
  130. specs:
  131. # Ascend NPUs
  132. - mode: throughput
  133. quantization: BF16
  134. gpu_filters:
  135. vendor: ascend
  136. source: huggingface
  137. huggingface_repo_id: Qwen/Qwen3-14B
  138. backend: MindIE
  139. backend_parameters:
  140. - --max-seq-len=32768
  141. # Other GPUs
  142. - mode: throughput
  143. quantization: FP8
  144. gpu_filters:
  145. vendor: nvidia
  146. compute_capability: ">=9.0" # Hopper or later
  147. source: huggingface
  148. huggingface_repo_id: Qwen/Qwen3-14B-FP8
  149. backend: SGLang
  150. backend_parameters:
  151. - --reasoning-parser=qwen3
  152. - --context-length=32768
  153. - mode: throughput
  154. quantization: FP8
  155. gpu_filters:
  156. vendor: nvidia
  157. compute_capability: "<9.0" # Before Hopper
  158. source: huggingface
  159. huggingface_repo_id: Qwen/Qwen3-14B-FP8
  160. backend: vLLM
  161. backend_parameters:
  162. - --reasoning-parser=deepseek_r1
  163. - --max-model-len=32768
  164. - mode: standard
  165. quantization: BF16
  166. source: huggingface
  167. huggingface_repo_id: Qwen/Qwen3-14B
  168. backend: vLLM
  169. backend_parameters:
  170. - --reasoning-parser=deepseek_r1
  171. - --max-model-len=32768
  172. - name: Qwen3-32B
  173. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  174. home: https://qwenlm.github.io
  175. icon: /static/catalog_icons/qwen.png
  176. size: 32
  177. categories:
  178. - llm
  179. capabilities:
  180. - context/128K
  181. - tools
  182. licenses:
  183. - apache-2.0
  184. release_date: "2025-04-19"
  185. specs:
  186. # Ascend NPUs
  187. - mode: throughput
  188. quantization: BF16
  189. gpu_filters:
  190. vendor: ascend
  191. source: huggingface
  192. huggingface_repo_id: Qwen/Qwen3-32B
  193. backend: MindIE
  194. backend_parameters:
  195. - --max-seq-len=32768
  196. # Other GPUs
  197. - mode: throughput
  198. quantization: FP8
  199. source: huggingface
  200. huggingface_repo_id: Qwen/Qwen3-32B-FP8
  201. backend: vLLM
  202. backend_parameters:
  203. - --reasoning-parser=deepseek_r1
  204. - --max-model-len=32768
  205. - mode: standard
  206. quantization: BF16
  207. source: huggingface
  208. huggingface_repo_id: Qwen/Qwen3-32B
  209. backend: vLLM
  210. backend_parameters:
  211. - --reasoning-parser=deepseek_r1
  212. - --max-model-len=32768
  213. - name: Qwen3-Coder-Next
  214. description: Qwen3-Coder-Next is a super-efficient coding model with 80B total parameters and 3B activated parameters (MoE architecture). It achieves performance comparable to models with 10-20x more active parameters, excelling at long-horizon reasoning, complex tool usage, and IDE integration.
  215. home: https://qwenlm.github.io
  216. icon: /static/catalog_icons/qwen.png
  217. size: 80
  218. activated_size: 3
  219. categories:
  220. - llm
  221. capabilities:
  222. - context/256K
  223. - tools
  224. licenses:
  225. - apache-2.0
  226. release_date: "2026-02-03"
  227. specs:
  228. - mode: throughput
  229. quantization: FP8
  230. source: huggingface
  231. huggingface_repo_id: Qwen/Qwen3-Coder-Next-FP8
  232. backend: vLLM
  233. backend_parameters:
  234. - --max-model-len=65536
  235. - --enable-auto-tool-choice
  236. - --tool-call-parser=qwen3_coder
  237. - mode: standard
  238. quantization: BF16
  239. source: huggingface
  240. huggingface_repo_id: Qwen/Qwen3-Coder-Next
  241. backend: vLLM
  242. backend_parameters:
  243. - --max-model-len=65536
  244. - --enable-auto-tool-choice
  245. - --tool-call-parser=qwen3_coder
  246. - name: Qwen3-30B-A3B-Instruct-2507
  247. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  248. home: https://qwenlm.github.io
  249. icon: /static/catalog_icons/qwen.png
  250. size: 30
  251. activated_size: 3
  252. categories:
  253. - llm
  254. capabilities:
  255. - context/256K
  256. - tools
  257. licenses:
  258. - apache-2.0
  259. release_date: "2025-07-21"
  260. specs:
  261. # Ascend NPUs
  262. - mode: throughput
  263. quantization: BF16
  264. gpu_filters:
  265. vendor: ascend
  266. source: huggingface
  267. huggingface_repo_id: Qwen/Qwen3-30B-A3B-Instruct-2507
  268. backend: MindIE
  269. backend_parameters:
  270. - --max-seq-len=32768
  271. # Other GPUs
  272. - mode: throughput
  273. quantization: FP8
  274. gpu_filters:
  275. vendor: nvidia
  276. compute_capability: ">=9.0" # Hopper or later
  277. source: huggingface
  278. huggingface_repo_id: Qwen/Qwen3-30B-A3B-Instruct-2507-FP8
  279. backend: SGLang
  280. backend_parameters:
  281. - --tool-call-parser=qwen25
  282. - --context-length=32768
  283. - mode: throughput
  284. quantization: FP8
  285. gpu_filters:
  286. vendor: nvidia
  287. compute_capability: "<9.0" # Before Hopper
  288. source: huggingface
  289. huggingface_repo_id: Qwen/Qwen3-30B-A3B-Instruct-2507-FP8
  290. backend: vLLM
  291. backend_parameters:
  292. - --tool-call-parser=hermes
  293. - --enable-auto-tool-choice
  294. - --max-model-len=32768
  295. - mode: standard
  296. quantization: BF16
  297. source: huggingface
  298. huggingface_repo_id: Qwen/Qwen3-30B-A3B-Instruct-2507
  299. backend: vLLM
  300. backend_parameters:
  301. - --tool-call-parser=hermes
  302. - --enable-auto-tool-choice
  303. - --max-model-len=32768
  304. - name: Qwen3-30B-A3B-Thinking-2507
  305. description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  306. home: https://qwenlm.github.io
  307. icon: /static/catalog_icons/qwen.png
  308. size: 30
  309. activated_size: 3
  310. categories:
  311. - llm
  312. capabilities:
  313. - context/256K
  314. - tools
  315. licenses:
  316. - apache-2.0
  317. release_date: "2025-07-21"
  318. specs:
  319. # Ascend NPUs
  320. - mode: throughput
  321. quantization: BF16
  322. gpu_filters:
  323. vendor: ascend
  324. source: huggingface
  325. huggingface_repo_id: Qwen/Qwen3-30B-A3B-Thinking-2507
  326. backend: MindIE
  327. backend_parameters:
  328. - --max-seq-len=32768
  329. # Other GPUs
  330. - mode: throughput
  331. quantization: FP8
  332. gpu_filters:
  333. vendor: nvidia
  334. compute_capability: ">=9.0" # Hopper or later
  335. source: huggingface
  336. huggingface_repo_id: Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
  337. backend: SGLang
  338. backend_parameters:
  339. - --reasoning-parser=deepseek-r1
  340. - --tool-call-parser=qwen25
  341. - --context-length=32768
  342. - mode: throughput
  343. quantization: FP8
  344. gpu_filters:
  345. vendor: nvidia
  346. compute_capability: "<9.0" # Before Hopper
  347. source: huggingface
  348. huggingface_repo_id: Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
  349. backend: vLLM
  350. backend_parameters:
  351. - --reasoning-parser=deepseek_r1
  352. - --tool-call-parser=hermes
  353. - --enable-auto-tool-choice
  354. - --max-model-len=32768
  355. - mode: standard
  356. quantization: BF16
  357. source: huggingface
  358. huggingface_repo_id: Qwen/Qwen3-30B-A3B-Thinking-2507
  359. backend: vLLM
  360. backend_parameters:
  361. - --reasoning-parser=deepseek_r1
  362. - --tool-call-parser=hermes
  363. - --enable-auto-tool-choice
  364. - --max-model-len=32768
  365. - name: Qwen3-235B-A22B-Instruct-2507
  366. description: The updated version of the Qwen3-235B-A22B non-thinking mode.
  367. home: https://qwenlm.github.io
  368. icon: /static/catalog_icons/qwen.png
  369. size: 235
  370. activated_size: 22
  371. categories:
  372. - llm
  373. capabilities:
  374. - context/1M
  375. - tools
  376. licenses:
  377. - apache-2.0
  378. release_date: "2025-07-21"
  379. specs:
  380. # Ascend NPUs
  381. - mode: throughput
  382. quantization: BF16
  383. gpu_filters:
  384. vendor: ascend
  385. source: huggingface
  386. huggingface_repo_id: Qwen/Qwen3-235B-A22B-Instruct-2507
  387. backend: MindIE
  388. backend_parameters:
  389. - --max-seq-len=65536
  390. # Other GPUs
  391. - mode: throughput
  392. quantization: FP8
  393. source: huggingface
  394. huggingface_repo_id: Qwen/Qwen3-235B-A22B-Instruct-2507-FP8
  395. backend: vLLM
  396. backend_parameters:
  397. - --tool-call-parser=hermes
  398. - --enable-auto-tool-choice
  399. - --max-model-len=65536
  400. - mode: standard
  401. quantization: BF16
  402. source: huggingface
  403. huggingface_repo_id: Qwen/Qwen3-235B-A22B-Instruct-2507
  404. backend: vLLM
  405. backend_parameters:
  406. - --tool-call-parser=hermes
  407. - --enable-auto-tool-choice
  408. - --max-model-len=65536
  409. - name: Qwen3-235B-A22B-Thinking-2507
  410. description: The updated version of the Qwen3-235B-A22B thinking mode.
  411. home: https://qwenlm.github.io
  412. icon: /static/catalog_icons/qwen.png
  413. size: 235
  414. activated_size: 22
  415. categories:
  416. - llm
  417. capabilities:
  418. - context/1M
  419. - tools
  420. licenses:
  421. - apache-2.0
  422. release_date: "2025-07-21"
  423. specs:
  424. # Ascend NPUs
  425. - mode: throughput
  426. quantization: BF16
  427. gpu_filters:
  428. vendor: ascend
  429. source: huggingface
  430. huggingface_repo_id: Qwen/Qwen3-235B-A22B-Thinking-2507
  431. backend: MindIE
  432. backend_parameters:
  433. - --max-seq-len=65536
  434. # Other GPUs
  435. - mode: throughput
  436. quantization: FP8
  437. source: huggingface
  438. huggingface_repo_id: Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
  439. backend: vLLM
  440. backend_parameters:
  441. - --reasoning-parser=deepseek_r1
  442. - --tool-call-parser=hermes
  443. - --enable-auto-tool-choice
  444. - --max-model-len=65536
  445. - mode: standard
  446. quantization: BF16
  447. source: huggingface
  448. huggingface_repo_id: Qwen/Qwen3-235B-A22B-Thinking-2507
  449. backend: vLLM
  450. backend_parameters:
  451. - --reasoning-parser=deepseek_r1
  452. - --tool-call-parser=hermes
  453. - --enable-auto-tool-choice
  454. - --max-model-len=65536
  455. - name: Qwen3.5-0.8B
  456. description: Qwen3.5-0.8B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks.
  457. home: https://qwenlm.github.io
  458. icon: /static/catalog_icons/qwen.png
  459. size: 0.8
  460. categories:
  461. - llm
  462. capabilities:
  463. - context/256K
  464. - reasoning
  465. - tools
  466. - vision
  467. licenses:
  468. - apache-2.0
  469. release_date: "2026-03-02"
  470. specs:
  471. # Ascend NPUs
  472. - mode: standard
  473. quantization: BF16
  474. gpu_filters:
  475. vendor: ascend
  476. source: huggingface
  477. huggingface_repo_id: Qwen/Qwen3.5-0.8B
  478. backend: SGLang
  479. backend_version: 0.5.9
  480. backend_parameters:
  481. - --context-length=32768
  482. - --disable-radix-cache
  483. - --chunked-prefill-size=4096
  484. - --max-prefill-tokens=4096
  485. - --max-total-tokens=40960
  486. - mode: standard
  487. quantization: BF16
  488. source: huggingface
  489. huggingface_repo_id: Qwen/Qwen3.5-0.8B
  490. backend: vLLM
  491. backend_version: 0.17.1
  492. backend_parameters:
  493. - --reasoning-parser=qwen3
  494. - --max-model-len=32768
  495. - name: Qwen3.5-2B
  496. description: Qwen3.5-2B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks.
  497. home: https://qwenlm.github.io
  498. icon: /static/catalog_icons/qwen.png
  499. size: 2
  500. categories:
  501. - llm
  502. capabilities:
  503. - context/256K
  504. - reasoning
  505. - tools
  506. - vision
  507. licenses:
  508. - apache-2.0
  509. release_date: "2026-03-02"
  510. specs:
  511. # Ascend NPUs
  512. - mode: standard
  513. quantization: BF16
  514. gpu_filters:
  515. vendor: ascend
  516. source: huggingface
  517. huggingface_repo_id: Qwen/Qwen3.5-2B
  518. backend: SGLang
  519. backend_version: 0.5.9
  520. backend_parameters:
  521. - --context-length=32768
  522. - --disable-radix-cache
  523. - --chunked-prefill-size=4096
  524. - --max-prefill-tokens=4096
  525. - --max-total-tokens=40960
  526. - mode: standard
  527. quantization: BF16
  528. source: huggingface
  529. huggingface_repo_id: Qwen/Qwen3.5-2B
  530. backend: vLLM
  531. backend_version: 0.17.1
  532. backend_parameters:
  533. - --reasoning-parser=qwen3
  534. - --max-model-len=32768
  535. - name: Qwen3.5-4B
  536. description: Qwen3.5-4B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks.
  537. home: https://qwenlm.github.io
  538. icon: /static/catalog_icons/qwen.png
  539. size: 4
  540. categories:
  541. - llm
  542. capabilities:
  543. - context/256K
  544. - reasoning
  545. - tools
  546. - vision
  547. licenses:
  548. - apache-2.0
  549. release_date: "2026-03-02"
  550. specs:
  551. # Ascend NPUs
  552. - mode: standard
  553. quantization: BF16
  554. gpu_filters:
  555. vendor: ascend
  556. source: huggingface
  557. huggingface_repo_id: Qwen/Qwen3.5-4B
  558. backend: SGLang
  559. backend_version: 0.5.9
  560. backend_parameters:
  561. - --reasoning-parser=qwen3
  562. - --context-length=32768
  563. - --disable-radix-cache
  564. - --chunked-prefill-size=4096
  565. - --max-prefill-tokens=4096
  566. - --max-total-tokens=40960
  567. - mode: standard
  568. quantization: BF16
  569. source: huggingface
  570. huggingface_repo_id: Qwen/Qwen3.5-4B
  571. backend: vLLM
  572. backend_version: 0.17.1
  573. backend_parameters:
  574. - --reasoning-parser=qwen3
  575. - --max-model-len=32768
  576. - name: Qwen3.5-9B
  577. description: Qwen3.5-9B is a model from the Qwen family, designed for strong reasoning, coding, and multilingual understanding with competitive performance across a wide range of tasks.
  578. home: https://qwenlm.github.io
  579. icon: /static/catalog_icons/qwen.png
  580. size: 9
  581. categories:
  582. - llm
  583. capabilities:
  584. - context/256K
  585. - reasoning
  586. - tools
  587. - vision
  588. licenses:
  589. - apache-2.0
  590. release_date: "2026-03-02"
  591. specs:
  592. # Ascend NPUs
  593. - mode: standard
  594. quantization: BF16
  595. gpu_filters:
  596. vendor: ascend
  597. source: huggingface
  598. huggingface_repo_id: Qwen/Qwen3.5-9B
  599. backend: SGLang
  600. backend_version: 0.5.9
  601. backend_parameters:
  602. - --reasoning-parser=qwen3
  603. - --context-length=32768
  604. - --disable-radix-cache
  605. - --chunked-prefill-size=4096
  606. - --max-prefill-tokens=4096
  607. - --max-total-tokens=40960
  608. - mode: throughput
  609. quantization: BF16
  610. source: huggingface
  611. huggingface_repo_id: Qwen/Qwen3.5-9B
  612. backend: vLLM
  613. backend_version: 0.17.1
  614. backend_parameters:
  615. - --reasoning-parser=qwen3
  616. - --max-model-len=32768
  617. - --performance-mode=throughput
  618. - --enable-prefix-caching
  619. - mode: latency
  620. quantization: BF16
  621. source: huggingface
  622. huggingface_repo_id: Qwen/Qwen3.5-9B
  623. backend: vLLM
  624. backend_version: 0.17.1
  625. backend_parameters:
  626. - --reasoning-parser=qwen3
  627. - --max-model-len=32768
  628. - --performance-mode=interactivity
  629. - --language-model-only
  630. speculative_config:
  631. enabled: true
  632. algorithm: mtp
  633. num_draft_tokens: 1
  634. - mode: standard
  635. quantization: BF16
  636. source: huggingface
  637. huggingface_repo_id: Qwen/Qwen3.5-9B
  638. backend: vLLM
  639. backend_version: 0.17.1
  640. backend_parameters:
  641. - --reasoning-parser=qwen3
  642. - --max-model-len=32768
  643. - name: Qwen3.5-27B
  644. description: Qwen3.5-27B is a model designed for strong reasoning, coding, and multilingual understanding with competitive performance across a wide range of tasks.
  645. home: https://qwenlm.github.io
  646. icon: /static/catalog_icons/qwen.png
  647. size: 27
  648. categories:
  649. - llm
  650. capabilities:
  651. - context/256K
  652. - reasoning
  653. - tools
  654. - vision
  655. licenses:
  656. - apache-2.0
  657. release_date: "2026-02-24"
  658. specs:
  659. # Ascend NPUs
  660. - mode: standard
  661. quantization: BF16
  662. gpu_filters:
  663. vendor: ascend
  664. source: huggingface
  665. huggingface_repo_id: Qwen/Qwen3.5-27B
  666. backend: SGLang
  667. backend_version: 0.5.9
  668. backend_parameters:
  669. - --reasoning-parser=qwen3
  670. - --context-length=32768
  671. - --disable-radix-cache
  672. - --chunked-prefill-size=4096
  673. - --max-prefill-tokens=4096
  674. - --max-total-tokens=40960
  675. - mode: standard
  676. quantization: BF16
  677. source: huggingface
  678. huggingface_repo_id: Qwen/Qwen3.5-27B
  679. backend: vLLM
  680. backend_version: 0.17.1
  681. backend_parameters:
  682. - --reasoning-parser=qwen3
  683. - --max-model-len=32768
  684. - mode: throughput
  685. quantization: FP8
  686. gpu_filters:
  687. vendor: nvidia
  688. compute_capability: ">=9.0"
  689. source: huggingface
  690. huggingface_repo_id: Qwen/Qwen3.5-27B-FP8
  691. backend: vLLM
  692. backend_version: 0.17.1
  693. backend_parameters:
  694. - --reasoning-parser=qwen3
  695. - --max-model-len=32768
  696. - --performance-mode=throughput
  697. - --enable-prefix-caching
  698. - name: Qwen3.5-35B-A3B
  699. description: Qwen3.5-35B-A3B is a 35-billion-parameter open-source large language model from the Qwen family, designed for strong reasoning, code generation, and multilingual understanding across diverse tasks.
  700. home: https://qwenlm.github.io
  701. icon: /static/catalog_icons/qwen.png
  702. size: 35
  703. activated_size: 3
  704. categories:
  705. - llm
  706. capabilities:
  707. - context/256K
  708. - reasoning
  709. - tools
  710. - vision
  711. licenses:
  712. - apache-2.0
  713. release_date: "2026-02-24"
  714. specs:
  715. # Ascend NPUs
  716. - mode: standard
  717. quantization: BF16
  718. gpu_filters:
  719. vendor: ascend
  720. source: huggingface
  721. huggingface_repo_id: Qwen/Qwen3.5-35B-A3B
  722. backend: SGLang
  723. backend_version: 0.5.9
  724. backend_parameters:
  725. - --reasoning-parser=qwen3
  726. - --context-length=32768
  727. - --disable-radix-cache
  728. - --chunked-prefill-size=4096
  729. - --max-prefill-tokens=4096
  730. - --max-total-tokens=40960
  731. - mode: standard
  732. quantization: BF16
  733. source: huggingface
  734. huggingface_repo_id: Qwen/Qwen3.5-35B-A3B
  735. backend: vLLM
  736. backend_version: 0.17.1
  737. backend_parameters:
  738. - --reasoning-parser=qwen3
  739. - --max-model-len=32768
  740. - mode: throughput
  741. quantization: FP8
  742. gpu_filters:
  743. vendor: nvidia
  744. compute_capability: ">=9.0"
  745. source: huggingface
  746. huggingface_repo_id: Qwen/Qwen3.5-35B-A3B-FP8
  747. backend: vLLM
  748. backend_version: 0.17.1
  749. backend_parameters:
  750. - --reasoning-parser=qwen3
  751. - --max-model-len=32768
  752. - --performance-mode=throughput
  753. - --enable-prefix-caching
  754. - mode: latency
  755. quantization: FP8
  756. gpu_filters:
  757. vendor: nvidia
  758. compute_capability: ">=9.0"
  759. source: huggingface
  760. huggingface_repo_id: Qwen/Qwen3.5-35B-A3B-FP8
  761. backend: vLLM
  762. backend_version: 0.17.1
  763. backend_parameters:
  764. - --reasoning-parser=qwen3
  765. - --max-model-len=32768
  766. - --performance-mode=interactivity
  767. speculative_config:
  768. enabled: true
  769. algorithm: mtp
  770. num_draft_tokens: 1
  771. - name: Qwen3.5-122B-A10B
  772. description: Qwen3.5-122B-A10B is a 122-billion-parameter open-source large language model from the Qwen family, designed for strong reasoning, code generation, and multilingual understanding across diverse tasks.
  773. home: https://qwenlm.github.io
  774. icon: /static/catalog_icons/qwen.png
  775. size: 122
  776. activated_size: 10
  777. categories:
  778. - llm
  779. capabilities:
  780. - context/256K
  781. - reasoning
  782. - tools
  783. - vision
  784. licenses:
  785. - apache-2.0
  786. release_date: "2026-02-24"
  787. specs:
  788. # Ascend NPUs
  789. - mode: standard
  790. quantization: BF16
  791. gpu_filters:
  792. vendor: ascend
  793. source: huggingface
  794. huggingface_repo_id: Qwen/Qwen3.5-122B-A10B
  795. backend: SGLang
  796. backend_version: 0.5.9
  797. backend_parameters:
  798. - --reasoning-parser=qwen3
  799. - --context-length=32768
  800. - --disable-radix-cache
  801. - --chunked-prefill-size=4096
  802. - --max-prefill-tokens=4096
  803. - --max-total-tokens=40960
  804. - mode: standard
  805. quantization: BF16
  806. source: huggingface
  807. huggingface_repo_id: Qwen/Qwen3.5-122B-A10B
  808. backend: vLLM
  809. backend_version: 0.17.1
  810. backend_parameters:
  811. - --reasoning-parser=qwen3
  812. - --max-model-len=32768
  813. - mode: throughput
  814. quantization: FP8
  815. gpu_filters:
  816. vendor: nvidia
  817. compute_capability: ">=9.0"
  818. source: huggingface
  819. huggingface_repo_id: Qwen/Qwen3.5-122B-A10B-FP8
  820. backend: vLLM
  821. backend_version: 0.17.1
  822. backend_parameters:
  823. - --reasoning-parser=qwen3
  824. - --max-model-len=32768
  825. - --performance-mode=throughput
  826. - --enable-prefix-caching
  827. - name: Qwen3.5-397B-A17B
  828. description: Qwen3.5-397B-A17B is a flagship MoE-hybrid model that delivers state-of-the-art reasoning and multimodal performance with ultra-efficient inference capabilities.
  829. home: https://qwenlm.github.io
  830. icon: /static/catalog_icons/qwen.png
  831. size: 397
  832. activated_size: 17
  833. categories:
  834. - llm
  835. capabilities:
  836. - context/256K
  837. - reasoning
  838. - tools
  839. - vision
  840. licenses:
  841. - apache-2.0
  842. release_date: "2026-02-16"
  843. specs:
  844. # Ascend NPUs
  845. - mode: standard
  846. quantization: BF16
  847. gpu_filters:
  848. vendor: ascend
  849. source: huggingface
  850. huggingface_repo_id: Qwen/Qwen3.5-397B-A17B
  851. backend: SGLang
  852. backend_version: 0.5.9
  853. backend_parameters:
  854. - --reasoning-parser=qwen3
  855. - --context-length=32768
  856. - --disable-radix-cache
  857. - --chunked-prefill-size=4096
  858. - --max-prefill-tokens=4096
  859. - --max-total-tokens=40960
  860. - mode: standard
  861. quantization: BF16
  862. source: huggingface
  863. huggingface_repo_id: Qwen/Qwen3.5-397B-A17B
  864. backend: vLLM
  865. backend_version: 0.17.1
  866. backend_parameters:
  867. - --reasoning-parser=qwen3
  868. - --max-model-len=32768
  869. - mode: throughput
  870. quantization: FP8
  871. gpu_filters:
  872. vendor: nvidia
  873. compute_capability: ">=9.0"
  874. source: huggingface
  875. huggingface_repo_id: Qwen/Qwen3.5-397B-A17B-FP8
  876. backend: vLLM
  877. backend_version: 0.17.1
  878. backend_parameters:
  879. - --reasoning-parser=qwen3
  880. - --max-model-len=32768
  881. - --performance-mode=throughput
  882. - --enable-prefix-caching
  883. - name: GLM-4.7
  884. description: GLM-4.7 is a large language model developed by Zhipu AI, featuring advanced agentic, reasoning, and coding capabilities.
  885. home: https://z.ai
  886. icon: /static/catalog_icons/zai.png
  887. size: 355
  888. activated_size: 32
  889. categories:
  890. - llm
  891. capabilities:
  892. - context/1M
  893. - reasoning
  894. - tools
  895. licenses:
  896. - mit
  897. release_date: "2025-12-22"
  898. specs:
  899. # TODO: tool-call-parser glm47 not yet available in the latest vLLM/SGLang release
  900. - mode: throughput
  901. quantization: FP8
  902. gpu_filters:
  903. vendor: nvidia
  904. compute_capability: ">=9.0" # Hopper or later
  905. source: huggingface
  906. huggingface_repo_id: zai-org/GLM-4.7-FP8
  907. backend: SGLang
  908. backend_parameters:
  909. - --reasoning-parser=glm45
  910. - --context-length=65536
  911. - mode: throughput
  912. quantization: FP8
  913. gpu_filters:
  914. vendor: nvidia
  915. compute_capability: "<9.0" # Before Hopper
  916. source: huggingface
  917. huggingface_repo_id: zai-org/GLM-4.7-FP8
  918. backend: vLLM
  919. backend_parameters:
  920. - --reasoning-parser=glm45
  921. - --max-model-len=65536
  922. - mode: standard
  923. quantization: BF16
  924. source: huggingface
  925. huggingface_repo_id: zai-org/GLM-4.7
  926. backend: vLLM
  927. backend_parameters:
  928. - --reasoning-parser=glm45
  929. - --max-model-len=65536
  930. - name: GLM-4.6
  931. description: GLM-4.6 is a large language model developed by Zhipu AI, featuring advanced agentic, reasoning, and coding capabilities.
  932. home: https://z.ai
  933. icon: /static/catalog_icons/zai.png
  934. size: 355
  935. activated_size: 32
  936. categories:
  937. - llm
  938. capabilities:
  939. - context/1M
  940. - reasoning
  941. - tools
  942. licenses:
  943. - mit
  944. release_date: "2025-09-30"
  945. specs:
  946. - mode: throughput
  947. quantization: FP8
  948. gpu_filters:
  949. vendor: nvidia
  950. compute_capability: ">=9.0" # Hopper or later
  951. source: huggingface
  952. huggingface_repo_id: zai-org/GLM-4.6-FP8
  953. backend: SGLang
  954. backend_parameters:
  955. - --tool-call-parser=glm
  956. - --reasoning-parser=glm45
  957. - --context-length=65536
  958. - mode: throughput
  959. quantization: FP8
  960. gpu_filters:
  961. vendor: nvidia
  962. compute_capability: "<9.0" # Before Hopper
  963. source: huggingface
  964. huggingface_repo_id: zai-org/GLM-4.6-FP8
  965. backend: vLLM
  966. backend_parameters:
  967. - --reasoning-parser=glm45
  968. - --tool-call-parser=glm45
  969. - --enable-auto-tool-choice
  970. - --max-model-len=65536
  971. - mode: standard
  972. quantization: BF16
  973. source: huggingface
  974. huggingface_repo_id: zai-org/GLM-4.6
  975. backend: vLLM
  976. backend_parameters:
  977. - --reasoning-parser=glm45
  978. - --tool-call-parser=glm45
  979. - --enable-auto-tool-choice
  980. - --max-model-len=65536
  981. - name: gpt-oss-120b
  982. description: The gpt-oss series is OpenAI's family of open-weight models, designed for powerful reasoning, agentic tasks, and versatile developer use cases.
  983. home: https://openai.com
  984. icon: /static/catalog_icons/openai.png
  985. categories:
  986. - llm
  987. capabilities:
  988. - context/128K
  989. size: 120
  990. licenses:
  991. - apache-2.0
  992. release_date: "2025-08-05"
  993. specs:
  994. - mode: throughput
  995. quantization: "MXFP4"
  996. source: huggingface
  997. huggingface_repo_id: openai/gpt-oss-120b
  998. backend: vLLM
  999. backend_parameters:
  1000. - --max-model-len=32768
  1001. - --tool-call-parser=openai
  1002. - --enable-auto-tool-choice
  1003. - --async-scheduling
  1004. - mode: standard
  1005. quantization: "MXFP4"
  1006. source: huggingface
  1007. huggingface_repo_id: openai/gpt-oss-120b
  1008. backend: vLLM
  1009. backend_parameters:
  1010. - --max-model-len=32768
  1011. - --tool-call-parser=openai
  1012. - --enable-auto-tool-choice
  1013. - name: gpt-oss-20b
  1014. description: The gpt-oss series is OpenAI's family of open-weight models, designed for powerful reasoning, agentic tasks, and versatile developer use cases.
  1015. home: https://openai.com
  1016. icon: /static/catalog_icons/openai.png
  1017. categories:
  1018. - llm
  1019. capabilities:
  1020. - context/128K
  1021. size: 20
  1022. licenses:
  1023. - apache-2.0
  1024. release_date: "2025-08-05"
  1025. specs:
  1026. - mode: throughput
  1027. quantization: "MXFP4"
  1028. source: huggingface
  1029. huggingface_repo_id: openai/gpt-oss-20b
  1030. backend: vLLM
  1031. backend_parameters:
  1032. - --max-model-len=32768
  1033. - --tool-call-parser=openai
  1034. - --enable-auto-tool-choice
  1035. - --async-scheduling
  1036. - mode: standard
  1037. quantization: "MXFP4"
  1038. source: huggingface
  1039. huggingface_repo_id: openai/gpt-oss-20b
  1040. backend: vLLM
  1041. backend_parameters:
  1042. - --max-model-len=32768
  1043. - --tool-call-parser=openai
  1044. - --enable-auto-tool-choice
  1045. - name: Deepseek-R1-0528
  1046. description: DeepSeek-R1-0528 is a minor version of the DeepSeek R1 model that features enhanced reasoning depth and inference capabilities. These improvements are achieved through increased computational resources and algorithmic optimizations applied during post-training. The model delivers strong performance across a range of benchmark evaluations, including mathematics, programming, and general logic, with overall capabilities approaching those of leading models such as O3 and Gemini 2.5 Pro.
  1047. home: https://www.deepseek.com
  1048. icon: /static/catalog_icons/deepseek.png
  1049. categories:
  1050. - llm
  1051. capabilities:
  1052. - context/128K
  1053. size: 671
  1054. licenses:
  1055. - mit
  1056. release_date: "2025-05-28"
  1057. specs:
  1058. - mode: throughput
  1059. quantization: FP8
  1060. gpu_filters:
  1061. vendor: nvidia
  1062. compute_capability: ">=9.0" # Hopper or later
  1063. source: huggingface
  1064. huggingface_repo_id: deepseek-ai/DeepSeek-R1-0528
  1065. backend: SGLang
  1066. backend_parameters:
  1067. - --enable-dp-attention
  1068. - --context-length=32768
  1069. - mode: standard
  1070. quantization: FP8
  1071. source: huggingface
  1072. huggingface_repo_id: deepseek-ai/DeepSeek-R1-0528
  1073. backend: vLLM
  1074. backend_parameters:
  1075. - --max-model-len=32768
  1076. - name: DeepSeek-OCR
  1077. description: DeepSeek-OCR is an advanced optical character recognition (OCR) model developed by DeepSeek AI. It is designed to accurately extract text from images and scanned documents.
  1078. home: https://www.deepseek.com
  1079. icon: /static/catalog_icons/deepseek.png
  1080. size: 3
  1081. categories:
  1082. - llm
  1083. licenses:
  1084. - mit
  1085. release_date: "2025-10-20"
  1086. specs:
  1087. - mode: standard
  1088. quantization: "BF16"
  1089. gpu_filters:
  1090. vendor:
  1091. - nvidia
  1092. - amd
  1093. source: huggingface
  1094. huggingface_repo_id: deepseek-ai/DeepSeek-OCR
  1095. backend: vLLM
  1096. backend_version: 0.11.2
  1097. backend_parameters:
  1098. - --logits_processors=vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor
  1099. - --no-enable-prefix-caching
  1100. - --mm-processor-cache-gb=0
  1101. - name: PaddleOCR-VL-1.5
  1102. description: PaddleOCR-VL-1.5 is an advanced optical character recognition (OCR) vision-language model developed by PaddlePaddle. It is designed to accurately extract and understand text from images and documents.
  1103. home: https://www.paddleocr.com
  1104. icon: /static/catalog_icons/paddlepaddle.jpeg
  1105. size: 0.9
  1106. categories:
  1107. - llm
  1108. capabilities:
  1109. - vision
  1110. licenses:
  1111. - apache-2.0
  1112. release_date: "2026-01-29"
  1113. specs:
  1114. - mode: standard
  1115. quantization: "BF16"
  1116. source: huggingface
  1117. huggingface_repo_id: PaddlePaddle/PaddleOCR-VL-1.5
  1118. backend: vLLM
  1119. backend_parameters:
  1120. - --trust-remote-code
  1121. - --max-num-batched-tokens=16384
  1122. - --no-enable-prefix-caching
  1123. - --mm-processor-cache-gb=0
  1124. - name: LightOnOCR-2-1B
  1125. description: LightOnOCR-2-1B is an efficient end-to-end vision-language model for optical character recognition (OCR), converting documents (PDFs, scans, images) into clean, naturally ordered text. It achieves state-of-the-art performance on OlmOCR-Bench while being significantly faster and more cost-effective than competitors.
  1126. home: https://www.lighton.ai
  1127. icon: /static/catalog_icons/lighton.png
  1128. size: 1
  1129. categories:
  1130. - llm
  1131. capabilities:
  1132. - vision
  1133. licenses:
  1134. - apache-2.0
  1135. release_date: "2026-01-19"
  1136. specs:
  1137. - mode: standard
  1138. quantization: "BF16"
  1139. source: huggingface
  1140. huggingface_repo_id: lightonai/LightOnOCR-2-1B
  1141. backend: vLLM
  1142. backend_parameters:
  1143. - '--limit-mm-per-prompt={"image": 1}'
  1144. - --mm-processor-cache-gb=0
  1145. - --no-enable-prefix-caching
  1146. - name: Deepseek-V3.2
  1147. description: 'DeepSeek-V3.2 is a model that balances computational efficiency with strong reasoning and agent capabilities through three technical innovations: DeepSeek Sparse Attention (DSA), Scalable Reinforcement Learning Framework, Large-Scale Agentic Task Synthesis Pipeline.'
  1148. home: https://www.deepseek.com
  1149. icon: /static/catalog_icons/deepseek.png
  1150. categories:
  1151. - llm
  1152. capabilities:
  1153. - context/128K
  1154. size: 685
  1155. licenses:
  1156. - mit
  1157. release_date: "2025-12-01"
  1158. specs:
  1159. - mode: throughput
  1160. quantization: FP8
  1161. gpu_filters:
  1162. vendor: nvidia
  1163. compute_capability: ">=9.0" # Hopper or later
  1164. source: huggingface
  1165. huggingface_repo_id: deepseek-ai/DeepSeek-V3.2
  1166. backend: SGLang
  1167. backend_version: 0.5.6.post2
  1168. backend_parameters:
  1169. - --enable-dp-attention
  1170. - --context-length=65536
  1171. - --reasoning-parser=deepseek-v3
  1172. - --tool-call-parser=deepseek_v32
  1173. - --chat-template={data_dir}/chat_templates/tool_chat_template_deepseekv32.jinja
  1174. - mode: standard
  1175. quantization: FP8
  1176. source: huggingface
  1177. huggingface_repo_id: deepseek-ai/DeepSeek-V3.2
  1178. backend: vLLM
  1179. backend_version: 0.13.0
  1180. backend_parameters:
  1181. - --max-model-len=65536
  1182. - --tokenizer-mode=deepseek_v32
  1183. - --reasoning-parser=deepseek_v3
  1184. - --tool-call-parser=deepseek_v32
  1185. - --enable-auto-tool-choice
  1186. - name: Deepseek-V3.2-Speciale
  1187. description: This model is the high-compute variant of DeepSeek-V3.2, surpasses GPT-5 and matches Gemini-3.0-Pro in reasoning, achieving gold-medal level performance in the 2025 IMO and IOI competitions.
  1188. home: https://www.deepseek.com
  1189. icon: /static/catalog_icons/deepseek.png
  1190. categories:
  1191. - llm
  1192. capabilities:
  1193. - context/128K
  1194. size: 685
  1195. licenses:
  1196. - mit
  1197. release_date: "2025-12-01"
  1198. specs:
  1199. - mode: throughput
  1200. quantization: FP8
  1201. gpu_filters:
  1202. vendor: nvidia
  1203. compute_capability: ">=9.0" # Hopper or later
  1204. source: huggingface
  1205. huggingface_repo_id: deepseek-ai/DeepSeek-V3.2-Speciale
  1206. backend: SGLang
  1207. backend_version: 0.5.6.post2
  1208. backend_parameters:
  1209. - --enable-dp-attention
  1210. - --context-length=65536
  1211. - --reasoning-parser=deepseek-v3
  1212. - mode: standard
  1213. quantization: FP8
  1214. source: huggingface
  1215. huggingface_repo_id: deepseek-ai/DeepSeek-V3.2-Speciale
  1216. backend: vLLM
  1217. backend_version: 0.13.0
  1218. backend_parameters:
  1219. - --max-model-len=65536
  1220. - --tokenizer-mode=deepseek_v32
  1221. - --reasoning-parser=deepseek_v3
  1222. - name: MiniMax-M2.1
  1223. description: MiniMax-M2.1 is a high-performance agentic model, optimized for robustness in coding, tool use, instruction following, and long-horizon planning. It excels in multilingual software development and complex multi-step workflows.
  1224. home: https://www.minimax.io
  1225. icon: /static/catalog_icons/minimax.png
  1226. size: 230
  1227. activated_size: 10
  1228. categories:
  1229. - llm
  1230. capabilities:
  1231. - context/192K
  1232. - tools
  1233. licenses:
  1234. - modified-mit
  1235. release_date: "2025-12-23"
  1236. specs:
  1237. - mode: standard
  1238. quantization: FP8
  1239. source: huggingface
  1240. huggingface_repo_id: MiniMaxAI/MiniMax-M2.1
  1241. backend: vLLM
  1242. backend_parameters:
  1243. - --max-model-len=65536
  1244. - --reasoning-parser=minimax_m2_append_think
  1245. - --tool-call-parser=minimax_m2
  1246. - --enable-auto-tool-choice
  1247. - --trust-remote-code
  1248. - name: MiniMax-M2.5
  1249. description: MiniMax-M2.5 is a powerful MoE (Mixture-of-Experts) model that delivers exceptional performance in logical reasoning, coding, and complex agent tasks through highly efficient inference.
  1250. home: https://www.minimax.io/
  1251. icon: /static/catalog_icons/minimax.png
  1252. size: 230
  1253. activated_size: 10
  1254. categories:
  1255. - llm
  1256. capabilities:
  1257. - context/196K
  1258. - reasoning
  1259. - tools
  1260. licenses:
  1261. - modified-mit
  1262. release_date: "2026-02-12"
  1263. specs:
  1264. - mode: standard
  1265. quantization: BF16
  1266. source: huggingface
  1267. huggingface_repo_id: MiniMaxAI/MiniMax-M2.5
  1268. backend: vLLM
  1269. backend_parameters:
  1270. - --max-model-len=65536
  1271. - --reasoning-parser=minimax_m2_append_think
  1272. - --tool-call-parser=minimax_m2
  1273. - --enable-auto-tool-choice
  1274. - --trust-remote-code
  1275. - --enable-expert-parallel
  1276. - name: Kimi-K2.5
  1277. description: Kimi-K2.5 is a multimodal mixture-of-experts model with 1T total parameters and 32B activated parameters. It features native INT4 quantization, vision support, dual operating modes (thinking/instant), agent swarm capabilities, and excels at visual reasoning, coding with vision, and complex tool orchestration.
  1278. home: https://www.moonshot.ai
  1279. icon: /static/catalog_icons/kimi.png
  1280. size: 1
  1281. size_unit: T
  1282. activated_size: 32
  1283. categories:
  1284. - llm
  1285. capabilities:
  1286. - context/256K
  1287. - vision
  1288. - tools
  1289. licenses:
  1290. - modified-mit
  1291. release_date: "2026-01-26"
  1292. specs:
  1293. - mode: standard
  1294. quantization: INT4
  1295. source: huggingface
  1296. huggingface_repo_id: moonshotai/Kimi-K2.5
  1297. backend: vLLM
  1298. backend_parameters:
  1299. - --max-model-len=65536
  1300. - --mm-encoder-tp-mode=data
  1301. - --tool-call-parser=kimi_k2
  1302. - --reasoning-parser=kimi_k2
  1303. - --trust-remote-code
  1304. - name: Step-3.5-Flash
  1305. description: Step-3.5-Flash is a fast, cost-effective multimodal model with 196B total parameters and 11B active parameters (MoE), optimized for quick inference. Built on StepFun's Step3 architecture, it delivers strong performance across text and vision tasks with efficient token usage.
  1306. home: https://www.stepfun.com
  1307. icon: /static/catalog_icons/stepfun.png
  1308. size: 196
  1309. activated_size: 11
  1310. categories:
  1311. - llm
  1312. capabilities:
  1313. - context/256K
  1314. - tools
  1315. licenses:
  1316. - apache-2.0
  1317. release_date: "2026-02-02"
  1318. specs:
  1319. - mode: throughput
  1320. quantization: FP8
  1321. source: huggingface
  1322. huggingface_repo_id: stepfun-ai/Step-3.5-Flash-FP8
  1323. backend: vLLM
  1324. backend_parameters:
  1325. - --max-model-len=65536
  1326. - --disable-cascade-attn
  1327. - --reasoning-parser=step3p5
  1328. - --enable-auto-tool-choice
  1329. - --tool-call-parser=step3p5
  1330. - --trust-remote-code
  1331. - --quantization=fp8
  1332. - mode: standard
  1333. quantization: BF16
  1334. source: huggingface
  1335. huggingface_repo_id: stepfun-ai/Step-3.5-Flash
  1336. backend: vLLM
  1337. backend_parameters:
  1338. - --max-model-len=65536
  1339. - --disable-cascade-attn
  1340. - --reasoning-parser=step3p5
  1341. - --enable-auto-tool-choice
  1342. - --tool-call-parser=step3p5
  1343. - --trust-remote-code
  1344. - name: Nanbeige4.1-3B
  1345. description: Nanbeige4.1-3B is a 3B-parameter language model from Nanbeige LLM Lab, optimized for long-context reasoning, agentic tasks, and tool use.
  1346. home: https://huggingface.co/Nanbeige
  1347. icon: /static/catalog_icons/nanbeige.png
  1348. size: 3
  1349. categories:
  1350. - llm
  1351. capabilities:
  1352. - context/256K
  1353. - reasoning
  1354. - tools
  1355. licenses:
  1356. - apache-2.0
  1357. release_date: "2026-02-13"
  1358. specs:
  1359. - mode: standard
  1360. quantization: BF16
  1361. source: huggingface
  1362. huggingface_repo_id: Nanbeige/Nanbeige4.1-3B
  1363. backend: vLLM
  1364. backend_parameters:
  1365. - --max-model-len=32768
  1366. # Embedding models
  1367. - name: Qwen3-Embedding-0.6B
  1368. description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1369. home: https://qwenlm.github.io
  1370. icon: /static/catalog_icons/qwen.png
  1371. size: 0.6
  1372. categories:
  1373. - embedding
  1374. capabilities:
  1375. - dimensions/4096
  1376. - max_tokens/32K
  1377. licenses:
  1378. - apache-2.0
  1379. release_date: "2025-06-09"
  1380. specs:
  1381. - mode: standard
  1382. quantization: "BF16"
  1383. source: huggingface
  1384. huggingface_repo_id: Qwen/Qwen3-Embedding-0.6B
  1385. categories:
  1386. - embedding
  1387. backend: vLLM
  1388. - name: Qwen3-Embedding-4B
  1389. description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1390. home: https://qwenlm.github.io
  1391. icon: /static/catalog_icons/qwen.png
  1392. size: 4
  1393. categories:
  1394. - embedding
  1395. capabilities:
  1396. - dimensions/4096
  1397. - max_tokens/32K
  1398. licenses:
  1399. - apache-2.0
  1400. release_date: "2025-06-09"
  1401. specs:
  1402. - mode: standard
  1403. quantization: "BF16"
  1404. source: huggingface
  1405. huggingface_repo_id: Qwen/Qwen3-Embedding-4B
  1406. categories:
  1407. - embedding
  1408. backend: vLLM
  1409. - name: Qwen3-Embedding-8B
  1410. description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1411. home: https://qwenlm.github.io
  1412. icon: /static/catalog_icons/qwen.png
  1413. size: 8
  1414. categories:
  1415. - embedding
  1416. capabilities:
  1417. - dimensions/4096
  1418. - max_tokens/32K
  1419. licenses:
  1420. - apache-2.0
  1421. release_date: "2025-06-09"
  1422. specs:
  1423. - mode: standard
  1424. quantization: "BF16"
  1425. source: huggingface
  1426. huggingface_repo_id: Qwen/Qwen3-Embedding-8B
  1427. categories:
  1428. - embedding
  1429. backend: vLLM
  1430. - name: Qwen3-VL-Embedding-2B
  1431. description: Qwen3-VL-Embedding is a multimodal embedding model series optimized for multimodal retrieval, clustering, and classification. It supports image-text retrieval and unified multimodal representation learning with 30+ languages support.
  1432. home: https://qwenlm.github.io
  1433. icon: /static/catalog_icons/qwen.png
  1434. size: 2
  1435. categories:
  1436. - embedding
  1437. capabilities:
  1438. - vision
  1439. - dimensions/2048
  1440. - max_tokens/32K
  1441. licenses:
  1442. - apache-2.0
  1443. release_date: "2026-01-08"
  1444. specs:
  1445. - mode: standard
  1446. quantization: "BF16"
  1447. source: huggingface
  1448. huggingface_repo_id: Qwen/Qwen3-VL-Embedding-2B
  1449. categories:
  1450. - embedding
  1451. backend: vLLM
  1452. backend_parameters:
  1453. - --runner=pooling
  1454. - name: Qwen3-VL-Embedding-8B
  1455. description: Qwen3-VL-Embedding is a multimodal embedding model series optimized for multimodal retrieval, clustering, and classification. It supports image-text retrieval and unified multimodal representation learning with 30+ languages support.
  1456. home: https://qwenlm.github.io
  1457. icon: /static/catalog_icons/qwen.png
  1458. size: 8
  1459. categories:
  1460. - embedding
  1461. capabilities:
  1462. - vision
  1463. - dimensions/4096
  1464. - max_tokens/32K
  1465. licenses:
  1466. - apache-2.0
  1467. release_date: "2026-01-08"
  1468. specs:
  1469. - mode: standard
  1470. quantization: "BF16"
  1471. source: huggingface
  1472. huggingface_repo_id: Qwen/Qwen3-VL-Embedding-8B
  1473. categories:
  1474. - embedding
  1475. backend: vLLM
  1476. backend_parameters:
  1477. - --runner=pooling
  1478. - name: BGE-M3
  1479. description: BGE-M3 is a new model from BAAI distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity.
  1480. home: https://bge-model.com
  1481. icon: /static/catalog_icons/bge_logo.jpeg
  1482. categories:
  1483. - embedding
  1484. capabilities:
  1485. - dimensions/1024
  1486. - max_tokens/8192
  1487. size: 567
  1488. size_unit: M
  1489. licenses:
  1490. - mit
  1491. release_date: "2024-01-28"
  1492. specs:
  1493. - mode: standard
  1494. quantization: "BF16"
  1495. source: huggingface
  1496. huggingface_repo_id: BAAI/bge-m3
  1497. categories:
  1498. - embedding
  1499. backend: vLLM
  1500. - name: BGE-Large-ZH-V1.5
  1501. description: BGE is short for BAAI general embedding. This is a Chinese text embedding model with more reasonable similarity distribution.
  1502. home: https://bge-model.com
  1503. icon: /static/catalog_icons/bge_logo.jpeg
  1504. categories:
  1505. - embedding
  1506. capabilities:
  1507. - dimensions/1024
  1508. - max_tokens/512
  1509. size: 335
  1510. size_unit: M
  1511. licenses:
  1512. - mit
  1513. release_date: "2023-09-12"
  1514. specs:
  1515. - mode: standard
  1516. quantization: "BF16"
  1517. source: huggingface
  1518. huggingface_repo_id: BAAI/bge-large-zh-v1.5
  1519. categories:
  1520. - embedding
  1521. backend: vLLM
  1522. - name: BGE-Large-EN-V1.5
  1523. description: BGE is short for BAAI general embedding. This is an English text embedding model with more reasonable similarity distribution.
  1524. home: https://bge-model.com
  1525. icon: /static/catalog_icons/bge_logo.jpeg
  1526. categories:
  1527. - embedding
  1528. capabilities:
  1529. - dimensions/1024
  1530. - max_tokens/512
  1531. size: 335
  1532. size_unit: M
  1533. licenses:
  1534. - mit
  1535. release_date: "2023-09-12"
  1536. specs:
  1537. - mode: standard
  1538. quantization: "BF16"
  1539. source: huggingface
  1540. huggingface_repo_id: BAAI/bge-large-en-v1.5
  1541. categories:
  1542. - embedding
  1543. backend: vLLM
  1544. - name: Nomic-Embed-Text-V1.5
  1545. description: Nomic-embed-text is a large context length text encoder that surpasses OpenAI text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
  1546. home: https://nomic.ai
  1547. icon: /static/catalog_icons/nomic.png
  1548. categories:
  1549. - embedding
  1550. capabilities:
  1551. - dimensions/768
  1552. - max_tokens/8192
  1553. size: 137
  1554. size_unit: M
  1555. licenses:
  1556. - apache-2.0
  1557. release_date: "2024-02-14"
  1558. specs:
  1559. - mode: standard
  1560. quantization: "BF16"
  1561. source: huggingface
  1562. huggingface_repo_id: nomic-ai/nomic-embed-text-v1.5
  1563. categories:
  1564. - embedding
  1565. backend: vLLM
  1566. backend_parameters:
  1567. - --trust-remote-code
  1568. - name: Jina-Embeddings-V3
  1569. description: jina-embeddings-v3 is a multilingual multi-task text embedding model designed for a variety of NLP applications. Based on the Jina-XLM-RoBERTa architecture, this model supports Rotary Position Embeddings to handle long input sequences up to 8192 tokens.
  1570. home: https://jina.ai
  1571. icon: /static/catalog_icons/jina.png
  1572. categories:
  1573. - embedding
  1574. capabilities:
  1575. - dimensions/1024
  1576. - max_tokens/8192
  1577. size: 570
  1578. size_unit: M
  1579. licenses:
  1580. - cc-by-nc-4.0
  1581. release_date: "2024-09-18"
  1582. specs:
  1583. - mode: standard
  1584. quantization: "BF16"
  1585. source: huggingface
  1586. huggingface_repo_id: jinaai/jina-embeddings-v3
  1587. categories:
  1588. - embedding
  1589. backend: vLLM
  1590. backend_parameters:
  1591. - --trust-remote-code
  1592. # Reranker models
  1593. - name: Qwen3-Reranker-0.6B
  1594. description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1595. home: https://qwenlm.github.io
  1596. icon: /static/catalog_icons/qwen.png
  1597. size: 0.6
  1598. categories:
  1599. - reranker
  1600. capabilities:
  1601. - max_tokens/32K
  1602. licenses:
  1603. - apache-2.0
  1604. release_date: "2025-06-09"
  1605. specs:
  1606. - mode: standard
  1607. quantization: "BF16"
  1608. source: huggingface
  1609. huggingface_repo_id: Qwen/Qwen3-Reranker-0.6B
  1610. categories:
  1611. - reranker
  1612. backend: vLLM
  1613. backend_parameters:
  1614. - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1615. - name: Qwen3-Reranker-4B
  1616. description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1617. home: https://qwenlm.github.io
  1618. icon: /static/catalog_icons/qwen.png
  1619. size: 4
  1620. categories:
  1621. - reranker
  1622. capabilities:
  1623. - max_tokens/32K
  1624. licenses:
  1625. - apache-2.0
  1626. release_date: "2025-06-09"
  1627. specs:
  1628. - mode: standard
  1629. quantization: "BF16"
  1630. source: huggingface
  1631. huggingface_repo_id: Qwen/Qwen3-Reranker-4B
  1632. categories:
  1633. - reranker
  1634. env:
  1635. GPUSTACK_APPLY_QWEN3_RERANKER_TEMPLATES: "true"
  1636. backend: vLLM
  1637. backend_parameters:
  1638. - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1639. - name: Qwen3-Reranker-8B
  1640. description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  1641. home: https://qwenlm.github.io
  1642. icon: /static/catalog_icons/qwen.png
  1643. size: 8
  1644. categories:
  1645. - reranker
  1646. capabilities:
  1647. - max_tokens/32K
  1648. licenses:
  1649. - apache-2.0
  1650. release_date: "2025-06-09"
  1651. specs:
  1652. - mode: standard
  1653. quantization: "BF16"
  1654. source: huggingface
  1655. huggingface_repo_id: Qwen/Qwen3-Reranker-8B
  1656. categories:
  1657. - reranker
  1658. backend: vLLM
  1659. backend_parameters:
  1660. - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1661. - name: Qwen3-VL-Reranker-2B
  1662. description: Qwen3-VL-Reranker is a multimodal text reranking model series optimized for multimodal retrieval, clustering, classification, and bitext mining. It consistently outperforms the base embedding model and baseline rerankers.
  1663. home: https://qwenlm.github.io
  1664. icon: /static/catalog_icons/qwen.png
  1665. size: 2
  1666. categories:
  1667. - reranker
  1668. capabilities:
  1669. - vision
  1670. - max_tokens/32K
  1671. licenses:
  1672. - apache-2.0
  1673. release_date: "2026-01-08"
  1674. specs:
  1675. - mode: standard
  1676. quantization: "BF16"
  1677. source: huggingface
  1678. huggingface_repo_id: Qwen/Qwen3-VL-Reranker-2B
  1679. categories:
  1680. - reranker
  1681. backend: vLLM
  1682. backend_parameters:
  1683. - '--hf_overrides={"architectures":["Qwen3VLForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1684. - name: Qwen3-VL-Reranker-8B
  1685. description: Qwen3-VL-Reranker is a multimodal text reranking model series optimized for multimodal retrieval, clustering, classification, and bitext mining. It consistently outperforms the base embedding model and baseline rerankers, with the 8B model showing particularly strong results.
  1686. home: https://qwenlm.github.io
  1687. icon: /static/catalog_icons/qwen.png
  1688. size: 8
  1689. categories:
  1690. - reranker
  1691. capabilities:
  1692. - vision
  1693. - max_tokens/32K
  1694. licenses:
  1695. - apache-2.0
  1696. release_date: "2026-01-08"
  1697. specs:
  1698. - mode: standard
  1699. quantization: "BF16"
  1700. source: huggingface
  1701. huggingface_repo_id: Qwen/Qwen3-VL-Reranker-8B
  1702. categories:
  1703. - reranker
  1704. backend: vLLM
  1705. backend_parameters:
  1706. - '--hf_overrides={"architectures":["Qwen3VLForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
  1707. - name: BGE-Reranker-V2-M3
  1708. description: BGE-Reranker-V2-M3 is a reranker model from BAAI.
  1709. home: https://bge-model.com
  1710. icon: /static/catalog_icons/bge_logo.jpeg
  1711. categories:
  1712. - reranker
  1713. size: 568
  1714. size_unit: M
  1715. licenses:
  1716. - apache-2.0
  1717. release_date: "2024-03-19"
  1718. specs:
  1719. - mode: standard
  1720. quantization: "BF16"
  1721. source: huggingface
  1722. huggingface_repo_id: BAAI/bge-reranker-v2-m3
  1723. categories:
  1724. - reranker
  1725. backend: vLLM
  1726. - name: Jina-Reranker-M0
  1727. description: Jina-Reranker-M0 is a multilingual multimodal document reranker model with 2.4B parameters. It accepts a query alongside visually rich documents and outputs ranked documents by relevance. Supports 29 languages and multimodal content including text, figures, tables, and infographics.
  1728. home: https://jina.ai
  1729. icon: /static/catalog_icons/jina.png
  1730. size: 2.4
  1731. categories:
  1732. - reranker
  1733. capabilities:
  1734. - max_tokens/10K
  1735. - vision
  1736. licenses:
  1737. - cc-by-nc-4.0
  1738. release_date: "2025-04-08"
  1739. specs:
  1740. - mode: standard
  1741. quantization: "BF16"
  1742. source: huggingface
  1743. huggingface_repo_id: jinaai/jina-reranker-m0
  1744. backend: vLLM
  1745. # Image models
  1746. - name: FLUX.1-dev
  1747. description: FLUX.1 [dev] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions.
  1748. home: https://blackforestlabs.ai
  1749. icon: /static/catalog_icons/blackforestlabs.png
  1750. size: 12
  1751. categories:
  1752. - image
  1753. licenses:
  1754. - flux-1-dev-non-commercial-license
  1755. release_date: "2024-08-02"
  1756. specs:
  1757. - mode: standard
  1758. quantization: "BF16"
  1759. gpu_filters:
  1760. vendor: nvidia
  1761. source: huggingface
  1762. huggingface_repo_id: black-forest-labs/FLUX.1-dev
  1763. backend: SGLang
  1764. backend_version: 0.5.6.post2
  1765. env:
  1766. GPUSTACK_MODEL_VRAM_CLAIM: "37580963840" # 35 GiB, observed empirically
  1767. - name: FLUX.2-klein-4B
  1768. description: FLUX.2-klein-4B is a 4 billion parameter image generation model from Black Forest Labs.
  1769. home: https://blackforestlabs.ai
  1770. icon: /static/catalog_icons/blackforestlabs.png
  1771. size: 4
  1772. categories:
  1773. - image
  1774. licenses:
  1775. - apache-2.0
  1776. release_date: "2026-01-15"
  1777. .base_spec: &flux_2_klein_4b_base_spec
  1778. mode: standard
  1779. quantization: "BF16"
  1780. source: huggingface
  1781. huggingface_repo_id: black-forest-labs/FLUX.2-klein-4B
  1782. backend: vLLM
  1783. backend_parameters:
  1784. - --omni
  1785. specs:
  1786. - <<: *flux_2_klein_4b_base_spec
  1787. gpu_filters:
  1788. vendor: ascend
  1789. backend_version: *vllm_omni_ascend_stable_version
  1790. - <<: *flux_2_klein_4b_base_spec
  1791. backend_version: *vllm_omni_stable_version
  1792. - name: FLUX.2-klein-9B
  1793. description: FLUX.2-klein-9B is a 9 billion parameter image generation model from Black Forest Labs.
  1794. home: https://blackforestlabs.ai
  1795. icon: /static/catalog_icons/blackforestlabs.png
  1796. size: 9
  1797. categories:
  1798. - image
  1799. licenses:
  1800. - apache-2.0
  1801. release_date: "2026-01-15"
  1802. .base_spec: &flux_2_klein_9b_base_spec
  1803. mode: standard
  1804. quantization: "BF16"
  1805. source: huggingface
  1806. huggingface_repo_id: black-forest-labs/FLUX.2-klein-9B
  1807. backend: vLLM
  1808. backend_parameters:
  1809. - --omni
  1810. specs:
  1811. - <<: *flux_2_klein_9b_base_spec
  1812. gpu_filters:
  1813. vendor: ascend
  1814. backend_version: *vllm_omni_ascend_stable_version
  1815. - <<: *flux_2_klein_9b_base_spec
  1816. backend_version: *vllm_omni_stable_version
  1817. - name: Qwen-Image
  1818. description: Qwen-Image is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing.
  1819. home: https://qwen.ai
  1820. icon: /static/catalog_icons/qwen.png
  1821. size: 20
  1822. categories:
  1823. - image
  1824. licenses:
  1825. - apache-2.0
  1826. release_date: "2025-08-04"
  1827. .base_spec: &qwen_image_base_spec
  1828. mode: standard
  1829. quantization: "BF16"
  1830. source: huggingface
  1831. huggingface_repo_id: Qwen/Qwen-Image
  1832. backend: vLLM
  1833. backend_parameters:
  1834. - --omni
  1835. specs:
  1836. - <<: *qwen_image_base_spec
  1837. gpu_filters:
  1838. vendor: ascend
  1839. backend_version: *vllm_omni_ascend_stable_version
  1840. - <<: *qwen_image_base_spec
  1841. backend_version: *vllm_omni_stable_version
  1842. - name: Qwen-Image-Edit
  1843. description: Built upon the 20B Qwen-Image model, Qwen-Image-Edit successfully extends Qwen-Image's unique text rendering capabilities to image editing tasks, enabling precise text editing.
  1844. home: https://qwen.ai
  1845. icon: /static/catalog_icons/qwen.png
  1846. size: 20
  1847. categories:
  1848. - image
  1849. licenses:
  1850. - apache-2.0
  1851. release_date: "2025-08-19"
  1852. specs:
  1853. - mode: standard
  1854. quantization: "BF16"
  1855. gpu_filters:
  1856. vendor: nvidia
  1857. source: huggingface
  1858. huggingface_repo_id: Qwen/Qwen-Image-Edit
  1859. backend: SGLang
  1860. backend_version: 0.5.6.post2
  1861. - name: Qwen-Image-2512
  1862. description: Qwen-Image-2512 is the December update of Qwen-Image's text-to-image foundational model, delivering enhanced image generation capabilities.
  1863. home: https://qwen.ai
  1864. icon: /static/catalog_icons/qwen.png
  1865. size: 20
  1866. categories:
  1867. - image
  1868. licenses:
  1869. - apache-2.0
  1870. release_date: "2025-12-30"
  1871. .base_spec: &qwen_image_2512_base_spec
  1872. mode: standard
  1873. quantization: "BF16"
  1874. source: huggingface
  1875. huggingface_repo_id: Qwen/Qwen-Image-2512
  1876. backend: vLLM
  1877. backend_parameters:
  1878. - --omni
  1879. specs:
  1880. - <<: *qwen_image_2512_base_spec
  1881. gpu_filters:
  1882. vendor: ascend
  1883. backend_version: *vllm_omni_ascend_stable_version
  1884. - <<: *qwen_image_2512_base_spec
  1885. backend_version: *vllm_omni_stable_version
  1886. - name: Z-Image
  1887. description: Z-Image is the foundation model of the Z-Image family, engineered for good quality, robust generative diversity, broad stylistic coverage, and precise prompt adherence.
  1888. home: https://qwen.ai
  1889. icon: /static/catalog_icons/qwen.png
  1890. size: 6
  1891. categories:
  1892. - image
  1893. licenses:
  1894. - apache-2.0
  1895. release_date: "2026-01-28"
  1896. .base_spec: &z_image_base_spec
  1897. mode: standard
  1898. quantization: "BF16"
  1899. source: huggingface
  1900. huggingface_repo_id: Tongyi-MAI/Z-Image
  1901. backend: vLLM
  1902. backend_parameters:
  1903. - --omni
  1904. specs:
  1905. - <<: *z_image_base_spec
  1906. gpu_filters:
  1907. vendor: ascend
  1908. backend_version: *vllm_omni_ascend_stable_version
  1909. - <<: *z_image_base_spec
  1910. backend_version: *vllm_omni_stable_version
  1911. - name: Z-Image-Turbo
  1912. description: Z-Image is a powerful and highly efficient image generation model with 6B parameters.
  1913. home: https://qwen.ai
  1914. icon: /static/catalog_icons/qwen.png
  1915. size: 6
  1916. categories:
  1917. - image
  1918. licenses:
  1919. - apache-2.0
  1920. release_date: "2025-11-27"
  1921. .base_spec: &z_image_turbo_base_spec
  1922. mode: standard
  1923. quantization: "BF16"
  1924. source: huggingface
  1925. huggingface_repo_id: Tongyi-MAI/Z-Image-Turbo
  1926. backend: vLLM
  1927. backend_parameters:
  1928. - --omni
  1929. env:
  1930. GPUSTACK_MODEL_VRAM_CLAIM: "24696061952" # 23 GiB observed. Weight file size is 33 GiB in F32 while vLLM loads in BF16.
  1931. specs:
  1932. - <<: *z_image_turbo_base_spec
  1933. gpu_filters:
  1934. vendor: ascend
  1935. backend_version: *vllm_omni_ascend_stable_version
  1936. - <<: *z_image_turbo_base_spec
  1937. backend_version: *vllm_omni_stable_version
  1938. - name: Qwen3-VL-8B-Instruct
  1939. description: Qwen3-VL-8B-Instruct is a vision-language model that delivers comprehensive upgrades across text understanding, visual perception, and reasoning capabilities, supporting image/video/text unified understanding.
  1940. home: https://qwen.ai
  1941. icon: /static/catalog_icons/qwen.png
  1942. size: 8
  1943. categories:
  1944. - llm
  1945. capabilities:
  1946. - context/1M
  1947. - vision
  1948. licenses:
  1949. - apache-2.0
  1950. release_date: "2025-10-15"
  1951. specs:
  1952. - mode: standard
  1953. quantization: BF16
  1954. source: huggingface
  1955. huggingface_repo_id: Qwen/Qwen3-VL-8B-Instruct
  1956. backend: vLLM
  1957. backend_parameters:
  1958. - --max-model-len=65536
  1959. - name: Qwen3-VL-8B-Thinking
  1960. description: Qwen3-VL-8B-Thinking is a vision-language model that delivers comprehensive upgrades across text understanding, visual perception, and reasoning capabilities, supporting image/video/text unified understanding with thinking mode.
  1961. home: https://qwen.ai
  1962. icon: /static/catalog_icons/qwen.png
  1963. size: 8
  1964. categories:
  1965. - llm
  1966. capabilities:
  1967. - context/1M
  1968. - vision
  1969. licenses:
  1970. - apache-2.0
  1971. release_date: "2025-10-15"
  1972. specs:
  1973. - mode: standard
  1974. quantization: BF16
  1975. source: huggingface
  1976. huggingface_repo_id: Qwen/Qwen3-VL-8B-Thinking
  1977. backend: vLLM
  1978. backend_parameters:
  1979. - --max-model-len=65536
  1980. - name: Qwen3-VL-32B-Instruct
  1981. description: Qwen3-VL-32B-Instruct is a vision-language model featuring superior visual intelligence, enhanced spatial awareness capabilities, and OCR functionality.
  1982. home: https://qwen.ai
  1983. icon: /static/catalog_icons/qwen.png
  1984. size: 32
  1985. categories:
  1986. - llm
  1987. capabilities:
  1988. - context/1M
  1989. - vision
  1990. licenses:
  1991. - apache-2.0
  1992. release_date: "2025-10-21"
  1993. specs:
  1994. - mode: standard
  1995. quantization: BF16
  1996. source: huggingface
  1997. huggingface_repo_id: Qwen/Qwen3-VL-32B-Instruct
  1998. backend: vLLM
  1999. backend_parameters:
  2000. - --max-model-len=65536
  2001. - name: Qwen3-VL-32B-Thinking
  2002. description: Qwen3-VL-32B-Thinking is a vision-language model featuring superior visual intelligence, enhanced spatial awareness capabilities, and OCR functionality with thinking mode.
  2003. home: https://qwen.ai
  2004. icon: /static/catalog_icons/qwen.png
  2005. size: 32
  2006. categories:
  2007. - llm
  2008. capabilities:
  2009. - context/1M
  2010. - vision
  2011. licenses:
  2012. - apache-2.0
  2013. release_date: "2025-10-21"
  2014. specs:
  2015. - mode: standard
  2016. quantization: BF16
  2017. source: huggingface
  2018. huggingface_repo_id: Qwen/Qwen3-VL-32B-Thinking
  2019. backend: vLLM
  2020. backend_parameters:
  2021. - --max-model-len=65536
  2022. - name: Qwen3-VL-30B-A3B-Instruct
  2023. description: Qwen3-VL-30B-A3B-Instruct is a mixture-of-experts vision-language model with 30B total parameters and 3B active parameters, featuring advanced spatial perception, 2D and 3D grounding.
  2024. home: https://qwen.ai
  2025. icon: /static/catalog_icons/qwen.png
  2026. size: 30
  2027. activated_size: 3
  2028. categories:
  2029. - llm
  2030. capabilities:
  2031. - context/1M
  2032. - vision
  2033. licenses:
  2034. - apache-2.0
  2035. release_date: "2025-10-05"
  2036. specs:
  2037. - mode: standard
  2038. quantization: BF16
  2039. source: huggingface
  2040. huggingface_repo_id: Qwen/Qwen3-VL-30B-A3B-Instruct
  2041. backend: vLLM
  2042. backend_parameters:
  2043. - --max-model-len=65536
  2044. - name: Qwen3-VL-30B-A3B-Thinking
  2045. description: Qwen3-VL-30B-A3B-Thinking is a mixture-of-experts vision-language model with 30B total parameters and 3B active parameters, featuring advanced spatial perception, 2D and 3D grounding with thinking mode.
  2046. home: https://qwen.ai
  2047. icon: /static/catalog_icons/qwen.png
  2048. size: 30
  2049. activated_size: 3
  2050. categories:
  2051. - llm
  2052. capabilities:
  2053. - context/1M
  2054. - vision
  2055. licenses:
  2056. - apache-2.0
  2057. release_date: "2025-10-05"
  2058. specs:
  2059. - mode: standard
  2060. quantization: BF16
  2061. source: huggingface
  2062. huggingface_repo_id: Qwen/Qwen3-VL-30B-A3B-Thinking
  2063. backend: vLLM
  2064. backend_parameters:
  2065. - --max-model-len=65536
  2066. - name: Qwen3-VL-235B-A22B-Instruct
  2067. description: Qwen3-VL-235B-A22B-Instruct is the largest vision-language model in the Qwen3-VL series with 235B total parameters and 22B active parameters, featuring state-of-the-art visual understanding and reasoning capabilities.
  2068. home: https://qwen.ai
  2069. icon: /static/catalog_icons/qwen.png
  2070. size: 235
  2071. activated_size: 22
  2072. categories:
  2073. - llm
  2074. capabilities:
  2075. - context/1M
  2076. - vision
  2077. licenses:
  2078. - apache-2.0
  2079. release_date: "2025-09-23"
  2080. specs:
  2081. - mode: standard
  2082. quantization: BF16
  2083. source: huggingface
  2084. huggingface_repo_id: Qwen/Qwen3-VL-235B-A22B-Instruct
  2085. backend: vLLM
  2086. backend_parameters:
  2087. - --max-model-len=65536
  2088. - name: Qwen3-VL-235B-A22B-Thinking
  2089. description: Qwen3-VL-235B-A22B-Thinking is the largest vision-language model in the Qwen3-VL series with 235B total parameters and 22B active parameters, featuring state-of-the-art visual understanding and reasoning capabilities with thinking mode.
  2090. home: https://qwen.ai
  2091. icon: /static/catalog_icons/qwen.png
  2092. size: 235
  2093. activated_size: 22
  2094. categories:
  2095. - llm
  2096. capabilities:
  2097. - context/1M
  2098. - vision
  2099. licenses:
  2100. - apache-2.0
  2101. release_date: "2025-09-23"
  2102. specs:
  2103. - mode: standard
  2104. quantization: BF16
  2105. source: huggingface
  2106. huggingface_repo_id: Qwen/Qwen3-VL-235B-A22B-Thinking
  2107. backend: vLLM
  2108. backend_parameters:
  2109. - --max-model-len=65536
  2110. # Audio models
  2111. - name: CosyVoice2-0.5B
  2112. description: CosyVoice2-0.5B is a speech generation model. It supports multilingual speech synthesis with high naturalness and expressiveness.
  2113. home: https://github.com/FunAudioLLM
  2114. icon: /static/catalog_icons/FunAudioLLM.png
  2115. size: 0.5
  2116. categories:
  2117. - text_to_speech
  2118. licenses:
  2119. - apache-2.0
  2120. release_date: "2024-12-01"
  2121. specs:
  2122. - mode: standard
  2123. quantization: FP16
  2124. source: huggingface
  2125. huggingface_repo_id: gpustack/CosyVoice2-0.5B
  2126. backend: VoxBox
  2127. env:
  2128. GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
  2129. - name: CosyVoice-300M
  2130. description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba.
  2131. home: https://github.com/FunAudioLLM
  2132. icon: /static/catalog_icons/FunAudioLLM.png
  2133. size: 300
  2134. size_unit: M
  2135. categories:
  2136. - text_to_speech
  2137. licenses:
  2138. - apache-2.0
  2139. release_date: "2024-07-05"
  2140. specs:
  2141. - mode: standard
  2142. quantization: FP16
  2143. source: huggingface
  2144. huggingface_repo_id: gpustack/CosyVoice-300M
  2145. backend: VoxBox
  2146. env:
  2147. GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
  2148. - name: CosyVoice-300M-SFT
  2149. description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba.
  2150. home: https://github.com/FunAudioLLM
  2151. icon: /static/catalog_icons/FunAudioLLM.png
  2152. size: 300
  2153. size_unit: M
  2154. categories:
  2155. - text_to_speech
  2156. licenses:
  2157. - apache-2.0
  2158. release_date: "2024-07-05"
  2159. specs:
  2160. - mode: standard
  2161. quantization: FP16
  2162. source: huggingface
  2163. huggingface_repo_id: gpustack/CosyVoice-300M-SFT
  2164. backend: VoxBox
  2165. env:
  2166. GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
  2167. - name: CosyVoice-300M-Instruct
  2168. description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba.
  2169. home: https://github.com/FunAudioLLM
  2170. icon: /static/catalog_icons/FunAudioLLM.png
  2171. size: 300
  2172. size_unit: M
  2173. categories:
  2174. - text_to_speech
  2175. licenses:
  2176. - apache-2.0
  2177. release_date: "2024-07-05"
  2178. specs:
  2179. - mode: standard
  2180. quantization: FP16
  2181. source: huggingface
  2182. huggingface_repo_id: gpustack/CosyVoice-300M-Instruct
  2183. backend: VoxBox
  2184. env:
  2185. GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
  2186. - name: Faster-Whisper-Large-V3
  2187. description: Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation, proposed in the paper Robust Speech Recognition via Large-Scale Weak Supervision by Alec Radford et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many datasets and domains in a zero-shot setting. This is the conversion of openai/whisper-large-v3 to the CTranslate2 model format.
  2188. home: https://huggingface.co/Systran
  2189. icon: /static/catalog_icons/systran.png
  2190. size: 1.55
  2191. categories:
  2192. - speech_to_text
  2193. licenses:
  2194. - mit
  2195. release_date: "2023-11-23"
  2196. specs:
  2197. - mode: standard
  2198. quantization: FP16
  2199. source: huggingface
  2200. huggingface_repo_id: Systran/faster-whisper-large-v3
  2201. backend: VoxBox
  2202. env:
  2203. GPUSTACK_MODEL_VRAM_CLAIM: "10737418240" # 10 GiB, per OpenAI Whisper large reference VRAM.
  2204. - name: Faster-Whisper-Medium
  2205. description: Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. This is the conversion of openai/whisper-medium to the CTranslate2 model format.
  2206. home: https://huggingface.co/Systran
  2207. icon: /static/catalog_icons/systran.png
  2208. size: 769
  2209. size_unit: M
  2210. categories:
  2211. - speech_to_text
  2212. licenses:
  2213. - mit
  2214. release_date: "2023-03-23"
  2215. specs:
  2216. - mode: standard
  2217. quantization: FP16
  2218. source: huggingface
  2219. huggingface_repo_id: Systran/faster-whisper-medium
  2220. backend: VoxBox
  2221. env:
  2222. GPUSTACK_MODEL_VRAM_CLAIM: "5368709120" # 5 GiB, per OpenAI Whisper medium reference VRAM.
  2223. - name: Faster-Whisper-Small
  2224. description: Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. This is the conversion of openai/whisper-small to the CTranslate2 model format.
  2225. home: https://huggingface.co/Systran
  2226. icon: /static/catalog_icons/systran.png
  2227. size: 244
  2228. size_unit: M
  2229. categories:
  2230. - speech_to_text
  2231. licenses:
  2232. - mit
  2233. release_date: "2023-03-23"
  2234. specs:
  2235. - mode: standard
  2236. quantization: FP16
  2237. source: huggingface
  2238. huggingface_repo_id: Systran/faster-whisper-small
  2239. backend: VoxBox
  2240. env:
  2241. GPUSTACK_MODEL_VRAM_CLAIM: "2147483648" # 2 GiB, per OpenAI Whisper small reference VRAM.
  2242. - name: Whisper-Large-V3-Turbo
  2243. description: Whisper large-v3-turbo is a finetuned version of a pruned Whisper large-v3. In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4. As a result, the model is way faster, at the expense of a minor quality degradation.
  2244. home: https://openai.com
  2245. icon: /static/catalog_icons/openai.png
  2246. size: 809
  2247. size_unit: M
  2248. categories:
  2249. - speech_to_text
  2250. licenses:
  2251. - mit
  2252. release_date: "2024-10-01"
  2253. specs:
  2254. - mode: standard
  2255. quantization: BF16
  2256. source: huggingface
  2257. huggingface_repo_id: openai/whisper-large-v3-turbo
  2258. backend: vLLM
  2259. - name: Whisper-Large-V3
  2260. description: Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation. Trained on 5M hours of labeled data, Whisper large-v3 demonstrates strong ability to generalise to many datasets and domains in a zero-shot setting.
  2261. home: https://openai.com
  2262. icon: /static/catalog_icons/openai.png
  2263. size: 1.55
  2264. categories:
  2265. - speech_to_text
  2266. licenses:
  2267. - mit
  2268. release_date: "2023-11-06"
  2269. specs:
  2270. - mode: standard
  2271. quantization: BF16
  2272. source: huggingface
  2273. huggingface_repo_id: openai/whisper-large-v3
  2274. backend: vLLM
  2275. env:
  2276. GPUSTACK_MODEL_VRAM_CLAIM: "4294967296" # 4 GiB. The repo stores weight files in multiple formats so explicitly set VRAM claim to avoid over-allocation.
  2277. - name: Voxtral-Mini-3B-2507
  2278. description: Voxtral-Mini-3B-2507 is a speech-to-text model from Mistral AI, designed for automatic speech recognition with high accuracy and efficiency.
  2279. home: https://mistral.ai
  2280. icon: /static/catalog_icons/mistral.png
  2281. size: 3
  2282. categories:
  2283. - speech_to_text
  2284. licenses:
  2285. - apache-2.0
  2286. release_date: "2025-07-18"
  2287. specs:
  2288. - mode: standard
  2289. quantization: BF16
  2290. source: huggingface
  2291. huggingface_repo_id: mistralai/Voxtral-Mini-3B-2507
  2292. backend: vLLM
  2293. - name: Granite-Speech-3.3-2B
  2294. description: Granite-Speech-3.3-2B is a speech-to-text model from IBM, part of the Granite series, designed for automatic speech recognition with strong multilingual capabilities.
  2295. home: https://www.ibm.com
  2296. icon: /static/catalog_icons/ibm.png
  2297. size: 2
  2298. categories:
  2299. - speech_to_text
  2300. licenses:
  2301. - apache-2.0
  2302. release_date: "2025-06-19"
  2303. specs:
  2304. - mode: standard
  2305. quantization: BF16
  2306. source: huggingface
  2307. huggingface_repo_id: ibm-granite/granite-speech-3.3-2b
  2308. backend: vLLM
  2309. - name: Granite-Speech-3.3-8B
  2310. description: Granite-Speech-3.3-8B is a speech-to-text model from IBM, part of the Granite series, designed for automatic speech recognition with enhanced accuracy and multilingual support.
  2311. home: https://www.ibm.com
  2312. icon: /static/catalog_icons/ibm.png
  2313. size: 8
  2314. categories:
  2315. - speech_to_text
  2316. licenses:
  2317. - apache-2.0
  2318. release_date: "2025-06-19"
  2319. specs:
  2320. - mode: standard
  2321. quantization: BF16
  2322. source: huggingface
  2323. huggingface_repo_id: ibm-granite/granite-speech-3.3-8b
  2324. backend: vLLM
  2325. - name: Qwen3-ASR-1.7B
  2326. description: Qwen3-ASR-1.7B support language identification and ASR for 52 languages and dialects. It leverages large-scale speech training data and the strong audio understanding capability of its foundation model, Qwen3-Omni.
  2327. home: https://qwen.ai
  2328. icon: /static/catalog_icons/qwen.png
  2329. size: 1.7
  2330. categories:
  2331. - speech_to_text
  2332. licenses:
  2333. - apache-2.0
  2334. release_date: "2026-01-29"
  2335. specs:
  2336. - mode: standard
  2337. quantization: BF16
  2338. source: huggingface
  2339. huggingface_repo_id: Qwen/Qwen3-ASR-1.7B
  2340. backend: vLLM
  2341. categories:
  2342. - speech_to_text
  2343. - name: Qwen3-ASR-0.6B
  2344. description: Qwen3-ASR-0.6B support language identification and ASR for 52 languages and dialects. It leverages large-scale speech training data and the strong audio understanding capability of its foundation model, Qwen3-Omni.
  2345. home: https://qwen.ai
  2346. icon: /static/catalog_icons/qwen.png
  2347. size: 0.6
  2348. categories:
  2349. - speech_to_text
  2350. licenses:
  2351. - apache-2.0
  2352. release_date: "2026-01-29"
  2353. specs:
  2354. - mode: standard
  2355. quantization: BF16
  2356. source: huggingface
  2357. huggingface_repo_id: Qwen/Qwen3-ASR-0.6B
  2358. backend: vLLM
  2359. categories:
  2360. - speech_to_text
  2361. - name: Dia-1.6B
  2362. description: Dia is a text-to-speech model created by Nari Labs. Dia directly generates highly realistic dialogue from a transcript. You can condition the output on audio, enabling emotion and tone control. The model can also produce nonverbal communications like laughter, coughing, clearing throat, etc.
  2363. home: https://narilabs.org
  2364. icon: /static/catalog_icons/narilabs.png
  2365. size: 1.6
  2366. categories:
  2367. - text_to_speech
  2368. licenses:
  2369. - apache-2.0
  2370. release_date: "2025-04-21"
  2371. specs:
  2372. - mode: standard
  2373. quantization: FP32
  2374. source: huggingface
  2375. huggingface_repo_id: nari-labs/Dia-1.6B
  2376. backend: VoxBox
  2377. env:
  2378. GPUSTACK_MODEL_VRAM_CLAIM: "10737418240" # 10 GiB, Dia model empirical estimate.
  2379. - name: Qwen3-TTS-12Hz-1.7B-Base
  2380. description: Qwen3-TTS-12Hz-1.7B-Base is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting 12kHz audio generation.
  2381. home: https://qwen.ai
  2382. icon: /static/catalog_icons/qwen.png
  2383. size: 1.7
  2384. categories:
  2385. - text_to_speech
  2386. licenses:
  2387. - apache-2.0
  2388. release_date: "2026-01-22"
  2389. .base_spec: &qwen3_tts_12hz_1_7b_base_base_spec
  2390. mode: standard
  2391. quantization: "BF16"
  2392. source: huggingface
  2393. huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-1.7B-Base
  2394. backend: vLLM
  2395. backend_parameters:
  2396. - --omni
  2397. specs:
  2398. - <<: *qwen3_tts_12hz_1_7b_base_base_spec
  2399. gpu_filters:
  2400. vendor: ascend
  2401. backend_version: *vllm_omni_ascend_stable_version
  2402. - <<: *qwen3_tts_12hz_1_7b_base_base_spec
  2403. backend_version: *vllm_omni_stable_version
  2404. - name: Qwen3-TTS-12Hz-1.7B-CustomVoice
  2405. description: Qwen3-TTS-12Hz-1.7B-CustomVoice is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting custom voice cloning and 12kHz audio generation.
  2406. home: https://qwen.ai
  2407. icon: /static/catalog_icons/qwen.png
  2408. size: 1.7
  2409. categories:
  2410. - text_to_speech
  2411. licenses:
  2412. - apache-2.0
  2413. release_date: "2026-01-22"
  2414. .base_spec: &qwen3_tts_12hz_1_7b_customvoice_base_spec
  2415. mode: standard
  2416. quantization: "BF16"
  2417. source: huggingface
  2418. huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
  2419. backend: vLLM
  2420. backend_parameters:
  2421. - --omni
  2422. specs:
  2423. - <<: *qwen3_tts_12hz_1_7b_customvoice_base_spec
  2424. gpu_filters:
  2425. vendor: ascend
  2426. backend_version: *vllm_omni_ascend_stable_version
  2427. - <<: *qwen3_tts_12hz_1_7b_customvoice_base_spec
  2428. backend_version: *vllm_omni_stable_version
  2429. - name: Qwen3-TTS-12Hz-1.7B-VoiceDesign
  2430. description: Qwen3-TTS-12Hz-1.7B-VoiceDesign is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting voice design and 12kHz audio generation.
  2431. home: https://qwen.ai
  2432. icon: /static/catalog_icons/qwen.png
  2433. size: 1.7
  2434. categories:
  2435. - text_to_speech
  2436. licenses:
  2437. - apache-2.0
  2438. release_date: "2026-01-22"
  2439. .base_spec: &qwen3_tts_12hz_1_7b_voicedesign_base_spec
  2440. mode: standard
  2441. quantization: "BF16"
  2442. source: huggingface
  2443. huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
  2444. backend: vLLM
  2445. backend_parameters:
  2446. - --omni
  2447. specs:
  2448. - <<: *qwen3_tts_12hz_1_7b_voicedesign_base_spec
  2449. gpu_filters:
  2450. vendor: ascend
  2451. backend_version: *vllm_omni_ascend_stable_version
  2452. - <<: *qwen3_tts_12hz_1_7b_voicedesign_base_spec
  2453. backend_version: *vllm_omni_stable_version
  2454. - name: Qwen3-TTS-12Hz-0.6B-Base
  2455. description: Qwen3-TTS-12Hz-0.6B-Base is a text-to-speech model from the Qwen3-TTS series with 0.6B parameters, supporting 12kHz audio generation.
  2456. home: https://qwen.ai
  2457. icon: /static/catalog_icons/qwen.png
  2458. size: 0.6
  2459. categories:
  2460. - text_to_speech
  2461. licenses:
  2462. - apache-2.0
  2463. release_date: "2026-01-22"
  2464. .base_spec: &qwen3_tts_12hz_0_6b_base_base_spec
  2465. mode: standard
  2466. quantization: "BF16"
  2467. source: huggingface
  2468. huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-0.6B-Base
  2469. backend: vLLM
  2470. backend_parameters:
  2471. - --omni
  2472. specs:
  2473. - <<: *qwen3_tts_12hz_0_6b_base_base_spec
  2474. gpu_filters:
  2475. vendor: ascend
  2476. backend_version: *vllm_omni_ascend_stable_version
  2477. - <<: *qwen3_tts_12hz_0_6b_base_base_spec
  2478. backend_version: *vllm_omni_stable_version
  2479. - name: Qwen3-TTS-12Hz-0.6B-CustomVoice
  2480. description: Qwen3-TTS-12Hz-0.6B-CustomVoice is a text-to-speech model from the Qwen3-TTS series with 0.6B parameters, supporting custom voice cloning and 12kHz audio generation.
  2481. home: https://qwen.ai
  2482. icon: /static/catalog_icons/qwen.png
  2483. size: 0.6
  2484. categories:
  2485. - text_to_speech
  2486. licenses:
  2487. - apache-2.0
  2488. release_date: "2026-01-22"
  2489. .base_spec: &qwen3_tts_12hz_0_6b_customvoice_base_spec
  2490. mode: standard
  2491. quantization: "BF16"
  2492. source: huggingface
  2493. huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice
  2494. backend: vLLM
  2495. backend_parameters:
  2496. - --omni
  2497. specs:
  2498. - <<: *qwen3_tts_12hz_0_6b_customvoice_base_spec
  2499. gpu_filters:
  2500. vendor: ascend
  2501. backend_version: *vllm_omni_ascend_stable_version
  2502. - <<: *qwen3_tts_12hz_0_6b_customvoice_base_spec
  2503. backend_version: *vllm_omni_stable_version
  2504. - name: SenseVoice-Small
  2505. description: SenseVoice is a speech foundation model with multiple speech understanding capabilities, including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and audio event detection (AED).
  2506. home: https://github.com/FunAudioLLM
  2507. icon: /static/catalog_icons/FunAudioLLM.png
  2508. categories:
  2509. - speech_to_text
  2510. licenses:
  2511. - apache-2.0
  2512. release_date: "2024-07-31"
  2513. specs:
  2514. - mode: standard
  2515. quantization: FP16
  2516. source: huggingface
  2517. huggingface_repo_id: FunAudioLLM/SenseVoiceSmall
  2518. backend: VoxBox
  2519. env:
  2520. GPUSTACK_MODEL_VRAM_CLAIM: "12884901888" # 12 GiB, it depends on the audio length. This value works for ~10 minutes audio input.