Maas2-group
/
maas-base


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521
							# YAML Variables
.vllm_omni_ascend_stable_version: &vllm_omni_ascend_stable_version "0.14.1"
.vllm_omni_stable_version: &vllm_omni_stable_version "0.16.0"

draft_models:
- name: Qwen3-8B-EAGLE3
  algorithm: eagle3
  source: huggingface
  huggingface_repo_id: Tengyunw/qwen3_8b_eagle3
- name: Qwen3-30B-A3B-EAGLE3
  algorithm: eagle3
  source: huggingface
  huggingface_repo_id: Tengyunw/qwen3_30b_moe_eagle3
- name: Qwen3-235B-A22B-EAGLE3
  algorithm: eagle3
  source: huggingface
  huggingface_repo_id: lmsys/Qwen3-235B-A22B-EAGLE3
- name: gpt-oss-120b-EAGLE3
  algorithm: eagle3
  source: huggingface
  huggingface_repo_id: lmsys/EAGLE3-gpt-oss-120b-bf16
model_sets:
- name: Qwen3-0.6B
  description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 0.6
  categories:
    - llm
  capabilities:
    - context/128K
    - tools
  licenses:
    - apache-2.0
  release_date: "2025-04-19"
  specs:
    # Ascend NPUs
    - mode: throughput
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-0.6B
      backend: MindIE
      backend_parameters:
        - --max-seq-len=8192
    # Other GPUs
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-0.6B
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --max-model-len=8192
- name: Qwen3-8B
  description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 8
  categories:
    - llm
  capabilities:
    - context/128K
    - tools
  licenses:
    - apache-2.0
  release_date: "2025-04-19"
  specs:
    # Ascend NPUs
    - mode: throughput
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-8B
      backend: MindIE
      backend_parameters:
        - --max-seq-len=32768
    # Other GPUs
    - mode: throughput
      quantization: FP8
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-8B-FP8
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --max-model-len=32768
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-8B
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --max-model-len=32768
- name: Falcon-H1R-7B
  description: Falcon-H1R-7B is a reasoning-specialized language model built on top of Falcon-H1-7B-Base, featuring a Hybrid-Head Language Model (Transformer-SSM) architecture that delivers outstanding performance in mathematics, programming, and instruction following.
  home: https://huggingface.co/tiiuae
  icon: /static/catalog_icons/tii.png
  size: 7
  categories:
    - llm
  capabilities:
    - context/256K
  licenses:
    - falcon-llm-license
  release_date: "2026-01-05"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: tiiuae/Falcon-H1R-7B
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --max-model-len=65536
- name: Qwen3-14B
  description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 14
  categories:
    - llm
  capabilities:
    - context/128K
    - tools
  licenses:
    - apache-2.0
  release_date: "2025-04-19"
  specs:
    # Ascend NPUs
    - mode: throughput
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-14B
      backend: MindIE
      backend_parameters:
        - --max-seq-len=32768
    # Other GPUs
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0" # Hopper or later
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-14B-FP8
      backend: SGLang
      backend_parameters:
        - --reasoning-parser=qwen3
        - --context-length=32768
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: "<9.0" # Before Hopper
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-14B-FP8
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --max-model-len=32768
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-14B
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --max-model-len=32768
- name: Qwen3-32B
  description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 32
  categories:
    - llm
  capabilities:
    - context/128K
    - tools
  licenses:
    - apache-2.0
  release_date: "2025-04-19"
  specs:
    # Ascend NPUs
    - mode: throughput
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-32B
      backend: MindIE
      backend_parameters:
        - --max-seq-len=32768
    # Other GPUs
    - mode: throughput
      quantization: FP8
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-32B-FP8
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --max-model-len=32768
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-32B
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --max-model-len=32768
- name: Qwen3-Coder-Next
  description: Qwen3-Coder-Next is a super-efficient coding model with 80B total parameters and 3B activated parameters (MoE architecture). It achieves performance comparable to models with 10-20x more active parameters, excelling at long-horizon reasoning, complex tool usage, and IDE integration.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 80
  activated_size: 3
  categories:
    - llm
  capabilities:
    - context/256K
    - tools
  licenses:
    - apache-2.0
  release_date: "2026-02-03"
  specs:
    - mode: throughput
      quantization: FP8
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-Coder-Next-FP8
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
        - --enable-auto-tool-choice
        - --tool-call-parser=qwen3_coder
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-Coder-Next
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
        - --enable-auto-tool-choice
        - --tool-call-parser=qwen3_coder
- name: Qwen3-30B-A3B-Instruct-2507
  description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 30
  activated_size: 3
  categories:
    - llm
  capabilities:
    - context/256K
    - tools
  licenses:
    - apache-2.0
  release_date: "2025-07-21"
  specs:
    # Ascend NPUs
    - mode: throughput
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-30B-A3B-Instruct-2507
      backend: MindIE
      backend_parameters:
        - --max-seq-len=32768
    # Other GPUs
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0" # Hopper or later
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-30B-A3B-Instruct-2507-FP8
      backend: SGLang
      backend_parameters:
        - --tool-call-parser=qwen25
        - --context-length=32768
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: "<9.0" # Before Hopper
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-30B-A3B-Instruct-2507-FP8
      backend: vLLM
      backend_parameters:
        - --tool-call-parser=hermes
        - --enable-auto-tool-choice
        - --max-model-len=32768
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-30B-A3B-Instruct-2507
      backend: vLLM
      backend_parameters:
        - --tool-call-parser=hermes
        - --enable-auto-tool-choice
        - --max-model-len=32768
- name: Qwen3-30B-A3B-Thinking-2507
  description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 30
  activated_size: 3 
  categories:
    - llm
  capabilities:
    - context/256K
    - tools
  licenses:
    - apache-2.0
  release_date: "2025-07-21"
  specs:
    # Ascend NPUs
    - mode: throughput
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-30B-A3B-Thinking-2507
      backend: MindIE
      backend_parameters:
        - --max-seq-len=32768
    # Other GPUs
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0" # Hopper or later
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
      backend: SGLang
      backend_parameters:
        - --reasoning-parser=deepseek-r1
        - --tool-call-parser=qwen25
        - --context-length=32768
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: "<9.0" # Before Hopper
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-30B-A3B-Thinking-2507-FP8
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --tool-call-parser=hermes
        - --enable-auto-tool-choice
        - --max-model-len=32768
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-30B-A3B-Thinking-2507
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --tool-call-parser=hermes
        - --enable-auto-tool-choice
        - --max-model-len=32768
- name: Qwen3-235B-A22B-Instruct-2507
  description: The updated version of the Qwen3-235B-A22B non-thinking mode.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 235
  activated_size: 22
  categories:
    - llm
  capabilities:
    - context/1M
    - tools
  licenses:
    - apache-2.0
  release_date: "2025-07-21"
  specs:
    # Ascend NPUs
    - mode: throughput
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-235B-A22B-Instruct-2507
      backend: MindIE
      backend_parameters:
        - --max-seq-len=65536
    # Other GPUs
    - mode: throughput
      quantization: FP8
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-235B-A22B-Instruct-2507-FP8
      backend: vLLM
      backend_parameters:
        - --tool-call-parser=hermes
        - --enable-auto-tool-choice
        - --max-model-len=65536
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-235B-A22B-Instruct-2507
      backend: vLLM
      backend_parameters:
        - --tool-call-parser=hermes
        - --enable-auto-tool-choice
        - --max-model-len=65536
- name: Qwen3-235B-A22B-Thinking-2507
  description: The updated version of the Qwen3-235B-A22B thinking mode.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 235
  activated_size: 22
  categories:
    - llm
  capabilities:
    - context/1M
    - tools
  licenses:
    - apache-2.0
  release_date: "2025-07-21"
  specs:
    # Ascend NPUs
    - mode: throughput
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-235B-A22B-Thinking-2507
      backend: MindIE
      backend_parameters:
        - --max-seq-len=65536
    # Other GPUs
    - mode: throughput
      quantization: FP8
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-235B-A22B-Thinking-2507-FP8
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --tool-call-parser=hermes
        - --enable-auto-tool-choice
        - --max-model-len=65536
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-235B-A22B-Thinking-2507
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=deepseek_r1
        - --tool-call-parser=hermes
        - --enable-auto-tool-choice
        - --max-model-len=65536
- name: Qwen3.5-0.8B
  description: Qwen3.5-0.8B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 0.8
  categories:
    - llm
  capabilities:
    - context/256K
    - reasoning
    - tools
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-03-02"
  specs:
    # Ascend NPUs
    - mode: standard
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-0.8B
      backend: SGLang
      backend_version: 0.5.9
      backend_parameters:
        - --context-length=32768
        - --disable-radix-cache
        - --chunked-prefill-size=4096
        - --max-prefill-tokens=4096
        - --max-total-tokens=40960
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-0.8B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
- name: Qwen3.5-2B
  description: Qwen3.5-2B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 2
  categories:
    - llm
  capabilities:
    - context/256K
    - reasoning
    - tools
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-03-02"
  specs:
    # Ascend NPUs
    - mode: standard
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-2B
      backend: SGLang
      backend_version: 0.5.9
      backend_parameters:
        - --context-length=32768
        - --disable-radix-cache
        - --chunked-prefill-size=4096
        - --max-prefill-tokens=4096
        - --max-total-tokens=40960
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-2B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
- name: Qwen3.5-4B
  description: Qwen3.5-4B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 4
  categories:
    - llm
  capabilities:
    - context/256K
    - reasoning
    - tools
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-03-02"
  specs:
    # Ascend NPUs
    - mode: standard
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-4B
      backend: SGLang
      backend_version: 0.5.9
      backend_parameters:
        - --reasoning-parser=qwen3
        - --context-length=32768
        - --disable-radix-cache
        - --chunked-prefill-size=4096
        - --max-prefill-tokens=4096
        - --max-total-tokens=40960
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-4B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
- name: Qwen3.5-9B
  description: Qwen3.5-9B is a model from the Qwen family, designed for strong reasoning, coding, and multilingual understanding with competitive performance across a wide range of tasks.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 9
  categories:
    - llm
  capabilities:
    - context/256K
    - reasoning
    - tools
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-03-02"
  specs:
    # Ascend NPUs
    - mode: standard
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-9B
      backend: SGLang
      backend_version: 0.5.9
      backend_parameters:
        - --reasoning-parser=qwen3
        - --context-length=32768
        - --disable-radix-cache
        - --chunked-prefill-size=4096
        - --max-prefill-tokens=4096
        - --max-total-tokens=40960
    - mode: throughput
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-9B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
        - --performance-mode=throughput
        - --enable-prefix-caching
    - mode: latency
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-9B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
        - --performance-mode=interactivity
        - --language-model-only
      speculative_config:
        enabled: true
        algorithm: mtp
        num_draft_tokens: 1
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-9B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
- name: Qwen3.5-27B
  description: Qwen3.5-27B is a model designed for strong reasoning, coding, and multilingual understanding with competitive performance across a wide range of tasks.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 27
  categories:
    - llm
  capabilities:
    - context/256K
    - reasoning
    - tools
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-02-24"
  specs:
    # Ascend NPUs
    - mode: standard
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-27B
      backend: SGLang
      backend_version: 0.5.9
      backend_parameters:
        - --reasoning-parser=qwen3
        - --context-length=32768
        - --disable-radix-cache
        - --chunked-prefill-size=4096
        - --max-prefill-tokens=4096
        - --max-total-tokens=40960
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-27B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-27B-FP8
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
        - --performance-mode=throughput
        - --enable-prefix-caching
- name: Qwen3.5-35B-A3B
  description: Qwen3.5-35B-A3B is a 35-billion-parameter open-source large language model from the Qwen family, designed for strong reasoning, code generation, and multilingual understanding across diverse tasks.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 35
  activated_size: 3
  categories:
    - llm
  capabilities:
    - context/256K
    - reasoning
    - tools
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-02-24"
  specs:
    # Ascend NPUs
    - mode: standard
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-35B-A3B
      backend: SGLang
      backend_version: 0.5.9
      backend_parameters:
        - --reasoning-parser=qwen3
        - --context-length=32768
        - --disable-radix-cache
        - --chunked-prefill-size=4096
        - --max-prefill-tokens=4096
        - --max-total-tokens=40960
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-35B-A3B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-35B-A3B-FP8
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
        - --performance-mode=throughput
        - --enable-prefix-caching
    - mode: latency
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-35B-A3B-FP8
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
        - --performance-mode=interactivity
      speculative_config:
        enabled: true
        algorithm: mtp
        num_draft_tokens: 1
- name: Qwen3.5-122B-A10B
  description: Qwen3.5-122B-A10B is a 122-billion-parameter open-source large language model from the Qwen family, designed for strong reasoning, code generation, and multilingual understanding across diverse tasks.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 122
  activated_size: 10
  categories:
    - llm
  capabilities:
    - context/256K
    - reasoning
    - tools
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-02-24"
  specs:
    # Ascend NPUs
    - mode: standard
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-122B-A10B
      backend: SGLang
      backend_version: 0.5.9
      backend_parameters:
        - --reasoning-parser=qwen3
        - --context-length=32768
        - --disable-radix-cache
        - --chunked-prefill-size=4096
        - --max-prefill-tokens=4096
        - --max-total-tokens=40960
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-122B-A10B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-122B-A10B-FP8
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
        - --performance-mode=throughput
        - --enable-prefix-caching
- name: Qwen3.5-397B-A17B
  description: Qwen3.5-397B-A17B is a flagship MoE-hybrid model that delivers state-of-the-art reasoning and multimodal performance with ultra-efficient inference capabilities.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 397
  activated_size: 17
  categories:
    - llm
  capabilities:
    - context/256K
    - reasoning
    - tools
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-02-16"
  specs:
    # Ascend NPUs
    - mode: standard
      quantization: BF16
      gpu_filters:
        vendor: ascend
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-397B-A17B
      backend: SGLang
      backend_version: 0.5.9
      backend_parameters:
        - --reasoning-parser=qwen3
        - --context-length=32768
        - --disable-radix-cache
        - --chunked-prefill-size=4096
        - --max-prefill-tokens=4096
        - --max-total-tokens=40960
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-397B-A17B
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3.5-397B-A17B-FP8
      backend: vLLM
      backend_version: 0.17.1
      backend_parameters:
        - --reasoning-parser=qwen3
        - --max-model-len=32768
        - --performance-mode=throughput
        - --enable-prefix-caching
- name: GLM-4.7
  description: GLM-4.7 is a large language model developed by Zhipu AI, featuring advanced agentic, reasoning, and coding capabilities.
  home: https://z.ai
  icon: /static/catalog_icons/zai.png
  size: 355
  activated_size: 32
  categories:
    - llm
  capabilities:
    - context/1M
    - reasoning
    - tools
  licenses:
    - mit
  release_date: "2025-12-22"
  specs:
    # TODO: tool-call-parser glm47 not yet available in the latest vLLM/SGLang release
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0" # Hopper or later
      source: huggingface
      huggingface_repo_id: zai-org/GLM-4.7-FP8
      backend: SGLang
      backend_parameters:
        - --reasoning-parser=glm45
        - --context-length=65536
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: "<9.0" # Before Hopper
      source: huggingface
      huggingface_repo_id: zai-org/GLM-4.7-FP8
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=glm45
        - --max-model-len=65536
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: zai-org/GLM-4.7
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=glm45
        - --max-model-len=65536
- name: GLM-4.6
  description: GLM-4.6 is a large language model developed by Zhipu AI, featuring advanced agentic, reasoning, and coding capabilities.
  home: https://z.ai
  icon: /static/catalog_icons/zai.png
  size: 355
  activated_size: 32
  categories:
    - llm
  capabilities:
    - context/1M
    - reasoning
    - tools
  licenses:
    - mit
  release_date: "2025-09-30"
  specs:
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0" # Hopper or later
      source: huggingface
      huggingface_repo_id: zai-org/GLM-4.6-FP8
      backend: SGLang
      backend_parameters:
        - --tool-call-parser=glm
        - --reasoning-parser=glm45
        - --context-length=65536
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: "<9.0" # Before Hopper
      source: huggingface
      huggingface_repo_id: zai-org/GLM-4.6-FP8
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=glm45
        - --tool-call-parser=glm45
        - --enable-auto-tool-choice
        - --max-model-len=65536
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: zai-org/GLM-4.6
      backend: vLLM
      backend_parameters:
        - --reasoning-parser=glm45
        - --tool-call-parser=glm45
        - --enable-auto-tool-choice
        - --max-model-len=65536
- name: gpt-oss-120b
  description: The gpt-oss series is OpenAI's family of open-weight models, designed for powerful reasoning, agentic tasks, and versatile developer use cases.
  home: https://openai.com
  icon: /static/catalog_icons/openai.png
  categories:
    - llm
  capabilities:
    - context/128K
  size: 120
  licenses:
    - apache-2.0
  release_date: "2025-08-05"
  specs:
    - mode: throughput
      quantization: "MXFP4"
      source: huggingface
      huggingface_repo_id: openai/gpt-oss-120b
      backend: vLLM
      backend_parameters:
        - --max-model-len=32768
        - --tool-call-parser=openai
        - --enable-auto-tool-choice
        - --async-scheduling
    - mode: standard
      quantization: "MXFP4"
      source: huggingface
      huggingface_repo_id: openai/gpt-oss-120b
      backend: vLLM
      backend_parameters:
        - --max-model-len=32768
        - --tool-call-parser=openai
        - --enable-auto-tool-choice
- name: gpt-oss-20b
  description: The gpt-oss series is OpenAI's family of open-weight models, designed for powerful reasoning, agentic tasks, and versatile developer use cases.
  home: https://openai.com
  icon: /static/catalog_icons/openai.png
  categories:
    - llm
  capabilities:
    - context/128K
  size: 20
  licenses:
    - apache-2.0
  release_date: "2025-08-05"
  specs:
    - mode: throughput
      quantization: "MXFP4"
      source: huggingface
      huggingface_repo_id: openai/gpt-oss-20b
      backend: vLLM
      backend_parameters:
        - --max-model-len=32768
        - --tool-call-parser=openai
        - --enable-auto-tool-choice
        - --async-scheduling
    - mode: standard
      quantization: "MXFP4"
      source: huggingface
      huggingface_repo_id: openai/gpt-oss-20b
      backend: vLLM
      backend_parameters:
        - --max-model-len=32768
        - --tool-call-parser=openai
        - --enable-auto-tool-choice
- name: Deepseek-R1-0528
  description: DeepSeek-R1-0528 is a minor version of the DeepSeek R1 model that features enhanced reasoning depth and inference capabilities. These improvements are achieved through increased computational resources and algorithmic optimizations applied during post-training. The model delivers strong performance across a range of benchmark evaluations, including mathematics, programming, and general logic, with overall capabilities approaching those of leading models such as O3 and Gemini 2.5 Pro.
  home: https://www.deepseek.com
  icon: /static/catalog_icons/deepseek.png
  categories:
    - llm
  capabilities:
    - context/128K
  size: 671
  licenses:
    - mit
  release_date: "2025-05-28"
  specs:
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0" # Hopper or later
      source: huggingface
      huggingface_repo_id: deepseek-ai/DeepSeek-R1-0528
      backend: SGLang
      backend_parameters:
        - --enable-dp-attention
        - --context-length=32768
    - mode: standard
      quantization: FP8
      source: huggingface
      huggingface_repo_id: deepseek-ai/DeepSeek-R1-0528
      backend: vLLM
      backend_parameters:
        - --max-model-len=32768
- name: DeepSeek-OCR
  description: DeepSeek-OCR is an advanced optical character recognition (OCR) model developed by DeepSeek AI. It is designed to accurately extract text from images and scanned documents.
  home: https://www.deepseek.com
  icon: /static/catalog_icons/deepseek.png
  size: 3
  categories:
    - llm
  licenses:
    - mit
  release_date: "2025-10-20"
  specs:
    - mode: standard
      quantization: "BF16"
      gpu_filters:
        vendor: 
          - nvidia
          - amd
      source: huggingface
      huggingface_repo_id: deepseek-ai/DeepSeek-OCR
      backend: vLLM
      backend_version: 0.11.2
      backend_parameters:
        - --logits_processors=vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor
        - --no-enable-prefix-caching
        - --mm-processor-cache-gb=0
- name: PaddleOCR-VL-1.5
  description: PaddleOCR-VL-1.5 is an advanced optical character recognition (OCR) vision-language model developed by PaddlePaddle. It is designed to accurately extract and understand text from images and documents.
  home: https://www.paddleocr.com
  icon: /static/catalog_icons/paddlepaddle.jpeg
  size: 0.9
  categories:
    - llm
  capabilities:
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-01-29"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: PaddlePaddle/PaddleOCR-VL-1.5
      backend: vLLM
      backend_parameters:
        - --trust-remote-code
        - --max-num-batched-tokens=16384
        - --no-enable-prefix-caching
        - --mm-processor-cache-gb=0
- name: LightOnOCR-2-1B
  description: LightOnOCR-2-1B is an efficient end-to-end vision-language model for optical character recognition (OCR), converting documents (PDFs, scans, images) into clean, naturally ordered text. It achieves state-of-the-art performance on OlmOCR-Bench while being significantly faster and more cost-effective than competitors.
  home: https://www.lighton.ai
  icon: /static/catalog_icons/lighton.png
  size: 1
  categories:
    - llm
  capabilities:
    - vision
  licenses:
    - apache-2.0
  release_date: "2026-01-19"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: lightonai/LightOnOCR-2-1B
      backend: vLLM
      backend_parameters:
        - '--limit-mm-per-prompt={"image": 1}'
        - --mm-processor-cache-gb=0
        - --no-enable-prefix-caching
- name: Deepseek-V3.2
  description: 'DeepSeek-V3.2 is a model that balances computational efficiency with strong reasoning and agent capabilities through three technical innovations: DeepSeek Sparse Attention (DSA), Scalable Reinforcement Learning Framework, Large-Scale Agentic Task Synthesis Pipeline.'
  home: https://www.deepseek.com
  icon: /static/catalog_icons/deepseek.png
  categories:
    - llm
  capabilities:
    - context/128K
  size: 685
  licenses:
    - mit
  release_date: "2025-12-01"
  specs:
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0" # Hopper or later
      source: huggingface
      huggingface_repo_id: deepseek-ai/DeepSeek-V3.2
      backend: SGLang
      backend_version: 0.5.6.post2
      backend_parameters:
        - --enable-dp-attention
        - --context-length=65536
        - --reasoning-parser=deepseek-v3
        - --tool-call-parser=deepseek_v32
        - --chat-template={data_dir}/chat_templates/tool_chat_template_deepseekv32.jinja
    - mode: standard
      quantization: FP8
      source: huggingface
      huggingface_repo_id: deepseek-ai/DeepSeek-V3.2
      backend: vLLM
      backend_version: 0.13.0
      backend_parameters:
        - --max-model-len=65536
        - --tokenizer-mode=deepseek_v32
        - --reasoning-parser=deepseek_v3
        - --tool-call-parser=deepseek_v32
        - --enable-auto-tool-choice
- name: Deepseek-V3.2-Speciale
  description: This model is the high-compute variant of DeepSeek-V3.2, surpasses GPT-5 and matches Gemini-3.0-Pro in reasoning, achieving gold-medal level performance in the 2025 IMO and IOI competitions. 
  home: https://www.deepseek.com
  icon: /static/catalog_icons/deepseek.png
  categories:
    - llm
  capabilities:
    - context/128K
  size: 685
  licenses:
    - mit
  release_date: "2025-12-01"
  specs:
    - mode: throughput
      quantization: FP8
      gpu_filters:
        vendor: nvidia
        compute_capability: ">=9.0" # Hopper or later
      source: huggingface
      huggingface_repo_id: deepseek-ai/DeepSeek-V3.2-Speciale
      backend: SGLang
      backend_version: 0.5.6.post2
      backend_parameters:
        - --enable-dp-attention
        - --context-length=65536
        - --reasoning-parser=deepseek-v3
    - mode: standard
      quantization: FP8
      source: huggingface
      huggingface_repo_id: deepseek-ai/DeepSeek-V3.2-Speciale
      backend: vLLM
      backend_version: 0.13.0
      backend_parameters:
        - --max-model-len=65536
        - --tokenizer-mode=deepseek_v32
        - --reasoning-parser=deepseek_v3
- name: MiniMax-M2.1
  description: MiniMax-M2.1 is a high-performance agentic model, optimized for robustness in coding, tool use, instruction following, and long-horizon planning. It excels in multilingual software development and complex multi-step workflows.
  home: https://www.minimax.io
  icon: /static/catalog_icons/minimax.png
  size: 230
  activated_size: 10
  categories:
    - llm
  capabilities:
    - context/192K
    - tools
  licenses:
    - modified-mit
  release_date: "2025-12-23"
  specs:
    - mode: standard
      quantization: FP8
      source: huggingface
      huggingface_repo_id: MiniMaxAI/MiniMax-M2.1
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
        - --reasoning-parser=minimax_m2_append_think
        - --tool-call-parser=minimax_m2
        - --enable-auto-tool-choice
        - --trust-remote-code
- name: MiniMax-M2.5
  description: MiniMax-M2.5 is a powerful MoE (Mixture-of-Experts) model that delivers exceptional performance in logical reasoning, coding, and complex agent tasks through highly efficient inference.
  home: https://www.minimax.io/
  icon: /static/catalog_icons/minimax.png
  size: 230
  activated_size: 10
  categories:
    - llm
  capabilities:
    - context/196K
    - reasoning
    - tools
  licenses:
    - modified-mit
  release_date: "2026-02-12"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: MiniMaxAI/MiniMax-M2.5
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
        - --reasoning-parser=minimax_m2_append_think
        - --tool-call-parser=minimax_m2
        - --enable-auto-tool-choice
        - --trust-remote-code
        - --enable-expert-parallel
- name: Kimi-K2.5
  description: Kimi-K2.5 is a multimodal mixture-of-experts model with 1T total parameters and 32B activated parameters. It features native INT4 quantization, vision support, dual operating modes (thinking/instant), agent swarm capabilities, and excels at visual reasoning, coding with vision, and complex tool orchestration.
  home: https://www.moonshot.ai
  icon: /static/catalog_icons/kimi.png
  size: 1
  size_unit: T
  activated_size: 32
  categories:
    - llm
  capabilities:
    - context/256K
    - vision
    - tools
  licenses:
    - modified-mit
  release_date: "2026-01-26"
  specs:
    - mode: standard
      quantization: INT4
      source: huggingface
      huggingface_repo_id: moonshotai/Kimi-K2.5
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
        - --mm-encoder-tp-mode=data
        - --tool-call-parser=kimi_k2
        - --reasoning-parser=kimi_k2
        - --trust-remote-code
- name: Step-3.5-Flash
  description: Step-3.5-Flash is a fast, cost-effective multimodal model with 196B total parameters and 11B active parameters (MoE), optimized for quick inference. Built on StepFun's Step3 architecture, it delivers strong performance across text and vision tasks with efficient token usage.
  home: https://www.stepfun.com
  icon: /static/catalog_icons/stepfun.png
  size: 196
  activated_size: 11
  categories:
    - llm
  capabilities:
    - context/256K
    - tools
  licenses:
    - apache-2.0
  release_date: "2026-02-02"
  specs:
    - mode: throughput
      quantization: FP8
      source: huggingface
      huggingface_repo_id: stepfun-ai/Step-3.5-Flash-FP8
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
        - --disable-cascade-attn
        - --reasoning-parser=step3p5
        - --enable-auto-tool-choice
        - --tool-call-parser=step3p5
        - --trust-remote-code
        - --quantization=fp8
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: stepfun-ai/Step-3.5-Flash
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
        - --disable-cascade-attn
        - --reasoning-parser=step3p5
        - --enable-auto-tool-choice
        - --tool-call-parser=step3p5
        - --trust-remote-code
- name: Nanbeige4.1-3B
  description: Nanbeige4.1-3B is a 3B-parameter language model from Nanbeige LLM Lab, optimized for long-context reasoning, agentic tasks, and tool use.
  home: https://huggingface.co/Nanbeige
  icon: /static/catalog_icons/nanbeige.png
  size: 3
  categories:
    - llm
  capabilities:
    - context/256K
    - reasoning
    - tools
  licenses:
    - apache-2.0
  release_date: "2026-02-13"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Nanbeige/Nanbeige4.1-3B
      backend: vLLM
      backend_parameters:
        - --max-model-len=32768
# Embedding models
- name: Qwen3-Embedding-0.6B
  description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 0.6
  categories:
    - embedding
  capabilities:
    - dimensions/4096
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2025-06-09"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-Embedding-0.6B
      categories:
        - embedding
      backend: vLLM
- name: Qwen3-Embedding-4B
  description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 4
  categories:
    - embedding
  capabilities:
    - dimensions/4096
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2025-06-09"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-Embedding-4B
      categories:
        - embedding
      backend: vLLM
- name: Qwen3-Embedding-8B
  description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 8
  categories:
    - embedding
  capabilities:
    - dimensions/4096
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2025-06-09"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-Embedding-8B
      categories:
        - embedding
      backend: vLLM
- name: Qwen3-VL-Embedding-2B
  description: Qwen3-VL-Embedding is a multimodal embedding model series optimized for multimodal retrieval, clustering, and classification. It supports image-text retrieval and unified multimodal representation learning with 30+ languages support.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 2
  categories:
    - embedding
  capabilities:
    - vision
    - dimensions/2048
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2026-01-08"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-Embedding-2B
      categories:
        - embedding
      backend: vLLM
      backend_parameters:
        - --runner=pooling
- name: Qwen3-VL-Embedding-8B
  description: Qwen3-VL-Embedding is a multimodal embedding model series optimized for multimodal retrieval, clustering, and classification. It supports image-text retrieval and unified multimodal representation learning with 30+ languages support.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 8
  categories:
    - embedding
  capabilities:
    - vision
    - dimensions/4096
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2026-01-08"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-Embedding-8B
      categories:
        - embedding
      backend: vLLM
      backend_parameters:
        - --runner=pooling
- name: BGE-M3
  description: BGE-M3 is a new model from BAAI distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity.
  home: https://bge-model.com
  icon: /static/catalog_icons/bge_logo.jpeg
  categories:
    - embedding
  capabilities:
    - dimensions/1024
    - max_tokens/8192
  size: 567
  size_unit: M
  licenses:
    - mit
  release_date: "2024-01-28"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: BAAI/bge-m3
      categories:
        - embedding
      backend: vLLM
- name: BGE-Large-ZH-V1.5
  description: BGE is short for BAAI general embedding. This is a Chinese text embedding model with more reasonable similarity distribution.
  home: https://bge-model.com
  icon: /static/catalog_icons/bge_logo.jpeg
  categories:
    - embedding
  capabilities:
    - dimensions/1024
    - max_tokens/512
  size: 335
  size_unit: M
  licenses:
    - mit
  release_date: "2023-09-12"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: BAAI/bge-large-zh-v1.5
      categories:
        - embedding
      backend: vLLM
- name: BGE-Large-EN-V1.5
  description: BGE is short for BAAI general embedding. This is an English text embedding model with more reasonable similarity distribution.
  home: https://bge-model.com
  icon: /static/catalog_icons/bge_logo.jpeg
  categories:
    - embedding
  capabilities:
    - dimensions/1024
    - max_tokens/512
  size: 335
  size_unit: M
  licenses:
    - mit
  release_date: "2023-09-12"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: BAAI/bge-large-en-v1.5
      categories:
        - embedding
      backend: vLLM
- name: Nomic-Embed-Text-V1.5
  description: Nomic-embed-text is a large context length text encoder that surpasses OpenAI text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
  home: https://nomic.ai
  icon: /static/catalog_icons/nomic.png
  categories:
    - embedding
  capabilities:
    - dimensions/768
    - max_tokens/8192
  size: 137
  size_unit: M
  licenses:
    - apache-2.0
  release_date: "2024-02-14"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: nomic-ai/nomic-embed-text-v1.5
      categories:
        - embedding
      backend: vLLM
      backend_parameters:
        - --trust-remote-code
- name: Jina-Embeddings-V3
  description: jina-embeddings-v3 is a multilingual multi-task text embedding model designed for a variety of NLP applications. Based on the Jina-XLM-RoBERTa architecture, this model supports Rotary Position Embeddings to handle long input sequences up to 8192 tokens.
  home: https://jina.ai
  icon: /static/catalog_icons/jina.png
  categories:
    - embedding
  capabilities:
    - dimensions/1024
    - max_tokens/8192
  size: 570
  size_unit: M
  licenses:
    - cc-by-nc-4.0
  release_date: "2024-09-18"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: jinaai/jina-embeddings-v3
      categories:
        - embedding
      backend: vLLM
      backend_parameters:
        - --trust-remote-code
# Reranker models
- name: Qwen3-Reranker-0.6B
  description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 0.6
  categories:
    - reranker
  capabilities:
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2025-06-09"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-Reranker-0.6B
      categories:
        - reranker
      backend: vLLM
      backend_parameters:
        - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
- name: Qwen3-Reranker-4B
  description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 4
  categories:
    - reranker
  capabilities:
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2025-06-09"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-Reranker-4B
      categories:
        - reranker
      env:
        GPUSTACK_APPLY_QWEN3_RERANKER_TEMPLATES: "true"
      backend: vLLM
      backend_parameters:
        - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
- name: Qwen3-Reranker-8B
  description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 8
  categories:
    - reranker
  capabilities:
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2025-06-09"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-Reranker-8B
      categories:
        - reranker
      backend: vLLM
      backend_parameters:
        - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
- name: Qwen3-VL-Reranker-2B
  description: Qwen3-VL-Reranker is a multimodal text reranking model series optimized for multimodal retrieval, clustering, classification, and bitext mining. It consistently outperforms the base embedding model and baseline rerankers.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 2
  categories:
    - reranker
  capabilities:
    - vision
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2026-01-08"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-Reranker-2B
      categories:
        - reranker
      backend: vLLM
      backend_parameters:
        - '--hf_overrides={"architectures":["Qwen3VLForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
- name: Qwen3-VL-Reranker-8B
  description: Qwen3-VL-Reranker is a multimodal text reranking model series optimized for multimodal retrieval, clustering, classification, and bitext mining. It consistently outperforms the base embedding model and baseline rerankers, with the 8B model showing particularly strong results.
  home: https://qwenlm.github.io
  icon: /static/catalog_icons/qwen.png
  size: 8
  categories:
    - reranker
  capabilities:
    - vision
    - max_tokens/32K
  licenses:
    - apache-2.0
  release_date: "2026-01-08"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-Reranker-8B
      categories:
        - reranker
      backend: vLLM
      backend_parameters:
        - '--hf_overrides={"architectures":["Qwen3VLForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}'
- name: BGE-Reranker-V2-M3
  description: BGE-Reranker-V2-M3 is a reranker model from BAAI.
  home: https://bge-model.com
  icon: /static/catalog_icons/bge_logo.jpeg
  categories:
    - reranker
  size: 568
  size_unit: M
  licenses:
    - apache-2.0
  release_date: "2024-03-19"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: BAAI/bge-reranker-v2-m3
      categories:
        - reranker
      backend: vLLM
- name: Jina-Reranker-M0
  description: Jina-Reranker-M0 is a multilingual multimodal document reranker model with 2.4B parameters. It accepts a query alongside visually rich documents and outputs ranked documents by relevance. Supports 29 languages and multimodal content including text, figures, tables, and infographics.
  home: https://jina.ai
  icon: /static/catalog_icons/jina.png
  size: 2.4
  categories:
    - reranker
  capabilities:
    - max_tokens/10K
    - vision
  licenses:
    - cc-by-nc-4.0
  release_date: "2025-04-08"
  specs:
    - mode: standard
      quantization: "BF16"
      source: huggingface
      huggingface_repo_id: jinaai/jina-reranker-m0
      backend: vLLM
# Image models
- name: FLUX.1-dev
  description: FLUX.1 [dev] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions.
  home: https://blackforestlabs.ai
  icon: /static/catalog_icons/blackforestlabs.png
  size: 12
  categories:
    - image
  licenses:
    - flux-1-dev-non-commercial-license
  release_date: "2024-08-02"
  specs:
    - mode: standard
      quantization: "BF16"
      gpu_filters:
        vendor: nvidia
      source: huggingface
      huggingface_repo_id: black-forest-labs/FLUX.1-dev
      backend: SGLang
      backend_version: 0.5.6.post2
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "37580963840" # 35 GiB, observed empirically
- name: FLUX.2-klein-4B
  description: FLUX.2-klein-4B is a 4 billion parameter image generation model from Black Forest Labs.
  home: https://blackforestlabs.ai
  icon: /static/catalog_icons/blackforestlabs.png
  size: 4
  categories:
    - image
  licenses:
    - apache-2.0
  release_date: "2026-01-15"
  .base_spec: &flux_2_klein_4b_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: black-forest-labs/FLUX.2-klein-4B
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *flux_2_klein_4b_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *flux_2_klein_4b_base_spec
      backend_version: *vllm_omni_stable_version
- name: FLUX.2-klein-9B
  description: FLUX.2-klein-9B is a 9 billion parameter image generation model from Black Forest Labs.
  home: https://blackforestlabs.ai
  icon: /static/catalog_icons/blackforestlabs.png
  size: 9
  categories:
    - image
  licenses:
    - apache-2.0
  release_date: "2026-01-15"
  .base_spec: &flux_2_klein_9b_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: black-forest-labs/FLUX.2-klein-9B
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *flux_2_klein_9b_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *flux_2_klein_9b_base_spec
      backend_version: *vllm_omni_stable_version
- name: Qwen-Image
  description: Qwen-Image is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 20
  categories:
    - image
  licenses:
    - apache-2.0
  release_date: "2025-08-04"
  .base_spec: &qwen_image_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: Qwen/Qwen-Image
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *qwen_image_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *qwen_image_base_spec
      backend_version: *vllm_omni_stable_version
- name: Qwen-Image-Edit
  description: Built upon the 20B Qwen-Image model, Qwen-Image-Edit successfully extends Qwen-Image's unique text rendering capabilities to image editing tasks, enabling precise text editing.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 20
  categories:
    - image
  licenses:
    - apache-2.0
  release_date: "2025-08-19"
  specs:
    - mode: standard
      quantization: "BF16"
      gpu_filters:
        vendor: nvidia
      source: huggingface
      huggingface_repo_id: Qwen/Qwen-Image-Edit
      backend: SGLang
      backend_version: 0.5.6.post2
- name: Qwen-Image-2512
  description: Qwen-Image-2512 is the December update of Qwen-Image's text-to-image foundational model, delivering enhanced image generation capabilities.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 20
  categories:
    - image
  licenses:
    - apache-2.0
  release_date: "2025-12-30"
  .base_spec: &qwen_image_2512_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: Qwen/Qwen-Image-2512
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *qwen_image_2512_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *qwen_image_2512_base_spec
      backend_version: *vllm_omni_stable_version
- name: Z-Image
  description: Z-Image is the foundation model of the Z-Image family, engineered for good quality, robust generative diversity, broad stylistic coverage, and precise prompt adherence.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 6
  categories:
    - image
  licenses:
    - apache-2.0
  release_date: "2026-01-28"
  .base_spec: &z_image_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: Tongyi-MAI/Z-Image
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *z_image_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *z_image_base_spec
      backend_version: *vllm_omni_stable_version
- name: Z-Image-Turbo
  description: Z-Image is a powerful and highly efficient image generation model with 6B parameters.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 6
  categories:
    - image
  licenses:
    - apache-2.0
  release_date: "2025-11-27"
  .base_spec: &z_image_turbo_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: Tongyi-MAI/Z-Image-Turbo
    backend: vLLM
    backend_parameters:
      - --omni
    env:
      GPUSTACK_MODEL_VRAM_CLAIM: "24696061952" # 23 GiB observed. Weight file size is 33 GiB in F32 while vLLM loads in BF16.
  specs:
    - <<: *z_image_turbo_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *z_image_turbo_base_spec
      backend_version: *vllm_omni_stable_version
- name: Qwen3-VL-8B-Instruct
  description: Qwen3-VL-8B-Instruct is a vision-language model that delivers comprehensive upgrades across text understanding, visual perception, and reasoning capabilities, supporting image/video/text unified understanding.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 8
  categories:
    - llm
  capabilities:
    - context/1M
    - vision
  licenses:
    - apache-2.0
  release_date: "2025-10-15"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-8B-Instruct
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
- name: Qwen3-VL-8B-Thinking
  description: Qwen3-VL-8B-Thinking is a vision-language model that delivers comprehensive upgrades across text understanding, visual perception, and reasoning capabilities, supporting image/video/text unified understanding with thinking mode.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 8
  categories:
    - llm
  capabilities:
    - context/1M
    - vision
  licenses:
    - apache-2.0
  release_date: "2025-10-15"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-8B-Thinking
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
- name: Qwen3-VL-32B-Instruct
  description: Qwen3-VL-32B-Instruct is a vision-language model featuring superior visual intelligence, enhanced spatial awareness capabilities, and OCR functionality.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 32
  categories:
    - llm
  capabilities:
    - context/1M
    - vision
  licenses:
    - apache-2.0
  release_date: "2025-10-21"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-32B-Instruct
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
- name: Qwen3-VL-32B-Thinking
  description: Qwen3-VL-32B-Thinking is a vision-language model featuring superior visual intelligence, enhanced spatial awareness capabilities, and OCR functionality with thinking mode.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 32
  categories:
    - llm
  capabilities:
    - context/1M
    - vision
  licenses:
    - apache-2.0
  release_date: "2025-10-21"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-32B-Thinking
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
- name: Qwen3-VL-30B-A3B-Instruct
  description: Qwen3-VL-30B-A3B-Instruct is a mixture-of-experts vision-language model with 30B total parameters and 3B active parameters, featuring advanced spatial perception, 2D and 3D grounding.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 30
  activated_size: 3
  categories:
    - llm
  capabilities:
    - context/1M
    - vision
  licenses:
    - apache-2.0
  release_date: "2025-10-05"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-30B-A3B-Instruct
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
- name: Qwen3-VL-30B-A3B-Thinking
  description: Qwen3-VL-30B-A3B-Thinking is a mixture-of-experts vision-language model with 30B total parameters and 3B active parameters, featuring advanced spatial perception, 2D and 3D grounding with thinking mode.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 30
  activated_size: 3
  categories:
    - llm
  capabilities:
    - context/1M
    - vision
  licenses:
    - apache-2.0
  release_date: "2025-10-05"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-30B-A3B-Thinking
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
- name: Qwen3-VL-235B-A22B-Instruct
  description: Qwen3-VL-235B-A22B-Instruct is the largest vision-language model in the Qwen3-VL series with 235B total parameters and 22B active parameters, featuring state-of-the-art visual understanding and reasoning capabilities.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 235
  activated_size: 22
  categories:
    - llm
  capabilities:
    - context/1M
    - vision
  licenses:
    - apache-2.0
  release_date: "2025-09-23"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-235B-A22B-Instruct
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
- name: Qwen3-VL-235B-A22B-Thinking
  description: Qwen3-VL-235B-A22B-Thinking is the largest vision-language model in the Qwen3-VL series with 235B total parameters and 22B active parameters, featuring state-of-the-art visual understanding and reasoning capabilities with thinking mode.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 235
  activated_size: 22
  categories:
    - llm
  capabilities:
    - context/1M
    - vision
  licenses:
    - apache-2.0
  release_date: "2025-09-23"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-VL-235B-A22B-Thinking
      backend: vLLM
      backend_parameters:
        - --max-model-len=65536
# Audio models
- name: CosyVoice2-0.5B
  description: CosyVoice2-0.5B is a speech generation model. It supports multilingual speech synthesis with high naturalness and expressiveness.
  home: https://github.com/FunAudioLLM
  icon: /static/catalog_icons/FunAudioLLM.png
  size: 0.5
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2024-12-01"
  specs:
    - mode: standard
      quantization: FP16
      source: huggingface
      huggingface_repo_id: gpustack/CosyVoice2-0.5B
      backend: VoxBox
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
- name: CosyVoice-300M
  description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba.
  home: https://github.com/FunAudioLLM
  icon: /static/catalog_icons/FunAudioLLM.png
  size: 300
  size_unit: M
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2024-07-05"
  specs:
    - mode: standard
      quantization: FP16
      source: huggingface
      huggingface_repo_id: gpustack/CosyVoice-300M
      backend: VoxBox
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
- name: CosyVoice-300M-SFT
  description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba.
  home: https://github.com/FunAudioLLM
  icon: /static/catalog_icons/FunAudioLLM.png
  size: 300
  size_unit: M
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2024-07-05"
  specs:
    - mode: standard
      quantization: FP16
      source: huggingface
      huggingface_repo_id: gpustack/CosyVoice-300M-SFT
      backend: VoxBox
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
- name: CosyVoice-300M-Instruct
  description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba.
  home: https://github.com/FunAudioLLM
  icon: /static/catalog_icons/FunAudioLLM.png
  size: 300
  size_unit: M
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2024-07-05"
  specs:
    - mode: standard
      quantization: FP16
      source: huggingface
      huggingface_repo_id: gpustack/CosyVoice-300M-Instruct
      backend: VoxBox
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate.
- name: Faster-Whisper-Large-V3
  description: Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation, proposed in the paper Robust Speech Recognition via Large-Scale Weak Supervision by Alec Radford et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many datasets and domains in a zero-shot setting. This is the conversion of openai/whisper-large-v3 to the CTranslate2 model format.
  home: https://huggingface.co/Systran
  icon: /static/catalog_icons/systran.png
  size: 1.55
  categories:
    - speech_to_text
  licenses:
    - mit
  release_date: "2023-11-23"
  specs:
    - mode: standard
      quantization: FP16
      source: huggingface
      huggingface_repo_id: Systran/faster-whisper-large-v3
      backend: VoxBox
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "10737418240" # 10 GiB, per OpenAI Whisper large reference VRAM.
- name: Faster-Whisper-Medium
  description: Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. This is the conversion of openai/whisper-medium to the CTranslate2 model format.
  home: https://huggingface.co/Systran
  icon: /static/catalog_icons/systran.png
  size: 769
  size_unit: M
  categories:
    - speech_to_text
  licenses:
    - mit
  release_date: "2023-03-23"
  specs:
    - mode: standard
      quantization: FP16
      source: huggingface
      huggingface_repo_id: Systran/faster-whisper-medium
      backend: VoxBox
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "5368709120" # 5 GiB, per OpenAI Whisper medium reference VRAM.
- name: Faster-Whisper-Small
  description: Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. This is the conversion of openai/whisper-small to the CTranslate2 model format.
  home: https://huggingface.co/Systran
  icon: /static/catalog_icons/systran.png
  size: 244
  size_unit: M
  categories:
    - speech_to_text
  licenses:
    - mit
  release_date: "2023-03-23"
  specs:
    - mode: standard
      quantization: FP16
      source: huggingface
      huggingface_repo_id: Systran/faster-whisper-small
      backend: VoxBox
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "2147483648" # 2 GiB, per OpenAI Whisper small reference VRAM.
- name: Whisper-Large-V3-Turbo
  description: Whisper large-v3-turbo is a finetuned version of a pruned Whisper large-v3. In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4. As a result, the model is way faster, at the expense of a minor quality degradation.
  home: https://openai.com
  icon: /static/catalog_icons/openai.png
  size: 809
  size_unit: M
  categories:
    - speech_to_text
  licenses:
    - mit
  release_date: "2024-10-01"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: openai/whisper-large-v3-turbo
      backend: vLLM
- name: Whisper-Large-V3
  description: Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation. Trained on 5M hours of labeled data, Whisper large-v3 demonstrates strong ability to generalise to many datasets and domains in a zero-shot setting.
  home: https://openai.com
  icon: /static/catalog_icons/openai.png
  size: 1.55
  categories:
    - speech_to_text
  licenses:
    - mit
  release_date: "2023-11-06"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: openai/whisper-large-v3
      backend: vLLM
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "4294967296" # 4 GiB. The repo stores weight files in multiple formats so explicitly set VRAM claim to avoid over-allocation.
- name: Voxtral-Mini-3B-2507
  description: Voxtral-Mini-3B-2507 is a speech-to-text model from Mistral AI, designed for automatic speech recognition with high accuracy and efficiency.
  home: https://mistral.ai
  icon: /static/catalog_icons/mistral.png
  size: 3
  categories:
    - speech_to_text
  licenses:
    - apache-2.0
  release_date: "2025-07-18"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: mistralai/Voxtral-Mini-3B-2507
      backend: vLLM
- name: Granite-Speech-3.3-2B
  description: Granite-Speech-3.3-2B is a speech-to-text model from IBM, part of the Granite series, designed for automatic speech recognition with strong multilingual capabilities.
  home: https://www.ibm.com
  icon: /static/catalog_icons/ibm.png
  size: 2
  categories:
    - speech_to_text
  licenses:
    - apache-2.0
  release_date: "2025-06-19"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: ibm-granite/granite-speech-3.3-2b
      backend: vLLM
- name: Granite-Speech-3.3-8B
  description: Granite-Speech-3.3-8B is a speech-to-text model from IBM, part of the Granite series, designed for automatic speech recognition with enhanced accuracy and multilingual support.
  home: https://www.ibm.com
  icon: /static/catalog_icons/ibm.png
  size: 8
  categories:
    - speech_to_text
  licenses:
    - apache-2.0
  release_date: "2025-06-19"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: ibm-granite/granite-speech-3.3-8b
      backend: vLLM
- name: Qwen3-ASR-1.7B
  description: Qwen3-ASR-1.7B support language identification and ASR for 52 languages and dialects. It leverages large-scale speech training data and the strong audio understanding capability of its foundation model, Qwen3-Omni.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 1.7
  categories:
    - speech_to_text
  licenses:
    - apache-2.0
  release_date: "2026-01-29"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-ASR-1.7B
      backend: vLLM
      categories:
        - speech_to_text
- name: Qwen3-ASR-0.6B
  description: Qwen3-ASR-0.6B support language identification and ASR for 52 languages and dialects. It leverages large-scale speech training data and the strong audio understanding capability of its foundation model, Qwen3-Omni.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 0.6
  categories:
    - speech_to_text
  licenses:
    - apache-2.0
  release_date: "2026-01-29"
  specs:
    - mode: standard
      quantization: BF16
      source: huggingface
      huggingface_repo_id: Qwen/Qwen3-ASR-0.6B
      backend: vLLM
      categories:
        - speech_to_text
- name: Dia-1.6B
  description: Dia is a text-to-speech model created by Nari Labs. Dia directly generates highly realistic dialogue from a transcript. You can condition the output on audio, enabling emotion and tone control. The model can also produce nonverbal communications like laughter, coughing, clearing throat, etc.
  home: https://narilabs.org
  icon: /static/catalog_icons/narilabs.png
  size: 1.6
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2025-04-21"
  specs:
    - mode: standard
      quantization: FP32
      source: huggingface
      huggingface_repo_id: nari-labs/Dia-1.6B
      backend: VoxBox
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "10737418240" # 10 GiB, Dia model empirical estimate.
- name: Qwen3-TTS-12Hz-1.7B-Base
  description: Qwen3-TTS-12Hz-1.7B-Base is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting 12kHz audio generation.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 1.7
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2026-01-22"
  .base_spec: &qwen3_tts_12hz_1_7b_base_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-1.7B-Base
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *qwen3_tts_12hz_1_7b_base_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *qwen3_tts_12hz_1_7b_base_base_spec
      backend_version: *vllm_omni_stable_version
- name: Qwen3-TTS-12Hz-1.7B-CustomVoice
  description: Qwen3-TTS-12Hz-1.7B-CustomVoice is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting custom voice cloning and 12kHz audio generation.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 1.7
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2026-01-22"
  .base_spec: &qwen3_tts_12hz_1_7b_customvoice_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *qwen3_tts_12hz_1_7b_customvoice_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *qwen3_tts_12hz_1_7b_customvoice_base_spec
      backend_version: *vllm_omni_stable_version
- name: Qwen3-TTS-12Hz-1.7B-VoiceDesign
  description: Qwen3-TTS-12Hz-1.7B-VoiceDesign is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting voice design and 12kHz audio generation.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 1.7
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2026-01-22"
  .base_spec: &qwen3_tts_12hz_1_7b_voicedesign_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *qwen3_tts_12hz_1_7b_voicedesign_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *qwen3_tts_12hz_1_7b_voicedesign_base_spec
      backend_version: *vllm_omni_stable_version
- name: Qwen3-TTS-12Hz-0.6B-Base
  description: Qwen3-TTS-12Hz-0.6B-Base is a text-to-speech model from the Qwen3-TTS series with 0.6B parameters, supporting 12kHz audio generation.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 0.6
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2026-01-22"
  .base_spec: &qwen3_tts_12hz_0_6b_base_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-0.6B-Base
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *qwen3_tts_12hz_0_6b_base_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *qwen3_tts_12hz_0_6b_base_base_spec
      backend_version: *vllm_omni_stable_version
- name: Qwen3-TTS-12Hz-0.6B-CustomVoice
  description: Qwen3-TTS-12Hz-0.6B-CustomVoice is a text-to-speech model from the Qwen3-TTS series with 0.6B parameters, supporting custom voice cloning and 12kHz audio generation.
  home: https://qwen.ai
  icon: /static/catalog_icons/qwen.png
  size: 0.6
  categories:
    - text_to_speech
  licenses:
    - apache-2.0
  release_date: "2026-01-22"
  .base_spec: &qwen3_tts_12hz_0_6b_customvoice_base_spec
    mode: standard
    quantization: "BF16"
    source: huggingface
    huggingface_repo_id: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice
    backend: vLLM
    backend_parameters:
      - --omni
  specs:
    - <<: *qwen3_tts_12hz_0_6b_customvoice_base_spec
      gpu_filters:
        vendor: ascend
      backend_version: *vllm_omni_ascend_stable_version
    - <<: *qwen3_tts_12hz_0_6b_customvoice_base_spec
      backend_version: *vllm_omni_stable_version
- name: SenseVoice-Small
  description: SenseVoice is a speech foundation model with multiple speech understanding capabilities, including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and audio event detection (AED).
  home: https://github.com/FunAudioLLM
  icon: /static/catalog_icons/FunAudioLLM.png
  categories:
    - speech_to_text
  licenses:
    - apache-2.0
  release_date: "2024-07-31"
  specs:
    - mode: standard
      quantization: FP16
      source: huggingface
      huggingface_repo_id: FunAudioLLM/SenseVoiceSmall
      backend: VoxBox
      env:
        GPUSTACK_MODEL_VRAM_CLAIM: "12884901888" # 12 GiB, it depends on the audio length. This value works for ~10 minutes audio input.