# YAML Variables .vllm_omni_ascend_stable_version: &vllm_omni_ascend_stable_version "0.14.1" .vllm_omni_stable_version: &vllm_omni_stable_version "0.16.0" draft_models: - name: Qwen3-8B-EAGLE3 algorithm: eagle3 source: model_scope model_scope_model_id: gpustack/qwen3_8b_eagle3 - name: Qwen3-30B-A3B-EAGLE3 algorithm: eagle3 source: model_scope model_scope_model_id: gpustack/qwen3_30b_moe_eagle3 - name: Qwen3-235B-A22B-EAGLE3 algorithm: eagle3 source: model_scope model_scope_model_id: gpustack/Qwen3-235B-A22B-EAGLE3 - name: gpt-oss-120b-EAGLE3 algorithm: eagle3 source: model_scope model_scope_model_id: gpustack/EAGLE3-gpt-oss-120b-bf16 model_sets: - name: Qwen3-0.6B description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 0.6 categories: - llm capabilities: - context/128K - tools licenses: - apache-2.0 release_date: "2025-04-19" specs: # Ascend NPUs - mode: throughput quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3-0.6B backend: MindIE backend_parameters: - --max-seq-len=8192 # Other GPUs - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-0.6B backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --max-model-len=8192 - name: Qwen3-8B description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 8 categories: - llm capabilities: - context/128K - tools licenses: - apache-2.0 release_date: "2025-04-19" specs: # Ascend NPUs - mode: throughput quantization: W8A8 gpu_filters: vendor: ascend vendor_variant: "910b" source: model_scope model_scope_model_id: vllm-ascend/Qwen3-8B-W8A8 backend: MindIE backend_parameters: - --enable-prefix-caching - --max-seq-len=32768 - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3-8B backend: MindIE backend_parameters: - --max-seq-len=32768 # Other GPUs - mode: throughput quantization: FP8 source: model_scope model_scope_model_id: Qwen/Qwen3-8B-FP8 backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --max-model-len=32768 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-8B backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --max-model-len=32768 - name: Falcon-H1R-7B description: Falcon-H1R-7B is a reasoning-specialized language model built on top of Falcon-H1-7B-Base, featuring a Hybrid-Head Language Model (Transformer-SSM) architecture that delivers outstanding performance in mathematics, programming, and instruction following. home: https://huggingface.co/tiiuae icon: /static/catalog_icons/tii.png size: 7 categories: - llm capabilities: - context/256K licenses: - falcon-llm-license release_date: "2026-01-05" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: tiiuae/Falcon-H1R-7B backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --max-model-len=65536 - name: Qwen3-14B description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 14 categories: - llm capabilities: - context/128K - tools licenses: - apache-2.0 release_date: "2025-04-19" specs: # Ascend NPUs - mode: throughput quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3-14B backend: MindIE backend_parameters: - --max-seq-len=32768 # Other GPUs - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" # Hopper or later source: model_scope model_scope_model_id: Qwen/Qwen3-14B-FP8 backend: SGLang backend_parameters: - --reasoning-parser=qwen3 - --context-length=32768 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: "<9.0" # Before Hopper source: model_scope model_scope_model_id: Qwen/Qwen3-14B-FP8 backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --max-model-len=32768 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-14B backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --max-model-len=32768 - name: Qwen3-32B description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 32 categories: - llm capabilities: - context/128K - tools licenses: - apache-2.0 release_date: "2025-04-19" specs: # Ascend NPUs - mode: throughput quantization: W8A8 gpu_filters: vendor: ascend vendor_variant: "910b" source: model_scope model_scope_model_id: vllm-ascend/Qwen3-32B-W8A8 backend: MindIE backend_parameters: - --enable-prefix-caching - --max-seq-len=32768 - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3-32B backend: MindIE backend_parameters: - --max-seq-len=32768 # Other GPUs - mode: throughput quantization: FP8 source: model_scope model_scope_model_id: Qwen/Qwen3-32B-FP8 backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --max-model-len=32768 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-32B backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --max-model-len=32768 - name: Qwen3-Coder-Next description: Qwen3-Coder-Next is a super-efficient coding model with 80B total parameters and 3B activated parameters (MoE architecture). It achieves performance comparable to models with 10-20x more active parameters, excelling at long-horizon reasoning, complex tool usage, and IDE integration. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 80 activated_size: 3 categories: - llm capabilities: - context/256K - tools licenses: - apache-2.0 release_date: "2026-02-03" specs: - mode: throughput quantization: FP8 source: model_scope model_scope_model_id: Qwen/Qwen3-Coder-Next-FP8 backend: vLLM backend_parameters: - --max-model-len=65536 - --enable-auto-tool-choice - --tool-call-parser=qwen3_coder - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-Coder-Next backend: vLLM backend_parameters: - --max-model-len=65536 - --enable-auto-tool-choice - --tool-call-parser=qwen3_coder - name: Qwen3-30B-A3B-Instruct-2507 description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 30 activated_size: 3 categories: - llm capabilities: - context/256K - tools licenses: - apache-2.0 release_date: "2025-07-21" specs: # Ascend NPUs - mode: throughput quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3-30B-A3B-Instruct-2507 backend: MindIE backend_parameters: - --max-seq-len=32768 # Other GPUs - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" # Hopper or later source: model_scope model_scope_model_id: Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 backend: SGLang backend_parameters: - --tool-call-parser=qwen25 - --context-length=32768 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: "<9.0" # Before Hopper source: model_scope model_scope_model_id: Qwen/Qwen3-30B-A3B-Instruct-2507-FP8 backend: vLLM backend_parameters: - --tool-call-parser=hermes - --enable-auto-tool-choice - --max-model-len=32768 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-30B-A3B-Instruct-2507 backend: vLLM backend_parameters: - --tool-call-parser=hermes - --enable-auto-tool-choice - --max-model-len=32768 - name: Qwen3-30B-A3B-Thinking-2507 description: Qwen3 is a family of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models. Built upon extensive training, Qwen3 delivers groundbreaking advancements in reasoning, instruction-following, agent capabilities, and multilingual support. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 30 activated_size: 3 categories: - llm capabilities: - context/256K - tools licenses: - apache-2.0 release_date: "2025-07-21" specs: # Ascend NPUs - mode: throughput quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3-30B-A3B-Thinking-2507 backend: MindIE backend_parameters: - --max-seq-len=32768 # Other GPUs - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" # Hopper or later source: model_scope model_scope_model_id: Qwen/Qwen3-30B-A3B-Thinking-2507-FP8 backend: SGLang backend_parameters: - --reasoning-parser=deepseek-r1 - --tool-call-parser=qwen25 - --context-length=32768 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: "<9.0" # Before Hopper source: model_scope model_scope_model_id: Qwen/Qwen3-30B-A3B-Thinking-2507-FP8 backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --tool-call-parser=hermes - --enable-auto-tool-choice - --max-model-len=32768 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-30B-A3B-Thinking-2507 backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --tool-call-parser=hermes - --enable-auto-tool-choice - --max-model-len=32768 - name: Qwen3-235B-A22B-Instruct-2507 description: The updated version of the Qwen3-235B-A22B non-thinking mode. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 235 activated_size: 22 categories: - llm capabilities: - context/1M - tools licenses: - apache-2.0 release_date: "2025-07-21" specs: # Ascend NPUs - mode: throughput quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3-235B-A22B-Instruct-2507 backend: MindIE backend_parameters: - --max-seq-len=65536 # Other GPUs - mode: throughput quantization: FP8 source: model_scope model_scope_model_id: Qwen/Qwen3-235B-A22B-Instruct-2507-FP8 backend: vLLM backend_parameters: - --tool-call-parser=hermes - --enable-auto-tool-choice - --max-model-len=65536 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-235B-A22B-Instruct-2507 backend: vLLM backend_parameters: - --tool-call-parser=hermes - --enable-auto-tool-choice - --max-model-len=65536 - name: Qwen3-235B-A22B-Thinking-2507 description: The updated version of the Qwen3-235B-A22B thinking mode. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 235 activated_size: 22 categories: - llm capabilities: - context/1M - tools licenses: - apache-2.0 release_date: "2025-07-21" specs: # Ascend NPUs - mode: throughput quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3-235B-A22B-Thinking-2507 backend: MindIE backend_parameters: - --max-seq-len=65536 # Other GPUs - mode: throughput quantization: FP8 source: model_scope model_scope_model_id: Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --tool-call-parser=hermes - --enable-auto-tool-choice - --max-model-len=65536 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-235B-A22B-Thinking-2507 backend: vLLM backend_parameters: - --reasoning-parser=deepseek_r1 - --tool-call-parser=hermes - --enable-auto-tool-choice - --max-model-len=65536 - name: Qwen3.5-0.8B description: Qwen3.5-0.8B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 0.8 categories: - llm capabilities: - context/256K - reasoning - tools - vision licenses: - apache-2.0 release_date: "2026-03-02" specs: # Ascend NPUs - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3.5-0.8B backend: SGLang backend_version: 0.5.9 backend_parameters: - --context-length=32768 - --disable-radix-cache - --chunked-prefill-size=4096 - --max-prefill-tokens=4096 - --max-total-tokens=40960 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-0.8B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - name: Qwen3.5-2B description: Qwen3.5-2B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 2 categories: - llm capabilities: - context/256K - reasoning - tools - vision licenses: - apache-2.0 release_date: "2026-03-02" specs: # Ascend NPUs - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3.5-2B backend: SGLang backend_version: 0.5.9 backend_parameters: - --context-length=32768 - --disable-radix-cache - --chunked-prefill-size=4096 - --max-prefill-tokens=4096 - --max-total-tokens=40960 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-2B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - name: Qwen3.5-4B description: Qwen3.5-4B is a compact language model from the Qwen family, designed for efficient reasoning, coding, and multilingual understanding across diverse tasks. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 4 categories: - llm capabilities: - context/256K - reasoning - tools - vision licenses: - apache-2.0 release_date: "2026-03-02" specs: # Ascend NPUs - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3.5-4B backend: SGLang backend_version: 0.5.9 backend_parameters: - --reasoning-parser=qwen3 - --context-length=32768 - --disable-radix-cache - --chunked-prefill-size=4096 - --max-prefill-tokens=4096 - --max-total-tokens=40960 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-4B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - name: Qwen3.5-9B description: Qwen3.5-9B is a model from the Qwen family, designed for strong reasoning, coding, and multilingual understanding with competitive performance across a wide range of tasks. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 9 categories: - llm capabilities: - context/256K - reasoning - tools - vision licenses: - apache-2.0 release_date: "2026-03-02" specs: # Ascend NPUs - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3.5-9B backend: SGLang backend_version: 0.5.9 backend_parameters: - --reasoning-parser=qwen3 - --context-length=32768 - --disable-radix-cache - --chunked-prefill-size=4096 - --max-prefill-tokens=4096 - --max-total-tokens=40960 - mode: throughput quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-9B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - --performance-mode=throughput - --enable-prefix-caching - mode: latency quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-9B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - --performance-mode=interactivity - --language-model-only speculative_config: enabled: true algorithm: mtp num_draft_tokens: 1 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-9B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - name: Qwen3.5-27B description: Qwen3.5-27B is a model designed for strong reasoning, coding, and multilingual understanding with competitive performance across a wide range of tasks. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 27 categories: - llm capabilities: - context/256K - reasoning - tools - vision licenses: - apache-2.0 release_date: "2026-02-24" specs: # Ascend NPUs - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3.5-27B backend: SGLang backend_version: 0.5.9 backend_parameters: - --reasoning-parser=qwen3 - --context-length=32768 - --disable-radix-cache - --chunked-prefill-size=4096 - --max-prefill-tokens=4096 - --max-total-tokens=40960 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-27B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" source: model_scope model_scope_model_id: Qwen/Qwen3.5-27B-FP8 backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - --performance-mode=throughput - --enable-prefix-caching - name: Qwen3.5-35B-A3B description: Qwen3.5-35B-A3B is a 35-billion-parameter open-source large language model from the Qwen family, designed for strong reasoning, code generation, and multilingual understanding across diverse tasks. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 35 activated_size: 3 categories: - llm capabilities: - context/256K - reasoning - tools - vision licenses: - apache-2.0 release_date: "2026-02-24" specs: # Ascend NPUs - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3.5-35B-A3B backend: SGLang backend_version: 0.5.9 backend_parameters: - --reasoning-parser=qwen3 - --context-length=32768 - --disable-radix-cache - --chunked-prefill-size=4096 - --max-prefill-tokens=4096 - --max-total-tokens=40960 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-35B-A3B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" source: model_scope model_scope_model_id: Qwen/Qwen3.5-35B-A3B-FP8 backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - --performance-mode=throughput - --enable-prefix-caching - mode: latency quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" source: model_scope model_scope_model_id: Qwen/Qwen3.5-35B-A3B-FP8 backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 speculative_config: enabled: true algorithm: mtp num_draft_tokens: 1 - name: Qwen3.5-122B-A10B description: Qwen3.5-122B-A10B is a 122-billion-parameter open-source large language model from the Qwen family, designed for strong reasoning, code generation, and multilingual understanding across diverse tasks. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 122 activated_size: 10 categories: - llm capabilities: - context/256K - reasoning - tools - vision licenses: - apache-2.0 release_date: "2026-02-24" specs: # Ascend NPUs - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3.5-122B-A10B backend: SGLang backend_version: 0.5.9 backend_parameters: - --reasoning-parser=qwen3 - --context-length=32768 - --disable-radix-cache - --chunked-prefill-size=4096 - --max-prefill-tokens=4096 - --max-total-tokens=40960 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-122B-A10B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" source: model_scope model_scope_model_id: Qwen/Qwen3.5-122B-A10B-FP8 backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - --performance-mode=throughput - --enable-prefix-caching - name: Qwen3.5-397B-A17B description: Qwen3.5-397B-A17B is a flagship MoE-hybrid model that delivers state-of-the-art reasoning and multimodal performance with ultra-efficient inference capabilities. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 397 activated_size: 17 categories: - llm capabilities: - context/256K - reasoning - tools - vision licenses: - apache-2.0 release_date: "2026-02-16" specs: # Ascend NPUs - mode: standard quantization: BF16 gpu_filters: vendor: ascend source: model_scope model_scope_model_id: Qwen/Qwen3.5-397B-A17B backend: SGLang backend_version: 0.5.9 backend_parameters: - --reasoning-parser=qwen3 - --context-length=32768 - --disable-radix-cache - --chunked-prefill-size=4096 - --max-prefill-tokens=4096 - --max-total-tokens=40960 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3.5-397B-A17B backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" source: model_scope model_scope_model_id: Qwen/Qwen3.5-397B-A17B-FP8 backend: vLLM backend_version: 0.17.1 backend_parameters: - --reasoning-parser=qwen3 - --max-model-len=32768 - --performance-mode=throughput - --enable-prefix-caching - name: GLM-4.7 description: GLM-4.7 is a large language model developed by Zhipu AI, featuring advanced agentic, reasoning, and coding capabilities. home: https://z.ai icon: /static/catalog_icons/zai.png size: 355 activated_size: 32 categories: - llm capabilities: - context/1M - reasoning - tools licenses: - mit release_date: "2025-12-22" specs: # TODO: tool-call-parser glm47 not yet available in the latest vLLM/SGLang release - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" # Hopper or later source: model_scope model_scope_model_id: ZhipuAI/GLM-4.7-FP8 backend: SGLang backend_parameters: - --reasoning-parser=glm45 - --context-length=65536 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: "<9.0" # Before Hopper source: model_scope model_scope_model_id: ZhipuAI/GLM-4.7-FP8 backend: vLLM backend_parameters: - --reasoning-parser=glm45 - --max-model-len=65536 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: ZhipuAI/GLM-4.7 backend: vLLM backend_parameters: - --reasoning-parser=glm45 - --max-model-len=65536 - name: GLM-4.6 description: GLM-4.6 is a large language model developed by Zhipu AI, featuring advanced agentic, reasoning, and coding capabilities. home: https://z.ai icon: /static/catalog_icons/zai.png size: 355 activated_size: 32 categories: - llm capabilities: - context/1M - reasoning - tools licenses: - mit release_date: "2025-09-30" specs: - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" # Hopper or later source: model_scope model_scope_model_id: ZhipuAI/GLM-4.6-FP8 backend: SGLang backend_parameters: - --tool-call-parser=glm - --reasoning-parser=glm45 - --context-length=65536 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: "<9.0" # Before Hopper source: model_scope model_scope_model_id: ZhipuAI/GLM-4.6-FP8 backend: vLLM backend_parameters: - --reasoning-parser=glm45 - --tool-call-parser=glm45 - --enable-auto-tool-choice - --max-model-len=65536 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: ZhipuAI/GLM-4.6 backend: vLLM backend_parameters: - --reasoning-parser=glm45 - --tool-call-parser=glm45 - --enable-auto-tool-choice - --max-model-len=65536 - name: gpt-oss-120b description: The gpt-oss series is OpenAI's family of open-weight models, designed for powerful reasoning, agentic tasks, and versatile developer use cases. home: https://openai.com icon: /static/catalog_icons/openai.png categories: - llm capabilities: - context/128K size: 120 licenses: - apache-2.0 release_date: "2025-08-05" specs: - mode: throughput quantization: "MXFP4" source: model_scope model_scope_model_id: openai-mirror/gpt-oss-120b backend: vLLM backend_parameters: - --max-model-len=32768 - --tool-call-parser=openai - --enable-auto-tool-choice - --async-scheduling - mode: standard quantization: "MXFP4" source: model_scope model_scope_model_id: openai-mirror/gpt-oss-120b backend: vLLM backend_parameters: - --max-model-len=32768 - --tool-call-parser=openai - --enable-auto-tool-choice - name: gpt-oss-20b description: The gpt-oss series is OpenAI's family of open-weight models, designed for powerful reasoning, agentic tasks, and versatile developer use cases. home: https://openai.com icon: /static/catalog_icons/openai.png categories: - llm capabilities: - context/128K size: 20 licenses: - apache-2.0 release_date: "2025-08-05" specs: - mode: throughput quantization: "MXFP4" source: model_scope model_scope_model_id: openai-mirror/gpt-oss-20b backend: vLLM backend_parameters: - --max-model-len=32768 - --tool-call-parser=openai - --enable-auto-tool-choice - --async-scheduling - mode: standard quantization: "MXFP4" source: model_scope model_scope_model_id: openai-mirror/gpt-oss-20b backend: vLLM backend_parameters: - --max-model-len=32768 - --tool-call-parser=openai - --enable-auto-tool-choice - name: Deepseek-R1-0528 description: DeepSeek-R1-0528 is a minor version of the DeepSeek R1 model that features enhanced reasoning depth and inference capabilities. These improvements are achieved through increased computational resources and algorithmic optimizations applied during post-training. The model delivers strong performance across a range of benchmark evaluations, including mathematics, programming, and general logic, with overall capabilities approaching those of leading models such as O3 and Gemini 2.5 Pro. home: https://www.deepseek.com icon: /static/catalog_icons/deepseek.png categories: - llm capabilities: - context/128K size: 671 licenses: - mit release_date: "2025-05-28" specs: - mode: throughput quantization: W8A8 gpu_filters: vendor: ascend vendor_variant: "910b" source: model_scope model_scope_model_id: gpustack/DeepSeek-R1-0528-w8a8 backend: MindIE backend_parameters: - --max-seq-len=32768 - --npu-memory-fraction=0.95 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" # Hopper or later source: model_scope model_scope_model_id: deepseek-ai/DeepSeek-R1-0528 backend: SGLang backend_parameters: - --enable-dp-attention - --context-length=32768 - mode: standard quantization: FP8 source: model_scope model_scope_model_id: deepseek-ai/DeepSeek-R1-0528 backend: vLLM backend_parameters: - --max-model-len=32768 - name: DeepSeek-OCR description: DeepSeek-OCR is an advanced optical character recognition (OCR) model developed by DeepSeek AI. It is designed to accurately extract text from images and scanned documents. home: https://www.deepseek.com icon: /static/catalog_icons/deepseek.png size: 3 categories: - llm licenses: - mit release_date: "2025-10-20" specs: - mode: standard quantization: "BF16" gpu_filters: vendor: - nvidia - amd source: model_scope model_scope_model_id: deepseek-ai/DeepSeek-OCR backend: vLLM backend_version: 0.11.2 backend_parameters: - --logits_processors=vllm.model_executor.models.deepseek_ocr:NGramPerReqLogitsProcessor - --no-enable-prefix-caching - --mm-processor-cache-gb=0 - name: PaddleOCR-VL-1.5 description: PaddleOCR-VL-1.5 is an advanced optical character recognition (OCR) vision-language model developed by PaddlePaddle. It is designed to accurately extract and understand text from images and documents. home: https://www.paddleocr.com icon: /static/catalog_icons/paddlepaddle.jpeg size: 0.9 categories: - llm capabilities: - vision licenses: - apache-2.0 release_date: "2026-01-29" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: PaddlePaddle/PaddleOCR-VL-1.5 backend: vLLM backend_parameters: - --trust-remote-code - --max-num-batched-tokens=16384 - --no-enable-prefix-caching - --mm-processor-cache-gb=0 - name: LightOnOCR-2-1B description: LightOnOCR-2-1B is an efficient end-to-end vision-language model for optical character recognition (OCR), converting documents (PDFs, scans, images) into clean, naturally ordered text. It achieves state-of-the-art performance on OlmOCR-Bench while being significantly faster and more cost-effective than competitors. home: https://www.lighton.ai icon: /static/catalog_icons/lighton.png size: 1 categories: - llm capabilities: - vision licenses: - apache-2.0 release_date: "2026-01-19" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: lightonai/LightOnOCR-2-1B backend: vLLM backend_parameters: - '--limit-mm-per-prompt={"image": 1}' - --mm-processor-cache-gb=0 - --no-enable-prefix-caching - name: Deepseek-V3.2 description: 'DeepSeek-V3.2 is a model that balances computational efficiency with strong reasoning and agent capabilities through three technical innovations: DeepSeek Sparse Attention (DSA), Scalable Reinforcement Learning Framework, Large-Scale Agentic Task Synthesis Pipeline.' home: https://www.deepseek.com icon: /static/catalog_icons/deepseek.png categories: - llm capabilities: - context/128K size: 685 licenses: - mit release_date: "2025-12-01" specs: - mode: throughput quantization: W8A8 gpu_filters: vendor: ascend vendor_variant: "910b" source: model_scope model_scope_model_id: vllm-ascend/DeepSeek-V3.2-W8A8 backend: vLLM backend_version: 0.14.1 backend_parameters: - --max-model-len=65536 - --gpu-memory-utilization=0.92 - --no-enable-prefix-caching - --trust-remote-code - --max-num-seqs=16 - '--compilation-config={"cudagraph_mode": "FULL_DECODE_ONLY"}' - --tensor-parallel-size=8 - --data-parallel-size=2 - --data-parallel-size-local=1 - --enable-expert-parallel - --quantization=ascend - --tokenizer-mode=deepseek_v32 - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" # Hopper or later source: model_scope model_scope_model_id: deepseek-ai/DeepSeek-V3.2 backend: SGLang backend_version: 0.5.6.post2 backend_parameters: - --enable-dp-attention - --context-length=65536 - --reasoning-parser=deepseek-v3 - --tool-call-parser=deepseek_v32 - --chat-template={data_dir}/chat_templates/tool_chat_template_deepseekv32.jinja - mode: standard quantization: FP8 source: model_scope model_scope_model_id: deepseek-ai/DeepSeek-V3.2 backend: vLLM backend_version: 0.13.0 backend_parameters: - --max-model-len=65536 - --tokenizer-mode=deepseek_v32 - --reasoning-parser=deepseek_v3 - --tool-call-parser=deepseek_v32 - --enable-auto-tool-choice - name: Deepseek-V3.2-Speciale description: This model is the high-compute variant of DeepSeek-V3.2, surpasses GPT-5 and matches Gemini-3.0-Pro in reasoning, achieving gold-medal level performance in the 2025 IMO and IOI competitions. home: https://www.deepseek.com icon: /static/catalog_icons/deepseek.png categories: - llm capabilities: - context/128K size: 685 licenses: - mit release_date: "2025-12-01" specs: - mode: throughput quantization: FP8 gpu_filters: vendor: nvidia compute_capability: ">=9.0" # Hopper or later source: model_scope model_scope_model_id: deepseek-ai/DeepSeek-V3.2-Speciale backend: SGLang backend_version: 0.5.6.post2 backend_parameters: - --enable-dp-attention - --context-length=65536 - --reasoning-parser=deepseek-v3 - mode: standard quantization: FP8 source: model_scope model_scope_model_id: deepseek-ai/DeepSeek-V3.2-Speciale backend: vLLM backend_version: 0.13.0 backend_parameters: - --max-model-len=65536 - --tokenizer-mode=deepseek_v32 - --reasoning-parser=deepseek_v3 - name: MiniMax-M2.1 description: MiniMax-M2.1 is a high-performance agentic model, optimized for robustness in coding, tool use, instruction following, and long-horizon planning. It excels in multilingual software development and complex multi-step workflows. home: https://www.minimax.io icon: /static/catalog_icons/minimax.png size: 230 activated_size: 10 categories: - llm capabilities: - context/192K - tools licenses: - modified-mit release_date: "2025-12-23" specs: - mode: standard quantization: FP8 source: model_scope model_scope_model_id: MiniMax/MiniMax-M2.1 backend: vLLM backend_parameters: - --max-model-len=65536 - --reasoning-parser=minimax_m2_append_think - --tool-call-parser=minimax_m2 - --enable-auto-tool-choice - --trust-remote-code - name: MiniMax-M2.5 description: MiniMax-M2.5 is a powerful MoE (Mixture-of-Experts) model that delivers exceptional performance in logical reasoning, coding, and complex agent tasks through highly efficient inference. home: https://www.minimax.io/ icon: /static/catalog_icons/minimax.png size: 230 activated_size: 10 categories: - llm capabilities: - context/196K - reasoning - tools licenses: - modified-mit release_date: "2026-02-12" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: MiniMax/MiniMax-M2.5 backend: vLLM backend_parameters: - --max-model-len=65536 - --reasoning-parser=minimax_m2_append_think - --tool-call-parser=minimax_m2 - --enable-auto-tool-choice - --trust-remote-code - --enable-expert-parallel - name: Kimi-K2.5 description: Kimi-K2.5 is a multimodal mixture-of-experts model with 1T total parameters and 32B activated parameters. It features native INT4 quantization, vision support, dual operating modes (thinking/instant), agent swarm capabilities, and excels at visual reasoning, coding with vision, and complex tool orchestration. home: https://www.moonshot.ai icon: /static/catalog_icons/kimi.png size: 1 size_unit: T activated_size: 32 categories: - llm capabilities: - context/256K - vision - tools licenses: - modified-mit release_date: "2026-01-26" specs: - mode: standard quantization: INT4 source: model_scope model_scope_model_id: MoonshotAI/Kimi-K2.5 backend: vLLM backend_parameters: - --max-model-len=65536 - --mm-encoder-tp-mode=data - --tool-call-parser=kimi_k2 - --reasoning-parser=kimi_k2 - --trust-remote-code - name: Step-3.5-Flash description: Step-3.5-Flash is a fast, cost-effective multimodal model with 196B total parameters and 11B active parameters (MoE), optimized for quick inference. Built on StepFun's Step3 architecture, it delivers strong performance across text and vision tasks with efficient token usage. home: https://www.stepfun.com icon: /static/catalog_icons/stepfun.png size: 196 activated_size: 11 categories: - llm capabilities: - context/256K - tools licenses: - apache-2.0 release_date: "2026-02-02" specs: - mode: throughput quantization: FP8 source: model_scope model_scope_model_id: stepfun-ai/Step-3.5-Flash-FP8 backend: vLLM backend_parameters: - --max-model-len=65536 - --disable-cascade-attn - --reasoning-parser=step3p5 - --enable-auto-tool-choice - --tool-call-parser=step3p5 - --trust-remote-code - --quantization=fp8 - mode: standard quantization: BF16 source: model_scope model_scope_model_id: stepfun-ai/Step-3.5-Flash backend: vLLM backend_parameters: - --max-model-len=65536 - --disable-cascade-attn - --reasoning-parser=step3p5 - --enable-auto-tool-choice - --tool-call-parser=step3p5 - --trust-remote-code - name: Nanbeige4.1-3B description: Nanbeige4.1-3B is a 3B-parameter language model from Nanbeige LLM Lab, optimized for long-context reasoning, agentic tasks, and tool use. home: https://modelscope.cn/organization/nanbeige icon: /static/catalog_icons/nanbeige.png size: 3 categories: - llm capabilities: - context/256K - reasoning - tools licenses: - apache-2.0 release_date: "2026-02-13" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: nanbeige/Nanbeige4.1-3B backend: vLLM backend_parameters: - --max-model-len=32768 # Embedding models - name: Qwen3-Embedding-0.6B description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 0.6 categories: - embedding capabilities: - dimensions/4096 - max_tokens/32K licenses: - apache-2.0 release_date: "2025-06-09" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-Embedding-0.6B categories: - embedding backend: vLLM - name: Qwen3-Embedding-4B description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 4 categories: - embedding capabilities: - dimensions/4096 - max_tokens/32K licenses: - apache-2.0 release_date: "2025-06-09" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-Embedding-4B categories: - embedding backend: vLLM - name: Qwen3-Embedding-8B description: Qwen3-Embedding is a multilingual embedding model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 8 categories: - embedding capabilities: - dimensions/4096 - max_tokens/32K licenses: - apache-2.0 release_date: "2025-06-09" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-Embedding-8B categories: - embedding backend: vLLM - name: Qwen3-VL-Embedding-2B description: Qwen3-VL-Embedding is a multimodal embedding model series optimized for multimodal retrieval, clustering, and classification. It supports image-text retrieval and unified multimodal representation learning with 30+ languages support. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 2 categories: - embedding capabilities: - vision - dimensions/2048 - max_tokens/32K licenses: - apache-2.0 release_date: "2026-01-08" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-VL-Embedding-2B categories: - embedding backend: vLLM backend_parameters: - --runner=pooling - name: Qwen3-VL-Embedding-8B description: Qwen3-VL-Embedding is a multimodal embedding model series optimized for multimodal retrieval, clustering, and classification. It supports image-text retrieval and unified multimodal representation learning with 30+ languages support. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 8 categories: - embedding capabilities: - vision - dimensions/4096 - max_tokens/32K licenses: - apache-2.0 release_date: "2026-01-08" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-VL-Embedding-8B categories: - embedding backend: vLLM backend_parameters: - --runner=pooling - name: BGE-M3 description: BGE-M3 is a new model from BAAI distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity. home: https://bge-model.com icon: /static/catalog_icons/bge_logo.jpeg categories: - embedding capabilities: - dimensions/1024 - max_tokens/8192 size: 567 size_unit: M licenses: - mit release_date: "2024-01-28" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: BAAI/BGE-M3 categories: - embedding backend: vLLM - name: BGE-Large-ZH-V1.5 description: BGE is short for BAAI general embedding. This is a Chinese text embedding model with more reasonable similarity distribution. home: https://bge-model.com icon: /static/catalog_icons/bge_logo.jpeg categories: - embedding capabilities: - dimensions/1024 - max_tokens/512 size: 335 size_unit: M licenses: - mit release_date: "2023-09-12" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: BAAI/bge-large-zh-v1.5 categories: - embedding backend: vLLM - name: BGE-Large-EN-V1.5 description: BGE is short for BAAI general embedding. This is an English text embedding model with more reasonable similarity distribution. home: https://bge-model.com icon: /static/catalog_icons/bge_logo.jpeg categories: - embedding capabilities: - dimensions/1024 - max_tokens/512 size: 335 size_unit: M licenses: - mit release_date: "2023-09-12" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: BAAI/bge-large-en-v1.5 categories: - embedding backend: vLLM - name: Nomic-Embed-Text-V1.5 description: Nomic-embed-text is a large context length text encoder that surpasses OpenAI text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks. home: https://nomic.ai icon: /static/catalog_icons/nomic.png categories: - embedding capabilities: - dimensions/768 - max_tokens/8192 size: 137 size_unit: M licenses: - apache-2.0 release_date: "2024-02-14" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: nomic-ai/nomic-embed-text-v1.5 categories: - embedding backend: vLLM backend_parameters: - --trust-remote-code - name: Jina-Embeddings-V3 description: jina-embeddings-v3 is a multilingual multi-task text embedding model designed for a variety of NLP applications. Based on the Jina-XLM-RoBERTa architecture, this model supports Rotary Position Embeddings to handle long input sequences up to 8192 tokens. home: https://jina.ai icon: /static/catalog_icons/jina.png categories: - embedding capabilities: - dimensions/1024 - max_tokens/8192 size: 570 size_unit: M licenses: - cc-by-nc-4.0 release_date: "2024-09-18" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: jinaai/jina-embeddings-v3 categories: - embedding backend: vLLM backend_parameters: - --trust-remote-code # Reranker models - name: Qwen3-Reranker-0.6B description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 0.6 categories: - reranker capabilities: - max_tokens/32K licenses: - apache-2.0 release_date: "2025-06-09" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-Reranker-0.6B categories: - reranker backend: vLLM backend_parameters: - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}' - name: Qwen3-Reranker-4B description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 4 categories: - reranker capabilities: - max_tokens/32K licenses: - apache-2.0 release_date: "2025-06-09" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-Reranker-4B categories: - reranker backend: vLLM backend_parameters: - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}' - name: Qwen3-Reranker-8B description: Qwen3-Reranker is a multilingual text reranking model series optimized for retrieval, clustering, classification, and bitext mining. It supports 100+ languages, with flexible vector dimensions and instruction tuning. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 8 categories: - reranker capabilities: - max_tokens/32K licenses: - apache-2.0 release_date: "2025-06-09" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-Reranker-8B categories: - reranker backend: vLLM backend_parameters: - '--hf_overrides={"architectures":["Qwen3ForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}' - name: Qwen3-VL-Reranker-2B description: Qwen3-VL-Reranker is a multimodal text reranking model series optimized for multimodal retrieval, clustering, classification, and bitext mining. It consistently outperforms the base embedding model and baseline rerankers. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 2 categories: - reranker capabilities: - vision - max_tokens/32K licenses: - apache-2.0 release_date: "2026-01-08" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-Reranker-2B categories: - reranker backend: vLLM backend_parameters: - '--hf_overrides={"architectures":["Qwen3VLForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}' - name: Qwen3-VL-Reranker-8B description: Qwen3-VL-Reranker is a multimodal text reranking model series optimized for multimodal retrieval, clustering, classification, and bitext mining. It consistently outperforms the base embedding model and baseline rerankers, with the 8B model showing particularly strong results. home: https://qwenlm.github.io icon: /static/catalog_icons/qwen.png size: 8 categories: - reranker capabilities: - vision - max_tokens/32K licenses: - apache-2.0 release_date: "2026-01-08" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-Reranker-8B categories: - reranker backend: vLLM backend_parameters: - '--hf_overrides={"architectures":["Qwen3VLForSequenceClassification"],"classifier_from_token":["no","yes"],"is_original_qwen3_reranker":true}' - name: BGE-Reranker-V2-M3 description: BGE-Reranker-V2-M3 is a reranker model from BAAI. home: https://bge-model.com icon: /static/catalog_icons/bge_logo.jpeg categories: - reranker size: 568 size_unit: M licenses: - apache-2.0 release_date: "2024-03-19" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: BAAI/bge-reranker-v2-m3 categories: - reranker backend: vLLM - name: Jina-Reranker-M0 description: Jina-Reranker-M0 is a multilingual multimodal document reranker model with 2.4B parameters. It accepts a query alongside visually rich documents and outputs ranked documents by relevance. Supports 29 languages and multimodal content including text, figures, tables, and infographics. home: https://jina.ai icon: /static/catalog_icons/jina.png size: 2.4 categories: - reranker capabilities: - max_tokens/10K - vision licenses: - cc-by-nc-4.0 release_date: "2025-04-08" specs: - mode: standard quantization: "BF16" source: model_scope model_scope_model_id: jinaai/jina-reranker-m0 backend: vLLM # Image models - name: FLUX.1-dev description: FLUX.1 [dev] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. home: https://blackforestlabs.ai icon: /static/catalog_icons/blackforestlabs.png size: 12 categories: - image licenses: - flux-1-dev-non-commercial-license release_date: "2024-08-02" specs: - mode: standard quantization: "BF16" gpu_filters: vendor: nvidia source: model_scope model_scope_model_id: black-forest-labs/FLUX.1-dev backend: SGLang backend_version: 0.5.6.post2 env: GPUSTACK_MODEL_VRAM_CLAIM: "37580963840" # 35 GiB, observed empirically - name: FLUX.2-klein-4B description: FLUX.2-klein-4B is a 4 billion parameter image generation model from Black Forest Labs. home: https://blackforestlabs.ai icon: /static/catalog_icons/blackforestlabs.png size: 4 categories: - image licenses: - apache-2.0 release_date: "2026-01-15" .base_spec: &flux_2_klein_4b_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: black-forest-labs/FLUX.2-klein-4B backend: vLLM backend_parameters: - --omni specs: - <<: *flux_2_klein_4b_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *flux_2_klein_4b_base_spec backend_version: *vllm_omni_stable_version - name: FLUX.2-klein-9B description: FLUX.2-klein-9B is a 9 billion parameter image generation model from Black Forest Labs. home: https://blackforestlabs.ai icon: /static/catalog_icons/blackforestlabs.png size: 9 categories: - image licenses: - apache-2.0 release_date: "2026-01-15" .base_spec: &flux_2_klein_9b_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: black-forest-labs/FLUX.2-klein-9B backend: vLLM backend_parameters: - --omni specs: - <<: *flux_2_klein_9b_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *flux_2_klein_9b_base_spec backend_version: *vllm_omni_stable_version - name: Qwen-Image description: Qwen-Image is an image generation foundation model in the Qwen series that achieves significant advances in complex text rendering and precise image editing. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 20 categories: - image licenses: - apache-2.0 release_date: "2025-08-04" .base_spec: &qwen_image_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen-Image backend: vLLM backend_parameters: - --omni specs: - <<: *qwen_image_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *qwen_image_base_spec backend_version: *vllm_omni_stable_version - name: Qwen-Image-Edit description: Built upon the 20B Qwen-Image model, Qwen-Image-Edit successfully extends Qwen-Image's unique text rendering capabilities to image editing tasks, enabling precise text editing. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 20 categories: - image licenses: - apache-2.0 release_date: "2025-08-19" specs: - mode: standard quantization: "BF16" gpu_filters: vendor: nvidia source: model_scope model_scope_model_id: Qwen/Qwen-Image-Edit backend: SGLang backend_version: 0.5.6.post2 - name: Qwen-Image-2512 description: Qwen-Image-2512 is the December update of Qwen-Image's text-to-image foundational model, delivering enhanced image generation capabilities. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 20 categories: - image licenses: - apache-2.0 release_date: "2025-12-30" .base_spec: &qwen_image_2512_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen-Image-2512 backend: vLLM backend_parameters: - --omni specs: - <<: *qwen_image_2512_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *qwen_image_2512_base_spec backend_version: *vllm_omni_stable_version - name: Z-Image description: Z-Image is the foundation model of the Z-Image family, engineered for good quality, robust generative diversity, broad stylistic coverage, and precise prompt adherence. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 6 categories: - image licenses: - apache-2.0 release_date: "2026-01-28" .base_spec: &z_image_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Tongyi-MAI/Z-Image backend: vLLM backend_parameters: - --omni specs: - <<: *z_image_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *z_image_base_spec backend_version: *vllm_omni_stable_version - name: Z-Image-Turbo description: Z-Image is a powerful and highly efficient image generation model with 6B parameters. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 6 categories: - image licenses: - apache-2.0 release_date: "2025-11-27" .base_spec: &z_image_turbo_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Tongyi-MAI/Z-Image-Turbo backend: vLLM backend_parameters: - --omni env: GPUSTACK_MODEL_VRAM_CLAIM: "24696061952" # 23 GiB observed. Weight file size is 33 GiB in F32 while vLLM loads in BF16. specs: - <<: *z_image_turbo_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *z_image_turbo_base_spec backend_version: *vllm_omni_stable_version - name: Qwen3-VL-8B-Instruct description: Qwen3-VL-8B-Instruct is a vision-language model that delivers comprehensive upgrades across text understanding, visual perception, and reasoning capabilities, supporting image/video/text unified understanding. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 8 categories: - llm capabilities: - context/1M - vision licenses: - apache-2.0 release_date: "2025-10-15" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-8B-Instruct backend: vLLM backend_parameters: - --max-model-len=65536 - name: Qwen3-VL-8B-Thinking description: Qwen3-VL-8B-Thinking is a vision-language model that delivers comprehensive upgrades across text understanding, visual perception, and reasoning capabilities, supporting image/video/text unified understanding with thinking mode. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 8 categories: - llm capabilities: - context/1M - vision licenses: - apache-2.0 release_date: "2025-10-15" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-8B-Thinking backend: vLLM backend_parameters: - --max-model-len=65536 - name: Qwen3-VL-32B-Instruct description: Qwen3-VL-32B-Instruct is a vision-language model featuring superior visual intelligence, enhanced spatial awareness capabilities, and OCR functionality. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 32 categories: - llm capabilities: - context/1M - vision licenses: - apache-2.0 release_date: "2025-10-21" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-32B-Instruct backend: vLLM backend_parameters: - --max-model-len=65536 - name: Qwen3-VL-32B-Thinking description: Qwen3-VL-32B-Thinking is a vision-language model featuring superior visual intelligence, enhanced spatial awareness capabilities, and OCR functionality with thinking mode. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 32 categories: - llm capabilities: - context/1M - vision licenses: - apache-2.0 release_date: "2025-10-21" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-32B-Thinking backend: vLLM backend_parameters: - --max-model-len=65536 - name: Qwen3-VL-30B-A3B-Instruct description: Qwen3-VL-30B-A3B-Instruct is a mixture-of-experts vision-language model with 30B total parameters and 3B active parameters, featuring advanced spatial perception, 2D and 3D grounding. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 30 activated_size: 3 categories: - llm capabilities: - context/1M - vision licenses: - apache-2.0 release_date: "2025-10-05" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-30B-A3B-Instruct backend: vLLM backend_parameters: - --max-model-len=65536 - name: Qwen3-VL-30B-A3B-Thinking description: Qwen3-VL-30B-A3B-Thinking is a mixture-of-experts vision-language model with 30B total parameters and 3B active parameters, featuring advanced spatial perception, 2D and 3D grounding with thinking mode. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 30 activated_size: 3 categories: - llm capabilities: - context/1M - vision licenses: - apache-2.0 release_date: "2025-10-05" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-30B-A3B-Thinking backend: vLLM backend_parameters: - --max-model-len=65536 - name: Qwen3-VL-235B-A22B-Instruct description: Qwen3-VL-235B-A22B-Instruct is the largest vision-language model in the Qwen3-VL series with 235B total parameters and 22B active parameters, featuring state-of-the-art visual understanding and reasoning capabilities. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 235 activated_size: 22 categories: - llm capabilities: - context/1M - vision licenses: - apache-2.0 release_date: "2025-09-23" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-235B-A22B-Instruct backend: vLLM backend_parameters: - --max-model-len=65536 - name: Qwen3-VL-235B-A22B-Thinking description: Qwen3-VL-235B-A22B-Thinking is the largest vision-language model in the Qwen3-VL series with 235B total parameters and 22B active parameters, featuring state-of-the-art visual understanding and reasoning capabilities with thinking mode. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 235 activated_size: 22 categories: - llm capabilities: - context/1M - vision licenses: - apache-2.0 release_date: "2025-09-23" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-VL-235B-A22B-Thinking backend: vLLM backend_parameters: - --max-model-len=65536 # Audio models - name: CosyVoice2-0.5B description: CosyVoice2-0.5B is a speech generation model. It supports multilingual speech synthesis with high naturalness and expressiveness. home: https://github.com/FunAudioLLM icon: /static/catalog_icons/FunAudioLLM.png size: 0.5 categories: - text_to_speech licenses: - apache-2.0 release_date: "2024-12-01" specs: - mode: standard quantization: FP16 source: model_scope model_scope_model_id: gpustack/CosyVoice2-0.5B backend: VoxBox env: GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate. - name: CosyVoice-300M description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba. home: https://github.com/FunAudioLLM icon: /static/catalog_icons/FunAudioLLM.png size: 300 size_unit: M categories: - text_to_speech licenses: - apache-2.0 release_date: "2024-07-05" specs: - mode: standard quantization: FP16 source: model_scope model_scope_model_id: gpustack/CosyVoice-300M backend: VoxBox env: GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate. - name: CosyVoice-300M-SFT description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba. home: https://github.com/FunAudioLLM icon: /static/catalog_icons/FunAudioLLM.png size: 300 size_unit: M categories: - text_to_speech licenses: - apache-2.0 release_date: "2024-07-05" specs: - mode: standard quantization: FP16 source: model_scope model_scope_model_id: iic/CosyVoice-300M-SFT backend: VoxBox env: GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate. - name: CosyVoice-300M-Instruct description: CosyVoice is a multi-lingual large voice generation model developed by Alibaba. home: https://github.com/FunAudioLLM icon: /static/catalog_icons/FunAudioLLM.png size: 300 size_unit: M categories: - text_to_speech licenses: - apache-2.0 release_date: "2024-07-05" specs: - mode: standard quantization: FP16 source: model_scope model_scope_model_id: gpustack/CosyVoice-300M-Instruct backend: VoxBox env: GPUSTACK_MODEL_VRAM_CLAIM: "3221225472" # 3 GiB, CosyVoice family empirical estimate. - name: Faster-Whisper-Large-V3 description: Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation, proposed in the paper Robust Speech Recognition via Large-Scale Weak Supervision by Alec Radford et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many datasets and domains in a zero-shot setting. This is the conversion of openai/whisper-large-v3 to the CTranslate2 model format. home: https://huggingface.co/Systran icon: /static/catalog_icons/systran.png size: 1.55 categories: - speech_to_text licenses: - mit release_date: "2023-11-23" specs: - mode: standard quantization: FP16 source: model_scope model_scope_model_id: gpustack/faster-whisper-large-v3 backend: VoxBox env: GPUSTACK_MODEL_VRAM_CLAIM: "10737418240" # 10 GiB, per OpenAI Whisper large reference VRAM. - name: Faster-Whisper-Medium description: Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. This is the conversion of openai/whisper-medium to the CTranslate2 model format. home: https://huggingface.co/Systran icon: /static/catalog_icons/systran.png size: 769 size_unit: M categories: - speech_to_text licenses: - mit release_date: "2023-03-23" specs: - mode: standard quantization: FP16 source: model_scope model_scope_model_id: gpustack/faster-whisper-medium backend: VoxBox env: GPUSTACK_MODEL_VRAM_CLAIM: "5368709120" # 5 GiB, per OpenAI Whisper medium reference VRAM. - name: Faster-Whisper-Small description: Whisper is a pre-trained model for automatic speech recognition (ASR) and speech translation. Trained on 680k hours of labelled data, Whisper models demonstrate a strong ability to generalise to many datasets and domains without the need for fine-tuning. This is the conversion of openai/whisper-small to the CTranslate2 model format. home: https://huggingface.co/Systran icon: /static/catalog_icons/systran.png size: 244 size_unit: M categories: - speech_to_text licenses: - mit release_date: "2023-03-23" specs: - mode: standard quantization: FP16 source: model_scope model_scope_model_id: gpustack/faster-whisper-small backend: VoxBox env: GPUSTACK_MODEL_VRAM_CLAIM: "2147483648" # 2 GiB, per OpenAI Whisper small reference VRAM. - name: Whisper-Large-V3-Turbo description: Whisper large-v3-turbo is a finetuned version of a pruned Whisper large-v3. In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4. As a result, the model is way faster, at the expense of a minor quality degradation. home: https://openai.com icon: /static/catalog_icons/openai.png size: 809 size_unit: M categories: - speech_to_text licenses: - mit release_date: "2024-10-01" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: openai/whisper-large-v3-turbo backend: vLLM - name: Whisper-Large-V3 description: Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation. Trained on 5M hours of labeled data, Whisper large-v3 demonstrates strong ability to generalise to many datasets and domains in a zero-shot setting. home: https://openai.com icon: /static/catalog_icons/openai.png size: 1.55 categories: - speech_to_text licenses: - mit release_date: "2023-11-06" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: openai/whisper-large-v3 backend: vLLM env: GPUSTACK_MODEL_VRAM_CLAIM: "4294967296" # 4 GiB. The repo stores weight files in multiple formats so explicitly set VRAM claim to avoid over-allocation. - name: Voxtral-Mini-3B-2507 description: Voxtral-Mini-3B-2507 is a speech-to-text model from Mistral AI, designed for automatic speech recognition with high accuracy and efficiency. home: https://mistral.ai icon: /static/catalog_icons/mistral.png size: 3 categories: - speech_to_text licenses: - apache-2.0 release_date: "2025-07-18" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: mistralai/Voxtral-Mini-3B-2507 backend: vLLM - name: Granite-Speech-3.3-2B description: Granite-Speech-3.3-2B is a speech-to-text model from IBM, part of the Granite series, designed for automatic speech recognition with strong multilingual capabilities. home: https://www.ibm.com icon: /static/catalog_icons/ibm.png size: 2 categories: - speech_to_text licenses: - apache-2.0 release_date: "2025-06-19" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: ibm-granite/granite-speech-3.3-2b backend: vLLM - name: Granite-Speech-3.3-8B description: Granite-Speech-3.3-8B is a speech-to-text model from IBM, part of the Granite series, designed for automatic speech recognition with enhanced accuracy and multilingual support. home: https://www.ibm.com icon: /static/catalog_icons/ibm.png size: 8 categories: - speech_to_text licenses: - apache-2.0 release_date: "2025-06-19" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: ibm-granite/granite-speech-3.3-8b backend: vLLM - name: Qwen3-ASR-1.7B description: Qwen3-ASR-1.7B support language identification and ASR for 52 languages and dialects. It leverages large-scale speech training data and the strong audio understanding capability of its foundation model, Qwen3-Omni. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 1.7 categories: - speech_to_text licenses: - apache-2.0 release_date: "2026-01-29" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-ASR-1.7B backend: vLLM categories: - speech_to_text - name: Qwen3-ASR-0.6B description: Qwen3-ASR-0.6B support language identification and ASR for 52 languages and dialects. It leverages large-scale speech training data and the strong audio understanding capability of its foundation model, Qwen3-Omni. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 0.6 categories: - speech_to_text licenses: - apache-2.0 release_date: "2026-01-29" specs: - mode: standard quantization: BF16 source: model_scope model_scope_model_id: Qwen/Qwen3-ASR-0.6B backend: vLLM categories: - speech_to_text - name: Dia-1.6B description: Dia is a text-to-speech model created by Nari Labs. Dia directly generates highly realistic dialogue from a transcript. You can condition the output on audio, enabling emotion and tone control. The model can also produce nonverbal communications like laughter, coughing, clearing throat, etc. home: https://narilabs.org icon: /static/catalog_icons/narilabs.png size: 1.6 categories: - text_to_speech licenses: - apache-2.0 release_date: "2025-04-21" specs: - mode: standard quantization: FP32 source: model_scope model_scope_model_id: nari-labs/Dia-1.6B backend: VoxBox env: GPUSTACK_MODEL_VRAM_CLAIM: "10737418240" # 10 GiB, Dia model empirical estimate. - name: Qwen3-TTS-12Hz-1.7B-Base description: Qwen3-TTS-12Hz-1.7B-Base is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting 12kHz audio generation. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 1.7 categories: - text_to_speech licenses: - apache-2.0 release_date: "2026-01-22" .base_spec: &qwen3_tts_12hz_1_7b_base_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-TTS-12Hz-1.7B-Base backend: vLLM backend_parameters: - --omni specs: - <<: *qwen3_tts_12hz_1_7b_base_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *qwen3_tts_12hz_1_7b_base_base_spec backend_version: *vllm_omni_stable_version - name: Qwen3-TTS-12Hz-1.7B-CustomVoice description: Qwen3-TTS-12Hz-1.7B-CustomVoice is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting custom voice cloning and 12kHz audio generation. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 1.7 categories: - text_to_speech licenses: - apache-2.0 release_date: "2026-01-22" .base_spec: &qwen3_tts_12hz_1_7b_customvoice_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice backend: vLLM backend_parameters: - --omni specs: - <<: *qwen3_tts_12hz_1_7b_customvoice_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *qwen3_tts_12hz_1_7b_customvoice_base_spec backend_version: *vllm_omni_stable_version - name: Qwen3-TTS-12Hz-1.7B-VoiceDesign description: Qwen3-TTS-12Hz-1.7B-VoiceDesign is a text-to-speech model from the Qwen3-TTS series with 1.7B parameters, supporting voice design and 12kHz audio generation. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 1.7 categories: - text_to_speech licenses: - apache-2.0 release_date: "2026-01-22" .base_spec: &qwen3_tts_12hz_1_7b_voicedesign_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign backend: vLLM backend_parameters: - --omni specs: - <<: *qwen3_tts_12hz_1_7b_voicedesign_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *qwen3_tts_12hz_1_7b_voicedesign_base_spec backend_version: *vllm_omni_stable_version - name: Qwen3-TTS-12Hz-0.6B-Base description: Qwen3-TTS-12Hz-0.6B-Base is a text-to-speech model from the Qwen3-TTS series with 0.6B parameters, supporting 12kHz audio generation. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 0.6 categories: - text_to_speech licenses: - apache-2.0 release_date: "2026-01-22" .base_spec: &qwen3_tts_12hz_0_6b_base_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-TTS-12Hz-0.6B-Base backend: vLLM backend_parameters: - --omni specs: - <<: *qwen3_tts_12hz_0_6b_base_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *qwen3_tts_12hz_0_6b_base_base_spec backend_version: *vllm_omni_stable_version - name: Qwen3-TTS-12Hz-0.6B-CustomVoice description: Qwen3-TTS-12Hz-0.6B-CustomVoice is a text-to-speech model from the Qwen3-TTS series with 0.6B parameters, supporting custom voice cloning and 12kHz audio generation. home: https://qwen.ai icon: /static/catalog_icons/qwen.png size: 0.6 categories: - text_to_speech licenses: - apache-2.0 release_date: "2026-01-22" .base_spec: &qwen3_tts_12hz_0_6b_customvoice_base_spec mode: standard quantization: "BF16" source: model_scope model_scope_model_id: Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice backend: vLLM backend_parameters: - --omni specs: - <<: *qwen3_tts_12hz_0_6b_customvoice_base_spec gpu_filters: vendor: ascend backend_version: *vllm_omni_ascend_stable_version - <<: *qwen3_tts_12hz_0_6b_customvoice_base_spec backend_version: *vllm_omni_stable_version - name: SenseVoice-Small description: SenseVoice is a speech foundation model with multiple speech understanding capabilities, including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and audio event detection (AED). home: https://github.com/FunAudioLLM icon: /static/catalog_icons/FunAudioLLM.png categories: - speech_to_text licenses: - apache-2.0 release_date: "2024-07-31" specs: - mode: standard quantization: FP16 source: model_scope model_scope_model_id: iic/SenseVoiceSmall backend: VoxBox env: GPUSTACK_MODEL_VRAM_CLAIM: "12884901888" # 12 GiB, it depends on the audio length. This value works for ~10 minutes audio input.