| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- version: '3.8'
- services:
- qwen3.6-27b:
- image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
- container_name: qwen3.6-27b-w8a8-vllm #qwen3.6-27b-w8a8 Qwen3.6-27B-W8A8
- stdin_open: true
- tty: true
- restart: unless-stopped
- #network_mode: host
- devices:
- - "/dev/dri:/dev/dri"
- - "/dev/mxcd:/dev/mxcd"
- - "/dev/mem:/dev/mem"
- group_add:
- - "video"
- privileged: true
- security_opt:
- - "apparmor=unconfined"
- - "seccomp=unconfined"
- shm_size: '100gb'
- ulimits:
- memlock:
- soft: -1
- hard: -1
- ports:
- - "8004:30000"
- environment:
- - CUDA_VISIBLE_DEVICES=0,1
- - PYTHONUNBUFFERED=1 # 确保实时输出
- - MACA_SMALL_PAGESIZE_ENABLE=1
- - MACA_VLLM_ENABLE_MCTLASS_FUSED_MOE=1
- - MACA_VLLM_ENABLE_MCTLASS_PYTHON_API=1
- volumes:
- - "/usr/local/:/usr/local/"
- - "/pde_ai:/pde_ai"
- - "/opt/lq/models:/model:ro"
- - "~/.cache/huggingface:/root/.cache/huggingface"
- - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射
- - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
- command: >
- sh -c "/opt/conda/bin/vllm serve /model/Qwen3.6-27B-W8A8 \
- --served-model-name Qwen3.6-27B-W8A8 \
- --host 0.0.0.0 \
- --port 30000 \
- --tensor-parallel-size 2 \
- --max-num-batched-tokens 4096 \
- --max-model-len 8192 \
- --reasoning-parser qwen3 \
- --enable-auto-tool-choice \
- --tool-call-parser qwen3_coder \
- --api-key sk-123456 \
- 2>&1 | tee /var/log/vllm/qwen3.6-27b-w8a8-server.log"
- qwen3-embedding:
- image: vllm-metax:lq
- container_name: qwen3-embedding-vllm #qwen3-embedding
- stdin_open: true
- tty: true
- restart: unless-stopped
- #network_mode: host
- devices:
- - "/dev/dri:/dev/dri"
- - "/dev/mxcd:/dev/mxcd"
- - "/dev/mem:/dev/mem"
- group_add:
- - "video"
- privileged: true
- security_opt:
- - "apparmor=unconfined"
- - "seccomp=unconfined"
- shm_size: '100gb'
- ulimits:
- memlock:
- soft: -1
- hard: -1
- ports:
- - "9003:30000"
- environment:
- - CUDA_VISIBLE_DEVICES=2
- - PYTHONUNBUFFERED=1 # 确保实时输出
- - VLLM_TORCH_COMPILE=0
- - VLLM_DISABLE_TORCH_COMPILE=1
- - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$
- - MAX_JOBS=1
- volumes:
- - "/usr/local/:/usr/local/"
- - "/pde_ai:/pde_ai"
- - "/opt/lq/models:/model:ro"
- - "~/.cache/huggingface:/root/.cache/huggingface"
- - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射
- - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
- command: >
- sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Embedding-8B \
- --served-model-name Qwen3-Embedding-8B \
- --task embedding \
- --host 0.0.0.0 \
- --port 30000 \
- --tensor-parallel-size 1 \
- --max-num-batched-tokens 4096 \
- --max-model-len 16384 \
- --gpu-memory-utilization 0.45 \
- --api-key sk-123456 \
- 2>&1 | tee /var/log/vllm/qwen3-embedding-server.log"
- qwen3-reranker:
- image: vllm-metax:lq
- container_name: qwen3-reranker-vllm #qwen3-reranker
- stdin_open: true
- tty: true
- restart: unless-stopped
- #network_mode: host
- devices:
- - "/dev/dri:/dev/dri"
- - "/dev/mxcd:/dev/mxcd"
- - "/dev/mem:/dev/mem"
- group_add:
- - "video"
- privileged: true
- security_opt:
- - "apparmor=unconfined"
- - "seccomp=unconfined"
- shm_size: '100gb'
- ulimits:
- memlock:
- soft: -1
- hard: -1
- ports:
- - "9004:30000"
- environment:
- - CUDA_VISIBLE_DEVICES=3
- - PYTHONUNBUFFERED=1 # 确保实时输出
- - VLLM_TORCH_COMPILE=0
- - VLLM_DISABLE_TORCH_COMPILE=1
- - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$
- - MAX_JOBS=1
- volumes:
- - "/usr/local/:/usr/local/"
- - "/pde_ai:/pde_ai"
- - "/opt/lq/models:/model:ro"
- - "~/.cache/huggingface:/root/.cache/huggingface"
- - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射
- - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
- command: >
- sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Reranker-8B \
- --served-model-name Qwen3-Reranker-8B \
- --task score \
- --host 0.0.0.0 \
- --port 30000 \
- --tensor-parallel-size 1 \
- --max-num-batched-tokens 4096 \
- --max-model-len 16384 \
- --gpu-memory-utilization 0.45 \
- --hf_overrides '{\"architectures\": [\"Qwen3ForSequenceClassification\"],\"classifier_from_token\": [\"no\", \"yes\"],\"is_original_qwen3_reranker\": true}' \
- --api-key sk-123456 \
- 2>&1 | tee /var/log/vllm/qwen3-reranker-server.log"
|