version: '3.8' services: qwen3.6-27b: image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64 container_name: qwen3.6-27b-w8a8-vllm #qwen3.6-27b-w8a8 Qwen3.6-27B-W8A8 stdin_open: true tty: true restart: unless-stopped #network_mode: host devices: - "/dev/dri:/dev/dri" - "/dev/mxcd:/dev/mxcd" - "/dev/mem:/dev/mem" group_add: - "video" privileged: true security_opt: - "apparmor=unconfined" - "seccomp=unconfined" shm_size: '100gb' ulimits: memlock: soft: -1 hard: -1 ports: - "8004:30000" environment: - CUDA_VISIBLE_DEVICES=0,1 - PYTHONUNBUFFERED=1 # 确保实时输出 - MACA_SMALL_PAGESIZE_ENABLE=1 - MACA_VLLM_ENABLE_MCTLASS_FUSED_MOE=1 - MACA_VLLM_ENABLE_MCTLASS_PYTHON_API=1 volumes: - "/usr/local/:/usr/local/" - "/pde_ai:/pde_ai" - "/opt/lq/models:/model:ro" - "~/.cache/huggingface:/root/.cache/huggingface" - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射 - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射 command: > sh -c "/opt/conda/bin/vllm serve /model/Qwen3.6-27B-W8A8 \ --served-model-name Qwen3.6-27B-W8A8 \ --host 0.0.0.0 \ --port 30000 \ --tensor-parallel-size 2 \ --max-num-batched-tokens 4096 \ --max-model-len 8192 \ --reasoning-parser qwen3 \ --enable-auto-tool-choice \ --tool-call-parser qwen3_coder \ --api-key sk-123456 \ 2>&1 | tee /var/log/vllm/qwen3.6-27b-w8a8-server.log" qwen3-embedding: image: vllm-metax:lq container_name: qwen3-embedding-vllm #qwen3-embedding stdin_open: true tty: true restart: unless-stopped #network_mode: host devices: - "/dev/dri:/dev/dri" - "/dev/mxcd:/dev/mxcd" - "/dev/mem:/dev/mem" group_add: - "video" privileged: true security_opt: - "apparmor=unconfined" - "seccomp=unconfined" shm_size: '100gb' ulimits: memlock: soft: -1 hard: -1 ports: - "9003:30000" environment: - CUDA_VISIBLE_DEVICES=2 - PYTHONUNBUFFERED=1 # 确保实时输出 - VLLM_TORCH_COMPILE=0 - VLLM_DISABLE_TORCH_COMPILE=1 - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$ - MAX_JOBS=1 volumes: - "/usr/local/:/usr/local/" - "/pde_ai:/pde_ai" - "/opt/lq/models:/model:ro" - "~/.cache/huggingface:/root/.cache/huggingface" - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射 - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射 command: > sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Embedding-8B \ --served-model-name Qwen3-Embedding-8B \ --task embedding \ --host 0.0.0.0 \ --port 30000 \ --tensor-parallel-size 1 \ --max-num-batched-tokens 4096 \ --max-model-len 16384 \ --gpu-memory-utilization 0.45 \ --api-key sk-123456 \ 2>&1 | tee /var/log/vllm/qwen3-embedding-server.log" qwen3-reranker: image: vllm-metax:lq container_name: qwen3-reranker-vllm #qwen3-reranker stdin_open: true tty: true restart: unless-stopped #network_mode: host devices: - "/dev/dri:/dev/dri" - "/dev/mxcd:/dev/mxcd" - "/dev/mem:/dev/mem" group_add: - "video" privileged: true security_opt: - "apparmor=unconfined" - "seccomp=unconfined" shm_size: '100gb' ulimits: memlock: soft: -1 hard: -1 ports: - "9004:30000" environment: - CUDA_VISIBLE_DEVICES=3 - PYTHONUNBUFFERED=1 # 确保实时输出 - VLLM_TORCH_COMPILE=0 - VLLM_DISABLE_TORCH_COMPILE=1 - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$ - MAX_JOBS=1 volumes: - "/usr/local/:/usr/local/" - "/pde_ai:/pde_ai" - "/opt/lq/models:/model:ro" - "~/.cache/huggingface:/root/.cache/huggingface" - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射 - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射 command: > sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Reranker-8B \ --served-model-name Qwen3-Reranker-8B \ --task score \ --host 0.0.0.0 \ --port 30000 \ --tensor-parallel-size 1 \ --max-num-batched-tokens 4096 \ --max-model-len 16384 \ --gpu-memory-utilization 0.45 \ --hf_overrides '{\"architectures\": [\"Qwen3ForSequenceClassification\"],\"classifier_from_token\": [\"no\", \"yes\"],\"is_original_qwen3_reranker\": true}' \ --api-key sk-123456 \ 2>&1 | tee /var/log/vllm/qwen3-reranker-server.log"