version: '3.8' services: qwen3.6-27b: image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64 container_name: qwen3.6-27b-w8a8-vllm #qwen3.6-27b-w8a8 Qwen3.6-27B-W8A8 stdin_open: true tty: true restart: unless-stopped #network_mode: host devices: - "/dev/dri:/dev/dri" - "/dev/mxcd:/dev/mxcd" - "/dev/mem:/dev/mem" group_add: - "video" privileged: true security_opt: - "apparmor=unconfined" - "seccomp=unconfined" shm_size: '100gb' ulimits: memlock: soft: -1 hard: -1 ports: - "8004:30000" environment: - CUDA_VISIBLE_DEVICES=0,1 - PYTHONUNBUFFERED=1 # 确保实时输出 - MACA_SMALL_PAGESIZE_ENABLE=1 - MACA_VLLM_ENABLE_MCTLASS_FUSED_MOE=1 - MACA_VLLM_ENABLE_MCTLASS_PYTHON_API=1 volumes: - "/usr/local/:/usr/local/" - "/pde_ai:/pde_ai" - "/opt/lq/models:/model:ro" - "~/.cache/huggingface:/root/.cache/huggingface" - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射 - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射 command: > sh -c "/opt/conda/bin/vllm serve /model/Qwen3.6-27B-W8A8 \ --served-model-name Qwen3.6-27B-W8A8 \ --host 0.0.0.0 \ --port 30000 \ --tensor-parallel-size 2 \ --max-num-batched-tokens 4096 \ --max-model-len 8192 \ --api-key sk-123456 \ 2>&1 | tee /var/log/vllm/qwen3.6-27b-w8a8-server.log"