docker-compose.yaml 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. version: '3.8'
  2. services:
  3. qwen3.6-27b:
  4. image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
  5. container_name: qwen3.6-27b-w8a8-vllm #qwen3.6-27b-w8a8 Qwen3.6-27B-W8A8
  6. stdin_open: true
  7. tty: true
  8. restart: unless-stopped
  9. #network_mode: host
  10. devices:
  11. - "/dev/dri:/dev/dri"
  12. - "/dev/mxcd:/dev/mxcd"
  13. - "/dev/mem:/dev/mem"
  14. group_add:
  15. - "video"
  16. privileged: true
  17. security_opt:
  18. - "apparmor=unconfined"
  19. - "seccomp=unconfined"
  20. shm_size: '100gb'
  21. ulimits:
  22. memlock:
  23. soft: -1
  24. hard: -1
  25. ports:
  26. - "8004:30000"
  27. environment:
  28. - CUDA_VISIBLE_DEVICES=0,1
  29. - PYTHONUNBUFFERED=1 # 确保实时输出
  30. - MACA_SMALL_PAGESIZE_ENABLE=1
  31. - MACA_VLLM_ENABLE_MCTLASS_FUSED_MOE=1
  32. - MACA_VLLM_ENABLE_MCTLASS_PYTHON_API=1
  33. volumes:
  34. - "/usr/local/:/usr/local/"
  35. - "/pde_ai:/pde_ai"
  36. - "/opt/lq/models:/model:ro"
  37. - "~/.cache/huggingface:/root/.cache/huggingface"
  38. - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射
  39. - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
  40. command: >
  41. sh -c "/opt/conda/bin/vllm serve /model/Qwen3.6-27B-W8A8 \
  42. --served-model-name Qwen3.6-27B-W8A8 \
  43. --host 0.0.0.0 \
  44. --port 30000 \
  45. --tensor-parallel-size 2 \
  46. --max-num-batched-tokens 4096 \
  47. --max-model-len 8192 \
  48. --reasoning-parser qwen3 \
  49. --enable-auto-tool-choice \
  50. --tool-call-parser qwen3_coder \
  51. --api-key sk-123456 \
  52. 2>&1 | tee /var/log/vllm/qwen3.6-27b-w8a8-server.log"
  53. qwen3-embedding:
  54. image: vllm-metax:lq
  55. container_name: qwen3-embedding-vllm #qwen3-embedding
  56. stdin_open: true
  57. tty: true
  58. restart: unless-stopped
  59. #network_mode: host
  60. devices:
  61. - "/dev/dri:/dev/dri"
  62. - "/dev/mxcd:/dev/mxcd"
  63. - "/dev/mem:/dev/mem"
  64. group_add:
  65. - "video"
  66. privileged: true
  67. security_opt:
  68. - "apparmor=unconfined"
  69. - "seccomp=unconfined"
  70. shm_size: '100gb'
  71. ulimits:
  72. memlock:
  73. soft: -1
  74. hard: -1
  75. ports:
  76. - "9003:30000"
  77. environment:
  78. - CUDA_VISIBLE_DEVICES=2
  79. - PYTHONUNBUFFERED=1 # 确保实时输出
  80. - VLLM_TORCH_COMPILE=0
  81. - VLLM_DISABLE_TORCH_COMPILE=1
  82. - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$
  83. - MAX_JOBS=1
  84. volumes:
  85. - "/usr/local/:/usr/local/"
  86. - "/pde_ai:/pde_ai"
  87. - "/opt/lq/models:/model:ro"
  88. - "~/.cache/huggingface:/root/.cache/huggingface"
  89. - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射
  90. - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
  91. command: >
  92. sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Embedding-8B \
  93. --served-model-name Qwen3-Embedding-8B \
  94. --task embedding \
  95. --host 0.0.0.0 \
  96. --port 30000 \
  97. --tensor-parallel-size 1 \
  98. --max-num-batched-tokens 4096 \
  99. --max-model-len 16384 \
  100. --gpu-memory-utilization 0.45 \
  101. --api-key sk-123456 \
  102. 2>&1 | tee /var/log/vllm/qwen3-embedding-server.log"
  103. qwen3-reranker:
  104. image: vllm-metax:lq
  105. container_name: qwen3-reranker-vllm #qwen3-reranker
  106. stdin_open: true
  107. tty: true
  108. restart: unless-stopped
  109. #network_mode: host
  110. devices:
  111. - "/dev/dri:/dev/dri"
  112. - "/dev/mxcd:/dev/mxcd"
  113. - "/dev/mem:/dev/mem"
  114. group_add:
  115. - "video"
  116. privileged: true
  117. security_opt:
  118. - "apparmor=unconfined"
  119. - "seccomp=unconfined"
  120. shm_size: '100gb'
  121. ulimits:
  122. memlock:
  123. soft: -1
  124. hard: -1
  125. ports:
  126. - "9004:30000"
  127. environment:
  128. - CUDA_VISIBLE_DEVICES=3
  129. - PYTHONUNBUFFERED=1 # 确保实时输出
  130. - VLLM_TORCH_COMPILE=0
  131. - VLLM_DISABLE_TORCH_COMPILE=1
  132. - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$
  133. - MAX_JOBS=1
  134. volumes:
  135. - "/usr/local/:/usr/local/"
  136. - "/pde_ai:/pde_ai"
  137. - "/opt/lq/models:/model:ro"
  138. - "~/.cache/huggingface:/root/.cache/huggingface"
  139. - "/opt/lq/deploy_models/logs:/var/log/vllm" # 日志目录映射
  140. - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
  141. command: >
  142. sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Reranker-8B \
  143. --served-model-name Qwen3-Reranker-8B \
  144. --task score \
  145. --host 0.0.0.0 \
  146. --port 30000 \
  147. --tensor-parallel-size 1 \
  148. --max-num-batched-tokens 4096 \
  149. --max-model-len 16384 \
  150. --gpu-memory-utilization 0.45 \
  151. --hf_overrides '{\"architectures\": [\"Qwen3ForSequenceClassification\"],\"classifier_from_token\": [\"no\", \"yes\"],\"is_original_qwen3_reranker\": true}' \
  152. --api-key sk-123456 \
  153. 2>&1 | tee /var/log/vllm/qwen3-reranker-server.log"