services: qwen3.5-122b: image: vllm/vllm-openai:latest container_name: qwen3.5-122b-vllm runtime: nvidia shm_size: '10gb' ports: - "25423:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm # 日志目录映射 - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro # environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 - VLLM_LOGGING_LEVEL=INFO # 使用环境变量控制日志级别 # 直接执行脚本,避免复杂的 shell 嵌套 entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3.5-122b.sh"] ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["0","1"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] qwen3-8b: image: vllm/vllm-openai:latest container_name: qwen3-8b-vllm runtime: nvidia shm_size: '10gb' ports: - "25424:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm # 日志目录映射 - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro # environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 - VLLM_LOGGING_LEVEL=INFO # 使用环境变量控制日志级别 # 直接执行脚本,避免复杂的 shell 嵌套 entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3-8b.sh"] ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["2"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] healthcheck: test: ["CMD", "curl", "-f", "http://localhost:30000/v1/models", "-H", "Authorization: Bearer lq123456"] interval: 10s timeout: 5s retries: 30 start_period: 60s qwen3-embedding-8b: image: vllm/vllm-openai:latest #image: vllm/vllm-openai:v0.15.0 container_name: qwen3-embedding-8b-vllm runtime: nvidia shm_size: '5gb' ports: - "25425:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm # 日志目录映射 - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro # environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 - VLLM_LOGGING_LEVEL=INFO # 使用环境变量控制日志级别 # 直接执行脚本,避免复杂的 shell 嵌套 entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3-embedding-8b.sh"] ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["2"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] depends_on: qwen3-8b: condition: service_healthy # 等待 qwen3-8b 健康检查通过 qwen3-reranker-8b: #image: vllm/vllm-openai:latest# v0.18 版本不支持rerank部署 image: vllm/vllm-openai:v0.15.0 container_name: qwen3-reranker-8b-vllm runtime: nvidia shm_size: '5gb' ports: - "25426:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm # 日志目录映射 - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro # - /data/app_workspace/deploy_models/vllm/sglang-main:/vllm/sglang-main:ro environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 - VLLM_LOGGING_LEVEL=INFO # 使用环境变量控制日志级别 # 直接执行脚本,避免复杂的 shell 嵌套 entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3-reranker-8b-15.sh"] ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["3"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] qwen3.5-35b: image: vllm/vllm-openai:latest container_name: qwen3.5-35b-vllm runtime: nvidia shm_size: '5gb' ports: - "25427:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm # 日志目录映射 - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro # environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 - VLLM_LOGGING_LEVEL=INFO # 使用环境变量控制日志级别 # 直接执行脚本,避免复杂的 shell 嵌套 entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3.5-35b.sh"] ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["4"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu]