services: qwen3.5-122b: image: lmsysorg/sglang:latest container_name: qwen3.5-122b-sglang runtime: nvidia shm_size: '10gb' ports: - "25423:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3.5-122B-A10B --tp 2 --host 0.0.0.0 --port 30000 --api-key lq123456 --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["0","1"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] qwen3-8b: image: lmsysorg/sglang:latest container_name: qwen3-8b-sglang runtime: nvidia shm_size: '10gb' ports: - "25424:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3-8B --tp 1 --host 0.0.0.0 --port 30000 --api-key lq123456 --mem-fraction-static 0.45 --log-level info 2>&1 | tee /var/log/sglang/qwen3-8b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["2"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] healthcheck: test: ["CMD", "curl", "-f", "http://localhost:30000/v1/models", "-H", "Authorization: Bearer lq123456"] interval: 10s timeout: 5s retries: 30 start_period: 60s qwen3-embedding-8b: image: lmsysorg/sglang:latest container_name: qwen3-embedding-8b-sglang runtime: nvidia shm_size: '5gb' ports: - "25425:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3-Embedding-8B --is-embedding --tp 1 --host 0.0.0.0 --port 30000 --api-key lq123456 --mem-fraction-static 0.45 --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["2"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] depends_on: qwen3-8b: condition: service_healthy # 等待 qwen3-8b 健康检查通过 qwen3-reranker-8b: image: lmsysorg/sglang:latest container_name: qwen3-reranker-8b-sglang runtime: nvidia shm_size: '5gb' ports: - "25426:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3-Reranker-8B --tp 1 --host 0.0.0.0 --port 30000 --api-key lq123456 --mem-fraction-static 0.50 --disable-radix-cache --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["3"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] qwen3.5-35b: image: lmsysorg/sglang:latest container_name: qwen3.5-35b-sglang runtime: nvidia shm_size: '5gb' ports: - "25427:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3.5-35B-A3B --tp 1 --host 0.0.0.0 --port 30000 --api-key lq123456 --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["4"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu]