services: qwen3.5-122b: image: lmsysorg/sglang:latest container_name: qwen3.5-122b-sglang runtime: nvidia shm_size: '200gb' ports: - "25423:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射 environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3.5-122B-A10B --tp 4 --host 0.0.0.0 --port 30000 --api-key sk-prod_ojkjwcO4TTd9TL3vK6uo8a2Dvcdoz64u_9a89845f --mem-fraction-static 0.95 --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["0","1","2","3"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] qwen3-embedding-8b: image: lmsysorg/sglang:latest container_name: qwen3-embedding-8b-sglang runtime: nvidia shm_size: '100gb' ports: - "25425:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射 environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3-Embedding-8B --is-embedding --tp 1 --host 0.0.0.0 --port 30000 --api-key sk_prod_3HDoVka8mU8Jqj9Xnmfkn8bxk5kmzKrz_700c186f --mem-fraction-static 0.45 --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["5"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] healthcheck: test: ["CMD", "curl", "-f", "http://localhost:30000/v1/embeddings", "-H", "Authorization: Bearer sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615", "-H", "Content-Type: application/json", "-d", "{\"input\": \"health\"}"] interval: 10s timeout: 5s retries: 30 start_period: 60s qwen3-reranker-8b: image: lmsysorg/sglang:latest container_name: qwen3-reranker-8b-sglang runtime: nvidia shm_size: '100gb' ports: - "25426:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射 environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3-Reranker-8B --tp 1 --host 0.0.0.0 --port 30000 --api-key sk_prod_dvgYHKWFoQlYAKmkIvBSyuguNSQGeNh0_23c65608 --mem-fraction-static 0.50 --disable-radix-cache --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["5"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] depends_on: qwen3-embedding-8b: condition: service_healthy # 等待 qwen3-embedding-8b 健康检查通过 qwen3.5-35b: image: lmsysorg/sglang:latest container_name: qwen3.5-35b-sglang runtime: nvidia shm_size: '100gb' ports: - "25427:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射 environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3.5-35B-A3B --tp 1 --host 0.0.0.0 --port 30000 --api-key sk_prod_0NuLZt1a2UrD80F9iB-GTxOIuAkJSZxH_5522d7ae --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["7"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu] qwen3.6-27b: image: lmsysorg/sglang:latest container_name: qwen3.6-27b-sglang runtime: nvidia shm_size: '100gb' ports: - "25424:30000" volumes: # # 宿主机路径:容器内路径 - /data/app_workspace/models:/model:ro - ~/.cache/huggingface:/root/.cache/huggingface - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射 - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射 environment: - CUDA_VISIBLE_DEVICES - PYTHONUNBUFFERED=1 # 确保实时输出 command: > sh -c "mkdir -p /var/log/sglang && python3 -m sglang.launch_server --model-path /model/Qwen3.6-27B --tp 1 --host 0.0.0.0 --port 30000 --api-key sk_prod_HH21x5WB9Pm7IM9Bf808BoJPEn_4bPX5_f2c5f3f6 --log-level info 2>&1 | tee /var/log/sglang/qwen3.6-27b-server.log" ipc: host deploy: resources: reservations: devices: - driver: nvidia device_ids: ["4"] # Modify for multiple GPUs: ["0", "1"] #count: all capabilities: [gpu]