| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188 |
- services:
- qwen3.5-122b:
- image: lmsysorg/sglang:latest
- container_name: qwen3.5-122b-sglang
- runtime: nvidia
- shm_size: '10gb'
- ports:
- - "25423:30000"
- volumes:
- # # 宿主机路径:容器内路径
- - /data/app_workspace/models:/model:ro
- - ~/.cache/huggingface:/root/.cache/huggingface
- - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
- environment:
- - CUDA_VISIBLE_DEVICES
- - PYTHONUNBUFFERED=1 # 确保实时输出
- command: >
- sh -c "mkdir -p /var/log/sglang &&
- python3 -m sglang.launch_server
- --model-path /model/Qwen3.5-122B-A10B
- --tp 2
- --host 0.0.0.0
- --port 30000
- --api-key lq123456
- --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122b-server.log"
- ipc: host
- deploy:
- resources:
- reservations:
- devices:
- - driver: nvidia
- device_ids: ["0","1"] # Modify for multiple GPUs: ["0", "1"]
- #count: all
- capabilities: [gpu]
- qwen3-8b:
- image: lmsysorg/sglang:latest
- container_name: qwen3-8b-sglang
- runtime: nvidia
- shm_size: '10gb'
- ports:
- - "25424:30000"
- volumes:
- # # 宿主机路径:容器内路径
- - /data/app_workspace/models:/model:ro
- - ~/.cache/huggingface:/root/.cache/huggingface
- - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
- environment:
- - CUDA_VISIBLE_DEVICES
- - PYTHONUNBUFFERED=1 # 确保实时输出
- command: >
- sh -c "mkdir -p /var/log/sglang &&
- python3 -m sglang.launch_server
- --model-path /model/Qwen3-8B
- --tp 1
- --host 0.0.0.0
- --port 30000
- --api-key lq123456
- --mem-fraction-static 0.45
- --log-level info 2>&1 | tee /var/log/sglang/qwen3-8b-server.log"
- ipc: host
- deploy:
- resources:
- reservations:
- devices:
- - driver: nvidia
- device_ids: ["2"] # Modify for multiple GPUs: ["0", "1"]
- #count: all
- capabilities: [gpu]
- healthcheck:
- test: ["CMD", "curl", "-f", "http://localhost:30000/v1/models", "-H", "Authorization: Bearer lq123456"]
- interval: 10s
- timeout: 5s
- retries: 30
- start_period: 60s
- qwen3-embedding-8b:
- image: lmsysorg/sglang:latest
- container_name: qwen3-embedding-8b-sglang
- runtime: nvidia
- shm_size: '5gb'
- ports:
- - "25425:30000"
- volumes:
- # # 宿主机路径:容器内路径
- - /data/app_workspace/models:/model:ro
- - ~/.cache/huggingface:/root/.cache/huggingface
- - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
- environment:
- - CUDA_VISIBLE_DEVICES
- - PYTHONUNBUFFERED=1 # 确保实时输出
- command: >
- sh -c "mkdir -p /var/log/sglang &&
- python3 -m sglang.launch_server
- --model-path /model/Qwen3-Embedding-8B
- --is-embedding
- --tp 1
- --host 0.0.0.0
- --port 30000
- --api-key lq123456
- --mem-fraction-static 0.45
- --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log"
- ipc: host
- deploy:
- resources:
- reservations:
- devices:
- - driver: nvidia
- device_ids: ["2"] # Modify for multiple GPUs: ["0", "1"]
- #count: all
- capabilities: [gpu]
- depends_on:
- qwen3-8b:
- condition: service_healthy # 等待 qwen3-8b 健康检查通过
- qwen3-reranker-8b:
- image: lmsysorg/sglang:latest
- container_name: qwen3-reranker-8b-sglang
- runtime: nvidia
- shm_size: '5gb'
- ports:
- - "25426:30000"
- volumes:
- # # 宿主机路径:容器内路径
- - /data/app_workspace/models:/model:ro
- - ~/.cache/huggingface:/root/.cache/huggingface
- - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
- - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro
- environment:
- - CUDA_VISIBLE_DEVICES
- - PYTHONUNBUFFERED=1 # 确保实时输出
- command: >
- sh -c "mkdir -p /var/log/sglang &&
- python3 -m sglang.launch_server
- --model-path /model/Qwen3-Reranker-8B
- --tp 1
- --host 0.0.0.0
- --port 30000
- --api-key lq123456
- --mem-fraction-static 0.50
- --disable-radix-cache
- --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja
- --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log"
- ipc: host
- deploy:
- resources:
- reservations:
- devices:
- - driver: nvidia
- device_ids: ["3"] # Modify for multiple GPUs: ["0", "1"]
- #count: all
- capabilities: [gpu]
- qwen3.5-35b:
- image: lmsysorg/sglang:latest
- container_name: qwen3.5-35b-sglang
- runtime: nvidia
- shm_size: '5gb'
- ports:
- - "25427:30000"
- volumes:
- # # 宿主机路径:容器内路径
- - /data/app_workspace/models:/model:ro
- - ~/.cache/huggingface:/root/.cache/huggingface
- - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
- environment:
- - CUDA_VISIBLE_DEVICES
- - PYTHONUNBUFFERED=1 # 确保实时输出
- command: >
- sh -c "mkdir -p /var/log/sglang &&
- python3 -m sglang.launch_server
- --model-path /model/Qwen3.5-35B-A3B
- --tp 1
- --host 0.0.0.0
- --port 30000
- --api-key lq123456
- --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log"
- ipc: host
- deploy:
- resources:
- reservations:
- devices:
- - driver: nvidia
- device_ids: ["4"] # Modify for multiple GPUs: ["0", "1"]
- #count: all
- capabilities: [gpu]
|