docker-compose.yaml 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. services:
  2. qwen3.5-122b:
  3. image: lmsysorg/sglang:latest
  4. container_name: qwen3.5-122b-sglang
  5. runtime: nvidia
  6. shm_size: '10gb'
  7. ports:
  8. - "25423:30000"
  9. volumes:
  10. # # 宿主机路径:容器内路径
  11. - /data/app_workspace/models:/model:ro
  12. - ~/.cache/huggingface:/root/.cache/huggingface
  13. - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
  14. environment:
  15. - CUDA_VISIBLE_DEVICES
  16. - PYTHONUNBUFFERED=1 # 确保实时输出
  17. command: >
  18. sh -c "mkdir -p /var/log/sglang &&
  19. python3 -m sglang.launch_server
  20. --model-path /model/Qwen3.5-122B-A10B
  21. --tp 2
  22. --host 0.0.0.0
  23. --port 30000
  24. --api-key lq123456
  25. --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122b-server.log"
  26. ipc: host
  27. deploy:
  28. resources:
  29. reservations:
  30. devices:
  31. - driver: nvidia
  32. device_ids: ["0","1"] # Modify for multiple GPUs: ["0", "1"]
  33. #count: all
  34. capabilities: [gpu]
  35. qwen3-8b:
  36. image: lmsysorg/sglang:latest
  37. container_name: qwen3-8b-sglang
  38. runtime: nvidia
  39. shm_size: '10gb'
  40. ports:
  41. - "25424:30000"
  42. volumes:
  43. # # 宿主机路径:容器内路径
  44. - /data/app_workspace/models:/model:ro
  45. - ~/.cache/huggingface:/root/.cache/huggingface
  46. - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
  47. environment:
  48. - CUDA_VISIBLE_DEVICES
  49. - PYTHONUNBUFFERED=1 # 确保实时输出
  50. command: >
  51. sh -c "mkdir -p /var/log/sglang &&
  52. python3 -m sglang.launch_server
  53. --model-path /model/Qwen3-8B
  54. --tp 1
  55. --host 0.0.0.0
  56. --port 30000
  57. --api-key lq123456
  58. --mem-fraction-static 0.45
  59. --log-level info 2>&1 | tee /var/log/sglang/qwen3-8b-server.log"
  60. ipc: host
  61. deploy:
  62. resources:
  63. reservations:
  64. devices:
  65. - driver: nvidia
  66. device_ids: ["2"] # Modify for multiple GPUs: ["0", "1"]
  67. #count: all
  68. capabilities: [gpu]
  69. healthcheck:
  70. test: ["CMD", "curl", "-f", "http://localhost:30000/v1/models", "-H", "Authorization: Bearer lq123456"]
  71. interval: 10s
  72. timeout: 5s
  73. retries: 30
  74. start_period: 60s
  75. qwen3-embedding-8b:
  76. image: lmsysorg/sglang:latest
  77. container_name: qwen3-embedding-8b-sglang
  78. runtime: nvidia
  79. shm_size: '5gb'
  80. ports:
  81. - "25425:30000"
  82. volumes:
  83. # # 宿主机路径:容器内路径
  84. - /data/app_workspace/models:/model:ro
  85. - ~/.cache/huggingface:/root/.cache/huggingface
  86. - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
  87. environment:
  88. - CUDA_VISIBLE_DEVICES
  89. - PYTHONUNBUFFERED=1 # 确保实时输出
  90. command: >
  91. sh -c "mkdir -p /var/log/sglang &&
  92. python3 -m sglang.launch_server
  93. --model-path /model/Qwen3-Embedding-8B
  94. --is-embedding
  95. --tp 1
  96. --host 0.0.0.0
  97. --port 30000
  98. --api-key lq123456
  99. --mem-fraction-static 0.45
  100. --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log"
  101. ipc: host
  102. deploy:
  103. resources:
  104. reservations:
  105. devices:
  106. - driver: nvidia
  107. device_ids: ["2"] # Modify for multiple GPUs: ["0", "1"]
  108. #count: all
  109. capabilities: [gpu]
  110. depends_on:
  111. qwen3-8b:
  112. condition: service_healthy # 等待 qwen3-8b 健康检查通过
  113. qwen3-reranker-8b:
  114. image: lmsysorg/sglang:latest
  115. container_name: qwen3-reranker-8b-sglang
  116. runtime: nvidia
  117. shm_size: '5gb'
  118. ports:
  119. - "25426:30000"
  120. volumes:
  121. # # 宿主机路径:容器内路径
  122. - /data/app_workspace/models:/model:ro
  123. - ~/.cache/huggingface:/root/.cache/huggingface
  124. - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
  125. - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro
  126. environment:
  127. - CUDA_VISIBLE_DEVICES
  128. - PYTHONUNBUFFERED=1 # 确保实时输出
  129. command: >
  130. sh -c "mkdir -p /var/log/sglang &&
  131. python3 -m sglang.launch_server
  132. --model-path /model/Qwen3-Reranker-8B
  133. --tp 1
  134. --host 0.0.0.0
  135. --port 30000
  136. --api-key lq123456
  137. --mem-fraction-static 0.50
  138. --disable-radix-cache
  139. --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja
  140. --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log"
  141. ipc: host
  142. deploy:
  143. resources:
  144. reservations:
  145. devices:
  146. - driver: nvidia
  147. device_ids: ["3"] # Modify for multiple GPUs: ["0", "1"]
  148. #count: all
  149. capabilities: [gpu]
  150. qwen3.5-35b:
  151. image: lmsysorg/sglang:latest
  152. container_name: qwen3.5-35b-sglang
  153. runtime: nvidia
  154. shm_size: '5gb'
  155. ports:
  156. - "25427:30000"
  157. volumes:
  158. # # 宿主机路径:容器内路径
  159. - /data/app_workspace/models:/model:ro
  160. - ~/.cache/huggingface:/root/.cache/huggingface
  161. - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang # 日志目录映射
  162. environment:
  163. - CUDA_VISIBLE_DEVICES
  164. - PYTHONUNBUFFERED=1 # 确保实时输出
  165. command: >
  166. sh -c "mkdir -p /var/log/sglang &&
  167. python3 -m sglang.launch_server
  168. --model-path /model/Qwen3.5-35B-A3B
  169. --tp 1
  170. --host 0.0.0.0
  171. --port 30000
  172. --api-key lq123456
  173. --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log"
  174. ipc: host
  175. deploy:
  176. resources:
  177. reservations:
  178. devices:
  179. - driver: nvidia
  180. device_ids: ["4"] # Modify for multiple GPUs: ["0", "1"]
  181. #count: all
  182. capabilities: [gpu]