Преглед изворни кода

算力生成环境模型部署配置信息

lingmin_package@163.com пре 2 недеља
родитељ
комит
6800f79aef
25 измењених фајлова са 1192 додато и 3 уклоњено
  1. 4 2
      prod/models/sglang/docker-compose.yaml
  2. 2 0
      prod/models/sglang/logs/qwen3-35b-server.log
  3. 0 1
      prod/models/sglang/logs/qwen3-8b-server.log
  4. 1 0
      prod/models/sglang/logs/qwen3-embedding-8b-server.log
  5. 1 0
      prod/models/sglang/logs/qwen3-reranker-8b-server.log
  6. 2 0
      prod/models/sglang/logs/qwen3_5-122b-server.log
  7. 7 0
      prod/models/sglang/qwen3_reranker.jinja
  8. BIN
      prod/models/sglang/sglang-main.zip
  9. 284 0
      prod/models/sglang/test_models.sh
  10. 266 0
      prod/models/sglang/test_models.sh.bak
  11. 159 0
      prod/models/vllm/docker-compose.yaml
  12. 11 0
      prod/models/vllm/logs/qwen3-8b-server.log
  13. 15 0
      prod/models/vllm/logs/qwen3-embedding-8b-server.log
  14. 15 0
      prod/models/vllm/logs/qwen3-reranker-8b-server.log
  15. 17 0
      prod/models/vllm/logs/qwen3_5-122b-server.log
  16. 14 0
      prod/models/vllm/logs/qwen3_5-35b-server.log
  17. 7 0
      prod/models/vllm/qwen3_reranker.jinja
  18. 284 0
      prod/models/vllm/test_models.sh
  19. 14 0
      prod/models/vllm/vllm_start_shell/start-vllm-qwen3-8b.sh
  20. 17 0
      prod/models/vllm/vllm_start_shell/start-vllm-qwen3-embedding-8b-15.sh
  21. 15 0
      prod/models/vllm/vllm_start_shell/start-vllm-qwen3-embedding-8b.sh
  22. 17 0
      prod/models/vllm/vllm_start_shell/start-vllm-qwen3-reranker-8b-15.sh
  23. 14 0
      prod/models/vllm/vllm_start_shell/start-vllm-qwen3-reranker-8b.sh
  24. 13 0
      prod/models/vllm/vllm_start_shell/start-vllm-qwen3.5-122b.sh
  25. 13 0
      prod/models/vllm/vllm_start_shell/start-vllm-qwen3.5-35b.sh

+ 4 - 2
prod/models/sglang/docker-compose.yaml

@@ -22,7 +22,7 @@ services:
       --host 0.0.0.0
       --port 30000
       --api-key lq123456
-      --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122B-server.log"
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122b-server.log"
     ipc: host
     deploy:
       resources:
@@ -126,6 +126,7 @@ services:
       - /data/app_workspace/models:/model:ro
       - ~/.cache/huggingface:/root/.cache/huggingface
       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+      - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro
     environment:
       - CUDA_VISIBLE_DEVICES 
       - PYTHONUNBUFFERED=1  # 确保实时输出
@@ -133,12 +134,13 @@ services:
       sh -c "mkdir -p /var/log/sglang &&
       python3 -m sglang.launch_server
       --model-path /model/Qwen3-Reranker-8B
-      --is-embedding
       --tp 1
       --host 0.0.0.0
       --port 30000
       --api-key lq123456
       --mem-fraction-static 0.50
+      --disable-radix-cache
+      --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja
       --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log"
     ipc: host
     deploy:

Разлика између датотеке није приказан због своје велике величине
+ 2 - 0
prod/models/sglang/logs/qwen3-35b-server.log


Разлика између датотеке није приказан због своје велике величине
+ 0 - 1
prod/models/sglang/logs/qwen3-8b-server.log


Разлика између датотеке није приказан због своје велике величине
+ 1 - 0
prod/models/sglang/logs/qwen3-embedding-8b-server.log


Разлика између датотеке није приказан због своје велике величине
+ 1 - 0
prod/models/sglang/logs/qwen3-reranker-8b-server.log


Разлика између датотеке није приказан због своје велике величине
+ 2 - 0
prod/models/sglang/logs/qwen3_5-122b-server.log


+ 7 - 0
prod/models/sglang/qwen3_reranker.jinja

@@ -0,0 +1,7 @@
+<|im_start|>system
+Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
+<|im_start|>user
+<Instruct>: {{ instruct | default("Given a web search query, retrieve relevant passages that answer the query.") }}
+<Query>: {{ messages[0]["content"] }}
+<Document>: {{ messages[1]["content"] }}<|im_end|>
+<|im_start|>assistant{{ '\n' }}

BIN
prod/models/sglang/sglang-main.zip


+ 284 - 0
prod/models/sglang/test_models.sh

@@ -0,0 +1,284 @@
+#!/bin/bash
+
+# SGLang 多模型 curl 测试脚本
+# 移除 set -e,避免遇到错误就终止
+
+# 颜色定义
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+# 模型配置(按顺序定义)
+MODEL_NAMES=("qwen3-8b" "qwen3.5-35b" "qwen3.5-122b" "qwen3-embedding-8b" "qwen3-reranker-8b")
+
+declare -A MODEL_PORTS=(
+    ["qwen3-8b"]="25424"
+    ["qwen3.5-35b"]="25427"
+    ["qwen3.5-122b"]="25423"
+    ["qwen3-embedding-8b"]="25425"
+    ["qwen3-reranker-8b"]="25426"
+)
+
+declare -A MODEL_PATHS=(
+    ["qwen3-8b"]="/model/Qwen3-8B"
+    ["qwen3.5-35b"]="/model/Qwen3.5-35B"
+    ["qwen3.5-122b"]="/model/Qwen3.5-122B-A10B"
+    ["qwen3-embedding-8b"]="/model/Qwen3-Embedding-8B"
+    ["qwen3-reranker-8b"]="/model/Qwen3-Reranker-8B"
+)
+
+declare -A MODEL_TYPES=(
+    ["qwen3-8b"]="chat"
+    ["qwen3.5-35b"]="chat"
+    ["qwen3.5-122b"]="chat"
+    ["qwen3-embedding-8b"]="embedding"
+    ["qwen3-reranker-8b"]="rerank"
+)
+
+API_KEY="lq123456"
+TIMEOUT=30
+
+echo "========================================"
+echo "SGLang 多模型健康检查 (curl)"
+echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "========================================"
+
+TOTAL=0
+SUCCESS=0
+
+# 测试对话模型
+test_chat_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (对话模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    local body
+    local code
+    
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"messages\": [{\"role\": \"user\", \"content\": \"你好,请用一句话介绍自己\"}],
+            \"temperature\": 0.7,
+            \"max_tokens\": 50
+        }" \
+        "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
+    
+    body=$(echo "$response" | head -n -1)
+    code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local content=$(echo "$body" | grep -o '"content":"[^"]*"' | head -1 | cut -d'"' -f4)
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "回复: ${content:0:100}..."
+        ((SUCCESS++))
+    else
+        echo -e "${RED}❌ 失败${NC} HTTP $code"
+        echo "响应: ${body:0:200}"
+    fi
+    ((TOTAL++))
+}
+
+# 测试嵌入模型
+test_embedding_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (嵌入模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    local body
+    local code
+    
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"input\": [\"你好,这是一个测试句子\", \"Hello world\"]
+        }" \
+        "http://localhost:$port/v1/embeddings" 2>/dev/null || echo -e "\n000")
+    
+    body=$(echo "$response" | head -n -1)
+    code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local dims=$(echo "$body" | grep -o '"embedding":\[[^]]*\]' | head -1 | grep -o ',' | wc -l)
+        dims=$((dims + 1))
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "向量维度: $dims"
+        ((SUCCESS++))
+    else
+        echo -e "${YELLOW}⚠️  Embedding 接口失败,尝试 Rerank 接口...${NC}"
+        response=$(curl -s -w "\n%{http_code}" \
+            --max-time $TIMEOUT \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer $API_KEY" \
+            -d "{
+                \"model\": \"$model_path\",
+                \"query\": \"测试查询\",
+                \"documents\": [\"文档1\", \"文档2\"]
+            }" \
+            "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
+        
+        code=$(echo "$response" | tail -n 1)
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅ 成功${NC} (Rerank 接口可用)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌ 失败${NC} HTTP $code"
+        fi
+    fi
+    ((TOTAL++))
+}
+
+# 测试重排序模型
+test_rerank_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (重排序模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    local body
+    local code
+    
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"query\": \"什么是机器学习\",
+            \"documents\": [\"机器学习是AI的分支\", \"Python是编程语言\", \"深度学习使用神经网络\"],
+            \"top_n\": 2
+        }" \
+        "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
+    
+    body=$(echo "$response" | head -n -1)
+    code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local top_doc=$(echo "$body" | grep -o '"text":"[^"]*"' | head -1 | cut -d'"' -f4)
+        local score=$(echo "$body" | grep -o '"score":[0-9.]*' | head -1 | cut -d':' -f2)
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "Top1: ${top_doc:0:50}... (得分: $score)"
+        ((SUCCESS++))
+    else
+        echo -e "${YELLOW}⚠️  Rerank 接口失败,尝试 Chat 接口...${NC}"
+        response=$(curl -s -w "\n%{http_code}" \
+            --max-time $TIMEOUT \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer $API_KEY" \
+            -d "{
+                \"model\": \"$model_path\",
+                \"messages\": [{\"role\": \"user\", \"content\": \"你好\"}],
+                \"max_tokens\": 20
+            }" \
+            "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
+        
+        code=$(echo "$response" | tail -n 1)
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅ 成功${NC} (Chat 接口可用)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌ 失败${NC} HTTP $code"
+        fi
+    fi
+    ((TOTAL++))
+}
+
+# 快速检查
+quick_check() {
+    echo ""
+    echo "========================================"
+    echo "快速检查模式"
+    echo "========================================"
+    
+    for key in "${MODEL_NAMES[@]}"; do
+        local port=${MODEL_PORTS[$key]}
+        
+        local code
+        code=$(curl -s -o /dev/null -w "%{http_code}" \
+            --max-time 5 \
+            -H "Authorization: Bearer $API_KEY" \
+            "http://localhost:$port/v1/models" 2>/dev/null || echo "000")
+        
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅${NC} $key (端口 $port)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌${NC} $key (端口 $port) HTTP $code"
+        fi
+        ((TOTAL++))
+    done
+}
+
+# 主函数
+main() {
+    if [ "$1" = "--quick" ]; then
+        quick_check
+    elif [ "$1" = "--model" ] && [ -n "$2" ]; then
+        local key=$2
+        local port=${MODEL_PORTS[$key]}
+        local path=${MODEL_PATHS[$key]}
+        local mtype=${MODEL_TYPES[$key]}
+        
+        case $mtype in
+            chat) test_chat_model "$key" "$port" "$path" ;;
+            embedding) test_embedding_model "$key" "$port" "$path" ;;
+            rerank) test_rerank_model "$key" "$port" "$path" ;;
+        esac
+    else
+        # 按顺序测试所有模型
+        for key in "${MODEL_NAMES[@]}"; do
+            local port=${MODEL_PORTS[$key]}
+            local path=${MODEL_PATHS[$key]}
+            local mtype=${MODEL_TYPES[$key]}
+            
+            case $mtype in
+                chat) test_chat_model "$key" "$port" "$path" ;;
+                embedding) test_embedding_model "$key" "$port" "$path" ;;
+                rerank) test_rerank_model "$key" "$port" "$path" ;;
+            esac
+        done
+    fi
+    
+    echo ""
+    echo "========================================"
+    echo "测试结果摘要"
+    echo "========================================"
+    echo "总计: $SUCCESS / $TOTAL 个模型正常"
+    
+    if [ $SUCCESS -eq $TOTAL ]; then
+        echo -e "${GREEN}所有模型运行正常!${NC}"
+        exit 0
+    else
+        echo -e "${RED}部分模型异常,请检查日志${NC}"
+        exit 1
+    fi
+}
+
+main "$@"

+ 266 - 0
prod/models/sglang/test_models.sh.bak

@@ -0,0 +1,266 @@
+#!/bin/bash
+
+# SGLang 多模型 curl 测试脚本
+# 测试模型:Qwen3-8B, Qwen3.5-35B, Qwen3.5-122B, Qwen3-Embedding-8B, Qwen3-Reranker-8B
+
+set -e
+
+# 颜色定义
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# 模型配置
+declare -A MODELS=(
+    ["qwen3-8b"]="25424|/model/Qwen3-8B|chat"
+    ["qwen3.5-35b"]="25427|/model/Qwen3.5-35B|chat"
+    ["qwen3.5-122b"]="25423|/model/Qwen3.5-122B-A10B|chat"
+    ["qwen3-embedding-8b"]="25425|/model/Qwen3-Embedding-8B|embedding"
+    ["qwen3-reranker-8b"]="25426|/model/Qwen3-Reranker-8B|rerank"
+)
+
+API_KEY="lq123456"
+TIMEOUT=30
+
+echo "========================================"
+echo "SGLang 多模型健康检查 (curl)"
+echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "========================================"
+
+# 测试计数器
+TOTAL=0
+SUCCESS=0
+
+# 测试对话模型
+test_chat_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (对话模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"messages\": [{\"role\": \"user\", \"content\": \"你好,请用一句话介绍自己\"}],
+            \"temperature\": 0.7,
+            \"max_tokens\": 50
+        }" \
+        "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
+    
+    local body=$(echo "$response" | head -n -1)
+    local code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local content=$(echo "$body" | grep -o '"content":"[^"]*"' | head -1 | cut -d'"' -f4)
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "回复: ${content:0:100}..."
+        ((SUCCESS++))
+    else
+        echo -e "${RED}❌ 失败${NC} HTTP $code"
+        echo "响应: ${body:0:200}"
+    fi
+    ((TOTAL++))
+}
+
+# 测试嵌入模型
+test_embedding_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (嵌入模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"input\": [\"你好,这是一个测试句子\", \"Hello world\"]
+        }" \
+        "http://localhost:$port/v1/embeddings" 2>/dev/null || echo -e "\n000")
+    
+    local body=$(echo "$response" | head -n -1)
+    local code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local dims=$(echo "$body" | grep -o '"embedding":\[[^]]*\]' | head -1 | grep -o ',' | wc -l)
+        dims=$((dims + 1))
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "向量维度: $dims"
+        ((SUCCESS++))
+    else
+        echo -e "${YELLOW}⚠️  Embedding 接口失败,尝试 Rerank 接口...${NC}"
+        # 尝试 rerank 接口
+        response=$(curl -s -w "\n%{http_code}" \
+            --max-time $TIMEOUT \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer $API_KEY" \
+            -d "{
+                \"model\": \"$model_path\",
+                \"query\": \"测试查询\",
+                \"documents\": [\"文档1\", \"文档2\"]
+            }" \
+            "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
+        
+        code=$(echo "$response" | tail -n 1)
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅ 成功${NC} (Rerank 接口可用)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌ 失败${NC} HTTP $code"
+        fi
+    fi
+    ((TOTAL++))
+}
+
+# 测试重排序模型
+test_rerank_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (重排序模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    # 尝试 rerank 接口
+    local response
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"query\": \"什么是机器学习\",
+            \"documents\": [\"机器学习是AI的分支\", \"Python是编程语言\", \"深度学习使用神经网络\"],
+            \"top_n\": 2
+        }" \
+        "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
+    
+    local body=$(echo "$response" | head -n -1)
+    local code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local top_doc=$(echo "$body" | grep -o '"text":"[^"]*"' | head -1 | cut -d'"' -f4)
+        local score=$(echo "$body" | grep -o '"relevance_score":[0-9.]*' | head -1 | cut -d':' -f2)
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "Top1: ${top_doc:0:50}... (得分: $score)"
+        ((SUCCESS++))
+    else
+        echo -e "${YELLOW}⚠️  Rerank 接口失败,尝试 Chat 接口...${NC}"
+        # 尝试作为 chat 模型
+        response=$(curl -s -w "\n%{http_code}" \
+            --max-time $TIMEOUT \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer $API_KEY" \
+            -d "{
+                \"model\": \"$model_path\",
+                \"messages\": [{\"role\": \"user\", \"content\": \"你好\"}],
+                \"max_tokens\": 20
+            }" \
+            "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
+        
+        code=$(echo "$response" | tail -n 1)
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅ 成功${NC} (Chat 接口可用)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌ 失败${NC} HTTP $code"
+        fi
+    fi
+    ((TOTAL++))
+}
+
+# 快速检查服务是否存活
+quick_check() {
+    echo ""
+    echo "========================================"
+    echo "快速检查模式"
+    echo "========================================"
+    
+    for key in "${!MODELS[@]}"; do
+        IFS='|' read -r port model_path mtype <<< "${MODELS[$key]}"
+        
+        local code
+        code=$(curl -s -o /dev/null -w "%{http_code}" \
+            --max-time 5 \
+            -H "Authorization: Bearer $API_KEY" \
+            "http://localhost:$port/v1/models" 2>/dev/null || echo "000")
+        
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅${NC} $key (端口 $port)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌${NC} $key (端口 $port) HTTP $code"
+        fi
+        ((TOTAL++))
+    done
+}
+
+# 主函数
+main() {
+    # 解析参数
+    if [ "$1" = "--quick" ]; then
+        quick_check
+    elif [ "$1" = "--model" ] && [ -n "$2" ]; then
+        # 测试指定模型
+        if [ -n "${MODELS[$2]}" ]; then
+            IFS='|' read -r port model_path mtype <<< "${MODELS[$2]}"
+            case $mtype in
+                chat) test_chat_model "$2" "$port" "$model_path" ;;
+                embedding) test_embedding_model "$2" "$port" "$model_path" ;;
+                rerank) test_rerank_model "$2" "$port" "$model_path" ;;
+            esac
+        else
+            echo "未知模型: $2"
+            echo "可用模型: ${!MODELS[@]}"
+            exit 1
+        fi
+    else
+        # 完整测试所有模型
+        for key in "${!MODELS[@]}"; do
+            IFS='|' read -r port model_path mtype <<< "${MODELS[$key]}"
+            case $mtype in
+                chat) test_chat_model "$key" "$port" "$model_path" ;;
+                embedding) test_embedding_model "$key" "$port" "$model_path" ;;
+                rerank) test_rerank_model "$key" "$port" "$model_path" ;;
+            esac
+        done
+    fi
+    
+    # 输出摘要
+    echo ""
+    echo "========================================"
+    echo "测试结果摘要"
+    echo "========================================"
+    echo "总计: $SUCCESS / $TOTAL 个模型正常"
+    
+    if [ $SUCCESS -eq $TOTAL ]; then
+        echo -e "${GREEN}所有模型运行正常!${NC}"
+        exit 0
+    else
+        echo -e "${RED}部分模型异常,请检查日志${NC}"
+        exit 1
+    fi
+}
+
+# 执行
+main "$@"

+ 159 - 0
prod/models/vllm/docker-compose.yaml

@@ -0,0 +1,159 @@
+services:
+  qwen3.5-122b:
+    image: vllm/vllm-openai:latest
+    container_name: qwen3.5-122b-vllm
+    runtime: nvidia
+    shm_size: '10gb'
+    ports:
+      - "25423:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm  # 日志目录映射
+      - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro  #
+    environment:
+      - CUDA_VISIBLE_DEVICES 
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+      - VLLM_LOGGING_LEVEL=INFO  # 使用环境变量控制日志级别
+     # 直接执行脚本,避免复杂的 shell 嵌套
+    entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3.5-122b.sh"]
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0","1"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]
+
+  qwen3-8b:
+    image: vllm/vllm-openai:latest
+    container_name: qwen3-8b-vllm
+    runtime: nvidia
+    shm_size: '10gb'
+    ports:
+      - "25424:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm  # 日志目录映射
+      - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro  #
+    environment:
+      - CUDA_VISIBLE_DEVICES 
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+      - VLLM_LOGGING_LEVEL=INFO  # 使用环境变量控制日志级别
+     # 直接执行脚本,避免复杂的 shell 嵌套
+    entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3-8b.sh"]
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:30000/v1/models", "-H", "Authorization: Bearer lq123456"]
+      interval: 10s
+      timeout: 5s
+      retries: 30
+      start_period: 60s
+
+  qwen3-embedding-8b:
+    image: vllm/vllm-openai:latest
+    #image: vllm/vllm-openai:v0.15.0
+    container_name: qwen3-embedding-8b-vllm
+    runtime: nvidia
+    shm_size: '5gb'
+    ports:
+      - "25425:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm  # 日志目录映射
+      - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro  #
+    environment:
+      - CUDA_VISIBLE_DEVICES 
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+      - VLLM_LOGGING_LEVEL=INFO  # 使用环境变量控制日志级别
+     # 直接执行脚本,避免复杂的 shell 嵌套
+    entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3-embedding-8b.sh"]
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]
+    depends_on:
+      qwen3-8b:
+        condition: service_healthy  # 等待 qwen3-8b 健康检查通过
+
+
+  qwen3-reranker-8b:
+    #image: vllm/vllm-openai:latest# v0.18 版本不支持rerank部署
+    image: vllm/vllm-openai:v0.15.0
+    container_name: qwen3-reranker-8b-vllm
+    runtime: nvidia
+    shm_size: '5gb'
+    ports:
+      - "25426:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm  # 日志目录映射
+      - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro  #
+      - /data/app_workspace/deploy_models/vllm/sglang-main:/vllm/sglang-main:ro
+    environment:
+      - CUDA_VISIBLE_DEVICES
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+      - VLLM_LOGGING_LEVEL=INFO  # 使用环境变量控制日志级别
+     # 直接执行脚本,避免复杂的 shell 嵌套
+    entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3-reranker-8b-15.sh"]
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["3"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]
+
+
+  qwen3.5-35b:
+    image: vllm/vllm-openai:latest
+    container_name: qwen3.5-35b-vllm
+    runtime: nvidia
+    shm_size: '5gb'
+    ports:
+      - "25427:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/vllm/logs:/var/log/vllm  # 日志目录映射
+      - /data/app_workspace/deploy_models/vllm/vllm_start_shell:/vllm_start_shell:ro  #
+    environment:
+      - CUDA_VISIBLE_DEVICES
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+      - VLLM_LOGGING_LEVEL=INFO  # 使用环境变量控制日志级别
+     # 直接执行脚本,避免复杂的 shell 嵌套
+    entrypoint: ["/bin/bash", "/vllm_start_shell/start-vllm-qwen3.5-35b.sh"]
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["4"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]

Разлика између датотеке није приказан због своје велике величине
+ 11 - 0
prod/models/vllm/logs/qwen3-8b-server.log


Разлика између датотеке није приказан због своје велике величине
+ 15 - 0
prod/models/vllm/logs/qwen3-embedding-8b-server.log


Разлика између датотеке није приказан због своје велике величине
+ 15 - 0
prod/models/vllm/logs/qwen3-reranker-8b-server.log


Разлика између датотеке није приказан због своје велике величине
+ 17 - 0
prod/models/vllm/logs/qwen3_5-122b-server.log


Разлика између датотеке није приказан због своје велике величине
+ 14 - 0
prod/models/vllm/logs/qwen3_5-35b-server.log


+ 7 - 0
prod/models/vllm/qwen3_reranker.jinja

@@ -0,0 +1,7 @@
+<|im_start|>system
+Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
+<|im_start|>user
+<Instruct>: {{ instruct | default("Given a web search query, retrieve relevant passages that answer the query.") }}
+<Query>: {{ messages[0]["content"] }}
+<Document>: {{ messages[1]["content"] }}<|im_end|>
+<|im_start|>assistant{{ '\n' }}

+ 284 - 0
prod/models/vllm/test_models.sh

@@ -0,0 +1,284 @@
+#!/bin/bash
+
+# SGLang 多模型 curl 测试脚本
+# 移除 set -e,避免遇到错误就终止
+
+# 颜色定义
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+# 模型配置(按顺序定义)
+MODEL_NAMES=("qwen3-8b" "qwen3.5-35b" "qwen3.5-122b" "qwen3-embedding-8b" "qwen3-reranker-8b")
+
+declare -A MODEL_PORTS=(
+    ["qwen3-8b"]="25424"
+    ["qwen3.5-35b"]="25427"
+    ["qwen3.5-122b"]="25423"
+    ["qwen3-embedding-8b"]="25425"
+    ["qwen3-reranker-8b"]="25426"
+)
+
+declare -A MODEL_PATHS=(
+    ["qwen3-8b"]="/model/Qwen3-8B"
+    ["qwen3.5-35b"]="/model/Qwen3.5-35B-A3B"
+    ["qwen3.5-122b"]="/model/Qwen3.5-122B-A10B"
+    ["qwen3-embedding-8b"]="/model/Qwen3-Embedding-8B"
+    ["qwen3-reranker-8b"]="/model/Qwen3-Reranker-8B"
+)
+
+declare -A MODEL_TYPES=(
+    ["qwen3-8b"]="chat"
+    ["qwen3.5-35b"]="chat"
+    ["qwen3.5-122b"]="chat"
+    ["qwen3-embedding-8b"]="embedding"
+    ["qwen3-reranker-8b"]="rerank"
+)
+
+API_KEY="lq123456"
+TIMEOUT=30
+
+echo "========================================"
+echo "SGLang 多模型健康检查 (curl)"
+echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "========================================"
+
+TOTAL=0
+SUCCESS=0
+
+# 测试对话模型
+test_chat_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (对话模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    local body
+    local code
+    
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"messages\": [{\"role\": \"user\", \"content\": \"你好,请用一句话介绍自己\"}],
+            \"temperature\": 0.7,
+            \"max_tokens\": 50
+        }" \
+        "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
+    
+    body=$(echo "$response" | head -n -1)
+    code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local content=$(echo "$body" | grep -o '"content":"[^"]*"' | head -1 | cut -d'"' -f4)
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "回复: ${content:0:100}..."
+        ((SUCCESS++))
+    else
+        echo -e "${RED}❌ 失败${NC} HTTP $code"
+        echo "响应: ${body:0:200}"
+    fi
+    ((TOTAL++))
+}
+
+# 测试嵌入模型
+test_embedding_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (嵌入模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    local body
+    local code
+    
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"input\": [\"你好,这是一个测试句子\", \"Hello world\"]
+        }" \
+        "http://localhost:$port/v1/embeddings" 2>/dev/null || echo -e "\n000")
+    
+    body=$(echo "$response" | head -n -1)
+    code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local dims=$(echo "$body" | grep -o '"embedding":\[[^]]*\]' | head -1 | grep -o ',' | wc -l)
+        dims=$((dims + 1))
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "向量维度: $dims"
+        ((SUCCESS++))
+    else
+        echo -e "${YELLOW}⚠️  Embedding 接口失败,尝试 Rerank 接口...${NC}"
+        response=$(curl -s -w "\n%{http_code}" \
+            --max-time $TIMEOUT \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer $API_KEY" \
+            -d "{
+                \"model\": \"$model_path\",
+                \"query\": \"测试查询\",
+                \"documents\": [\"文档1\", \"文档2\"]
+            }" \
+            "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
+        
+        code=$(echo "$response" | tail -n 1)
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅ 成功${NC} (Rerank 接口可用)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌ 失败${NC} HTTP $code"
+        fi
+    fi
+    ((TOTAL++))
+}
+
+# 测试重排序模型
+test_rerank_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (重排序模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    local body
+    local code
+    
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"query\": \"什么是机器学习\",
+            \"documents\": [\"机器学习是AI的分支\", \"Python是编程语言\", \"深度学习使用神经网络\"],
+            \"top_n\": 2
+        }" \
+        "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
+    
+    body=$(echo "$response" | head -n -1)
+    code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local top_doc=$(echo "$body" | grep -o '"text":"[^"]*"' | head -1 | cut -d'"' -f4)
+        local score=$(echo "$body" | grep -o '"relevance_score":[0-9.]*' | head -1 | cut -d':' -f2)
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "Top1: ${top_doc:0:50}... (得分: $score)"
+        ((SUCCESS++))
+    else
+        echo -e "${YELLOW}⚠️  Rerank 接口失败,尝试 Chat 接口...${NC}"
+        response=$(curl -s -w "\n%{http_code}" \
+            --max-time $TIMEOUT \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer $API_KEY" \
+            -d "{
+                \"model\": \"$model_path\",
+                \"messages\": [{\"role\": \"user\", \"content\": \"你好\"}],
+                \"max_tokens\": 20
+            }" \
+            "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
+        
+        code=$(echo "$response" | tail -n 1)
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅ 成功${NC} (Chat 接口可用)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌ 失败${NC} HTTP $code"
+        fi
+    fi
+    ((TOTAL++))
+}
+
+# 快速检查
+quick_check() {
+    echo ""
+    echo "========================================"
+    echo "快速检查模式"
+    echo "========================================"
+    
+    for key in "${MODEL_NAMES[@]}"; do
+        local port=${MODEL_PORTS[$key]}
+        
+        local code
+        code=$(curl -s -o /dev/null -w "%{http_code}" \
+            --max-time 5 \
+            -H "Authorization: Bearer $API_KEY" \
+            "http://localhost:$port/v1/models" 2>/dev/null || echo "000")
+        
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅${NC} $key (端口 $port)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌${NC} $key (端口 $port) HTTP $code"
+        fi
+        ((TOTAL++))
+    done
+}
+
+# 主函数
+main() {
+    if [ "$1" = "--quick" ]; then
+        quick_check
+    elif [ "$1" = "--model" ] && [ -n "$2" ]; then
+        local key=$2
+        local port=${MODEL_PORTS[$key]}
+        local path=${MODEL_PATHS[$key]}
+        local mtype=${MODEL_TYPES[$key]}
+        
+        case $mtype in
+            chat) test_chat_model "$key" "$port" "$path" ;;
+            embedding) test_embedding_model "$key" "$port" "$path" ;;
+            rerank) test_rerank_model "$key" "$port" "$path" ;;
+        esac
+    else
+        # 按顺序测试所有模型
+        for key in "${MODEL_NAMES[@]}"; do
+            local port=${MODEL_PORTS[$key]}
+            local path=${MODEL_PATHS[$key]}
+            local mtype=${MODEL_TYPES[$key]}
+            
+            case $mtype in
+                chat) test_chat_model "$key" "$port" "$path" ;;
+                embedding) test_embedding_model "$key" "$port" "$path" ;;
+                rerank) test_rerank_model "$key" "$port" "$path" ;;
+            esac
+        done
+    fi
+    
+    echo ""
+    echo "========================================"
+    echo "测试结果摘要"
+    echo "========================================"
+    echo "总计: $SUCCESS / $TOTAL 个模型正常"
+    
+    if [ $SUCCESS -eq $TOTAL ]; then
+        echo -e "${GREEN}所有模型运行正常!${NC}"
+        exit 0
+    else
+        echo -e "${RED}部分模型异常,请检查日志${NC}"
+        exit 1
+    fi
+}
+
+main "$@"

+ 14 - 0
prod/models/vllm/vllm_start_shell/start-vllm-qwen3-8b.sh

@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+# 创建日志目录
+mkdir -p /var/log/vllm
+
+# 启动 vLLM 服务
+vllm serve /model/Qwen3-8B \
+    --trust-remote-code \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.45 \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --api-key lq123456 2>&1 | tee /var/log/vllm/qwen3-8b-server.log

+ 17 - 0
prod/models/vllm/vllm_start_shell/start-vllm-qwen3-embedding-8b-15.sh

@@ -0,0 +1,17 @@
+#!/bin/bash
+set -e
+
+# 创建日志目录
+mkdir -p /var/log/vllm
+
+# 启动 vLLM 服务
+python3 -m vllm.entrypoints.openai.api_server \
+    --model /model/Qwen3-Embedding-8B \
+    --runner pooling \
+    --convert embed  \
+    --trust-remote-code \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.45 \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --api-key lq123456 2>&1 | tee /var/log/vllm/qwen3-embedding-8b-server.log

+ 15 - 0
prod/models/vllm/vllm_start_shell/start-vllm-qwen3-embedding-8b.sh

@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+# 创建日志目录
+mkdir -p /var/log/vllm
+
+# 启动 vLLM 服务
+vllm serve /model/Qwen3-Embedding-8B \
+    --convert embed \
+    --trust-remote-code \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.45 \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --api-key lq123456 2>&1 | tee /var/log/vllm/qwen3-embedding-8b-server.log

+ 17 - 0
prod/models/vllm/vllm_start_shell/start-vllm-qwen3-reranker-8b-15.sh

@@ -0,0 +1,17 @@
+#!/bin/bash
+set -e
+
+# 创建日志目录
+mkdir -p /var/log/vllm
+
+# 启动 vLLM 服务 --convert classify -task embed
+python3 -m vllm.entrypoints.openai.api_server \
+    --model /model/Qwen3-Reranker-8B \
+    --runner pooling \
+    --trust-remote-code \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.45 \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --chat-template /vllm/sglang-main/examples/chat_template/qwen3_reranker.jinja
+    --api-key lq123456 2>&1 | tee /var/log/vllm/qwen3-reranker-8b-server.log

+ 14 - 0
prod/models/vllm/vllm_start_shell/start-vllm-qwen3-reranker-8b.sh

@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+# 创建日志目录
+mkdir -p /var/log/vllm
+
+# 启动 vLLM 服务 --convert classify  --convert embed --task score --convert auto --task classify
+vllm serve /model/Qwen3-Reranker-8B \
+    --trust-remote-code \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.5 \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --api-key lq123456 2>&1 | tee /var/log/vllm/qwen3-reranker-8b-server.log

+ 13 - 0
prod/models/vllm/vllm_start_shell/start-vllm-qwen3.5-122b.sh

@@ -0,0 +1,13 @@
+#!/bin/bash
+set -e
+
+# 创建日志目录
+mkdir -p /var/log/vllm
+
+# 启动 vLLM 服务
+exec vllm serve /model/Qwen3.5-122B-A10B \
+    --trust-remote-code \
+    --tensor-parallel-size 2 \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --api-key lq123456 2>&1 | tee /var/log/vllm/qwen3_5-122b-server.log

+ 13 - 0
prod/models/vllm/vllm_start_shell/start-vllm-qwen3.5-35b.sh

@@ -0,0 +1,13 @@
+#!/bin/bash
+set -e
+
+# 创建日志目录
+mkdir -p /var/log/vllm
+
+# 启动 vLLM 服务
+exec vllm serve /model/Qwen3.5-35B-A3B \
+    --trust-remote-code \
+    --tensor-parallel-size 1 \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --api-key lq123456 2>&1 | tee /var/log/vllm/qwen3_5-35b-server.log

Неке датотеке нису приказане због велике количине промена