Quellcode durchsuchen

配置文件提交

lingmin_package@163.com vor 1 Woche
Ursprung
Commit
dbf13b5d37
29 geänderte Dateien mit 2139 neuen und 62 gelöschten Zeilen
  1. 25 0
      dev/minerU-dev/Dockerfile
  2. 155 0
      dev/minerU-dev/compose.yaml
  3. 389 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/benchmark.log
  4. 65 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_args.json
  5. BIN
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_data.db
  6. 112 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_percentile.json
  7. 17 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_summary.json
  8. 65 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_args.json
  9. BIN
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_data.db
  10. 112 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_percentile.json
  11. 17 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_summary.json
  12. 65 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_args.json
  13. BIN
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_data.db
  14. 112 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_percentile.json
  15. 17 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_summary.json
  16. 32 0
      dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/performance_summary.txt
  17. BIN
      dev/models/bench_suite/outputs_qwen3.6_27b-wa8a_0515.zip
  18. BIN
      dev/models/bench_suite/outputs_qwen3.6_27b_0515.zip
  19. 158 0
      dev/models/docker-compose.yaml
  20. 50 0
      dev/models/docker-compose.yaml.bak
  21. 29 0
      dev/models/yaml_bak/docker-compose.yaml.nvidia
  22. 49 0
      dev/models/yaml_bak/docker-compose.yaml.qwen3.6-27b-vllm
  23. 30 0
      dev/models/yaml_bak/docker-compose.yaml.test_vllm-metax
  24. 69 0
      prod/models/glm_ocr/docker-compose.yml
  25. 51 46
      prod/models/sglang/docker-compose.yaml
  26. 188 0
      prod/models/sglang/docker-compose_20260518.yaml
  27. 31 16
      prod/models/sglang/test_models.sh
  28. 284 0
      prod/models/sglang/test_models_20260518.sh
  29. 17 0
      prod/models/sglang/需求文档.md

+ 25 - 0
dev/minerU-dev/Dockerfile

@@ -0,0 +1,25 @@
+# Use DaoCloud mirrored vllm image for China region for gpu with Volta、Turing、Ampere、Ada Lovelace、Hopper、Blackwell architecture (7.0 <= Compute Capability <= 12.0)
+# Compute Capability version query (https://developer.nvidia.com/cuda-gpus)
+# support x86_64 architecture and ARM(AArch64) architecture
+FROM docker.m.daocloud.io/vllm/vllm-openai:v0.11.2
+
+# Install libgl for opencv support & Noto fonts for Chinese characters
+RUN apt-get update && \
+    apt-get install -y \
+        fonts-noto-core \
+        fonts-noto-cjk \
+        fontconfig \
+        libgl1 && \
+    fc-cache -fv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install mineru latest
+RUN python3 -m pip install -U 'mineru[core]>=3.0.0' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages && \
+    python3 -m pip cache purge
+
+# Download models and update the configuration file
+RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
+
+# Set the entry point to activate the virtual environment and run the command line tool
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]

+ 155 - 0
dev/minerU-dev/compose.yaml

@@ -0,0 +1,155 @@
+services:
+  mineru-openai-server:
+    image: mineru:latest
+    container_name: mineru-openai-server
+    restart: always
+    profiles: ["openai-server"]
+    ports:
+      - 30000:30000
+    environment:
+      MINERU_MODEL_SOURCE: local
+    entrypoint: mineru-openai-server
+    command:
+      --host 0.0.0.0
+      --port 30000
+      # --gpu-memory-utilization 0.5  # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0"]  # Modify for multiple GPUs: ["0", "1"]
+              capabilities: [gpu]
+
+  mineru-api:
+    image: mineru:latest
+    container_name: mineru-api
+    restart: always
+    profiles: ["api"]
+    ports:
+      - 23428:8000
+    environment:
+      #MINERU_MODEL_SOURCE: local
+       # 模型源:与 --source modelscope 保持一致
+       - MINERU_MODEL_SOURCE=modelscope
+       # 模型缓存路径(容器内)
+       - MODELSCOPE_CACHE=/root/.cache/modelscope
+       - MINERU_CACHE_DIR=/root/.cache/mineru
+       # Transformers/HF 缓存,避免路径冲突
+       - TRANSFORMERS_CACHE=/root/.cache/huggingface/transformers
+       - HF_HOME=/root/.cache/huggingface
+       # 日志与语言
+       - LOG_DIR=/app/logs
+       - LANG=zh_CN.UTF-8
+       - PYTHONUNBUFFERED=1
+       - DEVICE=cuda
+       # API Key 配置(根据实际版本选择)
+       - MINERU_API_KEY=sk_dev_aC_2gg8BS5ImUScrpaHIKS5x6gdLO9Js_ba854894
+
+    entrypoint: mineru-api
+    command:
+      --host 0.0.0.0
+      --port 8000
+      # --allow-public-http-client  # Disabled by default; when binding to 0.0.0.0 or ::, this re-enables *-http-client backends and server_url. Enable only if you accept the SSRF risk.
+      # parameters for vllm-engine
+      # --gpu-memory-utilization 0.5  # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
+    volumes:
+      # 1. 模型缓存持久化 (核心:避免重复下载)
+      - /home/ubuntu/.cache/modelscope:/root/.cache/modelscope:rw
+      # 2. MinerU 缓存持久化
+      - /home/ubuntu/.cache/mineru:/root/.cache/mineru:rw
+      - /home/ubuntu/.cache/huggingface:/root/.cache/huggingface:rw  # 新增:避免 transformers 缓存冲突
+      # 3. 日志目录映射
+      - //home/ubuntu/lq_workspace/minerU/logs:/app/logs:rw
+      # 4. 输入文件目录 (可选,如果 API 支持文件上传处理)
+      - /home/ubuntu/lq_workspace/minerU/input:/app/input:ro
+      # 5. 输出结果目录 (可选)
+      - /home/ubuntu/lq_workspace/minerU/output:/app/output:rw
+      # 6. 配置文件目录 (可选,如有自定义配置)
+      - /home/ubuntu/lq_workspace/minerU/config:/app/config:ro
+    
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0"]  # Modify for multiple GPUs: ["0", "1"]
+              capabilities: [gpu]
+
+  mineru-router:
+    image: mineru:latest
+    container_name: mineru-router
+    restart: always
+    profiles: ["router"]
+    ports:
+      - 8002:8002
+    environment:
+      MINERU_MODEL_SOURCE: local
+      # API Key 配置(根据实际版本选择)
+      MINERU_API_KEY: "sk_dev_aC_2gg8BS5ImUScrpaHIKS5x6gdLO9Js_ba854894"
+    entrypoint: mineru-router
+    command:
+      --host 0.0.0.0
+      --port 8002
+      --local-gpus auto
+      # --allow-public-http-client  # Disabled by default; when binding to 0.0.0.0 or ::, this re-enables *-http-client backends and server_url. Enable only if you accept the SSRF risk.
+      # To aggregate existing mineru-api services instead of starting local workers:
+      # --local-gpus none
+      # --upstream-url http://mineru-api:8000
+      # --upstream-url http://mineru-api-2:8000
+      # parameters for vllm-engine
+      # --gpu-memory-utilization 0.5  # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:8002/health || exit 1"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0"]  # Modify for multiple GPUs: ["0", "1"]
+              capabilities: [gpu]
+
+  mineru-gradio:
+    image: mineru:latest
+    container_name: mineru-gradio
+    restart: always
+    profiles: ["gradio"]
+    ports:
+      - 7860:7860
+    environment:
+      MINERU_MODEL_SOURCE: local
+    entrypoint: mineru-gradio
+    command:
+      --server-name 0.0.0.0
+      --server-port 7860
+      # --enable-api false  # If you want to disable the API, set this to false
+      # --max-convert-pages 20  # If you want to limit the number of pages for conversion, set this to a specific number
+      # parameters for vllm-engine
+      # --gpu-memory-utilization 0.5  # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
+    ulimits:
+      memlock: -1
+      stack: 67108864
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0"]  # Modify for multiple GPUs: ["0", "1"]
+              capabilities: [gpu]

+ 389 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/benchmark.log

@@ -0,0 +1,389 @@
+2026-05-15 08:41:06 - evalscope - INFO: Starting benchmark with args: 
+2026-05-15 08:41:06 - evalscope - INFO: {
+    "model": "Qwen3.6-27B-W8A8",
+    "model_id": "Qwen3.6-27B-W8A8",
+    "attn_implementation": null,
+    "api": "openai",
+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
+    "port": 8877,
+    "url": "http://127.0.0.1:8004/v1/chat/completions",
+    "headers": {
+        "Authorization": "Bearer sk-123456"
+    },
+    "connect_timeout": null,
+    "read_timeout": null,
+    "total_timeout": 21600,
+    "api_key": "sk-123456",
+    "no_test_connection": false,
+    "number": 1,
+    "parallel": 1,
+    "rate": -1,
+    "sleep_interval": 5,
+    "sla_auto_tune": false,
+    "sla_variable": "parallel",
+    "sla_params": null,
+    "sla_num_runs": 3,
+    "sla_upper_bound": 65536,
+    "sla_lower_bound": 1,
+    "db_commit_interval": 1000,
+    "queue_size_multiplier": 5,
+    "in_flight_task_multiplier": 2,
+    "log_every_n_query": 10,
+    "debug": false,
+    "visualizer": null,
+    "wandb_api_key": null,
+    "swanlab_api_key": null,
+    "name": null,
+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
+    "no_timestamp": false,
+    "max_prompt_length": 2048,
+    "min_prompt_length": 2048,
+    "prefix_length": 0,
+    "prompt": null,
+    "query_template": null,
+    "apply_chat_template": true,
+    "image_width": 224,
+    "image_height": 224,
+    "image_format": "RGB",
+    "image_num": 1,
+    "image_patch_size": 28,
+    "dataset": "random",
+    "dataset_path": null,
+    "frequency_penalty": null,
+    "repetition_penalty": null,
+    "logprobs": null,
+    "max_tokens": 128,
+    "min_tokens": 128,
+    "n_choices": null,
+    "seed": null,
+    "stop": null,
+    "stop_token_ids": null,
+    "stream": true,
+    "temperature": 0.0,
+    "top_p": null,
+    "top_k": null,
+    "extra_args": {}
+}
+2026-05-15 08:41:22 - evalscope - INFO: Test connection successful.
+2026-05-15 08:41:25 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
+2026-05-15 08:41:25 - evalscope - INFO: Sampling input lengths from [2046, 2047)
+2026-05-15 08:41:26 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_data.db
+2026-05-15 08:41:33 - evalscope - INFO: Processing 100%| 1/1 [Elapsed: 00:07 < Remaining: 00:00,  7.37s/it]
+2026-05-15 08:41:33 - evalscope - INFO: 
+Benchmarking summary:
++-----------------------------------+-----------+
+| Key                               |     Value |
++===================================+===========+
+| Time taken for tests (s)          |    7.3689 |
++-----------------------------------+-----------+
+| Number of concurrency             |    1      |
++-----------------------------------+-----------+
+| Request rate (req/s)              |   -1      |
++-----------------------------------+-----------+
+| Total requests                    |    1      |
++-----------------------------------+-----------+
+| Succeed requests                  |    1      |
++-----------------------------------+-----------+
+| Failed requests                   |    0      |
++-----------------------------------+-----------+
+| Output token throughput (tok/s)   |   17.3704 |
++-----------------------------------+-----------+
+| Total token throughput (tok/s)    |  296.383  |
++-----------------------------------+-----------+
+| Request throughput (req/s)        |    0.1357 |
++-----------------------------------+-----------+
+| Average latency (s)               |    7.3689 |
++-----------------------------------+-----------+
+| Average time to first token (s)   |    0.8157 |
++-----------------------------------+-----------+
+| Average time per output token (s) |    0.0516 |
++-----------------------------------+-----------+
+| Average inter-token latency (s)   |    0.0512 |
++-----------------------------------+-----------+
+| Average input tokens per request  | 2056      |
++-----------------------------------+-----------+
+| Average output tokens per request |  128      |
++-----------------------------------+-----------+
+2026-05-15 08:41:33 - evalscope - INFO: 
+Percentile results:
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+|     10%     |  0.8157  | 0.0514  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+|     25%     |  0.8157  | 0.0515  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+|     50%     |  0.8157  | 0.0516  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+|     66%     |  0.8157  | 0.0517  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+|     75%     |  0.8157  | 0.0518  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+|     80%     |  0.8157  | 0.0518  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+|     90%     |  0.8157  | 0.0519  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+|     95%     |  0.8157  |  0.052  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+|     98%     |  0.8157  | 0.0523  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+|     99%     |  0.8157  | 0.0523  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+2026-05-15 08:41:33 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1
+2026-05-15 08:41:33 - evalscope - INFO: Sleeping for 5 seconds before the next run...
+2026-05-15 08:41:38 - evalscope - INFO: Starting benchmark with args: 
+2026-05-15 08:41:38 - evalscope - INFO: {
+    "model": "Qwen3.6-27B-W8A8",
+    "model_id": "Qwen3.6-27B-W8A8",
+    "attn_implementation": null,
+    "api": "openai",
+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
+    "port": 8877,
+    "url": "http://127.0.0.1:8004/v1/chat/completions",
+    "headers": {
+        "Authorization": "Bearer sk-123456"
+    },
+    "connect_timeout": null,
+    "read_timeout": null,
+    "total_timeout": 21600,
+    "api_key": "sk-123456",
+    "no_test_connection": false,
+    "number": 5,
+    "parallel": 5,
+    "rate": -1,
+    "sleep_interval": 5,
+    "sla_auto_tune": false,
+    "sla_variable": "parallel",
+    "sla_params": null,
+    "sla_num_runs": 3,
+    "sla_upper_bound": 65536,
+    "sla_lower_bound": 1,
+    "db_commit_interval": 1000,
+    "queue_size_multiplier": 5,
+    "in_flight_task_multiplier": 2,
+    "log_every_n_query": 10,
+    "debug": false,
+    "visualizer": null,
+    "wandb_api_key": null,
+    "swanlab_api_key": null,
+    "name": null,
+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
+    "no_timestamp": false,
+    "max_prompt_length": 2048,
+    "min_prompt_length": 2048,
+    "prefix_length": 0,
+    "prompt": null,
+    "query_template": null,
+    "apply_chat_template": true,
+    "image_width": 224,
+    "image_height": 224,
+    "image_format": "RGB",
+    "image_num": 1,
+    "image_patch_size": 28,
+    "dataset": "random",
+    "dataset_path": null,
+    "frequency_penalty": null,
+    "repetition_penalty": null,
+    "logprobs": null,
+    "max_tokens": 128,
+    "min_tokens": 128,
+    "n_choices": null,
+    "seed": null,
+    "stop": null,
+    "stop_token_ids": null,
+    "stream": true,
+    "temperature": 0.0,
+    "top_p": null,
+    "top_k": null,
+    "extra_args": {}
+}
+2026-05-15 08:41:48 - evalscope - INFO: Test connection successful.
+2026-05-15 08:41:51 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
+2026-05-15 08:41:51 - evalscope - INFO: Sampling input lengths from [2046, 2047)
+2026-05-15 08:41:51 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_data.db
+2026-05-15 08:41:58 - evalscope - INFO: Processing 100%| 5/5 [Elapsed: 00:06 < Remaining: 00:00,  1.01it/s]
+2026-05-15 08:41:58 - evalscope - INFO: 
+Benchmarking summary:
++-----------------------------------+-----------+
+| Key                               |     Value |
++===================================+===========+
+| Time taken for tests (s)          |    6.6303 |
++-----------------------------------+-----------+
+| Number of concurrency             |    5      |
++-----------------------------------+-----------+
+| Request rate (req/s)              |   -1      |
++-----------------------------------+-----------+
+| Total requests                    |    5      |
++-----------------------------------+-----------+
+| Succeed requests                  |    5      |
++-----------------------------------+-----------+
+| Failed requests                   |    0      |
++-----------------------------------+-----------+
+| Output token throughput (tok/s)   |   96.5268 |
++-----------------------------------+-----------+
+| Total token throughput (tok/s)    | 1647.29   |
++-----------------------------------+-----------+
+| Request throughput (req/s)        |    0.7541 |
++-----------------------------------+-----------+
+| Average latency (s)               |    6.5697 |
++-----------------------------------+-----------+
+| Average time to first token (s)   |    2.1216 |
++-----------------------------------+-----------+
+| Average time per output token (s) |    0.035  |
++-----------------------------------+-----------+
+| Average inter-token latency (s)   |    0.0348 |
++-----------------------------------+-----------+
+| Average input tokens per request  | 2056.4    |
++-----------------------------------+-----------+
+| Average output tokens per request |  128      |
++-----------------------------------+-----------+
+2026-05-15 08:41:58 - evalscope - INFO: 
+Percentile results:
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+|     10%     |  0.6655  | 0.0295  |  0.0299  |   6.5201    |     2056     |      128      |    19.3128     |   329.6753    |
+|     25%     |  1.6872  | 0.0296  |  0.0304  |   6.5483    |     2056     |      128      |    19.4617     |   332.0644    |
+|     50%     |  2.7127  | 0.0297  |  0.0304  |   6.5751    |     2056     |      128      |    19.4674     |   332.1626    |
+|     66%     |  2.7127  | 0.0298  |  0.0383  |    6.577    |     2057     |      128      |    19.5469     |   333.6721    |
+|     75%     |  2.7127  | 0.0299  |  0.0383  |    6.577    |     2057     |      128      |    19.5469     |   333.6721    |
+|     80%     |  2.8297  | 0.0299  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
+|     90%     |  2.8297  | 0.0301  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
+|     95%     |  2.8297  | 0.0304  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
+|     98%     |  2.8297  |  0.031  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
+|     99%     |  2.8297  | 0.1171  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+2026-05-15 08:41:58 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5
+2026-05-15 08:41:58 - evalscope - INFO: Sleeping for 5 seconds before the next run...
+2026-05-15 08:42:03 - evalscope - INFO: Starting benchmark with args: 
+2026-05-15 08:42:03 - evalscope - INFO: {
+    "model": "Qwen3.6-27B-W8A8",
+    "model_id": "Qwen3.6-27B-W8A8",
+    "attn_implementation": null,
+    "api": "openai",
+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
+    "port": 8877,
+    "url": "http://127.0.0.1:8004/v1/chat/completions",
+    "headers": {
+        "Authorization": "Bearer sk-123456"
+    },
+    "connect_timeout": null,
+    "read_timeout": null,
+    "total_timeout": 21600,
+    "api_key": "sk-123456",
+    "no_test_connection": false,
+    "number": 10,
+    "parallel": 10,
+    "rate": -1,
+    "sleep_interval": 5,
+    "sla_auto_tune": false,
+    "sla_variable": "parallel",
+    "sla_params": null,
+    "sla_num_runs": 3,
+    "sla_upper_bound": 65536,
+    "sla_lower_bound": 1,
+    "db_commit_interval": 1000,
+    "queue_size_multiplier": 5,
+    "in_flight_task_multiplier": 2,
+    "log_every_n_query": 10,
+    "debug": false,
+    "visualizer": null,
+    "wandb_api_key": null,
+    "swanlab_api_key": null,
+    "name": null,
+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
+    "no_timestamp": false,
+    "max_prompt_length": 2048,
+    "min_prompt_length": 2048,
+    "prefix_length": 0,
+    "prompt": null,
+    "query_template": null,
+    "apply_chat_template": true,
+    "image_width": 224,
+    "image_height": 224,
+    "image_format": "RGB",
+    "image_num": 1,
+    "image_patch_size": 28,
+    "dataset": "random",
+    "dataset_path": null,
+    "frequency_penalty": null,
+    "repetition_penalty": null,
+    "logprobs": null,
+    "max_tokens": 128,
+    "min_tokens": 128,
+    "n_choices": null,
+    "seed": null,
+    "stop": null,
+    "stop_token_ids": null,
+    "stream": true,
+    "temperature": 0.0,
+    "top_p": null,
+    "top_k": null,
+    "extra_args": {}
+}
+2026-05-15 08:42:12 - evalscope - INFO: Test connection successful.
+2026-05-15 08:42:16 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
+2026-05-15 08:42:16 - evalscope - INFO: Sampling input lengths from [2046, 2047)
+2026-05-15 08:42:16 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_data.db
+2026-05-15 08:42:26 - evalscope - INFO: {
+  "Time taken for tests (s)": 9.6293,
+  "Number of concurrency": 10,
+  "Request rate (req/s)": -1,
+  "Total requests": 10,
+  "Succeed requests": 10,
+  "Failed requests": 0,
+  "Output token throughput (tok/s)": 132.928,
+  "Total token throughput (tok/s)": 2268.4997,
+  "Request throughput (req/s)": 1.0385,
+  "Average latency (s)": 9.567,
+  "Average time to first token (s)": 3.6071,
+  "Average time per output token (s)": 0.0469,
+  "Average inter-token latency (s)": 0.0466,
+  "Average input tokens per request": 2056.4,
+  "Average output tokens per request": 128.0
+}
+2026-05-15 08:42:26 - evalscope - INFO: Processing 100%| 10/10 [Elapsed: 00:09 < Remaining: 00:00,  1.01s/it]
+2026-05-15 08:42:26 - evalscope - INFO: 
+Benchmarking summary:
++-----------------------------------+-----------+
+| Key                               |     Value |
++===================================+===========+
+| Time taken for tests (s)          |    9.6293 |
++-----------------------------------+-----------+
+| Number of concurrency             |   10      |
++-----------------------------------+-----------+
+| Request rate (req/s)              |   -1      |
++-----------------------------------+-----------+
+| Total requests                    |   10      |
++-----------------------------------+-----------+
+| Succeed requests                  |   10      |
++-----------------------------------+-----------+
+| Failed requests                   |    0      |
++-----------------------------------+-----------+
+| Output token throughput (tok/s)   |  132.928  |
++-----------------------------------+-----------+
+| Total token throughput (tok/s)    | 2268.5    |
++-----------------------------------+-----------+
+| Request throughput (req/s)        |    1.0385 |
++-----------------------------------+-----------+
+| Average latency (s)               |    9.567  |
++-----------------------------------+-----------+
+| Average time to first token (s)   |    3.6071 |
++-----------------------------------+-----------+
+| Average time per output token (s) |    0.0469 |
++-----------------------------------+-----------+
+| Average inter-token latency (s)   |    0.0466 |
++-----------------------------------+-----------+
+| Average input tokens per request  | 2056.4    |
++-----------------------------------+-----------+
+| Average output tokens per request |  128      |
++-----------------------------------+-----------+
+2026-05-15 08:42:26 - evalscope - INFO: 
+Percentile results:
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+|     10%     |  1.7449  | 0.0332  |  0.0332  |   9.5131    |     2056     |      128      |    13.2998     |   226.9284    |
+|     25%     |  2.7691  | 0.0333  |  0.0376  |   9.5425    |     2056     |      128      |    13.3343     |   227.5162    |
+|     50%     |  3.7961  | 0.0333  |  0.0455  |   9.5729    |     2056     |      128      |    13.3741     |   228.2483    |
+|     66%     |  4.8242  | 0.0334  |  0.0533  |   9.5984    |     2056     |      128      |     13.412     |   228.8415    |
+|     75%     |  4.825   | 0.0334  |  0.0533  |   9.5993    |     2057     |      128      |    13.4137     |   229.0807    |
+|     80%     |  5.4098  | 0.0335  |  0.0612  |   9.6242    |     2057     |      128      |    13.4552     |   229.5791    |
+|     90%     |   5.41   | 0.0336  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
+|     95%     |   5.41   | 0.0337  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
+|     98%     |   5.41   | 0.0367  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
+|     99%     |   5.41   | 1.0169  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+2026-05-15 08:42:26 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10
+2026-05-15 08:42:26 - evalscope - INFO: Performance summary saved to: outputs/20260515_084106/Qwen3.6-27B-W8A8/performance_summary.txt

+ 65 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_args.json

@@ -0,0 +1,65 @@
+{
+    "model": "Qwen3.6-27B-W8A8",
+    "model_id": "Qwen3.6-27B-W8A8",
+    "attn_implementation": null,
+    "api": "openai",
+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
+    "port": 8877,
+    "url": "http://127.0.0.1:8004/v1/chat/completions",
+    "headers": {
+        "Authorization": "Bearer sk-123456"
+    },
+    "connect_timeout": null,
+    "read_timeout": null,
+    "total_timeout": 21600,
+    "api_key": "sk-123456",
+    "no_test_connection": false,
+    "number": 10,
+    "parallel": 10,
+    "rate": -1,
+    "sleep_interval": 5,
+    "sla_auto_tune": false,
+    "sla_variable": "parallel",
+    "sla_params": null,
+    "sla_num_runs": 3,
+    "sla_upper_bound": 65536,
+    "sla_lower_bound": 1,
+    "db_commit_interval": 1000,
+    "queue_size_multiplier": 5,
+    "in_flight_task_multiplier": 2,
+    "log_every_n_query": 10,
+    "debug": false,
+    "visualizer": null,
+    "wandb_api_key": null,
+    "swanlab_api_key": null,
+    "name": null,
+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10",
+    "no_timestamp": false,
+    "max_prompt_length": 2048,
+    "min_prompt_length": 2048,
+    "prefix_length": 0,
+    "prompt": null,
+    "query_template": null,
+    "apply_chat_template": true,
+    "image_width": 224,
+    "image_height": 224,
+    "image_format": "RGB",
+    "image_num": 1,
+    "image_patch_size": 28,
+    "dataset": "random",
+    "dataset_path": null,
+    "frequency_penalty": null,
+    "repetition_penalty": null,
+    "logprobs": null,
+    "max_tokens": 128,
+    "min_tokens": 128,
+    "n_choices": null,
+    "seed": null,
+    "stop": null,
+    "stop_token_ids": null,
+    "stream": true,
+    "temperature": 0.0,
+    "top_p": null,
+    "top_k": null,
+    "extra_args": {}
+}

BIN
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_data.db


+ 112 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_percentile.json

@@ -0,0 +1,112 @@
+[
+    {
+        "Percentiles": "10%",
+        "TTFT (s)": 1.7449,
+        "ITL (s)": 0.0332,
+        "TPOT (s)": 0.0332,
+        "Latency (s)": 9.5131,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.2998,
+        "Total (tok/s)": 226.9284
+    },
+    {
+        "Percentiles": "25%",
+        "TTFT (s)": 2.7691,
+        "ITL (s)": 0.0333,
+        "TPOT (s)": 0.0376,
+        "Latency (s)": 9.5425,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.3343,
+        "Total (tok/s)": 227.5162
+    },
+    {
+        "Percentiles": "50%",
+        "TTFT (s)": 3.7961,
+        "ITL (s)": 0.0333,
+        "TPOT (s)": 0.0455,
+        "Latency (s)": 9.5729,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.3741,
+        "Total (tok/s)": 228.2483
+    },
+    {
+        "Percentiles": "66%",
+        "TTFT (s)": 4.8242,
+        "ITL (s)": 0.0334,
+        "TPOT (s)": 0.0533,
+        "Latency (s)": 9.5984,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.412,
+        "Total (tok/s)": 228.8415
+    },
+    {
+        "Percentiles": "75%",
+        "TTFT (s)": 4.825,
+        "ITL (s)": 0.0334,
+        "TPOT (s)": 0.0533,
+        "Latency (s)": 9.5993,
+        "Input tokens": 2057,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.4137,
+        "Total (tok/s)": 229.0807
+    },
+    {
+        "Percentiles": "80%",
+        "TTFT (s)": 5.4098,
+        "ITL (s)": 0.0335,
+        "TPOT (s)": 0.0612,
+        "Latency (s)": 9.6242,
+        "Input tokens": 2057,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.4552,
+        "Total (tok/s)": 229.5791
+    },
+    {
+        "Percentiles": "90%",
+        "TTFT (s)": 5.41,
+        "ITL (s)": 0.0336,
+        "TPOT (s)": 0.0689,
+        "Latency (s)": 9.6253,
+        "Input tokens": 2058,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.5021,
+        "Total (tok/s)": 230.4848
+    },
+    {
+        "Percentiles": "95%",
+        "TTFT (s)": 5.41,
+        "ITL (s)": 0.0337,
+        "TPOT (s)": 0.0689,
+        "Latency (s)": 9.6253,
+        "Input tokens": 2058,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.5021,
+        "Total (tok/s)": 230.4848
+    },
+    {
+        "Percentiles": "98%",
+        "TTFT (s)": 5.41,
+        "ITL (s)": 0.0367,
+        "TPOT (s)": 0.0689,
+        "Latency (s)": 9.6253,
+        "Input tokens": 2058,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.5021,
+        "Total (tok/s)": 230.4848
+    },
+    {
+        "Percentiles": "99%",
+        "TTFT (s)": 5.41,
+        "ITL (s)": 1.0169,
+        "TPOT (s)": 0.0689,
+        "Latency (s)": 9.6253,
+        "Input tokens": 2058,
+        "Output tokens": 128,
+        "Output (tok/s)": 13.5021,
+        "Total (tok/s)": 230.4848
+    }
+]

+ 17 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_summary.json

@@ -0,0 +1,17 @@
+{
+    "Time taken for tests (s)": 9.6293,
+    "Number of concurrency": 10,
+    "Request rate (req/s)": -1,
+    "Total requests": 10,
+    "Succeed requests": 10,
+    "Failed requests": 0,
+    "Output token throughput (tok/s)": 132.928,
+    "Total token throughput (tok/s)": 2268.4997,
+    "Request throughput (req/s)": 1.0385,
+    "Average latency (s)": 9.567,
+    "Average time to first token (s)": 3.6071,
+    "Average time per output token (s)": 0.0469,
+    "Average inter-token latency (s)": 0.0466,
+    "Average input tokens per request": 2056.4,
+    "Average output tokens per request": 128.0
+}

+ 65 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_args.json

@@ -0,0 +1,65 @@
+{
+    "model": "Qwen3.6-27B-W8A8",
+    "model_id": "Qwen3.6-27B-W8A8",
+    "attn_implementation": null,
+    "api": "openai",
+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
+    "port": 8877,
+    "url": "http://127.0.0.1:8004/v1/chat/completions",
+    "headers": {
+        "Authorization": "Bearer sk-123456"
+    },
+    "connect_timeout": null,
+    "read_timeout": null,
+    "total_timeout": 21600,
+    "api_key": "sk-123456",
+    "no_test_connection": false,
+    "number": 1,
+    "parallel": 1,
+    "rate": -1,
+    "sleep_interval": 5,
+    "sla_auto_tune": false,
+    "sla_variable": "parallel",
+    "sla_params": null,
+    "sla_num_runs": 3,
+    "sla_upper_bound": 65536,
+    "sla_lower_bound": 1,
+    "db_commit_interval": 1000,
+    "queue_size_multiplier": 5,
+    "in_flight_task_multiplier": 2,
+    "log_every_n_query": 10,
+    "debug": false,
+    "visualizer": null,
+    "wandb_api_key": null,
+    "swanlab_api_key": null,
+    "name": null,
+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1",
+    "no_timestamp": false,
+    "max_prompt_length": 2048,
+    "min_prompt_length": 2048,
+    "prefix_length": 0,
+    "prompt": null,
+    "query_template": null,
+    "apply_chat_template": true,
+    "image_width": 224,
+    "image_height": 224,
+    "image_format": "RGB",
+    "image_num": 1,
+    "image_patch_size": 28,
+    "dataset": "random",
+    "dataset_path": null,
+    "frequency_penalty": null,
+    "repetition_penalty": null,
+    "logprobs": null,
+    "max_tokens": 128,
+    "min_tokens": 128,
+    "n_choices": null,
+    "seed": null,
+    "stop": null,
+    "stop_token_ids": null,
+    "stream": true,
+    "temperature": 0.0,
+    "top_p": null,
+    "top_k": null,
+    "extra_args": {}
+}

BIN
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_data.db


+ 112 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_percentile.json

@@ -0,0 +1,112 @@
+[
+    {
+        "Percentiles": "10%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.0514,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    },
+    {
+        "Percentiles": "25%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.0515,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    },
+    {
+        "Percentiles": "50%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.0516,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    },
+    {
+        "Percentiles": "66%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.0517,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    },
+    {
+        "Percentiles": "75%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.0518,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    },
+    {
+        "Percentiles": "80%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.0518,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    },
+    {
+        "Percentiles": "90%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.0519,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    },
+    {
+        "Percentiles": "95%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.052,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    },
+    {
+        "Percentiles": "98%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.0523,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    },
+    {
+        "Percentiles": "99%",
+        "TTFT (s)": 0.8157,
+        "ITL (s)": 0.0523,
+        "TPOT (s)": 0.0516,
+        "Latency (s)": 7.3689,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 17.3704,
+        "Total (tok/s)": 296.3827
+    }
+]

+ 17 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_summary.json

@@ -0,0 +1,17 @@
+{
+    "Time taken for tests (s)": 7.3689,
+    "Number of concurrency": 1,
+    "Request rate (req/s)": -1,
+    "Total requests": 1,
+    "Succeed requests": 1,
+    "Failed requests": 0,
+    "Output token throughput (tok/s)": 17.3704,
+    "Total token throughput (tok/s)": 296.3827,
+    "Request throughput (req/s)": 0.1357,
+    "Average latency (s)": 7.3689,
+    "Average time to first token (s)": 0.8157,
+    "Average time per output token (s)": 0.0516,
+    "Average inter-token latency (s)": 0.0512,
+    "Average input tokens per request": 2056.0,
+    "Average output tokens per request": 128.0
+}

+ 65 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_args.json

@@ -0,0 +1,65 @@
+{
+    "model": "Qwen3.6-27B-W8A8",
+    "model_id": "Qwen3.6-27B-W8A8",
+    "attn_implementation": null,
+    "api": "openai",
+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
+    "port": 8877,
+    "url": "http://127.0.0.1:8004/v1/chat/completions",
+    "headers": {
+        "Authorization": "Bearer sk-123456"
+    },
+    "connect_timeout": null,
+    "read_timeout": null,
+    "total_timeout": 21600,
+    "api_key": "sk-123456",
+    "no_test_connection": false,
+    "number": 5,
+    "parallel": 5,
+    "rate": -1,
+    "sleep_interval": 5,
+    "sla_auto_tune": false,
+    "sla_variable": "parallel",
+    "sla_params": null,
+    "sla_num_runs": 3,
+    "sla_upper_bound": 65536,
+    "sla_lower_bound": 1,
+    "db_commit_interval": 1000,
+    "queue_size_multiplier": 5,
+    "in_flight_task_multiplier": 2,
+    "log_every_n_query": 10,
+    "debug": false,
+    "visualizer": null,
+    "wandb_api_key": null,
+    "swanlab_api_key": null,
+    "name": null,
+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5",
+    "no_timestamp": false,
+    "max_prompt_length": 2048,
+    "min_prompt_length": 2048,
+    "prefix_length": 0,
+    "prompt": null,
+    "query_template": null,
+    "apply_chat_template": true,
+    "image_width": 224,
+    "image_height": 224,
+    "image_format": "RGB",
+    "image_num": 1,
+    "image_patch_size": 28,
+    "dataset": "random",
+    "dataset_path": null,
+    "frequency_penalty": null,
+    "repetition_penalty": null,
+    "logprobs": null,
+    "max_tokens": 128,
+    "min_tokens": 128,
+    "n_choices": null,
+    "seed": null,
+    "stop": null,
+    "stop_token_ids": null,
+    "stream": true,
+    "temperature": 0.0,
+    "top_p": null,
+    "top_k": null,
+    "extra_args": {}
+}

BIN
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_data.db


+ 112 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_percentile.json

@@ -0,0 +1,112 @@
+[
+    {
+        "Percentiles": "10%",
+        "TTFT (s)": 0.6655,
+        "ITL (s)": 0.0295,
+        "TPOT (s)": 0.0299,
+        "Latency (s)": 6.5201,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.3128,
+        "Total (tok/s)": 329.6753
+    },
+    {
+        "Percentiles": "25%",
+        "TTFT (s)": 1.6872,
+        "ITL (s)": 0.0296,
+        "TPOT (s)": 0.0304,
+        "Latency (s)": 6.5483,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.4617,
+        "Total (tok/s)": 332.0644
+    },
+    {
+        "Percentiles": "50%",
+        "TTFT (s)": 2.7127,
+        "ITL (s)": 0.0297,
+        "TPOT (s)": 0.0304,
+        "Latency (s)": 6.5751,
+        "Input tokens": 2056,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.4674,
+        "Total (tok/s)": 332.1626
+    },
+    {
+        "Percentiles": "66%",
+        "TTFT (s)": 2.7127,
+        "ITL (s)": 0.0298,
+        "TPOT (s)": 0.0383,
+        "Latency (s)": 6.577,
+        "Input tokens": 2057,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.5469,
+        "Total (tok/s)": 333.6721
+    },
+    {
+        "Percentiles": "75%",
+        "TTFT (s)": 2.7127,
+        "ITL (s)": 0.0299,
+        "TPOT (s)": 0.0383,
+        "Latency (s)": 6.577,
+        "Input tokens": 2057,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.5469,
+        "Total (tok/s)": 333.6721
+    },
+    {
+        "Percentiles": "80%",
+        "TTFT (s)": 2.8297,
+        "ITL (s)": 0.0299,
+        "TPOT (s)": 0.0461,
+        "Latency (s)": 6.6277,
+        "Input tokens": 2057,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.6315,
+        "Total (tok/s)": 334.9623
+    },
+    {
+        "Percentiles": "90%",
+        "TTFT (s)": 2.8297,
+        "ITL (s)": 0.0301,
+        "TPOT (s)": 0.0461,
+        "Latency (s)": 6.6277,
+        "Input tokens": 2057,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.6315,
+        "Total (tok/s)": 334.9623
+    },
+    {
+        "Percentiles": "95%",
+        "TTFT (s)": 2.8297,
+        "ITL (s)": 0.0304,
+        "TPOT (s)": 0.0461,
+        "Latency (s)": 6.6277,
+        "Input tokens": 2057,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.6315,
+        "Total (tok/s)": 334.9623
+    },
+    {
+        "Percentiles": "98%",
+        "TTFT (s)": 2.8297,
+        "ITL (s)": 0.031,
+        "TPOT (s)": 0.0461,
+        "Latency (s)": 6.6277,
+        "Input tokens": 2057,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.6315,
+        "Total (tok/s)": 334.9623
+    },
+    {
+        "Percentiles": "99%",
+        "TTFT (s)": 2.8297,
+        "ITL (s)": 0.1171,
+        "TPOT (s)": 0.0461,
+        "Latency (s)": 6.6277,
+        "Input tokens": 2057,
+        "Output tokens": 128,
+        "Output (tok/s)": 19.6315,
+        "Total (tok/s)": 334.9623
+    }
+]

+ 17 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_summary.json

@@ -0,0 +1,17 @@
+{
+    "Time taken for tests (s)": 6.6303,
+    "Number of concurrency": 5,
+    "Request rate (req/s)": -1,
+    "Total requests": 5,
+    "Succeed requests": 5,
+    "Failed requests": 0,
+    "Output token throughput (tok/s)": 96.5268,
+    "Total token throughput (tok/s)": 1647.2903,
+    "Request throughput (req/s)": 0.7541,
+    "Average latency (s)": 6.5697,
+    "Average time to first token (s)": 2.1216,
+    "Average time per output token (s)": 0.035,
+    "Average inter-token latency (s)": 0.0348,
+    "Average input tokens per request": 2056.4,
+    "Average output tokens per request": 128.0
+}

+ 32 - 0
dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/performance_summary.txt

@@ -0,0 +1,32 @@
+╭──────────────────────────────────────────────────────────────────────────────╮
+│ Performance Test Summary Report                                              │
+╰──────────────────────────────────────────────────────────────────────────────╯
+
+Basic Information:
+┌───────────────────────┬──────────────────────────────────────────────────────┐
+│ Model                 │ Qwen3.6-27B-W8A8                                     │
+│ Test Dataset          │ random                                               │
+│ Total Generated       │ 2,048.0 tokens                                       │
+│ Total Test Time       │ 23.63 seconds                                        │
+│ Avg Output Rate       │ 86.67 tokens/sec                                     │
+│ Output Path           │ outputs/20260515_084106/Qwen3.6-27B-W8A8             │
+└───────────────────────┴──────────────────────────────────────────────────────┘
+
+
+                                    Detailed Performance Metrics                                    
+┏━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓
+┃      ┃      ┃      ┃     Avg ┃     P99 ┃     Avg ┃     P99 ┃     Avg ┃    P99 ┃    Gen. ┃ Success┃
+┃Conc. ┃ Rate ┃  RPS ┃ Lat.(s) ┃ Lat.(s) ┃ TTFT(s) ┃ TTFT(s) ┃ TPOT(s) ┃ TPOT(… ┃  toks/s ┃    Rate┃
+┡━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━┩
+│    1 │  INF │ 0.14 │   7.369 │   7.369 │   0.816 │   0.816 │   0.052 │  0.052 │   17.37 │  100.0%│
+│    5 │  INF │ 0.75 │   6.570 │   6.628 │   2.122 │   2.830 │   0.035 │  0.046 │   96.53 │  100.0%│
+│   10 │  INF │ 1.04 │   9.567 │   9.625 │   3.607 │   5.410 │   0.047 │  0.069 │  132.93 │  100.0%│
+└──────┴──────┴──────┴─────────┴─────────┴─────────┴─────────┴─────────┴────────┴─────────┴────────┘
+
+
+               Best Performance Configuration               
+ Highest RPS         Concurrency 10 (1.04 req/sec)          
+ Lowest Latency      Concurrency 5 (6.570 seconds)          
+
+Performance Recommendations:
+• The system seems not to have reached its performance bottleneck, try higher concurrency

BIN
dev/models/bench_suite/outputs_qwen3.6_27b-wa8a_0515.zip


BIN
dev/models/bench_suite/outputs_qwen3.6_27b_0515.zip


+ 158 - 0
dev/models/docker-compose.yaml

@@ -0,0 +1,158 @@
+version: '3.8'
+services:
+  qwen3.6-27b:
+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
+    container_name: qwen3.6-27b-w8a8-vllm #qwen3.6-27b-w8a8  Qwen3.6-27B-W8A8
+    stdin_open: true
+    tty: true
+    restart: unless-stopped
+    #network_mode: host
+    devices:
+      - "/dev/dri:/dev/dri"
+      - "/dev/mxcd:/dev/mxcd"
+      - "/dev/mem:/dev/mem"
+    group_add:
+      - "video"
+    privileged: true
+    security_opt:
+      - "apparmor=unconfined"
+      - "seccomp=unconfined"
+    shm_size: '100gb'
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    ports:
+      - "8004:30000"
+    environment:
+      - CUDA_VISIBLE_DEVICES=0,1
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+      - MACA_SMALL_PAGESIZE_ENABLE=1
+      - MACA_VLLM_ENABLE_MCTLASS_FUSED_MOE=1
+      - MACA_VLLM_ENABLE_MCTLASS_PYTHON_API=1
+    volumes:
+      - "/usr/local/:/usr/local/"
+      - "/pde_ai:/pde_ai"
+      - "/opt/lq/models:/model:ro"
+      - "~/.cache/huggingface:/root/.cache/huggingface"
+      - "/opt/lq/deploy_models/logs:/var/log/vllm"  # 日志目录映射
+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
+    command: >
+      sh -c "/opt/conda/bin/vllm serve /model/Qwen3.6-27B-W8A8 \
+      --served-model-name Qwen3.6-27B-W8A8 \
+      --host 0.0.0.0 \
+      --port 30000 \
+      --tensor-parallel-size 2 \
+      --max-num-batched-tokens 4096 \
+      --max-model-len 8192 \
+      --reasoning-parser qwen3 \
+      --enable-auto-tool-choice \
+      --tool-call-parser qwen3_coder \ 
+      --api-key sk-123456 \
+      2>&1 | tee /var/log/vllm/qwen3.6-27b-w8a8-server.log"
+
+
+
+  qwen3-embedding:
+    image: vllm-metax:lq
+    container_name: qwen3-embedding-vllm #qwen3-embedding
+    stdin_open: true
+    tty: true
+    restart: unless-stopped
+    #network_mode: host
+    devices:
+      - "/dev/dri:/dev/dri"
+      - "/dev/mxcd:/dev/mxcd"
+      - "/dev/mem:/dev/mem"
+    group_add:
+      - "video"
+    privileged: true
+    security_opt:
+      - "apparmor=unconfined"
+      - "seccomp=unconfined"
+    shm_size: '100gb'
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    ports:
+      - "9003:30000"
+    environment:
+      - CUDA_VISIBLE_DEVICES=2
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+      - VLLM_TORCH_COMPILE=0
+      - VLLM_DISABLE_TORCH_COMPILE=1
+      - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$
+      - MAX_JOBS=1
+    volumes:
+      - "/usr/local/:/usr/local/"
+      - "/pde_ai:/pde_ai"
+      - "/opt/lq/models:/model:ro"
+      - "~/.cache/huggingface:/root/.cache/huggingface"
+      - "/opt/lq/deploy_models/logs:/var/log/vllm"  # 日志目录映射
+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
+    command: >
+      sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Embedding-8B  \
+      --served-model-name Qwen3-Embedding-8B  \
+      --task embedding \
+      --host 0.0.0.0 \
+      --port 30000 \
+      --tensor-parallel-size 1 \
+      --max-num-batched-tokens 4096 \
+      --max-model-len 16384 \
+      --gpu-memory-utilization 0.45 \
+      --api-key sk-123456 \
+      2>&1 | tee /var/log/vllm/qwen3-embedding-server.log"
+
+
+  qwen3-reranker:
+    image: vllm-metax:lq
+    container_name: qwen3-reranker-vllm #qwen3-reranker
+    stdin_open: true
+    tty: true
+    restart: unless-stopped
+    #network_mode: host
+    devices:
+      - "/dev/dri:/dev/dri"
+      - "/dev/mxcd:/dev/mxcd"
+      - "/dev/mem:/dev/mem"
+    group_add:
+      - "video"
+    privileged: true
+    security_opt:
+      - "apparmor=unconfined"
+      - "seccomp=unconfined"
+    shm_size: '100gb'
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    ports:
+      - "9004:30000"
+    environment:
+      - CUDA_VISIBLE_DEVICES=3
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+      - VLLM_TORCH_COMPILE=0
+      - VLLM_DISABLE_TORCH_COMPILE=1
+      - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$
+      - MAX_JOBS=1
+    volumes:
+      - "/usr/local/:/usr/local/"
+      - "/pde_ai:/pde_ai"
+      - "/opt/lq/models:/model:ro"
+      - "~/.cache/huggingface:/root/.cache/huggingface"
+      - "/opt/lq/deploy_models/logs:/var/log/vllm"  # 日志目录映射
+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
+    command: >
+      sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Reranker-8B  \
+      --served-model-name Qwen3-Reranker-8B  \
+      --task score \
+      --host 0.0.0.0 \
+      --port 30000 \
+      --tensor-parallel-size 1 \
+      --max-num-batched-tokens 4096 \
+      --max-model-len 16384 \
+      --gpu-memory-utilization 0.45 \
+      --hf_overrides '{\"architectures\": [\"Qwen3ForSequenceClassification\"],\"classifier_from_token\": [\"no\", \"yes\"],\"is_original_qwen3_reranker\": true}' \
+      --api-key sk-123456 \
+      2>&1 | tee /var/log/vllm/qwen3-reranker-server.log"

+ 50 - 0
dev/models/docker-compose.yaml.bak

@@ -0,0 +1,50 @@
+version: '3.8'
+services:
+  qwen3.6-27b:
+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
+    container_name: qwen3.6-27b-w8a8-vllm #qwen3.6-27b-w8a8  Qwen3.6-27B-W8A8
+    stdin_open: true
+    tty: true
+    restart: unless-stopped
+    #network_mode: host
+    devices:
+      - "/dev/dri:/dev/dri"
+      - "/dev/mxcd:/dev/mxcd"
+      - "/dev/mem:/dev/mem"
+    group_add:
+      - "video"
+    privileged: true
+    security_opt:
+      - "apparmor=unconfined"
+      - "seccomp=unconfined"
+    shm_size: '100gb'
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    ports:
+      - "8004:30000"
+    environment:
+      - CUDA_VISIBLE_DEVICES=0,1
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+      - MACA_SMALL_PAGESIZE_ENABLE=1
+      - MACA_VLLM_ENABLE_MCTLASS_FUSED_MOE=1
+      - MACA_VLLM_ENABLE_MCTLASS_PYTHON_API=1
+    volumes:
+      - "/usr/local/:/usr/local/"
+      - "/pde_ai:/pde_ai"
+      - "/opt/lq/models:/model:ro"
+      - "~/.cache/huggingface:/root/.cache/huggingface"
+      - "/opt/lq/deploy_models/logs:/var/log/vllm"  # 日志目录映射
+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
+    command: >
+      sh -c "/opt/conda/bin/vllm serve /model/Qwen3.6-27B-W8A8 \
+      --served-model-name Qwen3.6-27B-W8A8 \
+      --host 0.0.0.0 \
+      --port 30000 \
+      --tensor-parallel-size 2 \
+      --max-num-batched-tokens 4096 \
+      --max-model-len 8192 \
+      --api-key sk-123456 \
+      2>&1 | tee /var/log/vllm/qwen3.6-27b-w8a8-server.log"
+

+ 29 - 0
dev/models/yaml_bak/docker-compose.yaml.nvidia

@@ -0,0 +1,29 @@
+services:
+  qwen3.6-27b:
+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
+    container_name: qwen3.6-27b-vllm
+    shm_size: '10gb'
+    ports:
+      - "8004:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /opt/lq/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /opt/lq/deploy_models/logs/logs:/var/log/vllm  # 日志目录映射
+      - /opt/lq/deploy_models/bench_suite:/bench_suite #脚本目录映射
+    environment:
+      - CUDA_VISIBLE_DEVICES=1
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+    command: >
+      sh -c "mkdir -p /var/log/vllm &&
+      python3 -m vllm.entrypoints.openai.api_server
+      --model-path /model/Qwen3.6-27B
+      --served-model-name Qwen3.6-27B
+      --host 0.0.0.0
+      --port 30000
+      --tensor-parallel-size 1
+      --max-num-batched-tokens 4096
+      --max-model-len 8192
+      --api-key sk-12345
+      --log-level info 2>&1 | tee /var/log/vllm/qwen3.6-27b-server.log"
+    ipc: host

+ 49 - 0
dev/models/yaml_bak/docker-compose.yaml.qwen3.6-27b-vllm

@@ -0,0 +1,49 @@
+version: '3.8'
+services:
+  qwen3.6-27b:
+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
+    container_name: qwen3.6-27b-vllm
+    stdin_open: true
+    tty: true
+    restart: unless-stopped
+    network_mode: host
+    devices:
+      - "/dev/dri:/dev/dri"
+      - "/dev/mxcd:/dev/mxcd"
+      - "/dev/mem:/dev/mem"
+    group_add:
+      - "video"
+    privileged: true
+    security_opt:
+      - "apparmor=unconfined"
+      - "seccomp=unconfined"
+    shm_size: '100gb'
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    ports:
+      - "8004:30000"
+    environment:
+      - CUDA_VISIBLE_DEVICES=1
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+    volumes:
+      - "/usr/local/:/usr/local/"
+      - "/pde_ai:/pde_ai"
+      - "/opt/lq/models:/model:ro"
+      - "~/.cache/huggingface:/root/.cache/huggingface"
+      - "/opt/lq/deploy_models/logs/logs:/var/log/vllm"  # 日志目录映射
+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
+    command: >
+      sh -c "mkdir -p /var/log/vllm &&
+      python3 -m vllm.entrypoints.openai.api_server
+      --model-path /model/Qwen3.6-27B
+      --served-model-name Qwen3.6-27B
+      --host 0.0.0.0
+      --port 30000
+      --tensor-parallel-size 1
+      --max-num-batched-tokens 4096
+      --max-model-len 8192
+      --api-key sk-123456
+      --log-level info 2>&1 | tee /var/log/vllm/qwen3.6-27b-server.log" 
+

+ 30 - 0
dev/models/yaml_bak/docker-compose.yaml.test_vllm-metax

@@ -0,0 +1,30 @@
+version: '3.8'
+services:
+  test_vllm-metax:
+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
+    container_name: test_vllm-metax
+    command: tail -f /dev/null
+    stdin_open: true
+    tty: true
+    restart: unless-stopped
+    network_mode: host
+    devices:
+      - "/dev/dri:/dev/dri"
+      - "/dev/mxcd:/dev/mxcd"
+      - "/dev/mem:/dev/mem"
+    group_add:
+      - "video"
+    privileged: true
+    security_opt:
+      - "apparmor=unconfined"
+      - "seccomp=unconfined"
+    shm_size: '100gb'
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    volumes:
+      - "/usr/local/:/usr/local/"
+      - "/pde_ai:/pde_ai"
+    
+

+ 69 - 0
prod/models/glm_ocr/docker-compose.yml

@@ -0,0 +1,69 @@
+services:
+  glm-ocr-vllm-docker:
+    image: vllm/vllm-openai:nightly
+    container_name: glm-ocr-vllm
+    runtime: nvidia
+    restart: unless-stopped
+    environment:
+      - PYTHONUNBUFFERED=1
+      - PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
+      - PIP_TRUSTED_HOST=pypi.tuna.tsinghua.edu.cn
+      - PYTHONPATH=/opt/pip-packages:/usr/local/lib/python3.12/dist-packages
+    ports:
+      - "25429:30000"
+    volumes:
+      - /data/app_workspace/models/GLM-OCR:/models/GLM-OCR:ro
+      - /data/app_workspace/glm-ocr/logs-docker:/var/log/vllm
+      - /data/app_workspace/glm-ocr/pip-packages:/opt/pip-packages:rw
+      - /data/app_workspace/glm-ocr/pip-cache:/root/.cache/pip:rw
+    entrypoint: ["/bin/bash", "-c"]
+    command:
+      - |
+        # 关键修改:优先检查缓存,只在需要时安装
+        echo "=== 检查 transformers 版本 ==="
+        if python3 -c "import transformers; assert transformers.__version__ >= '5.3.0', f'需要 transformers>=5.3.0,当前版本 {transformers.__version__}'" 2>/dev/null; then
+          echo "✅ 使用已安装的 transformers $(python3 -c 'import transformers; print(transformers.__version__)')"
+        else
+          echo "⚠️ 安装或更新 transformers>=5.3.0..."
+          pip3 install "transformers>=5.3.0" \
+            --target /opt/pip-packages \
+            --root-user-action=ignore \
+            -q 2>&1 | tail -5
+        fi
+        
+        echo "=== 启动 vLLM ==="
+        python3 -m vllm.entrypoints.openai.api_server \
+          --model /models/GLM-OCR \
+          --served-model-name GLM-OCR \
+          --host 0.0.0.0 \
+          --port 30000 \
+          --api-key sk_prod_sXgHYxfVvZdw7O-cki6i7Cp2TbguOvbA_f4beb12a \
+          --gpu-memory-utilization 0.60 \
+          --max-model-len 4096 \
+          --max-num-seqs 96 \
+          --enable-prefix-caching \
+          --trust-remote-code \
+          --allowed-local-media-path / \
+          --dtype bfloat16
+    ulimits:
+      memlock: 67108864
+      stack: 67108864
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["6"]
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:30000/health -H 'Authorization: Bearer sk_prod_sXgHYxfVvZdw7O-cki6i7Cp2TbguOvbA_f4beb12a' || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 180s
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "500m"
+        max-file: "3"

+ 51 - 46
prod/models/sglang/docker-compose.yaml

@@ -3,7 +3,7 @@ services:
     image: lmsysorg/sglang:latest
     container_name: qwen3.5-122b-sglang
     runtime: nvidia
-    shm_size: '10gb'
+    shm_size: '200gb'
     ports:
       - "25423:30000"
     volumes:
@@ -11,6 +11,7 @@ services:
       - /data/app_workspace/models:/model:ro
       - ~/.cache/huggingface:/root/.cache/huggingface
       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
     environment:
       - CUDA_VISIBLE_DEVICES 
       - PYTHONUNBUFFERED=1  # 确保实时输出
@@ -18,10 +19,11 @@ services:
       sh -c "mkdir -p /var/log/sglang &&
       python3 -m sglang.launch_server
       --model-path /model/Qwen3.5-122B-A10B
-      --tp 2
+      --tp 4
       --host 0.0.0.0
       --port 30000
-      --api-key lq123456
+      --api-key sk-prod_ojkjwcO4TTd9TL3vK6uo8a2Dvcdoz64u_9a89845f
+      --mem-fraction-static 0.95
       --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122b-server.log"
     ipc: host
     deploy:
@@ -29,154 +31,157 @@ services:
         reservations:
           devices:
             - driver: nvidia
-              device_ids: ["0","1"]  # Modify for multiple GPUs: ["0", "1"]
+              device_ids: ["0","1","2","3"]  # Modify for multiple GPUs: ["0", "1"]
               #count: all
               capabilities: [gpu]
 
-  qwen3-8b:
+
+  qwen3-embedding-8b:
     image: lmsysorg/sglang:latest
-    container_name: qwen3-8b-sglang
+    container_name: qwen3-embedding-8b-sglang
     runtime: nvidia
-    shm_size: '10gb'
+    shm_size: '100gb'
     ports:
-      - "25424:30000"
+      - "25425:30000"
     volumes:
       # # 宿主机路径:容器内路径
       - /data/app_workspace/models:/model:ro
       - ~/.cache/huggingface:/root/.cache/huggingface
       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
     environment:
       - CUDA_VISIBLE_DEVICES 
       - PYTHONUNBUFFERED=1  # 确保实时输出
     command: >
       sh -c "mkdir -p /var/log/sglang &&
       python3 -m sglang.launch_server
-      --model-path /model/Qwen3-8B
+      --model-path /model/Qwen3-Embedding-8B
+      --is-embedding
       --tp 1
       --host 0.0.0.0
       --port 30000
-      --api-key lq123456
+      --api-key sk_prod_3HDoVka8mU8Jqj9Xnmfkn8bxk5kmzKrz_700c186f
       --mem-fraction-static 0.45
-      --log-level info 2>&1 | tee /var/log/sglang/qwen3-8b-server.log"
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log"
     ipc: host
     deploy:
       resources:
         reservations:
           devices:
             - driver: nvidia
-              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
+              device_ids: ["5"]  # Modify for multiple GPUs: ["0", "1"]
               #count: all
               capabilities: [gpu]
     healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:30000/v1/models", "-H", "Authorization: Bearer lq123456"]
+      test: ["CMD", "curl", "-f", "http://localhost:30000/v1/embeddings", "-H", "Authorization: Bearer sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615", "-H", "Content-Type: application/json", "-d", "{\"input\": \"health\"}"]
       interval: 10s
       timeout: 5s
       retries: 30
       start_period: 60s
 
-  qwen3-embedding-8b:
+  qwen3-reranker-8b:
     image: lmsysorg/sglang:latest
-    container_name: qwen3-embedding-8b-sglang
+    container_name: qwen3-reranker-8b-sglang
     runtime: nvidia
-    shm_size: '5gb'
+    shm_size: '100gb'
     ports:
-      - "25425:30000"
+      - "25426:30000"
     volumes:
       # # 宿主机路径:容器内路径
       - /data/app_workspace/models:/model:ro
       - ~/.cache/huggingface:/root/.cache/huggingface
       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+      - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro
+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
     environment:
       - CUDA_VISIBLE_DEVICES 
       - PYTHONUNBUFFERED=1  # 确保实时输出
     command: >
       sh -c "mkdir -p /var/log/sglang &&
       python3 -m sglang.launch_server
-      --model-path /model/Qwen3-Embedding-8B
-      --is-embedding
+      --model-path /model/Qwen3-Reranker-8B
       --tp 1
       --host 0.0.0.0
       --port 30000
-      --api-key lq123456
-      --mem-fraction-static 0.45
-      --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log"
+      --api-key sk_prod_dvgYHKWFoQlYAKmkIvBSyuguNSQGeNh0_23c65608
+      --mem-fraction-static 0.50
+      --disable-radix-cache
+      --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log"
     ipc: host
     deploy:
       resources:
         reservations:
           devices:
             - driver: nvidia
-              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
+              device_ids: ["5"]  # Modify for multiple GPUs: ["0", "1"]
               #count: all
               capabilities: [gpu]
     depends_on:
-      qwen3-8b:
-        condition: service_healthy  # 等待 qwen3-8b 健康检查通过
-
+      qwen3-embedding-8b:
+        condition: service_healthy  # 等待 qwen3-embedding-8b 健康检查通过
 
-  qwen3-reranker-8b:
+  qwen3.5-35b:
     image: lmsysorg/sglang:latest
-    container_name: qwen3-reranker-8b-sglang
+    container_name: qwen3.5-35b-sglang
     runtime: nvidia
-    shm_size: '5gb'
+    shm_size: '100gb'
     ports:
-      - "25426:30000"
+      - "25427:30000"
     volumes:
       # # 宿主机路径:容器内路径
       - /data/app_workspace/models:/model:ro
       - ~/.cache/huggingface:/root/.cache/huggingface
       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
-      - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro
+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
     environment:
       - CUDA_VISIBLE_DEVICES 
       - PYTHONUNBUFFERED=1  # 确保实时输出
     command: >
       sh -c "mkdir -p /var/log/sglang &&
       python3 -m sglang.launch_server
-      --model-path /model/Qwen3-Reranker-8B
+      --model-path /model/Qwen3.5-35B-A3B
       --tp 1
       --host 0.0.0.0
       --port 30000
-      --api-key lq123456
-      --mem-fraction-static 0.50
-      --disable-radix-cache
-      --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja
-      --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log"
+      --api-key sk_prod_0NuLZt1a2UrD80F9iB-GTxOIuAkJSZxH_5522d7ae
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log"
     ipc: host
     deploy:
       resources:
         reservations:
           devices:
             - driver: nvidia
-              device_ids: ["3"]  # Modify for multiple GPUs: ["0", "1"]
+              device_ids: ["7"]  # Modify for multiple GPUs: ["0", "1"]
               #count: all
               capabilities: [gpu]
 
 
-  qwen3.5-35b:
+  qwen3.6-27b:
     image: lmsysorg/sglang:latest
-    container_name: qwen3.5-35b-sglang
+    container_name: qwen3.6-27b-sglang
     runtime: nvidia
-    shm_size: '5gb'
+    shm_size: '100gb'
     ports:
-      - "25427:30000"
+      - "25424:30000"
     volumes:
       # # 宿主机路径:容器内路径
       - /data/app_workspace/models:/model:ro
       - ~/.cache/huggingface:/root/.cache/huggingface
       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
     environment:
-      - CUDA_VISIBLE_DEVICES 
+      - CUDA_VISIBLE_DEVICES
       - PYTHONUNBUFFERED=1  # 确保实时输出
     command: >
       sh -c "mkdir -p /var/log/sglang &&
       python3 -m sglang.launch_server
-      --model-path /model/Qwen3.5-35B-A3B
+      --model-path /model/Qwen3.6-27B
       --tp 1
       --host 0.0.0.0
       --port 30000
-      --api-key lq123456
-      --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log"
+      --api-key sk_prod_HH21x5WB9Pm7IM9Bf808BoJPEn_4bPX5_f2c5f3f6
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3.6-27b-server.log"
     ipc: host
     deploy:
       resources:

+ 188 - 0
prod/models/sglang/docker-compose_20260518.yaml

@@ -0,0 +1,188 @@
+services:
+  qwen3.5-122b:
+    image: lmsysorg/sglang:latest
+    container_name: qwen3.5-122b-sglang
+    runtime: nvidia
+    shm_size: '10gb'
+    ports:
+      - "25423:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+    environment:
+      - CUDA_VISIBLE_DEVICES 
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+    command: >
+      sh -c "mkdir -p /var/log/sglang &&
+      python3 -m sglang.launch_server
+      --model-path /model/Qwen3.5-122B-A10B
+      --tp 2
+      --host 0.0.0.0
+      --port 30000
+      --api-key lq123456
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122b-server.log"
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0","1"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]
+
+  qwen3-8b:
+    image: lmsysorg/sglang:latest
+    container_name: qwen3-8b-sglang
+    runtime: nvidia
+    shm_size: '10gb'
+    ports:
+      - "25424:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+    environment:
+      - CUDA_VISIBLE_DEVICES 
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+    command: >
+      sh -c "mkdir -p /var/log/sglang &&
+      python3 -m sglang.launch_server
+      --model-path /model/Qwen3-8B
+      --tp 1
+      --host 0.0.0.0
+      --port 30000
+      --api-key lq123456
+      --mem-fraction-static 0.45
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-8b-server.log"
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:30000/v1/models", "-H", "Authorization: Bearer lq123456"]
+      interval: 10s
+      timeout: 5s
+      retries: 30
+      start_period: 60s
+
+  qwen3-embedding-8b:
+    image: lmsysorg/sglang:latest
+    container_name: qwen3-embedding-8b-sglang
+    runtime: nvidia
+    shm_size: '5gb'
+    ports:
+      - "25425:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+    environment:
+      - CUDA_VISIBLE_DEVICES 
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+    command: >
+      sh -c "mkdir -p /var/log/sglang &&
+      python3 -m sglang.launch_server
+      --model-path /model/Qwen3-Embedding-8B
+      --is-embedding
+      --tp 1
+      --host 0.0.0.0
+      --port 30000
+      --api-key lq123456
+      --mem-fraction-static 0.45
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log"
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]
+    depends_on:
+      qwen3-8b:
+        condition: service_healthy  # 等待 qwen3-8b 健康检查通过
+
+
+  qwen3-reranker-8b:
+    image: lmsysorg/sglang:latest
+    container_name: qwen3-reranker-8b-sglang
+    runtime: nvidia
+    shm_size: '5gb'
+    ports:
+      - "25426:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+      - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro
+    environment:
+      - CUDA_VISIBLE_DEVICES 
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+    command: >
+      sh -c "mkdir -p /var/log/sglang &&
+      python3 -m sglang.launch_server
+      --model-path /model/Qwen3-Reranker-8B
+      --tp 1
+      --host 0.0.0.0
+      --port 30000
+      --api-key lq123456
+      --mem-fraction-static 0.50
+      --disable-radix-cache
+      --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log"
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["3"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]
+
+
+  qwen3.5-35b:
+    image: lmsysorg/sglang:latest
+    container_name: qwen3.5-35b-sglang
+    runtime: nvidia
+    shm_size: '5gb'
+    ports:
+      - "25427:30000"
+    volumes:
+      # # 宿主机路径:容器内路径
+      - /data/app_workspace/models:/model:ro
+      - ~/.cache/huggingface:/root/.cache/huggingface
+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
+    environment:
+      - CUDA_VISIBLE_DEVICES 
+      - PYTHONUNBUFFERED=1  # 确保实时输出
+    command: >
+      sh -c "mkdir -p /var/log/sglang &&
+      python3 -m sglang.launch_server
+      --model-path /model/Qwen3.5-35B-A3B
+      --tp 1
+      --host 0.0.0.0
+      --port 30000
+      --api-key lq123456
+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log"
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["4"]  # Modify for multiple GPUs: ["0", "1"]
+              #count: all
+              capabilities: [gpu]

+ 31 - 16
prod/models/sglang/test_models.sh

@@ -9,11 +9,12 @@ RED='\033[0;31m'
 YELLOW='\033[1;33m'
 NC='\033[0m'
 
-# 模型配置(按顺序定义)
-MODEL_NAMES=("qwen3-8b" "qwen3.5-35b" "qwen3.5-122b" "qwen3-embedding-8b" "qwen3-reranker-8b")
+# 模型配置(按顺序定义) "qwen3-8b" 
+MODEL_NAMES=("qwen3.6-27b" "qwen3.5-35b" "qwen3.5-122b" "qwen3-embedding-8b" "qwen3-reranker-8b")
 
 declare -A MODEL_PORTS=(
-    ["qwen3-8b"]="25424"
+    ["qwen3-8b"]="25428"
+    ["qwen3.6-27b"]="25424"
     ["qwen3.5-35b"]="25427"
     ["qwen3.5-122b"]="25423"
     ["qwen3-embedding-8b"]="25425"
@@ -21,22 +22,32 @@ declare -A MODEL_PORTS=(
 )
 
 declare -A MODEL_PATHS=(
-    ["qwen3-8b"]="/model/Qwen3-8B"
-    ["qwen3.5-35b"]="/model/Qwen3.5-35B"
-    ["qwen3.5-122b"]="/model/Qwen3.5-122B-A10B"
-    ["qwen3-embedding-8b"]="/model/Qwen3-Embedding-8B"
-    ["qwen3-reranker-8b"]="/model/Qwen3-Reranker-8B"
+    ["qwen3-8b"]="Qwen/Qwen3-8B"
+    ["qwen3.6-27b"]="Qwen/Qwen3.6-27B"
+    ["qwen3.5-35b"]="Qwen/Qwen3.5-35B-A3B"
+    ["qwen3.5-122b"]="Qwen/Qwen3.5-122B-A10B"
+    ["qwen3-embedding-8b"]="Qwen/Qwen3-Embedding-8B"
+    ["qwen3-reranker-8b"]="Qwen/Qwen3-Reranker-8B"
 )
 
 declare -A MODEL_TYPES=(
     ["qwen3-8b"]="chat"
+    ["qwen3.6-27b"]="chat"
     ["qwen3.5-35b"]="chat"
     ["qwen3.5-122b"]="chat"
     ["qwen3-embedding-8b"]="embedding"
     ["qwen3-reranker-8b"]="rerank"
 )
 
-API_KEY="lq123456"
+declare -A API_KEYS=(
+    ["qwen3-8b"]="sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615"
+    ["qwen3.6-27b"]="sk_prod_HH21x5WB9Pm7IM9Bf808BoJPEn_4bPX5_f2c5f3f6"
+    ["qwen3.5-35b"]="sk_prod_0NuLZt1a2UrD80F9iB-GTxOIuAkJSZxH_5522d7ae"
+    ["qwen3.5-122b"]="sk-prod_ojkjwcO4TTd9TL3vK6uo8a2Dvcdoz64u_9a89845f"
+    ["qwen3-embedding-8b"]="sk_prod_3HDoVka8mU8Jqj9Xnmfkn8bxk5kmzKrz_700c186f"
+    ["qwen3-reranker-8b"]="sk_prod_dvgYHKWFoQlYAKmkIvBSyuguNSQGeNh0_23c65608"
+)
+
 TIMEOUT=30
 
 echo "========================================"
@@ -52,6 +63,7 @@ test_chat_model() {
     local name=$1
     local port=$2
     local model_path=$3
+    local api_key=${API_KEYS[$name]}
     
     echo ""
     echo "----------------------------------------"
@@ -66,7 +78,7 @@ test_chat_model() {
     response=$(curl -s -w "\n%{http_code}" \
         --max-time $TIMEOUT \
         -H "Content-Type: application/json" \
-        -H "Authorization: Bearer $API_KEY" \
+        -H "Authorization: Bearer $api_key" \
         -d "{
             \"model\": \"$model_path\",
             \"messages\": [{\"role\": \"user\", \"content\": \"你好,请用一句话介绍自己\"}],
@@ -95,6 +107,7 @@ test_embedding_model() {
     local name=$1
     local port=$2
     local model_path=$3
+    local api_key=${API_KEYS[$name]}
     
     echo ""
     echo "----------------------------------------"
@@ -109,7 +122,7 @@ test_embedding_model() {
     response=$(curl -s -w "\n%{http_code}" \
         --max-time $TIMEOUT \
         -H "Content-Type: application/json" \
-        -H "Authorization: Bearer $API_KEY" \
+        -H "Authorization: Bearer $api_key" \
         -d "{
             \"model\": \"$model_path\",
             \"input\": [\"你好,这是一个测试句子\", \"Hello world\"]
@@ -130,7 +143,7 @@ test_embedding_model() {
         response=$(curl -s -w "\n%{http_code}" \
             --max-time $TIMEOUT \
             -H "Content-Type: application/json" \
-            -H "Authorization: Bearer $API_KEY" \
+            -H "Authorization: Bearer $api_key" \
             -d "{
                 \"model\": \"$model_path\",
                 \"query\": \"测试查询\",
@@ -154,6 +167,7 @@ test_rerank_model() {
     local name=$1
     local port=$2
     local model_path=$3
+    local api_key=${API_KEYS[$name]}
     
     echo ""
     echo "----------------------------------------"
@@ -168,7 +182,7 @@ test_rerank_model() {
     response=$(curl -s -w "\n%{http_code}" \
         --max-time $TIMEOUT \
         -H "Content-Type: application/json" \
-        -H "Authorization: Bearer $API_KEY" \
+        -H "Authorization: Bearer $api_key" \
         -d "{
             \"model\": \"$model_path\",
             \"query\": \"什么是机器学习\",
@@ -191,7 +205,7 @@ test_rerank_model() {
         response=$(curl -s -w "\n%{http_code}" \
             --max-time $TIMEOUT \
             -H "Content-Type: application/json" \
-            -H "Authorization: Bearer $API_KEY" \
+            -H "Authorization: Bearer $api_key" \
             -d "{
                 \"model\": \"$model_path\",
                 \"messages\": [{\"role\": \"user\", \"content\": \"你好\"}],
@@ -219,11 +233,12 @@ quick_check() {
     
     for key in "${MODEL_NAMES[@]}"; do
         local port=${MODEL_PORTS[$key]}
-        
+        local api_key=${API_KEYS[$key]}
+
         local code
         code=$(curl -s -o /dev/null -w "%{http_code}" \
             --max-time 5 \
-            -H "Authorization: Bearer $API_KEY" \
+            -H "Authorization: Bearer $api_key" \
             "http://localhost:$port/v1/models" 2>/dev/null || echo "000")
         
         if [ "$code" = "200" ]; then

+ 284 - 0
prod/models/sglang/test_models_20260518.sh

@@ -0,0 +1,284 @@
+#!/bin/bash
+
+# SGLang 多模型 curl 测试脚本
+# 移除 set -e,避免遇到错误就终止
+
+# 颜色定义
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+# 模型配置(按顺序定义)
+MODEL_NAMES=("qwen3-8b" "qwen3.5-35b" "qwen3.5-122b" "qwen3-embedding-8b" "qwen3-reranker-8b")
+
+declare -A MODEL_PORTS=(
+    ["qwen3-8b"]="25424"
+    ["qwen3.5-35b"]="25427"
+    ["qwen3.5-122b"]="25423"
+    ["qwen3-embedding-8b"]="25425"
+    ["qwen3-reranker-8b"]="25426"
+)
+
+declare -A MODEL_PATHS=(
+    ["qwen3-8b"]="/model/Qwen3-8B"
+    ["qwen3.5-35b"]="/model/Qwen3.5-35B"
+    ["qwen3.5-122b"]="/model/Qwen3.5-122B-A10B"
+    ["qwen3-embedding-8b"]="/model/Qwen3-Embedding-8B"
+    ["qwen3-reranker-8b"]="/model/Qwen3-Reranker-8B"
+)
+
+declare -A MODEL_TYPES=(
+    ["qwen3-8b"]="chat"
+    ["qwen3.5-35b"]="chat"
+    ["qwen3.5-122b"]="chat"
+    ["qwen3-embedding-8b"]="embedding"
+    ["qwen3-reranker-8b"]="rerank"
+)
+
+API_KEY="lq123456"
+TIMEOUT=30
+
+echo "========================================"
+echo "SGLang 多模型健康检查 (curl)"
+echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
+echo "========================================"
+
+TOTAL=0
+SUCCESS=0
+
+# 测试对话模型
+test_chat_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (对话模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    local body
+    local code
+    
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"messages\": [{\"role\": \"user\", \"content\": \"你好,请用一句话介绍自己\"}],
+            \"temperature\": 0.7,
+            \"max_tokens\": 50
+        }" \
+        "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
+    
+    body=$(echo "$response" | head -n -1)
+    code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local content=$(echo "$body" | grep -o '"content":"[^"]*"' | head -1 | cut -d'"' -f4)
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "回复: ${content:0:100}..."
+        ((SUCCESS++))
+    else
+        echo -e "${RED}❌ 失败${NC} HTTP $code"
+        echo "响应: ${body:0:200}"
+    fi
+    ((TOTAL++))
+}
+
+# 测试嵌入模型
+test_embedding_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (嵌入模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    local body
+    local code
+    
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"input\": [\"你好,这是一个测试句子\", \"Hello world\"]
+        }" \
+        "http://localhost:$port/v1/embeddings" 2>/dev/null || echo -e "\n000")
+    
+    body=$(echo "$response" | head -n -1)
+    code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local dims=$(echo "$body" | grep -o '"embedding":\[[^]]*\]' | head -1 | grep -o ',' | wc -l)
+        dims=$((dims + 1))
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "向量维度: $dims"
+        ((SUCCESS++))
+    else
+        echo -e "${YELLOW}⚠️  Embedding 接口失败,尝试 Rerank 接口...${NC}"
+        response=$(curl -s -w "\n%{http_code}" \
+            --max-time $TIMEOUT \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer $API_KEY" \
+            -d "{
+                \"model\": \"$model_path\",
+                \"query\": \"测试查询\",
+                \"documents\": [\"文档1\", \"文档2\"]
+            }" \
+            "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
+        
+        code=$(echo "$response" | tail -n 1)
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅ 成功${NC} (Rerank 接口可用)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌ 失败${NC} HTTP $code"
+        fi
+    fi
+    ((TOTAL++))
+}
+
+# 测试重排序模型
+test_rerank_model() {
+    local name=$1
+    local port=$2
+    local model_path=$3
+    
+    echo ""
+    echo "----------------------------------------"
+    echo "测试模型: $name (重排序模型)"
+    echo "端口: $port"
+    echo "----------------------------------------"
+    
+    local response
+    local body
+    local code
+    
+    response=$(curl -s -w "\n%{http_code}" \
+        --max-time $TIMEOUT \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{
+            \"model\": \"$model_path\",
+            \"query\": \"什么是机器学习\",
+            \"documents\": [\"机器学习是AI的分支\", \"Python是编程语言\", \"深度学习使用神经网络\"],
+            \"top_n\": 2
+        }" \
+        "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
+    
+    body=$(echo "$response" | head -n -1)
+    code=$(echo "$response" | tail -n 1)
+    
+    if [ "$code" = "200" ]; then
+        local top_doc=$(echo "$body" | grep -o '"text":"[^"]*"' | head -1 | cut -d'"' -f4)
+        local score=$(echo "$body" | grep -o '"score":[0-9.]*' | head -1 | cut -d':' -f2)
+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
+        echo "Top1: ${top_doc:0:50}... (得分: $score)"
+        ((SUCCESS++))
+    else
+        echo -e "${YELLOW}⚠️  Rerank 接口失败,尝试 Chat 接口...${NC}"
+        response=$(curl -s -w "\n%{http_code}" \
+            --max-time $TIMEOUT \
+            -H "Content-Type: application/json" \
+            -H "Authorization: Bearer $API_KEY" \
+            -d "{
+                \"model\": \"$model_path\",
+                \"messages\": [{\"role\": \"user\", \"content\": \"你好\"}],
+                \"max_tokens\": 20
+            }" \
+            "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
+        
+        code=$(echo "$response" | tail -n 1)
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅ 成功${NC} (Chat 接口可用)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌ 失败${NC} HTTP $code"
+        fi
+    fi
+    ((TOTAL++))
+}
+
+# 快速检查
+quick_check() {
+    echo ""
+    echo "========================================"
+    echo "快速检查模式"
+    echo "========================================"
+    
+    for key in "${MODEL_NAMES[@]}"; do
+        local port=${MODEL_PORTS[$key]}
+        
+        local code
+        code=$(curl -s -o /dev/null -w "%{http_code}" \
+            --max-time 5 \
+            -H "Authorization: Bearer $API_KEY" \
+            "http://localhost:$port/v1/models" 2>/dev/null || echo "000")
+        
+        if [ "$code" = "200" ]; then
+            echo -e "${GREEN}✅${NC} $key (端口 $port)"
+            ((SUCCESS++))
+        else
+            echo -e "${RED}❌${NC} $key (端口 $port) HTTP $code"
+        fi
+        ((TOTAL++))
+    done
+}
+
+# 主函数
+main() {
+    if [ "$1" = "--quick" ]; then
+        quick_check
+    elif [ "$1" = "--model" ] && [ -n "$2" ]; then
+        local key=$2
+        local port=${MODEL_PORTS[$key]}
+        local path=${MODEL_PATHS[$key]}
+        local mtype=${MODEL_TYPES[$key]}
+        
+        case $mtype in
+            chat) test_chat_model "$key" "$port" "$path" ;;
+            embedding) test_embedding_model "$key" "$port" "$path" ;;
+            rerank) test_rerank_model "$key" "$port" "$path" ;;
+        esac
+    else
+        # 按顺序测试所有模型
+        for key in "${MODEL_NAMES[@]}"; do
+            local port=${MODEL_PORTS[$key]}
+            local path=${MODEL_PATHS[$key]}
+            local mtype=${MODEL_TYPES[$key]}
+            
+            case $mtype in
+                chat) test_chat_model "$key" "$port" "$path" ;;
+                embedding) test_embedding_model "$key" "$port" "$path" ;;
+                rerank) test_rerank_model "$key" "$port" "$path" ;;
+            esac
+        done
+    fi
+    
+    echo ""
+    echo "========================================"
+    echo "测试结果摘要"
+    echo "========================================"
+    echo "总计: $SUCCESS / $TOTAL 个模型正常"
+    
+    if [ $SUCCESS -eq $TOTAL ]; then
+        echo -e "${GREEN}所有模型运行正常!${NC}"
+        exit 0
+    else
+        echo -e "${RED}部分模型异常,请检查日志${NC}"
+        exit 1
+    fi
+}
+
+main "$@"

+ 17 - 0
prod/models/sglang/需求文档.md

@@ -0,0 +1,17 @@
+
+
+
+
+
+
+### 根据最新的信息修改脚本 test_models.sh
+    - 增加模型测试 qwen3.6-27b
+      - 模型名称: Qwen3.6-27B
+      - 端口号:   25424
+    - 修改不同模型不同的API KEY
+      - Qwen/Qwen3.5-122B-A10B  sk-prod_ojkjwcO4TTd9TL3vK6uo8a2Dvcdoz64u_9a89845f
+      - Qwen/Qwen3-Embedding-8B sk_prod_3HDoVka8mU8Jqj9Xnmfkn8bxk5kmzKrz_700c186f
+      - Qwen/Qwen3-Reranker-8B  sk_prod_dvgYHKWFoQlYAKmkIvBSyuguNSQGeNh0_23c65608
+      - Qwen/Qwen3.5-35B-A3B    sk_prod_0NuLZt1a2UrD80F9iB-GTxOIuAkJSZxH_5522d7ae
+      - Qwen/Qwen3.6-27B        sk_prod_HH21x5WB9Pm7IM9Bf808BoJPEn_4bPX5_f2c5f3f6
+      - Qwen/Qwen3-8B           sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615