vor 1 Woche · dbf13b5d37
--- a/dev/minerU-dev/Dockerfile
+++ b/dev/minerU-dev/Dockerfile
@@ -0,0 +1,25 @@
 
				+# Use DaoCloud mirrored vllm image for China region for gpu with Volta、Turing、Ampere、Ada Lovelace、Hopper、Blackwell architecture (7.0 <= Compute Capability <= 12.0)
			
 
				+# Compute Capability version query (https://developer.nvidia.com/cuda-gpus)
			
 
				+# support x86_64 architecture and ARM(AArch64) architecture
			
 
				+FROM docker.m.daocloud.io/vllm/vllm-openai:v0.11.2
			
 
				+
			
 
				+# Install libgl for opencv support & Noto fonts for Chinese characters
			
 
				+RUN apt-get update && \
			
 
				+    apt-get install -y \
			
 
				+        fonts-noto-core \
			
 
				+        fonts-noto-cjk \
			
 
				+        fontconfig \
			
 
				+        libgl1 && \
			
 
				+    fc-cache -fv && \
			
 
				+    apt-get clean && \
			
 
				+    rm -rf /var/lib/apt/lists/*
			
 
				+
			
 
				+# Install mineru latest
			
 
				+RUN python3 -m pip install -U 'mineru[core]>=3.0.0' -i https://mirrors.aliyun.com/pypi/simple --break-system-packages && \
			
 
				+    python3 -m pip cache purge
			
 
				+
			
 
				+# Download models and update the configuration file
			
 
				+RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
			
 
				+
			
 
				+# Set the entry point to activate the virtual environment and run the command line tool
			
 
				+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
			
--- a/dev/minerU-dev/compose.yaml
+++ b/dev/minerU-dev/compose.yaml
@@ -0,0 +1,155 @@
 
				+services:
			
 
				+  mineru-openai-server:
			
 
				+    image: mineru:latest
			
 
				+    container_name: mineru-openai-server
			
 
				+    restart: always
			
 
				+    profiles: ["openai-server"]
			
 
				+    ports:
			
 
				+      - 30000:30000
			
 
				+    environment:
			
 
				+      MINERU_MODEL_SOURCE: local
			
 
				+    entrypoint: mineru-openai-server
			
 
				+    command:
			
 
				+      --host 0.0.0.0
			
 
				+      --port 30000
			
 
				+      # --gpu-memory-utilization 0.5  # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
			
 
				+    ulimits:
			
 
				+      memlock: -1
			
 
				+      stack: 67108864
			
 
				+    ipc: host
			
 
				+    healthcheck:
			
 
				+      test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"]
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["0"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              capabilities: [gpu]
			
 
				+
			
 
				+  mineru-api:
			
 
				+    image: mineru:latest
			
 
				+    container_name: mineru-api
			
 
				+    restart: always
			
 
				+    profiles: ["api"]
			
 
				+    ports:
			
 
				+      - 23428:8000
			
 
				+    environment:
			
 
				+      #MINERU_MODEL_SOURCE: local
			
 
				+       # 模型源：与 --source modelscope 保持一致
			
 
				+       - MINERU_MODEL_SOURCE=modelscope
			
 
				+       # 模型缓存路径（容器内）
			
 
				+       - MODELSCOPE_CACHE=/root/.cache/modelscope
			
 
				+       - MINERU_CACHE_DIR=/root/.cache/mineru
			
 
				+       # Transformers/HF 缓存，避免路径冲突
			
 
				+       - TRANSFORMERS_CACHE=/root/.cache/huggingface/transformers
			
 
				+       - HF_HOME=/root/.cache/huggingface
			
 
				+       # 日志与语言
			
 
				+       - LOG_DIR=/app/logs
			
 
				+       - LANG=zh_CN.UTF-8
			
 
				+       - PYTHONUNBUFFERED=1
			
 
				+       - DEVICE=cuda
			
 
				+       # API Key 配置（根据实际版本选择）
			
 
				+       - MINERU_API_KEY=sk_dev_aC_2gg8BS5ImUScrpaHIKS5x6gdLO9Js_ba854894
			
 
				+
			
 
				+    entrypoint: mineru-api
			
 
				+    command:
			
 
				+      --host 0.0.0.0
			
 
				+      --port 8000
			
 
				+      # --allow-public-http-client  # Disabled by default; when binding to 0.0.0.0 or ::, this re-enables *-http-client backends and server_url. Enable only if you accept the SSRF risk.
			
 
				+      # parameters for vllm-engine
			
 
				+      # --gpu-memory-utilization 0.5  # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
			
 
				+    volumes:
			
 
				+      # 1. 模型缓存持久化 (核心：避免重复下载)
			
 
				+      - /home/ubuntu/.cache/modelscope:/root/.cache/modelscope:rw
			
 
				+      # 2. MinerU 缓存持久化
			
 
				+      - /home/ubuntu/.cache/mineru:/root/.cache/mineru:rw
			
 
				+      - /home/ubuntu/.cache/huggingface:/root/.cache/huggingface:rw  # 新增：避免 transformers 缓存冲突
			
 
				+      # 3. 日志目录映射
			
 
				+      - //home/ubuntu/lq_workspace/minerU/logs:/app/logs:rw
			
 
				+      # 4. 输入文件目录 (可选，如果 API 支持文件上传处理)
			
 
				+      - /home/ubuntu/lq_workspace/minerU/input:/app/input:ro
			
 
				+      # 5. 输出结果目录 (可选)
			
 
				+      - /home/ubuntu/lq_workspace/minerU/output:/app/output:rw
			
 
				+      # 6. 配置文件目录 (可选，如有自定义配置)
			
 
				+      - /home/ubuntu/lq_workspace/minerU/config:/app/config:ro
			
 
				+    
			
 
				+    ulimits:
			
 
				+      memlock: -1
			
 
				+      stack: 67108864
			
 
				+    ipc: host
			
 
				+    healthcheck:
			
 
				+      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["0"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              capabilities: [gpu]
			
 
				+
			
 
				+  mineru-router:
			
 
				+    image: mineru:latest
			
 
				+    container_name: mineru-router
			
 
				+    restart: always
			
 
				+    profiles: ["router"]
			
 
				+    ports:
			
 
				+      - 8002:8002
			
 
				+    environment:
			
 
				+      MINERU_MODEL_SOURCE: local
			
 
				+      # API Key 配置（根据实际版本选择）
			
 
				+      MINERU_API_KEY: "sk_dev_aC_2gg8BS5ImUScrpaHIKS5x6gdLO9Js_ba854894"
			
 
				+    entrypoint: mineru-router
			
 
				+    command:
			
 
				+      --host 0.0.0.0
			
 
				+      --port 8002
			
 
				+      --local-gpus auto
			
 
				+      # --allow-public-http-client  # Disabled by default; when binding to 0.0.0.0 or ::, this re-enables *-http-client backends and server_url. Enable only if you accept the SSRF risk.
			
 
				+      # To aggregate existing mineru-api services instead of starting local workers:
			
 
				+      # --local-gpus none
			
 
				+      # --upstream-url http://mineru-api:8000
			
 
				+      # --upstream-url http://mineru-api-2:8000
			
 
				+      # parameters for vllm-engine
			
 
				+      # --gpu-memory-utilization 0.5  # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
			
 
				+    ulimits:
			
 
				+      memlock: -1
			
 
				+      stack: 67108864
			
 
				+    ipc: host
			
 
				+    healthcheck:
			
 
				+      test: ["CMD-SHELL", "curl -f http://localhost:8002/health || exit 1"]
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["0"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              capabilities: [gpu]
			
 
				+
			
 
				+  mineru-gradio:
			
 
				+    image: mineru:latest
			
 
				+    container_name: mineru-gradio
			
 
				+    restart: always
			
 
				+    profiles: ["gradio"]
			
 
				+    ports:
			
 
				+      - 7860:7860
			
 
				+    environment:
			
 
				+      MINERU_MODEL_SOURCE: local
			
 
				+    entrypoint: mineru-gradio
			
 
				+    command:
			
 
				+      --server-name 0.0.0.0
			
 
				+      --server-port 7860
			
 
				+      # --enable-api false  # If you want to disable the API, set this to false
			
 
				+      # --max-convert-pages 20  # If you want to limit the number of pages for conversion, set this to a specific number
			
 
				+      # parameters for vllm-engine
			
 
				+      # --gpu-memory-utilization 0.5  # If encountering VRAM shortage, reduce the KV cache size by this parameter; if VRAM issues persist, try lowering it further to `0.4` or below.
			
 
				+    ulimits:
			
 
				+      memlock: -1
			
 
				+      stack: 67108864
			
 
				+    ipc: host
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["0"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              capabilities: [gpu]
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/benchmark.log
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/benchmark.log
@@ -0,0 +1,389 @@
 
				+2026-05-15 08:41:06 - evalscope - INFO: Starting benchmark with args: 
			
 
				+2026-05-15 08:41:06 - evalscope - INFO: {
			
 
				+    "model": "Qwen3.6-27B-W8A8",
			
 
				+    "model_id": "Qwen3.6-27B-W8A8",
			
 
				+    "attn_implementation": null,
			
 
				+    "api": "openai",
			
 
				+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
			
 
				+    "port": 8877,
			
 
				+    "url": "http://127.0.0.1:8004/v1/chat/completions",
			
 
				+    "headers": {
			
 
				+        "Authorization": "Bearer sk-123456"
			
 
				+    },
			
 
				+    "connect_timeout": null,
			
 
				+    "read_timeout": null,
			
 
				+    "total_timeout": 21600,
			
 
				+    "api_key": "sk-123456",
			
 
				+    "no_test_connection": false,
			
 
				+    "number": 1,
			
 
				+    "parallel": 1,
			
 
				+    "rate": -1,
			
 
				+    "sleep_interval": 5,
			
 
				+    "sla_auto_tune": false,
			
 
				+    "sla_variable": "parallel",
			
 
				+    "sla_params": null,
			
 
				+    "sla_num_runs": 3,
			
 
				+    "sla_upper_bound": 65536,
			
 
				+    "sla_lower_bound": 1,
			
 
				+    "db_commit_interval": 1000,
			
 
				+    "queue_size_multiplier": 5,
			
 
				+    "in_flight_task_multiplier": 2,
			
 
				+    "log_every_n_query": 10,
			
 
				+    "debug": false,
			
 
				+    "visualizer": null,
			
 
				+    "wandb_api_key": null,
			
 
				+    "swanlab_api_key": null,
			
 
				+    "name": null,
			
 
				+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
			
 
				+    "no_timestamp": false,
			
 
				+    "max_prompt_length": 2048,
			
 
				+    "min_prompt_length": 2048,
			
 
				+    "prefix_length": 0,
			
 
				+    "prompt": null,
			
 
				+    "query_template": null,
			
 
				+    "apply_chat_template": true,
			
 
				+    "image_width": 224,
			
 
				+    "image_height": 224,
			
 
				+    "image_format": "RGB",
			
 
				+    "image_num": 1,
			
 
				+    "image_patch_size": 28,
			
 
				+    "dataset": "random",
			
 
				+    "dataset_path": null,
			
 
				+    "frequency_penalty": null,
			
 
				+    "repetition_penalty": null,
			
 
				+    "logprobs": null,
			
 
				+    "max_tokens": 128,
			
 
				+    "min_tokens": 128,
			
 
				+    "n_choices": null,
			
 
				+    "seed": null,
			
 
				+    "stop": null,
			
 
				+    "stop_token_ids": null,
			
 
				+    "stream": true,
			
 
				+    "temperature": 0.0,
			
 
				+    "top_p": null,
			
 
				+    "top_k": null,
			
 
				+    "extra_args": {}
			
 
				+}
			
 
				+2026-05-15 08:41:22 - evalscope - INFO: Test connection successful.
			
 
				+2026-05-15 08:41:25 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
			
 
				+2026-05-15 08:41:25 - evalscope - INFO: Sampling input lengths from [2046, 2047)
			
 
				+2026-05-15 08:41:26 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_data.db
			
 
				+2026-05-15 08:41:33 - evalscope - INFO: Processing 100%| 1/1 [Elapsed: 00:07 < Remaining: 00:00,  7.37s/it]
			
 
				+2026-05-15 08:41:33 - evalscope - INFO: 
			
 
				+Benchmarking summary:
			
 
				++-----------------------------------+-----------+
			
 
				+| Key                               |     Value |
			
 
				++===================================+===========+
			
 
				+| Time taken for tests (s)          |    7.3689 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Number of concurrency             |    1      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Request rate (req/s)              |   -1      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Total requests                    |    1      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Succeed requests                  |    1      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Failed requests                   |    0      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Output token throughput (tok/s)   |   17.3704 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Total token throughput (tok/s)    |  296.383  |
			
 
				++-----------------------------------+-----------+
			
 
				+| Request throughput (req/s)        |    0.1357 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average latency (s)               |    7.3689 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average time to first token (s)   |    0.8157 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average time per output token (s) |    0.0516 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average inter-token latency (s)   |    0.0512 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average input tokens per request  | 2056      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average output tokens per request |  128      |
			
 
				++-----------------------------------+-----------+
			
 
				+2026-05-15 08:41:33 - evalscope - INFO: 
			
 
				+Percentile results:
			
 
				++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
			
 
				+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
			
 
				++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
			
 
				+|     10%     |  0.8157  | 0.0514  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				+|     25%     |  0.8157  | 0.0515  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				+|     50%     |  0.8157  | 0.0516  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				+|     66%     |  0.8157  | 0.0517  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				+|     75%     |  0.8157  | 0.0518  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				+|     80%     |  0.8157  | 0.0518  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				+|     90%     |  0.8157  | 0.0519  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				+|     95%     |  0.8157  |  0.052  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				+|     98%     |  0.8157  | 0.0523  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				+|     99%     |  0.8157  | 0.0523  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
			
 
				++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
			
 
				+2026-05-15 08:41:33 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1
			
 
				+2026-05-15 08:41:33 - evalscope - INFO: Sleeping for 5 seconds before the next run...
			
 
				+2026-05-15 08:41:38 - evalscope - INFO: Starting benchmark with args: 
			
 
				+2026-05-15 08:41:38 - evalscope - INFO: {
			
 
				+    "model": "Qwen3.6-27B-W8A8",
			
 
				+    "model_id": "Qwen3.6-27B-W8A8",
			
 
				+    "attn_implementation": null,
			
 
				+    "api": "openai",
			
 
				+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
			
 
				+    "port": 8877,
			
 
				+    "url": "http://127.0.0.1:8004/v1/chat/completions",
			
 
				+    "headers": {
			
 
				+        "Authorization": "Bearer sk-123456"
			
 
				+    },
			
 
				+    "connect_timeout": null,
			
 
				+    "read_timeout": null,
			
 
				+    "total_timeout": 21600,
			
 
				+    "api_key": "sk-123456",
			
 
				+    "no_test_connection": false,
			
 
				+    "number": 5,
			
 
				+    "parallel": 5,
			
 
				+    "rate": -1,
			
 
				+    "sleep_interval": 5,
			
 
				+    "sla_auto_tune": false,
			
 
				+    "sla_variable": "parallel",
			
 
				+    "sla_params": null,
			
 
				+    "sla_num_runs": 3,
			
 
				+    "sla_upper_bound": 65536,
			
 
				+    "sla_lower_bound": 1,
			
 
				+    "db_commit_interval": 1000,
			
 
				+    "queue_size_multiplier": 5,
			
 
				+    "in_flight_task_multiplier": 2,
			
 
				+    "log_every_n_query": 10,
			
 
				+    "debug": false,
			
 
				+    "visualizer": null,
			
 
				+    "wandb_api_key": null,
			
 
				+    "swanlab_api_key": null,
			
 
				+    "name": null,
			
 
				+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
			
 
				+    "no_timestamp": false,
			
 
				+    "max_prompt_length": 2048,
			
 
				+    "min_prompt_length": 2048,
			
 
				+    "prefix_length": 0,
			
 
				+    "prompt": null,
			
 
				+    "query_template": null,
			
 
				+    "apply_chat_template": true,
			
 
				+    "image_width": 224,
			
 
				+    "image_height": 224,
			
 
				+    "image_format": "RGB",
			
 
				+    "image_num": 1,
			
 
				+    "image_patch_size": 28,
			
 
				+    "dataset": "random",
			
 
				+    "dataset_path": null,
			
 
				+    "frequency_penalty": null,
			
 
				+    "repetition_penalty": null,
			
 
				+    "logprobs": null,
			
 
				+    "max_tokens": 128,
			
 
				+    "min_tokens": 128,
			
 
				+    "n_choices": null,
			
 
				+    "seed": null,
			
 
				+    "stop": null,
			
 
				+    "stop_token_ids": null,
			
 
				+    "stream": true,
			
 
				+    "temperature": 0.0,
			
 
				+    "top_p": null,
			
 
				+    "top_k": null,
			
 
				+    "extra_args": {}
			
 
				+}
			
 
				+2026-05-15 08:41:48 - evalscope - INFO: Test connection successful.
			
 
				+2026-05-15 08:41:51 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
			
 
				+2026-05-15 08:41:51 - evalscope - INFO: Sampling input lengths from [2046, 2047)
			
 
				+2026-05-15 08:41:51 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_data.db
			
 
				+2026-05-15 08:41:58 - evalscope - INFO: Processing 100%| 5/5 [Elapsed: 00:06 < Remaining: 00:00,  1.01it/s]
			
 
				+2026-05-15 08:41:58 - evalscope - INFO: 
			
 
				+Benchmarking summary:
			
 
				++-----------------------------------+-----------+
			
 
				+| Key                               |     Value |
			
 
				++===================================+===========+
			
 
				+| Time taken for tests (s)          |    6.6303 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Number of concurrency             |    5      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Request rate (req/s)              |   -1      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Total requests                    |    5      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Succeed requests                  |    5      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Failed requests                   |    0      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Output token throughput (tok/s)   |   96.5268 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Total token throughput (tok/s)    | 1647.29   |
			
 
				++-----------------------------------+-----------+
			
 
				+| Request throughput (req/s)        |    0.7541 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average latency (s)               |    6.5697 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average time to first token (s)   |    2.1216 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average time per output token (s) |    0.035  |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average inter-token latency (s)   |    0.0348 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average input tokens per request  | 2056.4    |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average output tokens per request |  128      |
			
 
				++-----------------------------------+-----------+
			
 
				+2026-05-15 08:41:58 - evalscope - INFO: 
			
 
				+Percentile results:
			
 
				++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
			
 
				+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
			
 
				++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
			
 
				+|     10%     |  0.6655  | 0.0295  |  0.0299  |   6.5201    |     2056     |      128      |    19.3128     |   329.6753    |
			
 
				+|     25%     |  1.6872  | 0.0296  |  0.0304  |   6.5483    |     2056     |      128      |    19.4617     |   332.0644    |
			
 
				+|     50%     |  2.7127  | 0.0297  |  0.0304  |   6.5751    |     2056     |      128      |    19.4674     |   332.1626    |
			
 
				+|     66%     |  2.7127  | 0.0298  |  0.0383  |    6.577    |     2057     |      128      |    19.5469     |   333.6721    |
			
 
				+|     75%     |  2.7127  | 0.0299  |  0.0383  |    6.577    |     2057     |      128      |    19.5469     |   333.6721    |
			
 
				+|     80%     |  2.8297  | 0.0299  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
			
 
				+|     90%     |  2.8297  | 0.0301  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
			
 
				+|     95%     |  2.8297  | 0.0304  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
			
 
				+|     98%     |  2.8297  |  0.031  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
			
 
				+|     99%     |  2.8297  | 0.1171  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
			
 
				++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
			
 
				+2026-05-15 08:41:58 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5
			
 
				+2026-05-15 08:41:58 - evalscope - INFO: Sleeping for 5 seconds before the next run...
			
 
				+2026-05-15 08:42:03 - evalscope - INFO: Starting benchmark with args: 
			
 
				+2026-05-15 08:42:03 - evalscope - INFO: {
			
 
				+    "model": "Qwen3.6-27B-W8A8",
			
 
				+    "model_id": "Qwen3.6-27B-W8A8",
			
 
				+    "attn_implementation": null,
			
 
				+    "api": "openai",
			
 
				+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
			
 
				+    "port": 8877,
			
 
				+    "url": "http://127.0.0.1:8004/v1/chat/completions",
			
 
				+    "headers": {
			
 
				+        "Authorization": "Bearer sk-123456"
			
 
				+    },
			
 
				+    "connect_timeout": null,
			
 
				+    "read_timeout": null,
			
 
				+    "total_timeout": 21600,
			
 
				+    "api_key": "sk-123456",
			
 
				+    "no_test_connection": false,
			
 
				+    "number": 10,
			
 
				+    "parallel": 10,
			
 
				+    "rate": -1,
			
 
				+    "sleep_interval": 5,
			
 
				+    "sla_auto_tune": false,
			
 
				+    "sla_variable": "parallel",
			
 
				+    "sla_params": null,
			
 
				+    "sla_num_runs": 3,
			
 
				+    "sla_upper_bound": 65536,
			
 
				+    "sla_lower_bound": 1,
			
 
				+    "db_commit_interval": 1000,
			
 
				+    "queue_size_multiplier": 5,
			
 
				+    "in_flight_task_multiplier": 2,
			
 
				+    "log_every_n_query": 10,
			
 
				+    "debug": false,
			
 
				+    "visualizer": null,
			
 
				+    "wandb_api_key": null,
			
 
				+    "swanlab_api_key": null,
			
 
				+    "name": null,
			
 
				+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
			
 
				+    "no_timestamp": false,
			
 
				+    "max_prompt_length": 2048,
			
 
				+    "min_prompt_length": 2048,
			
 
				+    "prefix_length": 0,
			
 
				+    "prompt": null,
			
 
				+    "query_template": null,
			
 
				+    "apply_chat_template": true,
			
 
				+    "image_width": 224,
			
 
				+    "image_height": 224,
			
 
				+    "image_format": "RGB",
			
 
				+    "image_num": 1,
			
 
				+    "image_patch_size": 28,
			
 
				+    "dataset": "random",
			
 
				+    "dataset_path": null,
			
 
				+    "frequency_penalty": null,
			
 
				+    "repetition_penalty": null,
			
 
				+    "logprobs": null,
			
 
				+    "max_tokens": 128,
			
 
				+    "min_tokens": 128,
			
 
				+    "n_choices": null,
			
 
				+    "seed": null,
			
 
				+    "stop": null,
			
 
				+    "stop_token_ids": null,
			
 
				+    "stream": true,
			
 
				+    "temperature": 0.0,
			
 
				+    "top_p": null,
			
 
				+    "top_k": null,
			
 
				+    "extra_args": {}
			
 
				+}
			
 
				+2026-05-15 08:42:12 - evalscope - INFO: Test connection successful.
			
 
				+2026-05-15 08:42:16 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
			
 
				+2026-05-15 08:42:16 - evalscope - INFO: Sampling input lengths from [2046, 2047)
			
 
				+2026-05-15 08:42:16 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_data.db
			
 
				+2026-05-15 08:42:26 - evalscope - INFO: {
			
 
				+  "Time taken for tests (s)": 9.6293,
			
 
				+  "Number of concurrency": 10,
			
 
				+  "Request rate (req/s)": -1,
			
 
				+  "Total requests": 10,
			
 
				+  "Succeed requests": 10,
			
 
				+  "Failed requests": 0,
			
 
				+  "Output token throughput (tok/s)": 132.928,
			
 
				+  "Total token throughput (tok/s)": 2268.4997,
			
 
				+  "Request throughput (req/s)": 1.0385,
			
 
				+  "Average latency (s)": 9.567,
			
 
				+  "Average time to first token (s)": 3.6071,
			
 
				+  "Average time per output token (s)": 0.0469,
			
 
				+  "Average inter-token latency (s)": 0.0466,
			
 
				+  "Average input tokens per request": 2056.4,
			
 
				+  "Average output tokens per request": 128.0
			
 
				+}
			
 
				+2026-05-15 08:42:26 - evalscope - INFO: Processing 100%| 10/10 [Elapsed: 00:09 < Remaining: 00:00,  1.01s/it]
			
 
				+2026-05-15 08:42:26 - evalscope - INFO: 
			
 
				+Benchmarking summary:
			
 
				++-----------------------------------+-----------+
			
 
				+| Key                               |     Value |
			
 
				++===================================+===========+
			
 
				+| Time taken for tests (s)          |    9.6293 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Number of concurrency             |   10      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Request rate (req/s)              |   -1      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Total requests                    |   10      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Succeed requests                  |   10      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Failed requests                   |    0      |
			
 
				++-----------------------------------+-----------+
			
 
				+| Output token throughput (tok/s)   |  132.928  |
			
 
				++-----------------------------------+-----------+
			
 
				+| Total token throughput (tok/s)    | 2268.5    |
			
 
				++-----------------------------------+-----------+
			
 
				+| Request throughput (req/s)        |    1.0385 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average latency (s)               |    9.567  |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average time to first token (s)   |    3.6071 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average time per output token (s) |    0.0469 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average inter-token latency (s)   |    0.0466 |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average input tokens per request  | 2056.4    |
			
 
				++-----------------------------------+-----------+
			
 
				+| Average output tokens per request |  128      |
			
 
				++-----------------------------------+-----------+
			
 
				+2026-05-15 08:42:26 - evalscope - INFO: 
			
 
				+Percentile results:
			
 
				++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
			
 
				+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
			
 
				++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
			
 
				+|     10%     |  1.7449  | 0.0332  |  0.0332  |   9.5131    |     2056     |      128      |    13.2998     |   226.9284    |
			
 
				+|     25%     |  2.7691  | 0.0333  |  0.0376  |   9.5425    |     2056     |      128      |    13.3343     |   227.5162    |
			
 
				+|     50%     |  3.7961  | 0.0333  |  0.0455  |   9.5729    |     2056     |      128      |    13.3741     |   228.2483    |
			
 
				+|     66%     |  4.8242  | 0.0334  |  0.0533  |   9.5984    |     2056     |      128      |     13.412     |   228.8415    |
			
 
				+|     75%     |  4.825   | 0.0334  |  0.0533  |   9.5993    |     2057     |      128      |    13.4137     |   229.0807    |
			
 
				+|     80%     |  5.4098  | 0.0335  |  0.0612  |   9.6242    |     2057     |      128      |    13.4552     |   229.5791    |
			
 
				+|     90%     |   5.41   | 0.0336  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
			
 
				+|     95%     |   5.41   | 0.0337  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
			
 
				+|     98%     |   5.41   | 0.0367  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
			
 
				+|     99%     |   5.41   | 1.0169  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
			
 
				++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
			
 
				+2026-05-15 08:42:26 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10
			
 
				+2026-05-15 08:42:26 - evalscope - INFO: Performance summary saved to: outputs/20260515_084106/Qwen3.6-27B-W8A8/performance_summary.txt
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_args.json
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_args.json
@@ -0,0 +1,65 @@
 
				+{
			
 
				+    "model": "Qwen3.6-27B-W8A8",
			
 
				+    "model_id": "Qwen3.6-27B-W8A8",
			
 
				+    "attn_implementation": null,
			
 
				+    "api": "openai",
			
 
				+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
			
 
				+    "port": 8877,
			
 
				+    "url": "http://127.0.0.1:8004/v1/chat/completions",
			
 
				+    "headers": {
			
 
				+        "Authorization": "Bearer sk-123456"
			
 
				+    },
			
 
				+    "connect_timeout": null,
			
 
				+    "read_timeout": null,
			
 
				+    "total_timeout": 21600,
			
 
				+    "api_key": "sk-123456",
			
 
				+    "no_test_connection": false,
			
 
				+    "number": 10,
			
 
				+    "parallel": 10,
			
 
				+    "rate": -1,
			
 
				+    "sleep_interval": 5,
			
 
				+    "sla_auto_tune": false,
			
 
				+    "sla_variable": "parallel",
			
 
				+    "sla_params": null,
			
 
				+    "sla_num_runs": 3,
			
 
				+    "sla_upper_bound": 65536,
			
 
				+    "sla_lower_bound": 1,
			
 
				+    "db_commit_interval": 1000,
			
 
				+    "queue_size_multiplier": 5,
			
 
				+    "in_flight_task_multiplier": 2,
			
 
				+    "log_every_n_query": 10,
			
 
				+    "debug": false,
			
 
				+    "visualizer": null,
			
 
				+    "wandb_api_key": null,
			
 
				+    "swanlab_api_key": null,
			
 
				+    "name": null,
			
 
				+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10",
			
 
				+    "no_timestamp": false,
			
 
				+    "max_prompt_length": 2048,
			
 
				+    "min_prompt_length": 2048,
			
 
				+    "prefix_length": 0,
			
 
				+    "prompt": null,
			
 
				+    "query_template": null,
			
 
				+    "apply_chat_template": true,
			
 
				+    "image_width": 224,
			
 
				+    "image_height": 224,
			
 
				+    "image_format": "RGB",
			
 
				+    "image_num": 1,
			
 
				+    "image_patch_size": 28,
			
 
				+    "dataset": "random",
			
 
				+    "dataset_path": null,
			
 
				+    "frequency_penalty": null,
			
 
				+    "repetition_penalty": null,
			
 
				+    "logprobs": null,
			
 
				+    "max_tokens": 128,
			
 
				+    "min_tokens": 128,
			
 
				+    "n_choices": null,
			
 
				+    "seed": null,
			
 
				+    "stop": null,
			
 
				+    "stop_token_ids": null,
			
 
				+    "stream": true,
			
 
				+    "temperature": 0.0,
			
 
				+    "top_p": null,
			
 
				+    "top_k": null,
			
 
				+    "extra_args": {}
			
 
				+}
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_data.db
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_data.db
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_percentile.json
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_percentile.json
@@ -0,0 +1,112 @@
 
				+[
			
 
				+    {
			
 
				+        "Percentiles": "10%",
			
 
				+        "TTFT (s)": 1.7449,
			
 
				+        "ITL (s)": 0.0332,
			
 
				+        "TPOT (s)": 0.0332,
			
 
				+        "Latency (s)": 9.5131,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.2998,
			
 
				+        "Total (tok/s)": 226.9284
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "25%",
			
 
				+        "TTFT (s)": 2.7691,
			
 
				+        "ITL (s)": 0.0333,
			
 
				+        "TPOT (s)": 0.0376,
			
 
				+        "Latency (s)": 9.5425,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.3343,
			
 
				+        "Total (tok/s)": 227.5162
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "50%",
			
 
				+        "TTFT (s)": 3.7961,
			
 
				+        "ITL (s)": 0.0333,
			
 
				+        "TPOT (s)": 0.0455,
			
 
				+        "Latency (s)": 9.5729,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.3741,
			
 
				+        "Total (tok/s)": 228.2483
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "66%",
			
 
				+        "TTFT (s)": 4.8242,
			
 
				+        "ITL (s)": 0.0334,
			
 
				+        "TPOT (s)": 0.0533,
			
 
				+        "Latency (s)": 9.5984,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.412,
			
 
				+        "Total (tok/s)": 228.8415
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "75%",
			
 
				+        "TTFT (s)": 4.825,
			
 
				+        "ITL (s)": 0.0334,
			
 
				+        "TPOT (s)": 0.0533,
			
 
				+        "Latency (s)": 9.5993,
			
 
				+        "Input tokens": 2057,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.4137,
			
 
				+        "Total (tok/s)": 229.0807
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "80%",
			
 
				+        "TTFT (s)": 5.4098,
			
 
				+        "ITL (s)": 0.0335,
			
 
				+        "TPOT (s)": 0.0612,
			
 
				+        "Latency (s)": 9.6242,
			
 
				+        "Input tokens": 2057,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.4552,
			
 
				+        "Total (tok/s)": 229.5791
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "90%",
			
 
				+        "TTFT (s)": 5.41,
			
 
				+        "ITL (s)": 0.0336,
			
 
				+        "TPOT (s)": 0.0689,
			
 
				+        "Latency (s)": 9.6253,
			
 
				+        "Input tokens": 2058,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.5021,
			
 
				+        "Total (tok/s)": 230.4848
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "95%",
			
 
				+        "TTFT (s)": 5.41,
			
 
				+        "ITL (s)": 0.0337,
			
 
				+        "TPOT (s)": 0.0689,
			
 
				+        "Latency (s)": 9.6253,
			
 
				+        "Input tokens": 2058,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.5021,
			
 
				+        "Total (tok/s)": 230.4848
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "98%",
			
 
				+        "TTFT (s)": 5.41,
			
 
				+        "ITL (s)": 0.0367,
			
 
				+        "TPOT (s)": 0.0689,
			
 
				+        "Latency (s)": 9.6253,
			
 
				+        "Input tokens": 2058,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.5021,
			
 
				+        "Total (tok/s)": 230.4848
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "99%",
			
 
				+        "TTFT (s)": 5.41,
			
 
				+        "ITL (s)": 1.0169,
			
 
				+        "TPOT (s)": 0.0689,
			
 
				+        "Latency (s)": 9.6253,
			
 
				+        "Input tokens": 2058,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 13.5021,
			
 
				+        "Total (tok/s)": 230.4848
			
 
				+    }
			
 
				+]
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_summary.json
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_summary.json
@@ -0,0 +1,17 @@
 
				+{
			
 
				+    "Time taken for tests (s)": 9.6293,
			
 
				+    "Number of concurrency": 10,
			
 
				+    "Request rate (req/s)": -1,
			
 
				+    "Total requests": 10,
			
 
				+    "Succeed requests": 10,
			
 
				+    "Failed requests": 0,
			
 
				+    "Output token throughput (tok/s)": 132.928,
			
 
				+    "Total token throughput (tok/s)": 2268.4997,
			
 
				+    "Request throughput (req/s)": 1.0385,
			
 
				+    "Average latency (s)": 9.567,
			
 
				+    "Average time to first token (s)": 3.6071,
			
 
				+    "Average time per output token (s)": 0.0469,
			
 
				+    "Average inter-token latency (s)": 0.0466,
			
 
				+    "Average input tokens per request": 2056.4,
			
 
				+    "Average output tokens per request": 128.0
			
 
				+}
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_args.json
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_args.json
@@ -0,0 +1,65 @@
 
				+{
			
 
				+    "model": "Qwen3.6-27B-W8A8",
			
 
				+    "model_id": "Qwen3.6-27B-W8A8",
			
 
				+    "attn_implementation": null,
			
 
				+    "api": "openai",
			
 
				+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
			
 
				+    "port": 8877,
			
 
				+    "url": "http://127.0.0.1:8004/v1/chat/completions",
			
 
				+    "headers": {
			
 
				+        "Authorization": "Bearer sk-123456"
			
 
				+    },
			
 
				+    "connect_timeout": null,
			
 
				+    "read_timeout": null,
			
 
				+    "total_timeout": 21600,
			
 
				+    "api_key": "sk-123456",
			
 
				+    "no_test_connection": false,
			
 
				+    "number": 1,
			
 
				+    "parallel": 1,
			
 
				+    "rate": -1,
			
 
				+    "sleep_interval": 5,
			
 
				+    "sla_auto_tune": false,
			
 
				+    "sla_variable": "parallel",
			
 
				+    "sla_params": null,
			
 
				+    "sla_num_runs": 3,
			
 
				+    "sla_upper_bound": 65536,
			
 
				+    "sla_lower_bound": 1,
			
 
				+    "db_commit_interval": 1000,
			
 
				+    "queue_size_multiplier": 5,
			
 
				+    "in_flight_task_multiplier": 2,
			
 
				+    "log_every_n_query": 10,
			
 
				+    "debug": false,
			
 
				+    "visualizer": null,
			
 
				+    "wandb_api_key": null,
			
 
				+    "swanlab_api_key": null,
			
 
				+    "name": null,
			
 
				+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1",
			
 
				+    "no_timestamp": false,
			
 
				+    "max_prompt_length": 2048,
			
 
				+    "min_prompt_length": 2048,
			
 
				+    "prefix_length": 0,
			
 
				+    "prompt": null,
			
 
				+    "query_template": null,
			
 
				+    "apply_chat_template": true,
			
 
				+    "image_width": 224,
			
 
				+    "image_height": 224,
			
 
				+    "image_format": "RGB",
			
 
				+    "image_num": 1,
			
 
				+    "image_patch_size": 28,
			
 
				+    "dataset": "random",
			
 
				+    "dataset_path": null,
			
 
				+    "frequency_penalty": null,
			
 
				+    "repetition_penalty": null,
			
 
				+    "logprobs": null,
			
 
				+    "max_tokens": 128,
			
 
				+    "min_tokens": 128,
			
 
				+    "n_choices": null,
			
 
				+    "seed": null,
			
 
				+    "stop": null,
			
 
				+    "stop_token_ids": null,
			
 
				+    "stream": true,
			
 
				+    "temperature": 0.0,
			
 
				+    "top_p": null,
			
 
				+    "top_k": null,
			
 
				+    "extra_args": {}
			
 
				+}
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_data.db
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_data.db
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_percentile.json
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_percentile.json
@@ -0,0 +1,112 @@
 
				+[
			
 
				+    {
			
 
				+        "Percentiles": "10%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.0514,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "25%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.0515,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "50%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.0516,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "66%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.0517,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "75%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.0518,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "80%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.0518,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "90%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.0519,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "95%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.052,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "98%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.0523,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "99%",
			
 
				+        "TTFT (s)": 0.8157,
			
 
				+        "ITL (s)": 0.0523,
			
 
				+        "TPOT (s)": 0.0516,
			
 
				+        "Latency (s)": 7.3689,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 17.3704,
			
 
				+        "Total (tok/s)": 296.3827
			
 
				+    }
			
 
				+]
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_summary.json
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_summary.json
@@ -0,0 +1,17 @@
 
				+{
			
 
				+    "Time taken for tests (s)": 7.3689,
			
 
				+    "Number of concurrency": 1,
			
 
				+    "Request rate (req/s)": -1,
			
 
				+    "Total requests": 1,
			
 
				+    "Succeed requests": 1,
			
 
				+    "Failed requests": 0,
			
 
				+    "Output token throughput (tok/s)": 17.3704,
			
 
				+    "Total token throughput (tok/s)": 296.3827,
			
 
				+    "Request throughput (req/s)": 0.1357,
			
 
				+    "Average latency (s)": 7.3689,
			
 
				+    "Average time to first token (s)": 0.8157,
			
 
				+    "Average time per output token (s)": 0.0516,
			
 
				+    "Average inter-token latency (s)": 0.0512,
			
 
				+    "Average input tokens per request": 2056.0,
			
 
				+    "Average output tokens per request": 128.0
			
 
				+}
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_args.json
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_args.json
@@ -0,0 +1,65 @@
 
				+{
			
 
				+    "model": "Qwen3.6-27B-W8A8",
			
 
				+    "model_id": "Qwen3.6-27B-W8A8",
			
 
				+    "attn_implementation": null,
			
 
				+    "api": "openai",
			
 
				+    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
			
 
				+    "port": 8877,
			
 
				+    "url": "http://127.0.0.1:8004/v1/chat/completions",
			
 
				+    "headers": {
			
 
				+        "Authorization": "Bearer sk-123456"
			
 
				+    },
			
 
				+    "connect_timeout": null,
			
 
				+    "read_timeout": null,
			
 
				+    "total_timeout": 21600,
			
 
				+    "api_key": "sk-123456",
			
 
				+    "no_test_connection": false,
			
 
				+    "number": 5,
			
 
				+    "parallel": 5,
			
 
				+    "rate": -1,
			
 
				+    "sleep_interval": 5,
			
 
				+    "sla_auto_tune": false,
			
 
				+    "sla_variable": "parallel",
			
 
				+    "sla_params": null,
			
 
				+    "sla_num_runs": 3,
			
 
				+    "sla_upper_bound": 65536,
			
 
				+    "sla_lower_bound": 1,
			
 
				+    "db_commit_interval": 1000,
			
 
				+    "queue_size_multiplier": 5,
			
 
				+    "in_flight_task_multiplier": 2,
			
 
				+    "log_every_n_query": 10,
			
 
				+    "debug": false,
			
 
				+    "visualizer": null,
			
 
				+    "wandb_api_key": null,
			
 
				+    "swanlab_api_key": null,
			
 
				+    "name": null,
			
 
				+    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5",
			
 
				+    "no_timestamp": false,
			
 
				+    "max_prompt_length": 2048,
			
 
				+    "min_prompt_length": 2048,
			
 
				+    "prefix_length": 0,
			
 
				+    "prompt": null,
			
 
				+    "query_template": null,
			
 
				+    "apply_chat_template": true,
			
 
				+    "image_width": 224,
			
 
				+    "image_height": 224,
			
 
				+    "image_format": "RGB",
			
 
				+    "image_num": 1,
			
 
				+    "image_patch_size": 28,
			
 
				+    "dataset": "random",
			
 
				+    "dataset_path": null,
			
 
				+    "frequency_penalty": null,
			
 
				+    "repetition_penalty": null,
			
 
				+    "logprobs": null,
			
 
				+    "max_tokens": 128,
			
 
				+    "min_tokens": 128,
			
 
				+    "n_choices": null,
			
 
				+    "seed": null,
			
 
				+    "stop": null,
			
 
				+    "stop_token_ids": null,
			
 
				+    "stream": true,
			
 
				+    "temperature": 0.0,
			
 
				+    "top_p": null,
			
 
				+    "top_k": null,
			
 
				+    "extra_args": {}
			
 
				+}
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_data.db
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_data.db
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_percentile.json
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_percentile.json
@@ -0,0 +1,112 @@
 
				+[
			
 
				+    {
			
 
				+        "Percentiles": "10%",
			
 
				+        "TTFT (s)": 0.6655,
			
 
				+        "ITL (s)": 0.0295,
			
 
				+        "TPOT (s)": 0.0299,
			
 
				+        "Latency (s)": 6.5201,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.3128,
			
 
				+        "Total (tok/s)": 329.6753
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "25%",
			
 
				+        "TTFT (s)": 1.6872,
			
 
				+        "ITL (s)": 0.0296,
			
 
				+        "TPOT (s)": 0.0304,
			
 
				+        "Latency (s)": 6.5483,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.4617,
			
 
				+        "Total (tok/s)": 332.0644
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "50%",
			
 
				+        "TTFT (s)": 2.7127,
			
 
				+        "ITL (s)": 0.0297,
			
 
				+        "TPOT (s)": 0.0304,
			
 
				+        "Latency (s)": 6.5751,
			
 
				+        "Input tokens": 2056,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.4674,
			
 
				+        "Total (tok/s)": 332.1626
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "66%",
			
 
				+        "TTFT (s)": 2.7127,
			
 
				+        "ITL (s)": 0.0298,
			
 
				+        "TPOT (s)": 0.0383,
			
 
				+        "Latency (s)": 6.577,
			
 
				+        "Input tokens": 2057,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.5469,
			
 
				+        "Total (tok/s)": 333.6721
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "75%",
			
 
				+        "TTFT (s)": 2.7127,
			
 
				+        "ITL (s)": 0.0299,
			
 
				+        "TPOT (s)": 0.0383,
			
 
				+        "Latency (s)": 6.577,
			
 
				+        "Input tokens": 2057,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.5469,
			
 
				+        "Total (tok/s)": 333.6721
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "80%",
			
 
				+        "TTFT (s)": 2.8297,
			
 
				+        "ITL (s)": 0.0299,
			
 
				+        "TPOT (s)": 0.0461,
			
 
				+        "Latency (s)": 6.6277,
			
 
				+        "Input tokens": 2057,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.6315,
			
 
				+        "Total (tok/s)": 334.9623
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "90%",
			
 
				+        "TTFT (s)": 2.8297,
			
 
				+        "ITL (s)": 0.0301,
			
 
				+        "TPOT (s)": 0.0461,
			
 
				+        "Latency (s)": 6.6277,
			
 
				+        "Input tokens": 2057,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.6315,
			
 
				+        "Total (tok/s)": 334.9623
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "95%",
			
 
				+        "TTFT (s)": 2.8297,
			
 
				+        "ITL (s)": 0.0304,
			
 
				+        "TPOT (s)": 0.0461,
			
 
				+        "Latency (s)": 6.6277,
			
 
				+        "Input tokens": 2057,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.6315,
			
 
				+        "Total (tok/s)": 334.9623
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "98%",
			
 
				+        "TTFT (s)": 2.8297,
			
 
				+        "ITL (s)": 0.031,
			
 
				+        "TPOT (s)": 0.0461,
			
 
				+        "Latency (s)": 6.6277,
			
 
				+        "Input tokens": 2057,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.6315,
			
 
				+        "Total (tok/s)": 334.9623
			
 
				+    },
			
 
				+    {
			
 
				+        "Percentiles": "99%",
			
 
				+        "TTFT (s)": 2.8297,
			
 
				+        "ITL (s)": 0.1171,
			
 
				+        "TPOT (s)": 0.0461,
			
 
				+        "Latency (s)": 6.6277,
			
 
				+        "Input tokens": 2057,
			
 
				+        "Output tokens": 128,
			
 
				+        "Output (tok/s)": 19.6315,
			
 
				+        "Total (tok/s)": 334.9623
			
 
				+    }
			
 
				+]
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_summary.json
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_summary.json
@@ -0,0 +1,17 @@
 
				+{
			
 
				+    "Time taken for tests (s)": 6.6303,
			
 
				+    "Number of concurrency": 5,
			
 
				+    "Request rate (req/s)": -1,
			
 
				+    "Total requests": 5,
			
 
				+    "Succeed requests": 5,
			
 
				+    "Failed requests": 0,
			
 
				+    "Output token throughput (tok/s)": 96.5268,
			
 
				+    "Total token throughput (tok/s)": 1647.2903,
			
 
				+    "Request throughput (req/s)": 0.7541,
			
 
				+    "Average latency (s)": 6.5697,
			
 
				+    "Average time to first token (s)": 2.1216,
			
 
				+    "Average time per output token (s)": 0.035,
			
 
				+    "Average inter-token latency (s)": 0.0348,
			
 
				+    "Average input tokens per request": 2056.4,
			
 
				+    "Average output tokens per request": 128.0
			
 
				+}
			
--- a/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/performance_summary.txt
+++ b/dev/models/bench_suite/outputs/20260515_084106/Qwen3.6-27B-W8A8/performance_summary.txt
@@ -0,0 +1,32 @@
 
				+╭──────────────────────────────────────────────────────────────────────────────╮
			
 
				+│ Performance Test Summary Report                                              │
			
 
				+╰──────────────────────────────────────────────────────────────────────────────╯
			
 
				+
			
 
				+Basic Information:
			
 
				+┌───────────────────────┬──────────────────────────────────────────────────────┐
			
 
				+│ Model                 │ Qwen3.6-27B-W8A8                                     │
			
 
				+│ Test Dataset          │ random                                               │
			
 
				+│ Total Generated       │ 2,048.0 tokens                                       │
			
 
				+│ Total Test Time       │ 23.63 seconds                                        │
			
 
				+│ Avg Output Rate       │ 86.67 tokens/sec                                     │
			
 
				+│ Output Path           │ outputs/20260515_084106/Qwen3.6-27B-W8A8             │
			
 
				+└───────────────────────┴──────────────────────────────────────────────────────┘
			
 
				+
			
 
				+
			
 
				+                                    Detailed Performance Metrics                                    
			
 
				+┏━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓
			
 
				+┃      ┃      ┃      ┃     Avg ┃     P99 ┃     Avg ┃     P99 ┃     Avg ┃    P99 ┃    Gen. ┃ Success┃
			
 
				+┃Conc. ┃ Rate ┃  RPS ┃ Lat.(s) ┃ Lat.(s) ┃ TTFT(s) ┃ TTFT(s) ┃ TPOT(s) ┃ TPOT(… ┃  toks/s ┃    Rate┃
			
 
				+┡━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━┩
			
 
				+│    1 │  INF │ 0.14 │   7.369 │   7.369 │   0.816 │   0.816 │   0.052 │  0.052 │   17.37 │  100.0%│
			
 
				+│    5 │  INF │ 0.75 │   6.570 │   6.628 │   2.122 │   2.830 │   0.035 │  0.046 │   96.53 │  100.0%│
			
 
				+│   10 │  INF │ 1.04 │   9.567 │   9.625 │   3.607 │   5.410 │   0.047 │  0.069 │  132.93 │  100.0%│
			
 
				+└──────┴──────┴──────┴─────────┴─────────┴─────────┴─────────┴─────────┴────────┴─────────┴────────┘
			
 
				+
			
 
				+
			
 
				+               Best Performance Configuration               
			
 
				+ Highest RPS         Concurrency 10 (1.04 req/sec)          
			
 
				+ Lowest Latency      Concurrency 5 (6.570 seconds)          
			
 
				+
			
 
				+Performance Recommendations:
			
 
				+• The system seems not to have reached its performance bottleneck, try higher concurrency
			
--- a/dev/models/bench_suite/outputs_qwen3.6_27b-wa8a_0515.zip
+++ b/dev/models/bench_suite/outputs_qwen3.6_27b-wa8a_0515.zip
--- a/dev/models/bench_suite/outputs_qwen3.6_27b_0515.zip
+++ b/dev/models/bench_suite/outputs_qwen3.6_27b_0515.zip
--- a/dev/models/docker-compose.yaml
+++ b/dev/models/docker-compose.yaml
@@ -0,0 +1,158 @@
 
				+version: '3.8'
			
 
				+services:
			
 
				+  qwen3.6-27b:
			
 
				+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
			
 
				+    container_name: qwen3.6-27b-w8a8-vllm #qwen3.6-27b-w8a8  Qwen3.6-27B-W8A8
			
 
				+    stdin_open: true
			
 
				+    tty: true
			
 
				+    restart: unless-stopped
			
 
				+    #network_mode: host
			
 
				+    devices:
			
 
				+      - "/dev/dri:/dev/dri"
			
 
				+      - "/dev/mxcd:/dev/mxcd"
			
 
				+      - "/dev/mem:/dev/mem"
			
 
				+    group_add:
			
 
				+      - "video"
			
 
				+    privileged: true
			
 
				+    security_opt:
			
 
				+      - "apparmor=unconfined"
			
 
				+      - "seccomp=unconfined"
			
 
				+    shm_size: '100gb'
			
 
				+    ulimits:
			
 
				+      memlock:
			
 
				+        soft: -1
			
 
				+        hard: -1
			
 
				+    ports:
			
 
				+      - "8004:30000"
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES=0,1
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+      - MACA_SMALL_PAGESIZE_ENABLE=1
			
 
				+      - MACA_VLLM_ENABLE_MCTLASS_FUSED_MOE=1
			
 
				+      - MACA_VLLM_ENABLE_MCTLASS_PYTHON_API=1
			
 
				+    volumes:
			
 
				+      - "/usr/local/:/usr/local/"
			
 
				+      - "/pde_ai:/pde_ai"
			
 
				+      - "/opt/lq/models:/model:ro"
			
 
				+      - "~/.cache/huggingface:/root/.cache/huggingface"
			
 
				+      - "/opt/lq/deploy_models/logs:/var/log/vllm"  # 日志目录映射
			
 
				+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
			
 
				+    command: >
			
 
				+      sh -c "/opt/conda/bin/vllm serve /model/Qwen3.6-27B-W8A8 \
			
 
				+      --served-model-name Qwen3.6-27B-W8A8 \
			
 
				+      --host 0.0.0.0 \
			
 
				+      --port 30000 \
			
 
				+      --tensor-parallel-size 2 \
			
 
				+      --max-num-batched-tokens 4096 \
			
 
				+      --max-model-len 8192 \
			
 
				+      --reasoning-parser qwen3 \
			
 
				+      --enable-auto-tool-choice \
			
 
				+      --tool-call-parser qwen3_coder \ 
			
 
				+      --api-key sk-123456 \
			
 
				+      2>&1 | tee /var/log/vllm/qwen3.6-27b-w8a8-server.log"
			
 
				+
			
 
				+
			
 
				+
			
 
				+  qwen3-embedding:
			
 
				+    image: vllm-metax:lq
			
 
				+    container_name: qwen3-embedding-vllm #qwen3-embedding
			
 
				+    stdin_open: true
			
 
				+    tty: true
			
 
				+    restart: unless-stopped
			
 
				+    #network_mode: host
			
 
				+    devices:
			
 
				+      - "/dev/dri:/dev/dri"
			
 
				+      - "/dev/mxcd:/dev/mxcd"
			
 
				+      - "/dev/mem:/dev/mem"
			
 
				+    group_add:
			
 
				+      - "video"
			
 
				+    privileged: true
			
 
				+    security_opt:
			
 
				+      - "apparmor=unconfined"
			
 
				+      - "seccomp=unconfined"
			
 
				+    shm_size: '100gb'
			
 
				+    ulimits:
			
 
				+      memlock:
			
 
				+        soft: -1
			
 
				+        hard: -1
			
 
				+    ports:
			
 
				+      - "9003:30000"
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES=2
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+      - VLLM_TORCH_COMPILE=0
			
 
				+      - VLLM_DISABLE_TORCH_COMPILE=1
			
 
				+      - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$
			
 
				+      - MAX_JOBS=1
			
 
				+    volumes:
			
 
				+      - "/usr/local/:/usr/local/"
			
 
				+      - "/pde_ai:/pde_ai"
			
 
				+      - "/opt/lq/models:/model:ro"
			
 
				+      - "~/.cache/huggingface:/root/.cache/huggingface"
			
 
				+      - "/opt/lq/deploy_models/logs:/var/log/vllm"  # 日志目录映射
			
 
				+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
			
 
				+    command: >
			
 
				+      sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Embedding-8B  \
			
 
				+      --served-model-name Qwen3-Embedding-8B  \
			
 
				+      --task embedding \
			
 
				+      --host 0.0.0.0 \
			
 
				+      --port 30000 \
			
 
				+      --tensor-parallel-size 1 \
			
 
				+      --max-num-batched-tokens 4096 \
			
 
				+      --max-model-len 16384 \
			
 
				+      --gpu-memory-utilization 0.45 \
			
 
				+      --api-key sk-123456 \
			
 
				+      2>&1 | tee /var/log/vllm/qwen3-embedding-server.log"
			
 
				+
			
 
				+
			
 
				+  qwen3-reranker:
			
 
				+    image: vllm-metax:lq
			
 
				+    container_name: qwen3-reranker-vllm #qwen3-reranker
			
 
				+    stdin_open: true
			
 
				+    tty: true
			
 
				+    restart: unless-stopped
			
 
				+    #network_mode: host
			
 
				+    devices:
			
 
				+      - "/dev/dri:/dev/dri"
			
 
				+      - "/dev/mxcd:/dev/mxcd"
			
 
				+      - "/dev/mem:/dev/mem"
			
 
				+    group_add:
			
 
				+      - "video"
			
 
				+    privileged: true
			
 
				+    security_opt:
			
 
				+      - "apparmor=unconfined"
			
 
				+      - "seccomp=unconfined"
			
 
				+    shm_size: '100gb'
			
 
				+    ulimits:
			
 
				+      memlock:
			
 
				+        soft: -1
			
 
				+        hard: -1
			
 
				+    ports:
			
 
				+      - "9004:30000"
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES=3
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+      - VLLM_TORCH_COMPILE=0
			
 
				+      - VLLM_DISABLE_TORCH_COMPILE=1
			
 
				+      - TORCH_EXTENSIONS_DIR=/tmp/torch_ext_$$
			
 
				+      - MAX_JOBS=1
			
 
				+    volumes:
			
 
				+      - "/usr/local/:/usr/local/"
			
 
				+      - "/pde_ai:/pde_ai"
			
 
				+      - "/opt/lq/models:/model:ro"
			
 
				+      - "~/.cache/huggingface:/root/.cache/huggingface"
			
 
				+      - "/opt/lq/deploy_models/logs:/var/log/vllm"  # 日志目录映射
			
 
				+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
			
 
				+    command: >
			
 
				+      sh -c "/opt/conda/bin/vllm serve /model/Qwen3-Reranker-8B  \
			
 
				+      --served-model-name Qwen3-Reranker-8B  \
			
 
				+      --task score \
			
 
				+      --host 0.0.0.0 \
			
 
				+      --port 30000 \
			
 
				+      --tensor-parallel-size 1 \
			
 
				+      --max-num-batched-tokens 4096 \
			
 
				+      --max-model-len 16384 \
			
 
				+      --gpu-memory-utilization 0.45 \
			
 
				+      --hf_overrides '{\"architectures\": [\"Qwen3ForSequenceClassification\"],\"classifier_from_token\": [\"no\", \"yes\"],\"is_original_qwen3_reranker\": true}' \
			
 
				+      --api-key sk-123456 \
			
 
				+      2>&1 | tee /var/log/vllm/qwen3-reranker-server.log"
			
--- a/dev/models/docker-compose.yaml.bak
+++ b/dev/models/docker-compose.yaml.bak
@@ -0,0 +1,50 @@
 
				+version: '3.8'
			
 
				+services:
			
 
				+  qwen3.6-27b:
			
 
				+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
			
 
				+    container_name: qwen3.6-27b-w8a8-vllm #qwen3.6-27b-w8a8  Qwen3.6-27B-W8A8
			
 
				+    stdin_open: true
			
 
				+    tty: true
			
 
				+    restart: unless-stopped
			
 
				+    #network_mode: host
			
 
				+    devices:
			
 
				+      - "/dev/dri:/dev/dri"
			
 
				+      - "/dev/mxcd:/dev/mxcd"
			
 
				+      - "/dev/mem:/dev/mem"
			
 
				+    group_add:
			
 
				+      - "video"
			
 
				+    privileged: true
			
 
				+    security_opt:
			
 
				+      - "apparmor=unconfined"
			
 
				+      - "seccomp=unconfined"
			
 
				+    shm_size: '100gb'
			
 
				+    ulimits:
			
 
				+      memlock:
			
 
				+        soft: -1
			
 
				+        hard: -1
			
 
				+    ports:
			
 
				+      - "8004:30000"
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES=0,1
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+      - MACA_SMALL_PAGESIZE_ENABLE=1
			
 
				+      - MACA_VLLM_ENABLE_MCTLASS_FUSED_MOE=1
			
 
				+      - MACA_VLLM_ENABLE_MCTLASS_PYTHON_API=1
			
 
				+    volumes:
			
 
				+      - "/usr/local/:/usr/local/"
			
 
				+      - "/pde_ai:/pde_ai"
			
 
				+      - "/opt/lq/models:/model:ro"
			
 
				+      - "~/.cache/huggingface:/root/.cache/huggingface"
			
 
				+      - "/opt/lq/deploy_models/logs:/var/log/vllm"  # 日志目录映射
			
 
				+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
			
 
				+    command: >
			
 
				+      sh -c "/opt/conda/bin/vllm serve /model/Qwen3.6-27B-W8A8 \
			
 
				+      --served-model-name Qwen3.6-27B-W8A8 \
			
 
				+      --host 0.0.0.0 \
			
 
				+      --port 30000 \
			
 
				+      --tensor-parallel-size 2 \
			
 
				+      --max-num-batched-tokens 4096 \
			
 
				+      --max-model-len 8192 \
			
 
				+      --api-key sk-123456 \
			
 
				+      2>&1 | tee /var/log/vllm/qwen3.6-27b-w8a8-server.log"
			
 
				+
			
--- a/dev/models/yaml_bak/docker-compose.yaml.nvidia
+++ b/dev/models/yaml_bak/docker-compose.yaml.nvidia
@@ -0,0 +1,29 @@
 
				+services:
			
 
				+  qwen3.6-27b:
			
 
				+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
			
 
				+    container_name: qwen3.6-27b-vllm
			
 
				+    shm_size: '10gb'
			
 
				+    ports:
			
 
				+      - "8004:30000"
			
 
				+    volumes:
			
 
				+      # # 宿主机路径:容器内路径
			
 
				+      - /opt/lq/models:/model:ro
			
 
				+      - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				+      - /opt/lq/deploy_models/logs/logs:/var/log/vllm  # 日志目录映射
			
 
				+      - /opt/lq/deploy_models/bench_suite:/bench_suite #脚本目录映射
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES=1
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+    command: >
			
 
				+      sh -c "mkdir -p /var/log/vllm &&
			
 
				+      python3 -m vllm.entrypoints.openai.api_server
			
 
				+      --model-path /model/Qwen3.6-27B
			
 
				+      --served-model-name Qwen3.6-27B
			
 
				+      --host 0.0.0.0
			
 
				+      --port 30000
			
 
				+      --tensor-parallel-size 1
			
 
				+      --max-num-batched-tokens 4096
			
 
				+      --max-model-len 8192
			
 
				+      --api-key sk-12345
			
 
				+      --log-level info 2>&1 | tee /var/log/vllm/qwen3.6-27b-server.log"
			
 
				+    ipc: host
			
--- a/dev/models/yaml_bak/docker-compose.yaml.qwen3.6-27b-vllm
+++ b/dev/models/yaml_bak/docker-compose.yaml.qwen3.6-27b-vllm
@@ -0,0 +1,49 @@
 
				+version: '3.8'
			
 
				+services:
			
 
				+  qwen3.6-27b:
			
 
				+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
			
 
				+    container_name: qwen3.6-27b-vllm
			
 
				+    stdin_open: true
			
 
				+    tty: true
			
 
				+    restart: unless-stopped
			
 
				+    network_mode: host
			
 
				+    devices:
			
 
				+      - "/dev/dri:/dev/dri"
			
 
				+      - "/dev/mxcd:/dev/mxcd"
			
 
				+      - "/dev/mem:/dev/mem"
			
 
				+    group_add:
			
 
				+      - "video"
			
 
				+    privileged: true
			
 
				+    security_opt:
			
 
				+      - "apparmor=unconfined"
			
 
				+      - "seccomp=unconfined"
			
 
				+    shm_size: '100gb'
			
 
				+    ulimits:
			
 
				+      memlock:
			
 
				+        soft: -1
			
 
				+        hard: -1
			
 
				+    ports:
			
 
				+      - "8004:30000"
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES=1
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+    volumes:
			
 
				+      - "/usr/local/:/usr/local/"
			
 
				+      - "/pde_ai:/pde_ai"
			
 
				+      - "/opt/lq/models:/model:ro"
			
 
				+      - "~/.cache/huggingface:/root/.cache/huggingface"
			
 
				+      - "/opt/lq/deploy_models/logs/logs:/var/log/vllm"  # 日志目录映射
			
 
				+      - "/opt/lq/deploy_models/bench_suite:/bench_suite" #脚本目录映射
			
 
				+    command: >
			
 
				+      sh -c "mkdir -p /var/log/vllm &&
			
 
				+      python3 -m vllm.entrypoints.openai.api_server
			
 
				+      --model-path /model/Qwen3.6-27B
			
 
				+      --served-model-name Qwen3.6-27B
			
 
				+      --host 0.0.0.0
			
 
				+      --port 30000
			
 
				+      --tensor-parallel-size 1
			
 
				+      --max-num-batched-tokens 4096
			
 
				+      --max-model-len 8192
			
 
				+      --api-key sk-123456
			
 
				+      --log-level info 2>&1 | tee /var/log/vllm/qwen3.6-27b-server.log" 
			
 
				+
			
--- a/dev/models/yaml_bak/docker-compose.yaml.test_vllm-metax
+++ b/dev/models/yaml_bak/docker-compose.yaml.test_vllm-metax
@@ -0,0 +1,30 @@
 
				+version: '3.8'
			
 
				+services:
			
 
				+  test_vllm-metax:
			
 
				+    image: cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-ubuntu22.04-amd64
			
 
				+    container_name: test_vllm-metax
			
 
				+    command: tail -f /dev/null
			
 
				+    stdin_open: true
			
 
				+    tty: true
			
 
				+    restart: unless-stopped
			
 
				+    network_mode: host
			
 
				+    devices:
			
 
				+      - "/dev/dri:/dev/dri"
			
 
				+      - "/dev/mxcd:/dev/mxcd"
			
 
				+      - "/dev/mem:/dev/mem"
			
 
				+    group_add:
			
 
				+      - "video"
			
 
				+    privileged: true
			
 
				+    security_opt:
			
 
				+      - "apparmor=unconfined"
			
 
				+      - "seccomp=unconfined"
			
 
				+    shm_size: '100gb'
			
 
				+    ulimits:
			
 
				+      memlock:
			
 
				+        soft: -1
			
 
				+        hard: -1
			
 
				+    volumes:
			
 
				+      - "/usr/local/:/usr/local/"
			
 
				+      - "/pde_ai:/pde_ai"
			
 
				+    
			
 
				+
			
--- a/prod/models/glm_ocr/docker-compose.yml
+++ b/prod/models/glm_ocr/docker-compose.yml
@@ -0,0 +1,69 @@
 
				+services:
			
 
				+  glm-ocr-vllm-docker:
			
 
				+    image: vllm/vllm-openai:nightly
			
 
				+    container_name: glm-ocr-vllm
			
 
				+    runtime: nvidia
			
 
				+    restart: unless-stopped
			
 
				+    environment:
			
 
				+      - PYTHONUNBUFFERED=1
			
 
				+      - PIP_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
			
 
				+      - PIP_TRUSTED_HOST=pypi.tuna.tsinghua.edu.cn
			
 
				+      - PYTHONPATH=/opt/pip-packages:/usr/local/lib/python3.12/dist-packages
			
 
				+    ports:
			
 
				+      - "25429:30000"
			
 
				+    volumes:
			
 
				+      - /data/app_workspace/models/GLM-OCR:/models/GLM-OCR:ro
			
 
				+      - /data/app_workspace/glm-ocr/logs-docker:/var/log/vllm
			
 
				+      - /data/app_workspace/glm-ocr/pip-packages:/opt/pip-packages:rw
			
 
				+      - /data/app_workspace/glm-ocr/pip-cache:/root/.cache/pip:rw
			
 
				+    entrypoint: ["/bin/bash", "-c"]
			
 
				+    command:
			
 
				+      - |
			
 
				+        # 关键修改：优先检查缓存，只在需要时安装
			
 
				+        echo "=== 检查 transformers 版本 ==="
			
 
				+        if python3 -c "import transformers; assert transformers.__version__ >= '5.3.0', f'需要 transformers>=5.3.0，当前版本 {transformers.__version__}'" 2>/dev/null; then
			
 
				+          echo "✅ 使用已安装的 transformers $(python3 -c 'import transformers; print(transformers.__version__)')"
			
 
				+        else
			
 
				+          echo "⚠️ 安装或更新 transformers>=5.3.0..."
			
 
				+          pip3 install "transformers>=5.3.0" \
			
 
				+            --target /opt/pip-packages \
			
 
				+            --root-user-action=ignore \
			
 
				+            -q 2>&1 | tail -5
			
 
				+        fi
			
 
				+        
			
 
				+        echo "=== 启动 vLLM ==="
			
 
				+        python3 -m vllm.entrypoints.openai.api_server \
			
 
				+          --model /models/GLM-OCR \
			
 
				+          --served-model-name GLM-OCR \
			
 
				+          --host 0.0.0.0 \
			
 
				+          --port 30000 \
			
 
				+          --api-key sk_prod_sXgHYxfVvZdw7O-cki6i7Cp2TbguOvbA_f4beb12a \
			
 
				+          --gpu-memory-utilization 0.60 \
			
 
				+          --max-model-len 4096 \
			
 
				+          --max-num-seqs 96 \
			
 
				+          --enable-prefix-caching \
			
 
				+          --trust-remote-code \
			
 
				+          --allowed-local-media-path / \
			
 
				+          --dtype bfloat16
			
 
				+    ulimits:
			
 
				+      memlock: 67108864
			
 
				+      stack: 67108864
			
 
				+    ipc: host
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["6"]
			
 
				+              capabilities: [gpu]
			
 
				+    healthcheck:
			
 
				+      test: ["CMD-SHELL", "curl -f http://localhost:30000/health -H 'Authorization: Bearer sk_prod_sXgHYxfVvZdw7O-cki6i7Cp2TbguOvbA_f4beb12a' || exit 1"]
			
 
				+      interval: 30s
			
 
				+      timeout: 10s
			
 
				+      retries: 5
			
 
				+      start_period: 180s
			
 
				+    logging:
			
 
				+      driver: "json-file"
			
 
				+      options:
			
 
				+        max-size: "500m"
			
 
				+        max-file: "3"
			
--- a/prod/models/sglang/docker-compose.yaml
+++ b/prod/models/sglang/docker-compose.yaml
@@ -3,7 +3,7 @@ services:
 
				     image: lmsysorg/sglang:latest
			
 
				     container_name: qwen3.5-122b-sglang
			
 
				     runtime: nvidia
			
 
				-    shm_size: '10gb'
			
 
				+    shm_size: '200gb'
			
 
				     ports:
			
 
				       - "25423:30000"
			
 
				     volumes:
			
@@ -11,6 +11,7 @@ services:
 
				       - /data/app_workspace/models:/model:ro
			
 
				       - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
			
 
				     environment:
			
 
				       - CUDA_VISIBLE_DEVICES 
			
 
				       - PYTHONUNBUFFERED=1  # 确保实时输出
			
@@ -18,10 +19,11 @@ services:
 
				       sh -c "mkdir -p /var/log/sglang &&
			
 
				       python3 -m sglang.launch_server
			
 
				       --model-path /model/Qwen3.5-122B-A10B
			
 
				-      --tp 2
			
 
				+      --tp 4
			
 
				       --host 0.0.0.0
			
 
				       --port 30000
			
 
				-      --api-key lq123456
			
 
				+      --api-key sk-prod_ojkjwcO4TTd9TL3vK6uo8a2Dvcdoz64u_9a89845f
			
 
				+      --mem-fraction-static 0.95
			
 
				       --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122b-server.log"
			
 
				     ipc: host
			
 
				     deploy:
			
@@ -29,154 +31,157 @@ services:
 
				         reservations:
			
 
				           devices:
			
 
				             - driver: nvidia
			
 
				-              device_ids: ["0","1"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              device_ids: ["0","1","2","3"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				               #count: all
			
 
				               capabilities: [gpu]
			
 
				 
			
 
				-  qwen3-8b:
			
 
				+
			
 
				+  qwen3-embedding-8b:
			
 
				     image: lmsysorg/sglang:latest
			
 
				-    container_name: qwen3-8b-sglang
			
 
				+    container_name: qwen3-embedding-8b-sglang
			
 
				     runtime: nvidia
			
 
				-    shm_size: '10gb'
			
 
				+    shm_size: '100gb'
			
 
				     ports:
			
 
				-      - "25424:30000"
			
 
				+      - "25425:30000"
			
 
				     volumes:
			
 
				       # # 宿主机路径:容器内路径
			
 
				       - /data/app_workspace/models:/model:ro
			
 
				       - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
			
 
				     environment:
			
 
				       - CUDA_VISIBLE_DEVICES 
			
 
				       - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				     command: >
			
 
				       sh -c "mkdir -p /var/log/sglang &&
			
 
				       python3 -m sglang.launch_server
			
 
				-      --model-path /model/Qwen3-8B
			
 
				+      --model-path /model/Qwen3-Embedding-8B
			
 
				+      --is-embedding
			
 
				       --tp 1
			
 
				       --host 0.0.0.0
			
 
				       --port 30000
			
 
				-      --api-key lq123456
			
 
				+      --api-key sk_prod_3HDoVka8mU8Jqj9Xnmfkn8bxk5kmzKrz_700c186f
			
 
				       --mem-fraction-static 0.45
			
 
				-      --log-level info 2>&1 | tee /var/log/sglang/qwen3-8b-server.log"
			
 
				+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log"
			
 
				     ipc: host
			
 
				     deploy:
			
 
				       resources:
			
 
				         reservations:
			
 
				           devices:
			
 
				             - driver: nvidia
			
 
				-              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              device_ids: ["5"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				               #count: all
			
 
				               capabilities: [gpu]
			
 
				     healthcheck:
			
 
				-      test: ["CMD", "curl", "-f", "http://localhost:30000/v1/models", "-H", "Authorization: Bearer lq123456"]
			
 
				+      test: ["CMD", "curl", "-f", "http://localhost:30000/v1/embeddings", "-H", "Authorization: Bearer sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615", "-H", "Content-Type: application/json", "-d", "{\"input\": \"health\"}"]
			
 
				       interval: 10s
			
 
				       timeout: 5s
			
 
				       retries: 30
			
 
				       start_period: 60s
			
 
				 
			
 
				-  qwen3-embedding-8b:
			
 
				+  qwen3-reranker-8b:
			
 
				     image: lmsysorg/sglang:latest
			
 
				-    container_name: qwen3-embedding-8b-sglang
			
 
				+    container_name: qwen3-reranker-8b-sglang
			
 
				     runtime: nvidia
			
 
				-    shm_size: '5gb'
			
 
				+    shm_size: '100gb'
			
 
				     ports:
			
 
				-      - "25425:30000"
			
 
				+      - "25426:30000"
			
 
				     volumes:
			
 
				       # # 宿主机路径:容器内路径
			
 
				       - /data/app_workspace/models:/model:ro
			
 
				       - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				+      - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro
			
 
				+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
			
 
				     environment:
			
 
				       - CUDA_VISIBLE_DEVICES 
			
 
				       - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				     command: >
			
 
				       sh -c "mkdir -p /var/log/sglang &&
			
 
				       python3 -m sglang.launch_server
			
 
				-      --model-path /model/Qwen3-Embedding-8B
			
 
				-      --is-embedding
			
 
				+      --model-path /model/Qwen3-Reranker-8B
			
 
				       --tp 1
			
 
				       --host 0.0.0.0
			
 
				       --port 30000
			
 
				-      --api-key lq123456
			
 
				-      --mem-fraction-static 0.45
			
 
				-      --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log"
			
 
				+      --api-key sk_prod_dvgYHKWFoQlYAKmkIvBSyuguNSQGeNh0_23c65608
			
 
				+      --mem-fraction-static 0.50
			
 
				+      --disable-radix-cache
			
 
				+      --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja
			
 
				+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log"
			
 
				     ipc: host
			
 
				     deploy:
			
 
				       resources:
			
 
				         reservations:
			
 
				           devices:
			
 
				             - driver: nvidia
			
 
				-              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              device_ids: ["5"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				               #count: all
			
 
				               capabilities: [gpu]
			
 
				     depends_on:
			
 
				-      qwen3-8b:
			
 
				-        condition: service_healthy  # 等待 qwen3-8b 健康检查通过
			
 
				-
			
 
				+      qwen3-embedding-8b:
			
 
				+        condition: service_healthy  # 等待 qwen3-embedding-8b 健康检查通过
			
 
				 
			
 
				-  qwen3-reranker-8b:
			
 
				+  qwen3.5-35b:
			
 
				     image: lmsysorg/sglang:latest
			
 
				-    container_name: qwen3-reranker-8b-sglang
			
 
				+    container_name: qwen3.5-35b-sglang
			
 
				     runtime: nvidia
			
 
				-    shm_size: '5gb'
			
 
				+    shm_size: '100gb'
			
 
				     ports:
			
 
				-      - "25426:30000"
			
 
				+      - "25427:30000"
			
 
				     volumes:
			
 
				       # # 宿主机路径:容器内路径
			
 
				       - /data/app_workspace/models:/model:ro
			
 
				       - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				-      - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro
			
 
				+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
			
 
				     environment:
			
 
				       - CUDA_VISIBLE_DEVICES 
			
 
				       - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				     command: >
			
 
				       sh -c "mkdir -p /var/log/sglang &&
			
 
				       python3 -m sglang.launch_server
			
 
				-      --model-path /model/Qwen3-Reranker-8B
			
 
				+      --model-path /model/Qwen3.5-35B-A3B
			
 
				       --tp 1
			
 
				       --host 0.0.0.0
			
 
				       --port 30000
			
 
				-      --api-key lq123456
			
 
				-      --mem-fraction-static 0.50
			
 
				-      --disable-radix-cache
			
 
				-      --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja
			
 
				-      --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log"
			
 
				+      --api-key sk_prod_0NuLZt1a2UrD80F9iB-GTxOIuAkJSZxH_5522d7ae
			
 
				+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log"
			
 
				     ipc: host
			
 
				     deploy:
			
 
				       resources:
			
 
				         reservations:
			
 
				           devices:
			
 
				             - driver: nvidia
			
 
				-              device_ids: ["3"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              device_ids: ["7"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				               #count: all
			
 
				               capabilities: [gpu]
			
 
				 
			
 
				 
			
 
				-  qwen3.5-35b:
			
 
				+  qwen3.6-27b:
			
 
				     image: lmsysorg/sglang:latest
			
 
				-    container_name: qwen3.5-35b-sglang
			
 
				+    container_name: qwen3.6-27b-sglang
			
 
				     runtime: nvidia
			
 
				-    shm_size: '5gb'
			
 
				+    shm_size: '100gb'
			
 
				     ports:
			
 
				-      - "25427:30000"
			
 
				+      - "25424:30000"
			
 
				     volumes:
			
 
				       # # 宿主机路径:容器内路径
			
 
				       - /data/app_workspace/models:/model:ro
			
 
				       - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				       - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				+      - /data/app_workspace/deploy_models/sglang/bench_suite:/bench_suite #脚本目录映射
			
 
				     environment:
			
 
				-      - CUDA_VISIBLE_DEVICES 
			
 
				+      - CUDA_VISIBLE_DEVICES
			
 
				       - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				     command: >
			
 
				       sh -c "mkdir -p /var/log/sglang &&
			
 
				       python3 -m sglang.launch_server
			
 
				-      --model-path /model/Qwen3.5-35B-A3B
			
 
				+      --model-path /model/Qwen3.6-27B
			
 
				       --tp 1
			
 
				       --host 0.0.0.0
			
 
				       --port 30000
			
 
				-      --api-key lq123456
			
 
				-      --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log"
			
 
				+      --api-key sk_prod_HH21x5WB9Pm7IM9Bf808BoJPEn_4bPX5_f2c5f3f6
			
 
				+      --log-level info 2>&1 | tee /var/log/sglang/qwen3.6-27b-server.log"
			
 
				     ipc: host
			
 
				     deploy:
			
 
				       resources:
			
--- a/prod/models/sglang/docker-compose_20260518.yaml
+++ b/prod/models/sglang/docker-compose_20260518.yaml
@@ -0,0 +1,188 @@
 
				+services:
			
 
				+  qwen3.5-122b:
			
 
				+    image: lmsysorg/sglang:latest
			
 
				+    container_name: qwen3.5-122b-sglang
			
 
				+    runtime: nvidia
			
 
				+    shm_size: '10gb'
			
 
				+    ports:
			
 
				+      - "25423:30000"
			
 
				+    volumes:
			
 
				+      # # 宿主机路径:容器内路径
			
 
				+      - /data/app_workspace/models:/model:ro
			
 
				+      - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES 
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+    command: >
			
 
				+      sh -c "mkdir -p /var/log/sglang &&
			
 
				+      python3 -m sglang.launch_server
			
 
				+      --model-path /model/Qwen3.5-122B-A10B
			
 
				+      --tp 2
			
 
				+      --host 0.0.0.0
			
 
				+      --port 30000
			
 
				+      --api-key lq123456
			
 
				+      --log-level info 2>&1 | tee /var/log/sglang/qwen3_5-122b-server.log"
			
 
				+    ipc: host
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["0","1"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              #count: all
			
 
				+              capabilities: [gpu]
			
 
				+
			
 
				+  qwen3-8b:
			
 
				+    image: lmsysorg/sglang:latest
			
 
				+    container_name: qwen3-8b-sglang
			
 
				+    runtime: nvidia
			
 
				+    shm_size: '10gb'
			
 
				+    ports:
			
 
				+      - "25424:30000"
			
 
				+    volumes:
			
 
				+      # # 宿主机路径:容器内路径
			
 
				+      - /data/app_workspace/models:/model:ro
			
 
				+      - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES 
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+    command: >
			
 
				+      sh -c "mkdir -p /var/log/sglang &&
			
 
				+      python3 -m sglang.launch_server
			
 
				+      --model-path /model/Qwen3-8B
			
 
				+      --tp 1
			
 
				+      --host 0.0.0.0
			
 
				+      --port 30000
			
 
				+      --api-key lq123456
			
 
				+      --mem-fraction-static 0.45
			
 
				+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-8b-server.log"
			
 
				+    ipc: host
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              #count: all
			
 
				+              capabilities: [gpu]
			
 
				+    healthcheck:
			
 
				+      test: ["CMD", "curl", "-f", "http://localhost:30000/v1/models", "-H", "Authorization: Bearer lq123456"]
			
 
				+      interval: 10s
			
 
				+      timeout: 5s
			
 
				+      retries: 30
			
 
				+      start_period: 60s
			
 
				+
			
 
				+  qwen3-embedding-8b:
			
 
				+    image: lmsysorg/sglang:latest
			
 
				+    container_name: qwen3-embedding-8b-sglang
			
 
				+    runtime: nvidia
			
 
				+    shm_size: '5gb'
			
 
				+    ports:
			
 
				+      - "25425:30000"
			
 
				+    volumes:
			
 
				+      # # 宿主机路径:容器内路径
			
 
				+      - /data/app_workspace/models:/model:ro
			
 
				+      - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES 
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+    command: >
			
 
				+      sh -c "mkdir -p /var/log/sglang &&
			
 
				+      python3 -m sglang.launch_server
			
 
				+      --model-path /model/Qwen3-Embedding-8B
			
 
				+      --is-embedding
			
 
				+      --tp 1
			
 
				+      --host 0.0.0.0
			
 
				+      --port 30000
			
 
				+      --api-key lq123456
			
 
				+      --mem-fraction-static 0.45
			
 
				+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-embedding-8b-server.log"
			
 
				+    ipc: host
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["2"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              #count: all
			
 
				+              capabilities: [gpu]
			
 
				+    depends_on:
			
 
				+      qwen3-8b:
			
 
				+        condition: service_healthy  # 等待 qwen3-8b 健康检查通过
			
 
				+
			
 
				+
			
 
				+  qwen3-reranker-8b:
			
 
				+    image: lmsysorg/sglang:latest
			
 
				+    container_name: qwen3-reranker-8b-sglang
			
 
				+    runtime: nvidia
			
 
				+    shm_size: '5gb'
			
 
				+    ports:
			
 
				+      - "25426:30000"
			
 
				+    volumes:
			
 
				+      # # 宿主机路径:容器内路径
			
 
				+      - /data/app_workspace/models:/model:ro
			
 
				+      - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				+      - /data/app_workspace/deploy_models/sglang/sglang-main:/sglang/sglang-main:ro
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES 
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+    command: >
			
 
				+      sh -c "mkdir -p /var/log/sglang &&
			
 
				+      python3 -m sglang.launch_server
			
 
				+      --model-path /model/Qwen3-Reranker-8B
			
 
				+      --tp 1
			
 
				+      --host 0.0.0.0
			
 
				+      --port 30000
			
 
				+      --api-key lq123456
			
 
				+      --mem-fraction-static 0.50
			
 
				+      --disable-radix-cache
			
 
				+      --chat-template /sglang/sglang-main/examples/chat_template/qwen3_reranker.jinja
			
 
				+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-reranker-8b-server.log"
			
 
				+    ipc: host
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["3"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              #count: all
			
 
				+              capabilities: [gpu]
			
 
				+
			
 
				+
			
 
				+  qwen3.5-35b:
			
 
				+    image: lmsysorg/sglang:latest
			
 
				+    container_name: qwen3.5-35b-sglang
			
 
				+    runtime: nvidia
			
 
				+    shm_size: '5gb'
			
 
				+    ports:
			
 
				+      - "25427:30000"
			
 
				+    volumes:
			
 
				+      # # 宿主机路径:容器内路径
			
 
				+      - /data/app_workspace/models:/model:ro
			
 
				+      - ~/.cache/huggingface:/root/.cache/huggingface
			
 
				+      - /data/app_workspace/deploy_models/sglang/logs:/var/log/sglang  # 日志目录映射
			
 
				+    environment:
			
 
				+      - CUDA_VISIBLE_DEVICES 
			
 
				+      - PYTHONUNBUFFERED=1  # 确保实时输出
			
 
				+    command: >
			
 
				+      sh -c "mkdir -p /var/log/sglang &&
			
 
				+      python3 -m sglang.launch_server
			
 
				+      --model-path /model/Qwen3.5-35B-A3B
			
 
				+      --tp 1
			
 
				+      --host 0.0.0.0
			
 
				+      --port 30000
			
 
				+      --api-key lq123456
			
 
				+      --log-level info 2>&1 | tee /var/log/sglang/qwen3-35b-server.log"
			
 
				+    ipc: host
			
 
				+    deploy:
			
 
				+      resources:
			
 
				+        reservations:
			
 
				+          devices:
			
 
				+            - driver: nvidia
			
 
				+              device_ids: ["4"]  # Modify for multiple GPUs: ["0", "1"]
			
 
				+              #count: all
			
 
				+              capabilities: [gpu]
			
--- a/prod/models/sglang/test_models.sh
+++ b/prod/models/sglang/test_models.sh
@@ -9,11 +9,12 @@ RED='\033[0;31m'
 
				 YELLOW='\033[1;33m'
			
 
				 NC='\033[0m'
			
 
				 
			
 
				-# 模型配置（按顺序定义）
			
 
				-MODEL_NAMES=("qwen3-8b" "qwen3.5-35b" "qwen3.5-122b" "qwen3-embedding-8b" "qwen3-reranker-8b")
			
 
				+# 模型配置（按顺序定义） "qwen3-8b" 
			
 
				+MODEL_NAMES=("qwen3.6-27b" "qwen3.5-35b" "qwen3.5-122b" "qwen3-embedding-8b" "qwen3-reranker-8b")
			
 
				 
			
 
				 declare -A MODEL_PORTS=(
			
 
				-    ["qwen3-8b"]="25424"
			
 
				+    ["qwen3-8b"]="25428"
			
 
				+    ["qwen3.6-27b"]="25424"
			
 
				     ["qwen3.5-35b"]="25427"
			
 
				     ["qwen3.5-122b"]="25423"
			
 
				     ["qwen3-embedding-8b"]="25425"
			
@@ -21,22 +22,32 @@ declare -A MODEL_PORTS=(
 
				 )
			
 
				 
			
 
				 declare -A MODEL_PATHS=(
			
 
				-    ["qwen3-8b"]="/model/Qwen3-8B"
			
 
				-    ["qwen3.5-35b"]="/model/Qwen3.5-35B"
			
 
				-    ["qwen3.5-122b"]="/model/Qwen3.5-122B-A10B"
			
 
				-    ["qwen3-embedding-8b"]="/model/Qwen3-Embedding-8B"
			
 
				-    ["qwen3-reranker-8b"]="/model/Qwen3-Reranker-8B"
			
 
				+    ["qwen3-8b"]="Qwen/Qwen3-8B"
			
 
				+    ["qwen3.6-27b"]="Qwen/Qwen3.6-27B"
			
 
				+    ["qwen3.5-35b"]="Qwen/Qwen3.5-35B-A3B"
			
 
				+    ["qwen3.5-122b"]="Qwen/Qwen3.5-122B-A10B"
			
 
				+    ["qwen3-embedding-8b"]="Qwen/Qwen3-Embedding-8B"
			
 
				+    ["qwen3-reranker-8b"]="Qwen/Qwen3-Reranker-8B"
			
 
				 )
			
 
				 
			
 
				 declare -A MODEL_TYPES=(
			
 
				     ["qwen3-8b"]="chat"
			
 
				+    ["qwen3.6-27b"]="chat"
			
 
				     ["qwen3.5-35b"]="chat"
			
 
				     ["qwen3.5-122b"]="chat"
			
 
				     ["qwen3-embedding-8b"]="embedding"
			
 
				     ["qwen3-reranker-8b"]="rerank"
			
 
				 )
			
 
				 
			
 
				-API_KEY="lq123456"
			
 
				+declare -A API_KEYS=(
			
 
				+    ["qwen3-8b"]="sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615"
			
 
				+    ["qwen3.6-27b"]="sk_prod_HH21x5WB9Pm7IM9Bf808BoJPEn_4bPX5_f2c5f3f6"
			
 
				+    ["qwen3.5-35b"]="sk_prod_0NuLZt1a2UrD80F9iB-GTxOIuAkJSZxH_5522d7ae"
			
 
				+    ["qwen3.5-122b"]="sk-prod_ojkjwcO4TTd9TL3vK6uo8a2Dvcdoz64u_9a89845f"
			
 
				+    ["qwen3-embedding-8b"]="sk_prod_3HDoVka8mU8Jqj9Xnmfkn8bxk5kmzKrz_700c186f"
			
 
				+    ["qwen3-reranker-8b"]="sk_prod_dvgYHKWFoQlYAKmkIvBSyuguNSQGeNh0_23c65608"
			
 
				+)
			
 
				+
			
 
				 TIMEOUT=30
			
 
				 
			
 
				 echo "========================================"
			
@@ -52,6 +63,7 @@ test_chat_model() {
 
				     local name=$1
			
 
				     local port=$2
			
 
				     local model_path=$3
			
 
				+    local api_key=${API_KEYS[$name]}
			
 
				     
			
 
				     echo ""
			
 
				     echo "----------------------------------------"
			
@@ -66,7 +78,7 @@ test_chat_model() {
 
				     response=$(curl -s -w "\n%{http_code}" \
			
 
				         --max-time $TIMEOUT \
			
 
				         -H "Content-Type: application/json" \
			
 
				-        -H "Authorization: Bearer $API_KEY" \
			
 
				+        -H "Authorization: Bearer $api_key" \
			
 
				         -d "{
			
 
				             \"model\": \"$model_path\",
			
 
				             \"messages\": [{\"role\": \"user\", \"content\": \"你好，请用一句话介绍自己\"}],
			
@@ -95,6 +107,7 @@ test_embedding_model() {
 
				     local name=$1
			
 
				     local port=$2
			
 
				     local model_path=$3
			
 
				+    local api_key=${API_KEYS[$name]}
			
 
				     
			
 
				     echo ""
			
 
				     echo "----------------------------------------"
			
@@ -109,7 +122,7 @@ test_embedding_model() {
 
				     response=$(curl -s -w "\n%{http_code}" \
			
 
				         --max-time $TIMEOUT \
			
 
				         -H "Content-Type: application/json" \
			
 
				-        -H "Authorization: Bearer $API_KEY" \
			
 
				+        -H "Authorization: Bearer $api_key" \
			
 
				         -d "{
			
 
				             \"model\": \"$model_path\",
			
 
				             \"input\": [\"你好，这是一个测试句子\", \"Hello world\"]
			
@@ -130,7 +143,7 @@ test_embedding_model() {
 
				         response=$(curl -s -w "\n%{http_code}" \
			
 
				             --max-time $TIMEOUT \
			
 
				             -H "Content-Type: application/json" \
			
 
				-            -H "Authorization: Bearer $API_KEY" \
			
 
				+            -H "Authorization: Bearer $api_key" \
			
 
				             -d "{
			
 
				                 \"model\": \"$model_path\",
			
 
				                 \"query\": \"测试查询\",
			
@@ -154,6 +167,7 @@ test_rerank_model() {
 
				     local name=$1
			
 
				     local port=$2
			
 
				     local model_path=$3
			
 
				+    local api_key=${API_KEYS[$name]}
			
 
				     
			
 
				     echo ""
			
 
				     echo "----------------------------------------"
			
@@ -168,7 +182,7 @@ test_rerank_model() {
 
				     response=$(curl -s -w "\n%{http_code}" \
			
 
				         --max-time $TIMEOUT \
			
 
				         -H "Content-Type: application/json" \
			
 
				-        -H "Authorization: Bearer $API_KEY" \
			
 
				+        -H "Authorization: Bearer $api_key" \
			
 
				         -d "{
			
 
				             \"model\": \"$model_path\",
			
 
				             \"query\": \"什么是机器学习\",
			
@@ -191,7 +205,7 @@ test_rerank_model() {
 
				         response=$(curl -s -w "\n%{http_code}" \
			
 
				             --max-time $TIMEOUT \
			
 
				             -H "Content-Type: application/json" \
			
 
				-            -H "Authorization: Bearer $API_KEY" \
			
 
				+            -H "Authorization: Bearer $api_key" \
			
 
				             -d "{
			
 
				                 \"model\": \"$model_path\",
			
 
				                 \"messages\": [{\"role\": \"user\", \"content\": \"你好\"}],
			
@@ -219,11 +233,12 @@ quick_check() {
 
				     
			
 
				     for key in "${MODEL_NAMES[@]}"; do
			
 
				         local port=${MODEL_PORTS[$key]}
			
 
				-        
			
 
				+        local api_key=${API_KEYS[$key]}
			
 
				+
			
 
				         local code
			
 
				         code=$(curl -s -o /dev/null -w "%{http_code}" \
			
 
				             --max-time 5 \
			
 
				-            -H "Authorization: Bearer $API_KEY" \
			
 
				+            -H "Authorization: Bearer $api_key" \
			
 
				             "http://localhost:$port/v1/models" 2>/dev/null || echo "000")
			
 
				         
			
 
				         if [ "$code" = "200" ]; then
			
--- a/prod/models/sglang/test_models_20260518.sh
+++ b/prod/models/sglang/test_models_20260518.sh
@@ -0,0 +1,284 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# SGLang 多模型 curl 测试脚本
			
 
				+# 移除 set -e，避免遇到错误就终止
			
 
				+
			
 
				+# 颜色定义
			
 
				+GREEN='\033[0;32m'
			
 
				+RED='\033[0;31m'
			
 
				+YELLOW='\033[1;33m'
			
 
				+NC='\033[0m'
			
 
				+
			
 
				+# 模型配置（按顺序定义）
			
 
				+MODEL_NAMES=("qwen3-8b" "qwen3.5-35b" "qwen3.5-122b" "qwen3-embedding-8b" "qwen3-reranker-8b")
			
 
				+
			
 
				+declare -A MODEL_PORTS=(
			
 
				+    ["qwen3-8b"]="25424"
			
 
				+    ["qwen3.5-35b"]="25427"
			
 
				+    ["qwen3.5-122b"]="25423"
			
 
				+    ["qwen3-embedding-8b"]="25425"
			
 
				+    ["qwen3-reranker-8b"]="25426"
			
 
				+)
			
 
				+
			
 
				+declare -A MODEL_PATHS=(
			
 
				+    ["qwen3-8b"]="/model/Qwen3-8B"
			
 
				+    ["qwen3.5-35b"]="/model/Qwen3.5-35B"
			
 
				+    ["qwen3.5-122b"]="/model/Qwen3.5-122B-A10B"
			
 
				+    ["qwen3-embedding-8b"]="/model/Qwen3-Embedding-8B"
			
 
				+    ["qwen3-reranker-8b"]="/model/Qwen3-Reranker-8B"
			
 
				+)
			
 
				+
			
 
				+declare -A MODEL_TYPES=(
			
 
				+    ["qwen3-8b"]="chat"
			
 
				+    ["qwen3.5-35b"]="chat"
			
 
				+    ["qwen3.5-122b"]="chat"
			
 
				+    ["qwen3-embedding-8b"]="embedding"
			
 
				+    ["qwen3-reranker-8b"]="rerank"
			
 
				+)
			
 
				+
			
 
				+API_KEY="lq123456"
			
 
				+TIMEOUT=30
			
 
				+
			
 
				+echo "========================================"
			
 
				+echo "SGLang 多模型健康检查 (curl)"
			
 
				+echo "时间: $(date '+%Y-%m-%d %H:%M:%S')"
			
 
				+echo "========================================"
			
 
				+
			
 
				+TOTAL=0
			
 
				+SUCCESS=0
			
 
				+
			
 
				+# 测试对话模型
			
 
				+test_chat_model() {
			
 
				+    local name=$1
			
 
				+    local port=$2
			
 
				+    local model_path=$3
			
 
				+    
			
 
				+    echo ""
			
 
				+    echo "----------------------------------------"
			
 
				+    echo "测试模型: $name (对话模型)"
			
 
				+    echo "端口: $port"
			
 
				+    echo "----------------------------------------"
			
 
				+    
			
 
				+    local response
			
 
				+    local body
			
 
				+    local code
			
 
				+    
			
 
				+    response=$(curl -s -w "\n%{http_code}" \
			
 
				+        --max-time $TIMEOUT \
			
 
				+        -H "Content-Type: application/json" \
			
 
				+        -H "Authorization: Bearer $API_KEY" \
			
 
				+        -d "{
			
 
				+            \"model\": \"$model_path\",
			
 
				+            \"messages\": [{\"role\": \"user\", \"content\": \"你好，请用一句话介绍自己\"}],
			
 
				+            \"temperature\": 0.7,
			
 
				+            \"max_tokens\": 50
			
 
				+        }" \
			
 
				+        "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
			
 
				+    
			
 
				+    body=$(echo "$response" | head -n -1)
			
 
				+    code=$(echo "$response" | tail -n 1)
			
 
				+    
			
 
				+    if [ "$code" = "200" ]; then
			
 
				+        local content=$(echo "$body" | grep -o '"content":"[^"]*"' | head -1 | cut -d'"' -f4)
			
 
				+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
			
 
				+        echo "回复: ${content:0:100}..."
			
 
				+        ((SUCCESS++))
			
 
				+    else
			
 
				+        echo -e "${RED}❌ 失败${NC} HTTP $code"
			
 
				+        echo "响应: ${body:0:200}"
			
 
				+    fi
			
 
				+    ((TOTAL++))
			
 
				+}
			
 
				+
			
 
				+# 测试嵌入模型
			
 
				+test_embedding_model() {
			
 
				+    local name=$1
			
 
				+    local port=$2
			
 
				+    local model_path=$3
			
 
				+    
			
 
				+    echo ""
			
 
				+    echo "----------------------------------------"
			
 
				+    echo "测试模型: $name (嵌入模型)"
			
 
				+    echo "端口: $port"
			
 
				+    echo "----------------------------------------"
			
 
				+    
			
 
				+    local response
			
 
				+    local body
			
 
				+    local code
			
 
				+    
			
 
				+    response=$(curl -s -w "\n%{http_code}" \
			
 
				+        --max-time $TIMEOUT \
			
 
				+        -H "Content-Type: application/json" \
			
 
				+        -H "Authorization: Bearer $API_KEY" \
			
 
				+        -d "{
			
 
				+            \"model\": \"$model_path\",
			
 
				+            \"input\": [\"你好，这是一个测试句子\", \"Hello world\"]
			
 
				+        }" \
			
 
				+        "http://localhost:$port/v1/embeddings" 2>/dev/null || echo -e "\n000")
			
 
				+    
			
 
				+    body=$(echo "$response" | head -n -1)
			
 
				+    code=$(echo "$response" | tail -n 1)
			
 
				+    
			
 
				+    if [ "$code" = "200" ]; then
			
 
				+        local dims=$(echo "$body" | grep -o '"embedding":\[[^]]*\]' | head -1 | grep -o ',' | wc -l)
			
 
				+        dims=$((dims + 1))
			
 
				+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
			
 
				+        echo "向量维度: $dims"
			
 
				+        ((SUCCESS++))
			
 
				+    else
			
 
				+        echo -e "${YELLOW}⚠️  Embedding 接口失败，尝试 Rerank 接口...${NC}"
			
 
				+        response=$(curl -s -w "\n%{http_code}" \
			
 
				+            --max-time $TIMEOUT \
			
 
				+            -H "Content-Type: application/json" \
			
 
				+            -H "Authorization: Bearer $API_KEY" \
			
 
				+            -d "{
			
 
				+                \"model\": \"$model_path\",
			
 
				+                \"query\": \"测试查询\",
			
 
				+                \"documents\": [\"文档1\", \"文档2\"]
			
 
				+            }" \
			
 
				+            "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
			
 
				+        
			
 
				+        code=$(echo "$response" | tail -n 1)
			
 
				+        if [ "$code" = "200" ]; then
			
 
				+            echo -e "${GREEN}✅ 成功${NC} (Rerank 接口可用)"
			
 
				+            ((SUCCESS++))
			
 
				+        else
			
 
				+            echo -e "${RED}❌ 失败${NC} HTTP $code"
			
 
				+        fi
			
 
				+    fi
			
 
				+    ((TOTAL++))
			
 
				+}
			
 
				+
			
 
				+# 测试重排序模型
			
 
				+test_rerank_model() {
			
 
				+    local name=$1
			
 
				+    local port=$2
			
 
				+    local model_path=$3
			
 
				+    
			
 
				+    echo ""
			
 
				+    echo "----------------------------------------"
			
 
				+    echo "测试模型: $name (重排序模型)"
			
 
				+    echo "端口: $port"
			
 
				+    echo "----------------------------------------"
			
 
				+    
			
 
				+    local response
			
 
				+    local body
			
 
				+    local code
			
 
				+    
			
 
				+    response=$(curl -s -w "\n%{http_code}" \
			
 
				+        --max-time $TIMEOUT \
			
 
				+        -H "Content-Type: application/json" \
			
 
				+        -H "Authorization: Bearer $API_KEY" \
			
 
				+        -d "{
			
 
				+            \"model\": \"$model_path\",
			
 
				+            \"query\": \"什么是机器学习\",
			
 
				+            \"documents\": [\"机器学习是AI的分支\", \"Python是编程语言\", \"深度学习使用神经网络\"],
			
 
				+            \"top_n\": 2
			
 
				+        }" \
			
 
				+        "http://localhost:$port/v1/rerank" 2>/dev/null || echo -e "\n000")
			
 
				+    
			
 
				+    body=$(echo "$response" | head -n -1)
			
 
				+    code=$(echo "$response" | tail -n 1)
			
 
				+    
			
 
				+    if [ "$code" = "200" ]; then
			
 
				+        local top_doc=$(echo "$body" | grep -o '"text":"[^"]*"' | head -1 | cut -d'"' -f4)
			
 
				+        local score=$(echo "$body" | grep -o '"score":[0-9.]*' | head -1 | cut -d':' -f2)
			
 
				+        echo -e "${GREEN}✅ 成功${NC} HTTP $code"
			
 
				+        echo "Top1: ${top_doc:0:50}... (得分: $score)"
			
 
				+        ((SUCCESS++))
			
 
				+    else
			
 
				+        echo -e "${YELLOW}⚠️  Rerank 接口失败，尝试 Chat 接口...${NC}"
			
 
				+        response=$(curl -s -w "\n%{http_code}" \
			
 
				+            --max-time $TIMEOUT \
			
 
				+            -H "Content-Type: application/json" \
			
 
				+            -H "Authorization: Bearer $API_KEY" \
			
 
				+            -d "{
			
 
				+                \"model\": \"$model_path\",
			
 
				+                \"messages\": [{\"role\": \"user\", \"content\": \"你好\"}],
			
 
				+                \"max_tokens\": 20
			
 
				+            }" \
			
 
				+            "http://localhost:$port/v1/chat/completions" 2>/dev/null || echo -e "\n000")
			
 
				+        
			
 
				+        code=$(echo "$response" | tail -n 1)
			
 
				+        if [ "$code" = "200" ]; then
			
 
				+            echo -e "${GREEN}✅ 成功${NC} (Chat 接口可用)"
			
 
				+            ((SUCCESS++))
			
 
				+        else
			
 
				+            echo -e "${RED}❌ 失败${NC} HTTP $code"
			
 
				+        fi
			
 
				+    fi
			
 
				+    ((TOTAL++))
			
 
				+}
			
 
				+
			
 
				+# 快速检查
			
 
				+quick_check() {
			
 
				+    echo ""
			
 
				+    echo "========================================"
			
 
				+    echo "快速检查模式"
			
 
				+    echo "========================================"
			
 
				+    
			
 
				+    for key in "${MODEL_NAMES[@]}"; do
			
 
				+        local port=${MODEL_PORTS[$key]}
			
 
				+        
			
 
				+        local code
			
 
				+        code=$(curl -s -o /dev/null -w "%{http_code}" \
			
 
				+            --max-time 5 \
			
 
				+            -H "Authorization: Bearer $API_KEY" \
			
 
				+            "http://localhost:$port/v1/models" 2>/dev/null || echo "000")
			
 
				+        
			
 
				+        if [ "$code" = "200" ]; then
			
 
				+            echo -e "${GREEN}✅${NC} $key (端口 $port)"
			
 
				+            ((SUCCESS++))
			
 
				+        else
			
 
				+            echo -e "${RED}❌${NC} $key (端口 $port) HTTP $code"
			
 
				+        fi
			
 
				+        ((TOTAL++))
			
 
				+    done
			
 
				+}
			
 
				+
			
 
				+# 主函数
			
 
				+main() {
			
 
				+    if [ "$1" = "--quick" ]; then
			
 
				+        quick_check
			
 
				+    elif [ "$1" = "--model" ] && [ -n "$2" ]; then
			
 
				+        local key=$2
			
 
				+        local port=${MODEL_PORTS[$key]}
			
 
				+        local path=${MODEL_PATHS[$key]}
			
 
				+        local mtype=${MODEL_TYPES[$key]}
			
 
				+        
			
 
				+        case $mtype in
			
 
				+            chat) test_chat_model "$key" "$port" "$path" ;;
			
 
				+            embedding) test_embedding_model "$key" "$port" "$path" ;;
			
 
				+            rerank) test_rerank_model "$key" "$port" "$path" ;;
			
 
				+        esac
			
 
				+    else
			
 
				+        # 按顺序测试所有模型
			
 
				+        for key in "${MODEL_NAMES[@]}"; do
			
 
				+            local port=${MODEL_PORTS[$key]}
			
 
				+            local path=${MODEL_PATHS[$key]}
			
 
				+            local mtype=${MODEL_TYPES[$key]}
			
 
				+            
			
 
				+            case $mtype in
			
 
				+                chat) test_chat_model "$key" "$port" "$path" ;;
			
 
				+                embedding) test_embedding_model "$key" "$port" "$path" ;;
			
 
				+                rerank) test_rerank_model "$key" "$port" "$path" ;;
			
 
				+            esac
			
 
				+        done
			
 
				+    fi
			
 
				+    
			
 
				+    echo ""
			
 
				+    echo "========================================"
			
 
				+    echo "测试结果摘要"
			
 
				+    echo "========================================"
			
 
				+    echo "总计: $SUCCESS / $TOTAL 个模型正常"
			
 
				+    
			
 
				+    if [ $SUCCESS -eq $TOTAL ]; then
			
 
				+        echo -e "${GREEN}所有模型运行正常！${NC}"
			
 
				+        exit 0
			
 
				+    else
			
 
				+        echo -e "${RED}部分模型异常，请检查日志${NC}"
			
 
				+        exit 1
			
 
				+    fi
			
 
				+}
			
 
				+
			
 
				+main "$@"
			
--- a/prod/models/sglang/需求文档.md
+++ b/prod/models/sglang/需求文档.md
@@ -0,0 +1,17 @@
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+### 根据最新的信息修改脚本 test_models.sh
			
 
				+    - 增加模型测试 qwen3.6-27b
			
 
				+      - 模型名称： Qwen3.6-27B
			
 
				+      - 端口号：   25424
			
 
				+    - 修改不同模型不同的API KEY
			
 
				+      - Qwen/Qwen3.5-122B-A10B  sk-prod_ojkjwcO4TTd9TL3vK6uo8a2Dvcdoz64u_9a89845f
			
 
				+      - Qwen/Qwen3-Embedding-8B sk_prod_3HDoVka8mU8Jqj9Xnmfkn8bxk5kmzKrz_700c186f
			
 
				+      - Qwen/Qwen3-Reranker-8B  sk_prod_dvgYHKWFoQlYAKmkIvBSyuguNSQGeNh0_23c65608
			
 
				+      - Qwen/Qwen3.5-35B-A3B    sk_prod_0NuLZt1a2UrD80F9iB-GTxOIuAkJSZxH_5522d7ae
			
 
				+      - Qwen/Qwen3.6-27B        sk_prod_HH21x5WB9Pm7IM9Bf808BoJPEn_4bPX5_f2c5f3f6
			
 
				+      - Qwen/Qwen3-8B           sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615