CRBC-MaaS-Platform-Project
/
LQDeployConfig


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
							2026-05-15 08:41:06 - evalscope - INFO: Starting benchmark with args: 
2026-05-15 08:41:06 - evalscope - INFO: {
    "model": "Qwen3.6-27B-W8A8",
    "model_id": "Qwen3.6-27B-W8A8",
    "attn_implementation": null,
    "api": "openai",
    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
    "port": 8877,
    "url": "http://127.0.0.1:8004/v1/chat/completions",
    "headers": {
        "Authorization": "Bearer sk-123456"
    },
    "connect_timeout": null,
    "read_timeout": null,
    "total_timeout": 21600,
    "api_key": "sk-123456",
    "no_test_connection": false,
    "number": 1,
    "parallel": 1,
    "rate": -1,
    "sleep_interval": 5,
    "sla_auto_tune": false,
    "sla_variable": "parallel",
    "sla_params": null,
    "sla_num_runs": 3,
    "sla_upper_bound": 65536,
    "sla_lower_bound": 1,
    "db_commit_interval": 1000,
    "queue_size_multiplier": 5,
    "in_flight_task_multiplier": 2,
    "log_every_n_query": 10,
    "debug": false,
    "visualizer": null,
    "wandb_api_key": null,
    "swanlab_api_key": null,
    "name": null,
    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
    "no_timestamp": false,
    "max_prompt_length": 2048,
    "min_prompt_length": 2048,
    "prefix_length": 0,
    "prompt": null,
    "query_template": null,
    "apply_chat_template": true,
    "image_width": 224,
    "image_height": 224,
    "image_format": "RGB",
    "image_num": 1,
    "image_patch_size": 28,
    "dataset": "random",
    "dataset_path": null,
    "frequency_penalty": null,
    "repetition_penalty": null,
    "logprobs": null,
    "max_tokens": 128,
    "min_tokens": 128,
    "n_choices": null,
    "seed": null,
    "stop": null,
    "stop_token_ids": null,
    "stream": true,
    "temperature": 0.0,
    "top_p": null,
    "top_k": null,
    "extra_args": {}
}
2026-05-15 08:41:22 - evalscope - INFO: Test connection successful.
2026-05-15 08:41:25 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
2026-05-15 08:41:25 - evalscope - INFO: Sampling input lengths from [2046, 2047)
2026-05-15 08:41:26 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_data.db
2026-05-15 08:41:33 - evalscope - INFO: Processing 100%| 1/1 [Elapsed: 00:07 < Remaining: 00:00,  7.37s/it]
2026-05-15 08:41:33 - evalscope - INFO: 
Benchmarking summary:
+-----------------------------------+-----------+
| Key                               |     Value |
+===================================+===========+
| Time taken for tests (s)          |    7.3689 |
+-----------------------------------+-----------+
| Number of concurrency             |    1      |
+-----------------------------------+-----------+
| Request rate (req/s)              |   -1      |
+-----------------------------------+-----------+
| Total requests                    |    1      |
+-----------------------------------+-----------+
| Succeed requests                  |    1      |
+-----------------------------------+-----------+
| Failed requests                   |    0      |
+-----------------------------------+-----------+
| Output token throughput (tok/s)   |   17.3704 |
+-----------------------------------+-----------+
| Total token throughput (tok/s)    |  296.383  |
+-----------------------------------+-----------+
| Request throughput (req/s)        |    0.1357 |
+-----------------------------------+-----------+
| Average latency (s)               |    7.3689 |
+-----------------------------------+-----------+
| Average time to first token (s)   |    0.8157 |
+-----------------------------------+-----------+
| Average time per output token (s) |    0.0516 |
+-----------------------------------+-----------+
| Average inter-token latency (s)   |    0.0512 |
+-----------------------------------+-----------+
| Average input tokens per request  | 2056      |
+-----------------------------------+-----------+
| Average output tokens per request |  128      |
+-----------------------------------+-----------+
2026-05-15 08:41:33 - evalscope - INFO: 
Percentile results:
+-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
+-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
|     10%     |  0.8157  | 0.0514  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
|     25%     |  0.8157  | 0.0515  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
|     50%     |  0.8157  | 0.0516  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
|     66%     |  0.8157  | 0.0517  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
|     75%     |  0.8157  | 0.0518  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
|     80%     |  0.8157  | 0.0518  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
|     90%     |  0.8157  | 0.0519  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
|     95%     |  0.8157  |  0.052  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
|     98%     |  0.8157  | 0.0523  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
|     99%     |  0.8157  | 0.0523  |  0.0516  |   7.3689    |     2056     |      128      |    17.3704     |   296.3827    |
+-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
2026-05-15 08:41:33 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1
2026-05-15 08:41:33 - evalscope - INFO: Sleeping for 5 seconds before the next run...
2026-05-15 08:41:38 - evalscope - INFO: Starting benchmark with args: 
2026-05-15 08:41:38 - evalscope - INFO: {
    "model": "Qwen3.6-27B-W8A8",
    "model_id": "Qwen3.6-27B-W8A8",
    "attn_implementation": null,
    "api": "openai",
    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
    "port": 8877,
    "url": "http://127.0.0.1:8004/v1/chat/completions",
    "headers": {
        "Authorization": "Bearer sk-123456"
    },
    "connect_timeout": null,
    "read_timeout": null,
    "total_timeout": 21600,
    "api_key": "sk-123456",
    "no_test_connection": false,
    "number": 5,
    "parallel": 5,
    "rate": -1,
    "sleep_interval": 5,
    "sla_auto_tune": false,
    "sla_variable": "parallel",
    "sla_params": null,
    "sla_num_runs": 3,
    "sla_upper_bound": 65536,
    "sla_lower_bound": 1,
    "db_commit_interval": 1000,
    "queue_size_multiplier": 5,
    "in_flight_task_multiplier": 2,
    "log_every_n_query": 10,
    "debug": false,
    "visualizer": null,
    "wandb_api_key": null,
    "swanlab_api_key": null,
    "name": null,
    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
    "no_timestamp": false,
    "max_prompt_length": 2048,
    "min_prompt_length": 2048,
    "prefix_length": 0,
    "prompt": null,
    "query_template": null,
    "apply_chat_template": true,
    "image_width": 224,
    "image_height": 224,
    "image_format": "RGB",
    "image_num": 1,
    "image_patch_size": 28,
    "dataset": "random",
    "dataset_path": null,
    "frequency_penalty": null,
    "repetition_penalty": null,
    "logprobs": null,
    "max_tokens": 128,
    "min_tokens": 128,
    "n_choices": null,
    "seed": null,
    "stop": null,
    "stop_token_ids": null,
    "stream": true,
    "temperature": 0.0,
    "top_p": null,
    "top_k": null,
    "extra_args": {}
}
2026-05-15 08:41:48 - evalscope - INFO: Test connection successful.
2026-05-15 08:41:51 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
2026-05-15 08:41:51 - evalscope - INFO: Sampling input lengths from [2046, 2047)
2026-05-15 08:41:51 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_data.db
2026-05-15 08:41:58 - evalscope - INFO: Processing 100%| 5/5 [Elapsed: 00:06 < Remaining: 00:00,  1.01it/s]
2026-05-15 08:41:58 - evalscope - INFO: 
Benchmarking summary:
+-----------------------------------+-----------+
| Key                               |     Value |
+===================================+===========+
| Time taken for tests (s)          |    6.6303 |
+-----------------------------------+-----------+
| Number of concurrency             |    5      |
+-----------------------------------+-----------+
| Request rate (req/s)              |   -1      |
+-----------------------------------+-----------+
| Total requests                    |    5      |
+-----------------------------------+-----------+
| Succeed requests                  |    5      |
+-----------------------------------+-----------+
| Failed requests                   |    0      |
+-----------------------------------+-----------+
| Output token throughput (tok/s)   |   96.5268 |
+-----------------------------------+-----------+
| Total token throughput (tok/s)    | 1647.29   |
+-----------------------------------+-----------+
| Request throughput (req/s)        |    0.7541 |
+-----------------------------------+-----------+
| Average latency (s)               |    6.5697 |
+-----------------------------------+-----------+
| Average time to first token (s)   |    2.1216 |
+-----------------------------------+-----------+
| Average time per output token (s) |    0.035  |
+-----------------------------------+-----------+
| Average inter-token latency (s)   |    0.0348 |
+-----------------------------------+-----------+
| Average input tokens per request  | 2056.4    |
+-----------------------------------+-----------+
| Average output tokens per request |  128      |
+-----------------------------------+-----------+
2026-05-15 08:41:58 - evalscope - INFO: 
Percentile results:
+-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
+-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
|     10%     |  0.6655  | 0.0295  |  0.0299  |   6.5201    |     2056     |      128      |    19.3128     |   329.6753    |
|     25%     |  1.6872  | 0.0296  |  0.0304  |   6.5483    |     2056     |      128      |    19.4617     |   332.0644    |
|     50%     |  2.7127  | 0.0297  |  0.0304  |   6.5751    |     2056     |      128      |    19.4674     |   332.1626    |
|     66%     |  2.7127  | 0.0298  |  0.0383  |    6.577    |     2057     |      128      |    19.5469     |   333.6721    |
|     75%     |  2.7127  | 0.0299  |  0.0383  |    6.577    |     2057     |      128      |    19.5469     |   333.6721    |
|     80%     |  2.8297  | 0.0299  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
|     90%     |  2.8297  | 0.0301  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
|     95%     |  2.8297  | 0.0304  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
|     98%     |  2.8297  |  0.031  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
|     99%     |  2.8297  | 0.1171  |  0.0461  |   6.6277    |     2057     |      128      |    19.6315     |   334.9623    |
+-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
2026-05-15 08:41:58 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5
2026-05-15 08:41:58 - evalscope - INFO: Sleeping for 5 seconds before the next run...
2026-05-15 08:42:03 - evalscope - INFO: Starting benchmark with args: 
2026-05-15 08:42:03 - evalscope - INFO: {
    "model": "Qwen3.6-27B-W8A8",
    "model_id": "Qwen3.6-27B-W8A8",
    "attn_implementation": null,
    "api": "openai",
    "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
    "port": 8877,
    "url": "http://127.0.0.1:8004/v1/chat/completions",
    "headers": {
        "Authorization": "Bearer sk-123456"
    },
    "connect_timeout": null,
    "read_timeout": null,
    "total_timeout": 21600,
    "api_key": "sk-123456",
    "no_test_connection": false,
    "number": 10,
    "parallel": 10,
    "rate": -1,
    "sleep_interval": 5,
    "sla_auto_tune": false,
    "sla_variable": "parallel",
    "sla_params": null,
    "sla_num_runs": 3,
    "sla_upper_bound": 65536,
    "sla_lower_bound": 1,
    "db_commit_interval": 1000,
    "queue_size_multiplier": 5,
    "in_flight_task_multiplier": 2,
    "log_every_n_query": 10,
    "debug": false,
    "visualizer": null,
    "wandb_api_key": null,
    "swanlab_api_key": null,
    "name": null,
    "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
    "no_timestamp": false,
    "max_prompt_length": 2048,
    "min_prompt_length": 2048,
    "prefix_length": 0,
    "prompt": null,
    "query_template": null,
    "apply_chat_template": true,
    "image_width": 224,
    "image_height": 224,
    "image_format": "RGB",
    "image_num": 1,
    "image_patch_size": 28,
    "dataset": "random",
    "dataset_path": null,
    "frequency_penalty": null,
    "repetition_penalty": null,
    "logprobs": null,
    "max_tokens": 128,
    "min_tokens": 128,
    "n_choices": null,
    "seed": null,
    "stop": null,
    "stop_token_ids": null,
    "stream": true,
    "temperature": 0.0,
    "top_p": null,
    "top_k": null,
    "extra_args": {}
}
2026-05-15 08:42:12 - evalscope - INFO: Test connection successful.
2026-05-15 08:42:16 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
2026-05-15 08:42:16 - evalscope - INFO: Sampling input lengths from [2046, 2047)
2026-05-15 08:42:16 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_data.db
2026-05-15 08:42:26 - evalscope - INFO: {
  "Time taken for tests (s)": 9.6293,
  "Number of concurrency": 10,
  "Request rate (req/s)": -1,
  "Total requests": 10,
  "Succeed requests": 10,
  "Failed requests": 0,
  "Output token throughput (tok/s)": 132.928,
  "Total token throughput (tok/s)": 2268.4997,
  "Request throughput (req/s)": 1.0385,
  "Average latency (s)": 9.567,
  "Average time to first token (s)": 3.6071,
  "Average time per output token (s)": 0.0469,
  "Average inter-token latency (s)": 0.0466,
  "Average input tokens per request": 2056.4,
  "Average output tokens per request": 128.0
}
2026-05-15 08:42:26 - evalscope - INFO: Processing 100%| 10/10 [Elapsed: 00:09 < Remaining: 00:00,  1.01s/it]
2026-05-15 08:42:26 - evalscope - INFO: 
Benchmarking summary:
+-----------------------------------+-----------+
| Key                               |     Value |
+===================================+===========+
| Time taken for tests (s)          |    9.6293 |
+-----------------------------------+-----------+
| Number of concurrency             |   10      |
+-----------------------------------+-----------+
| Request rate (req/s)              |   -1      |
+-----------------------------------+-----------+
| Total requests                    |   10      |
+-----------------------------------+-----------+
| Succeed requests                  |   10      |
+-----------------------------------+-----------+
| Failed requests                   |    0      |
+-----------------------------------+-----------+
| Output token throughput (tok/s)   |  132.928  |
+-----------------------------------+-----------+
| Total token throughput (tok/s)    | 2268.5    |
+-----------------------------------+-----------+
| Request throughput (req/s)        |    1.0385 |
+-----------------------------------+-----------+
| Average latency (s)               |    9.567  |
+-----------------------------------+-----------+
| Average time to first token (s)   |    3.6071 |
+-----------------------------------+-----------+
| Average time per output token (s) |    0.0469 |
+-----------------------------------+-----------+
| Average inter-token latency (s)   |    0.0466 |
+-----------------------------------+-----------+
| Average input tokens per request  | 2056.4    |
+-----------------------------------+-----------+
| Average output tokens per request |  128      |
+-----------------------------------+-----------+
2026-05-15 08:42:26 - evalscope - INFO: 
Percentile results:
+-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
+-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
|     10%     |  1.7449  | 0.0332  |  0.0332  |   9.5131    |     2056     |      128      |    13.2998     |   226.9284    |
|     25%     |  2.7691  | 0.0333  |  0.0376  |   9.5425    |     2056     |      128      |    13.3343     |   227.5162    |
|     50%     |  3.7961  | 0.0333  |  0.0455  |   9.5729    |     2056     |      128      |    13.3741     |   228.2483    |
|     66%     |  4.8242  | 0.0334  |  0.0533  |   9.5984    |     2056     |      128      |     13.412     |   228.8415    |
|     75%     |  4.825   | 0.0334  |  0.0533  |   9.5993    |     2057     |      128      |    13.4137     |   229.0807    |
|     80%     |  5.4098  | 0.0335  |  0.0612  |   9.6242    |     2057     |      128      |    13.4552     |   229.5791    |
|     90%     |   5.41   | 0.0336  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
|     95%     |   5.41   | 0.0337  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
|     98%     |   5.41   | 0.0367  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
|     99%     |   5.41   | 1.0169  |  0.0689  |   9.6253    |     2058     |      128      |    13.5021     |   230.4848    |
+-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
2026-05-15 08:42:26 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10
2026-05-15 08:42:26 - evalscope - INFO: Performance summary saved to: outputs/20260515_084106/Qwen3.6-27B-W8A8/performance_summary.txt