benchmark.log 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. 2026-05-15 08:41:06 - evalscope - INFO: Starting benchmark with args:
  2. 2026-05-15 08:41:06 - evalscope - INFO: {
  3. "model": "Qwen3.6-27B-W8A8",
  4. "model_id": "Qwen3.6-27B-W8A8",
  5. "attn_implementation": null,
  6. "api": "openai",
  7. "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
  8. "port": 8877,
  9. "url": "http://127.0.0.1:8004/v1/chat/completions",
  10. "headers": {
  11. "Authorization": "Bearer sk-123456"
  12. },
  13. "connect_timeout": null,
  14. "read_timeout": null,
  15. "total_timeout": 21600,
  16. "api_key": "sk-123456",
  17. "no_test_connection": false,
  18. "number": 1,
  19. "parallel": 1,
  20. "rate": -1,
  21. "sleep_interval": 5,
  22. "sla_auto_tune": false,
  23. "sla_variable": "parallel",
  24. "sla_params": null,
  25. "sla_num_runs": 3,
  26. "sla_upper_bound": 65536,
  27. "sla_lower_bound": 1,
  28. "db_commit_interval": 1000,
  29. "queue_size_multiplier": 5,
  30. "in_flight_task_multiplier": 2,
  31. "log_every_n_query": 10,
  32. "debug": false,
  33. "visualizer": null,
  34. "wandb_api_key": null,
  35. "swanlab_api_key": null,
  36. "name": null,
  37. "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
  38. "no_timestamp": false,
  39. "max_prompt_length": 2048,
  40. "min_prompt_length": 2048,
  41. "prefix_length": 0,
  42. "prompt": null,
  43. "query_template": null,
  44. "apply_chat_template": true,
  45. "image_width": 224,
  46. "image_height": 224,
  47. "image_format": "RGB",
  48. "image_num": 1,
  49. "image_patch_size": 28,
  50. "dataset": "random",
  51. "dataset_path": null,
  52. "frequency_penalty": null,
  53. "repetition_penalty": null,
  54. "logprobs": null,
  55. "max_tokens": 128,
  56. "min_tokens": 128,
  57. "n_choices": null,
  58. "seed": null,
  59. "stop": null,
  60. "stop_token_ids": null,
  61. "stream": true,
  62. "temperature": 0.0,
  63. "top_p": null,
  64. "top_k": null,
  65. "extra_args": {}
  66. }
  67. 2026-05-15 08:41:22 - evalscope - INFO: Test connection successful.
  68. 2026-05-15 08:41:25 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
  69. 2026-05-15 08:41:25 - evalscope - INFO: Sampling input lengths from [2046, 2047)
  70. 2026-05-15 08:41:26 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1/benchmark_data.db
  71. 2026-05-15 08:41:33 - evalscope - INFO: Processing 100%| 1/1 [Elapsed: 00:07 < Remaining: 00:00, 7.37s/it]
  72. 2026-05-15 08:41:33 - evalscope - INFO:
  73. Benchmarking summary:
  74. +-----------------------------------+-----------+
  75. | Key | Value |
  76. +===================================+===========+
  77. | Time taken for tests (s) | 7.3689 |
  78. +-----------------------------------+-----------+
  79. | Number of concurrency | 1 |
  80. +-----------------------------------+-----------+
  81. | Request rate (req/s) | -1 |
  82. +-----------------------------------+-----------+
  83. | Total requests | 1 |
  84. +-----------------------------------+-----------+
  85. | Succeed requests | 1 |
  86. +-----------------------------------+-----------+
  87. | Failed requests | 0 |
  88. +-----------------------------------+-----------+
  89. | Output token throughput (tok/s) | 17.3704 |
  90. +-----------------------------------+-----------+
  91. | Total token throughput (tok/s) | 296.383 |
  92. +-----------------------------------+-----------+
  93. | Request throughput (req/s) | 0.1357 |
  94. +-----------------------------------+-----------+
  95. | Average latency (s) | 7.3689 |
  96. +-----------------------------------+-----------+
  97. | Average time to first token (s) | 0.8157 |
  98. +-----------------------------------+-----------+
  99. | Average time per output token (s) | 0.0516 |
  100. +-----------------------------------+-----------+
  101. | Average inter-token latency (s) | 0.0512 |
  102. +-----------------------------------+-----------+
  103. | Average input tokens per request | 2056 |
  104. +-----------------------------------+-----------+
  105. | Average output tokens per request | 128 |
  106. +-----------------------------------+-----------+
  107. 2026-05-15 08:41:33 - evalscope - INFO:
  108. Percentile results:
  109. +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
  110. | Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
  111. +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
  112. | 10% | 0.8157 | 0.0514 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  113. | 25% | 0.8157 | 0.0515 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  114. | 50% | 0.8157 | 0.0516 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  115. | 66% | 0.8157 | 0.0517 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  116. | 75% | 0.8157 | 0.0518 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  117. | 80% | 0.8157 | 0.0518 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  118. | 90% | 0.8157 | 0.0519 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  119. | 95% | 0.8157 | 0.052 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  120. | 98% | 0.8157 | 0.0523 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  121. | 99% | 0.8157 | 0.0523 | 0.0516 | 7.3689 | 2056 | 128 | 17.3704 | 296.3827 |
  122. +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
  123. 2026-05-15 08:41:33 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_1_number_1
  124. 2026-05-15 08:41:33 - evalscope - INFO: Sleeping for 5 seconds before the next run...
  125. 2026-05-15 08:41:38 - evalscope - INFO: Starting benchmark with args:
  126. 2026-05-15 08:41:38 - evalscope - INFO: {
  127. "model": "Qwen3.6-27B-W8A8",
  128. "model_id": "Qwen3.6-27B-W8A8",
  129. "attn_implementation": null,
  130. "api": "openai",
  131. "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
  132. "port": 8877,
  133. "url": "http://127.0.0.1:8004/v1/chat/completions",
  134. "headers": {
  135. "Authorization": "Bearer sk-123456"
  136. },
  137. "connect_timeout": null,
  138. "read_timeout": null,
  139. "total_timeout": 21600,
  140. "api_key": "sk-123456",
  141. "no_test_connection": false,
  142. "number": 5,
  143. "parallel": 5,
  144. "rate": -1,
  145. "sleep_interval": 5,
  146. "sla_auto_tune": false,
  147. "sla_variable": "parallel",
  148. "sla_params": null,
  149. "sla_num_runs": 3,
  150. "sla_upper_bound": 65536,
  151. "sla_lower_bound": 1,
  152. "db_commit_interval": 1000,
  153. "queue_size_multiplier": 5,
  154. "in_flight_task_multiplier": 2,
  155. "log_every_n_query": 10,
  156. "debug": false,
  157. "visualizer": null,
  158. "wandb_api_key": null,
  159. "swanlab_api_key": null,
  160. "name": null,
  161. "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
  162. "no_timestamp": false,
  163. "max_prompt_length": 2048,
  164. "min_prompt_length": 2048,
  165. "prefix_length": 0,
  166. "prompt": null,
  167. "query_template": null,
  168. "apply_chat_template": true,
  169. "image_width": 224,
  170. "image_height": 224,
  171. "image_format": "RGB",
  172. "image_num": 1,
  173. "image_patch_size": 28,
  174. "dataset": "random",
  175. "dataset_path": null,
  176. "frequency_penalty": null,
  177. "repetition_penalty": null,
  178. "logprobs": null,
  179. "max_tokens": 128,
  180. "min_tokens": 128,
  181. "n_choices": null,
  182. "seed": null,
  183. "stop": null,
  184. "stop_token_ids": null,
  185. "stream": true,
  186. "temperature": 0.0,
  187. "top_p": null,
  188. "top_k": null,
  189. "extra_args": {}
  190. }
  191. 2026-05-15 08:41:48 - evalscope - INFO: Test connection successful.
  192. 2026-05-15 08:41:51 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
  193. 2026-05-15 08:41:51 - evalscope - INFO: Sampling input lengths from [2046, 2047)
  194. 2026-05-15 08:41:51 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5/benchmark_data.db
  195. 2026-05-15 08:41:58 - evalscope - INFO: Processing 100%| 5/5 [Elapsed: 00:06 < Remaining: 00:00, 1.01it/s]
  196. 2026-05-15 08:41:58 - evalscope - INFO:
  197. Benchmarking summary:
  198. +-----------------------------------+-----------+
  199. | Key | Value |
  200. +===================================+===========+
  201. | Time taken for tests (s) | 6.6303 |
  202. +-----------------------------------+-----------+
  203. | Number of concurrency | 5 |
  204. +-----------------------------------+-----------+
  205. | Request rate (req/s) | -1 |
  206. +-----------------------------------+-----------+
  207. | Total requests | 5 |
  208. +-----------------------------------+-----------+
  209. | Succeed requests | 5 |
  210. +-----------------------------------+-----------+
  211. | Failed requests | 0 |
  212. +-----------------------------------+-----------+
  213. | Output token throughput (tok/s) | 96.5268 |
  214. +-----------------------------------+-----------+
  215. | Total token throughput (tok/s) | 1647.29 |
  216. +-----------------------------------+-----------+
  217. | Request throughput (req/s) | 0.7541 |
  218. +-----------------------------------+-----------+
  219. | Average latency (s) | 6.5697 |
  220. +-----------------------------------+-----------+
  221. | Average time to first token (s) | 2.1216 |
  222. +-----------------------------------+-----------+
  223. | Average time per output token (s) | 0.035 |
  224. +-----------------------------------+-----------+
  225. | Average inter-token latency (s) | 0.0348 |
  226. +-----------------------------------+-----------+
  227. | Average input tokens per request | 2056.4 |
  228. +-----------------------------------+-----------+
  229. | Average output tokens per request | 128 |
  230. +-----------------------------------+-----------+
  231. 2026-05-15 08:41:58 - evalscope - INFO:
  232. Percentile results:
  233. +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
  234. | Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
  235. +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
  236. | 10% | 0.6655 | 0.0295 | 0.0299 | 6.5201 | 2056 | 128 | 19.3128 | 329.6753 |
  237. | 25% | 1.6872 | 0.0296 | 0.0304 | 6.5483 | 2056 | 128 | 19.4617 | 332.0644 |
  238. | 50% | 2.7127 | 0.0297 | 0.0304 | 6.5751 | 2056 | 128 | 19.4674 | 332.1626 |
  239. | 66% | 2.7127 | 0.0298 | 0.0383 | 6.577 | 2057 | 128 | 19.5469 | 333.6721 |
  240. | 75% | 2.7127 | 0.0299 | 0.0383 | 6.577 | 2057 | 128 | 19.5469 | 333.6721 |
  241. | 80% | 2.8297 | 0.0299 | 0.0461 | 6.6277 | 2057 | 128 | 19.6315 | 334.9623 |
  242. | 90% | 2.8297 | 0.0301 | 0.0461 | 6.6277 | 2057 | 128 | 19.6315 | 334.9623 |
  243. | 95% | 2.8297 | 0.0304 | 0.0461 | 6.6277 | 2057 | 128 | 19.6315 | 334.9623 |
  244. | 98% | 2.8297 | 0.031 | 0.0461 | 6.6277 | 2057 | 128 | 19.6315 | 334.9623 |
  245. | 99% | 2.8297 | 0.1171 | 0.0461 | 6.6277 | 2057 | 128 | 19.6315 | 334.9623 |
  246. +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
  247. 2026-05-15 08:41:58 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_5_number_5
  248. 2026-05-15 08:41:58 - evalscope - INFO: Sleeping for 5 seconds before the next run...
  249. 2026-05-15 08:42:03 - evalscope - INFO: Starting benchmark with args:
  250. 2026-05-15 08:42:03 - evalscope - INFO: {
  251. "model": "Qwen3.6-27B-W8A8",
  252. "model_id": "Qwen3.6-27B-W8A8",
  253. "attn_implementation": null,
  254. "api": "openai",
  255. "tokenizer_path": "/opt/lq/models/Qwen3.6-27B-W8A8",
  256. "port": 8877,
  257. "url": "http://127.0.0.1:8004/v1/chat/completions",
  258. "headers": {
  259. "Authorization": "Bearer sk-123456"
  260. },
  261. "connect_timeout": null,
  262. "read_timeout": null,
  263. "total_timeout": 21600,
  264. "api_key": "sk-123456",
  265. "no_test_connection": false,
  266. "number": 10,
  267. "parallel": 10,
  268. "rate": -1,
  269. "sleep_interval": 5,
  270. "sla_auto_tune": false,
  271. "sla_variable": "parallel",
  272. "sla_params": null,
  273. "sla_num_runs": 3,
  274. "sla_upper_bound": 65536,
  275. "sla_lower_bound": 1,
  276. "db_commit_interval": 1000,
  277. "queue_size_multiplier": 5,
  278. "in_flight_task_multiplier": 2,
  279. "log_every_n_query": 10,
  280. "debug": false,
  281. "visualizer": null,
  282. "wandb_api_key": null,
  283. "swanlab_api_key": null,
  284. "name": null,
  285. "outputs_dir": "outputs/20260515_084106/Qwen3.6-27B-W8A8",
  286. "no_timestamp": false,
  287. "max_prompt_length": 2048,
  288. "min_prompt_length": 2048,
  289. "prefix_length": 0,
  290. "prompt": null,
  291. "query_template": null,
  292. "apply_chat_template": true,
  293. "image_width": 224,
  294. "image_height": 224,
  295. "image_format": "RGB",
  296. "image_num": 1,
  297. "image_patch_size": 28,
  298. "dataset": "random",
  299. "dataset_path": null,
  300. "frequency_penalty": null,
  301. "repetition_penalty": null,
  302. "logprobs": null,
  303. "max_tokens": 128,
  304. "min_tokens": 128,
  305. "n_choices": null,
  306. "seed": null,
  307. "stop": null,
  308. "stop_token_ids": null,
  309. "stream": true,
  310. "temperature": 0.0,
  311. "top_p": null,
  312. "top_k": null,
  313. "extra_args": {}
  314. }
  315. 2026-05-15 08:42:12 - evalscope - INFO: Test connection successful.
  316. 2026-05-15 08:42:16 - evalscope - INFO: Using 248044 allowed tokens out of 248044 total tokens
  317. 2026-05-15 08:42:16 - evalscope - INFO: Sampling input lengths from [2046, 2047)
  318. 2026-05-15 08:42:16 - evalscope - INFO: Save the data base to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10/benchmark_data.db
  319. 2026-05-15 08:42:26 - evalscope - INFO: {
  320. "Time taken for tests (s)": 9.6293,
  321. "Number of concurrency": 10,
  322. "Request rate (req/s)": -1,
  323. "Total requests": 10,
  324. "Succeed requests": 10,
  325. "Failed requests": 0,
  326. "Output token throughput (tok/s)": 132.928,
  327. "Total token throughput (tok/s)": 2268.4997,
  328. "Request throughput (req/s)": 1.0385,
  329. "Average latency (s)": 9.567,
  330. "Average time to first token (s)": 3.6071,
  331. "Average time per output token (s)": 0.0469,
  332. "Average inter-token latency (s)": 0.0466,
  333. "Average input tokens per request": 2056.4,
  334. "Average output tokens per request": 128.0
  335. }
  336. 2026-05-15 08:42:26 - evalscope - INFO: Processing 100%| 10/10 [Elapsed: 00:09 < Remaining: 00:00, 1.01s/it]
  337. 2026-05-15 08:42:26 - evalscope - INFO:
  338. Benchmarking summary:
  339. +-----------------------------------+-----------+
  340. | Key | Value |
  341. +===================================+===========+
  342. | Time taken for tests (s) | 9.6293 |
  343. +-----------------------------------+-----------+
  344. | Number of concurrency | 10 |
  345. +-----------------------------------+-----------+
  346. | Request rate (req/s) | -1 |
  347. +-----------------------------------+-----------+
  348. | Total requests | 10 |
  349. +-----------------------------------+-----------+
  350. | Succeed requests | 10 |
  351. +-----------------------------------+-----------+
  352. | Failed requests | 0 |
  353. +-----------------------------------+-----------+
  354. | Output token throughput (tok/s) | 132.928 |
  355. +-----------------------------------+-----------+
  356. | Total token throughput (tok/s) | 2268.5 |
  357. +-----------------------------------+-----------+
  358. | Request throughput (req/s) | 1.0385 |
  359. +-----------------------------------+-----------+
  360. | Average latency (s) | 9.567 |
  361. +-----------------------------------+-----------+
  362. | Average time to first token (s) | 3.6071 |
  363. +-----------------------------------+-----------+
  364. | Average time per output token (s) | 0.0469 |
  365. +-----------------------------------+-----------+
  366. | Average inter-token latency (s) | 0.0466 |
  367. +-----------------------------------+-----------+
  368. | Average input tokens per request | 2056.4 |
  369. +-----------------------------------+-----------+
  370. | Average output tokens per request | 128 |
  371. +-----------------------------------+-----------+
  372. 2026-05-15 08:42:26 - evalscope - INFO:
  373. Percentile results:
  374. +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
  375. | Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
  376. +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
  377. | 10% | 1.7449 | 0.0332 | 0.0332 | 9.5131 | 2056 | 128 | 13.2998 | 226.9284 |
  378. | 25% | 2.7691 | 0.0333 | 0.0376 | 9.5425 | 2056 | 128 | 13.3343 | 227.5162 |
  379. | 50% | 3.7961 | 0.0333 | 0.0455 | 9.5729 | 2056 | 128 | 13.3741 | 228.2483 |
  380. | 66% | 4.8242 | 0.0334 | 0.0533 | 9.5984 | 2056 | 128 | 13.412 | 228.8415 |
  381. | 75% | 4.825 | 0.0334 | 0.0533 | 9.5993 | 2057 | 128 | 13.4137 | 229.0807 |
  382. | 80% | 5.4098 | 0.0335 | 0.0612 | 9.6242 | 2057 | 128 | 13.4552 | 229.5791 |
  383. | 90% | 5.41 | 0.0336 | 0.0689 | 9.6253 | 2058 | 128 | 13.5021 | 230.4848 |
  384. | 95% | 5.41 | 0.0337 | 0.0689 | 9.6253 | 2058 | 128 | 13.5021 | 230.4848 |
  385. | 98% | 5.41 | 0.0367 | 0.0689 | 9.6253 | 2058 | 128 | 13.5021 | 230.4848 |
  386. | 99% | 5.41 | 1.0169 | 0.0689 | 9.6253 | 2058 | 128 | 13.5021 | 230.4848 |
  387. +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
  388. 2026-05-15 08:42:26 - evalscope - INFO: Save the summary to: outputs/20260515_084106/Qwen3.6-27B-W8A8/parallel_10_number_10
  389. 2026-05-15 08:42:26 - evalscope - INFO: Performance summary saved to: outputs/20260515_084106/Qwen3.6-27B-W8A8/performance_summary.txt