generate_throughput_optimization_doc.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. #!/usr/bin/env python3
  2. """
  3. Generate a throughput optimization markdown document from benchmark result JSON files.
  4. Example:
  5. python3 hack/perf/generate_throughput_optimization_doc.py \
  6. --title "Optimizing Qwen3.5-35B-A3B Throughput" \
  7. --model Qwen/Qwen3.5-35B-A3B-FP8 \
  8. --baseline-file ./output/baseline.json \
  9. --optimized-file ./output/optimized.json \
  10. --group "Baseline of the Inference Engine=./output/baseline.json,./output/sglang.json" \
  11. --group "Choosing the Inference Engine=vLLM:: ./output/baseline.json,SGLang:: ./output/sglang.json" \
  12. --group "Quantization=./output/fp8.json" \
  13. --other-group "ShareGPT=./output/sharegpt_baseline.json,./output/sharegpt_optimized.json" \
  14. --other-group "Long Context=./output/long_context_baseline.json,./output/long_context_optimized.json" \
  15. --output ./output/throughput_doc.md
  16. """
  17. from __future__ import annotations
  18. from pathlib import Path
  19. from perf_doc_common import (
  20. BenchmarkRecord,
  21. build_arg_parser,
  22. collect_experimental_setup,
  23. collect_profiles_from_results,
  24. fmt_number,
  25. load_record,
  26. load_grouped_records,
  27. request_success_rate,
  28. render_benchmark_method_intro,
  29. render_benchmark_result_block,
  30. render_backend_parameters,
  31. render_profile_config,
  32. render_recommended_configuration,
  33. render_record_section,
  34. render_result_image,
  35. render_setup_list,
  36. render_standard_note_block,
  37. )
  38. def fmt_tps_compare(baseline_value: float, candidate_value: float) -> str:
  39. if baseline_value <= 0 or candidate_value <= 0:
  40. return "N/A"
  41. ratio = (candidate_value - baseline_value) / baseline_value * 100
  42. if ratio >= 0:
  43. return f'<span style="background-color:lightgreen;">(+{ratio:.2f}%)</span>'
  44. return f'<span style="background-color:#ffd6d6;">({ratio:.2f}%)</span>'
  45. def throughput_annotation(
  46. baseline_tps: float,
  47. candidate_tps: float,
  48. candidate_record: BenchmarkRecord,
  49. ) -> str:
  50. success_rate = request_success_rate(candidate_record)
  51. if success_rate is not None and success_rate < 1:
  52. return (
  53. '<span style="background-color:#ffd6d6;">'
  54. f'(Success rate: {success_rate * 100:.1f}%, optimization skipped)'
  55. "</span>"
  56. )
  57. return fmt_tps_compare(baseline_tps, candidate_tps)
  58. def format_tpot_delta_summary(faster_value: float, slower_value: float) -> str:
  59. delta = slower_value - faster_value
  60. if faster_value <= 0 or slower_value <= 0:
  61. return f"Mean TPOT = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms"
  62. if abs(delta) < 1e-9:
  63. return f"Mean TPOT = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms, unchanged"
  64. if delta > 0:
  65. ratio = delta / slower_value * 100
  66. return (
  67. f"Mean TPOT = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms, "
  68. f"reduced by {fmt_number(delta)} ms ({fmt_number(ratio)}%)"
  69. )
  70. increase = abs(delta)
  71. ratio = increase / slower_value * 100
  72. return (
  73. f"Mean TPOT = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms, "
  74. f"increased by {fmt_number(increase)} ms ({fmt_number(ratio)}% slower)"
  75. )
  76. def build_summary_rows(
  77. baseline: BenchmarkRecord, records: list[BenchmarkRecord]
  78. ) -> list[str]:
  79. baseline_tps = float(baseline.payload.get("tokens_per_second_mean") or 0)
  80. baseline_tpot = float(baseline.payload.get("time_per_output_token_mean") or 0)
  81. records_by_group: dict[str, list[BenchmarkRecord]] = {}
  82. for record in records:
  83. records_by_group.setdefault(record.group_name, []).append(record)
  84. best_by_group: dict[str, BenchmarkRecord] = {}
  85. for group_name, group_records in records_by_group.items():
  86. # Skip the engine baseline comparison group from the optimization summary.
  87. if any(record.path == baseline.path for record in group_records):
  88. continue
  89. best_record: BenchmarkRecord | None = None
  90. best_tps = float("-inf")
  91. for record in group_records:
  92. record_tps = float(record.payload.get("tokens_per_second_mean") or 0)
  93. if record_tps > best_tps:
  94. best_tps = record_tps
  95. best_record = record
  96. if best_record is not None:
  97. best_by_group[group_name] = best_record
  98. rows = [
  99. "| Benchmark Case | Group | Optimized | Baseline |",
  100. "|---|---|---|---|",
  101. ]
  102. for record in best_by_group.values():
  103. benchmark_case = record.profile
  104. if record.request_rate is not None:
  105. benchmark_case = f"{benchmark_case} (r={record.request_rate})"
  106. candidate_tps = float(record.payload.get("tokens_per_second_mean") or 0)
  107. candidate_tpot = float(record.payload.get("time_per_output_token_mean") or 0)
  108. rows.append(
  109. "| "
  110. + f"{benchmark_case} | {record.group_name} | "
  111. + f"Total TPS: {candidate_tps:.2f} {throughput_annotation(baseline_tps, candidate_tps, record)}"
  112. + f"<br>Mean TPOT(ms): {fmt_number(candidate_tpot)} | "
  113. + f"Total TPS: {baseline_tps:.2f}<br>Mean TPOT(ms): {fmt_number(baseline_tpot)} |"
  114. )
  115. return rows
  116. def build_optimization_option_rows(
  117. baseline: BenchmarkRecord, records: list[BenchmarkRecord]
  118. ) -> list[str]:
  119. baseline_tps = float(baseline.payload.get("tokens_per_second_mean") or 0)
  120. baseline_tpot = float(baseline.payload.get("time_per_output_token_mean") or 0)
  121. records_by_group: dict[str, list[BenchmarkRecord]] = {}
  122. for record in records:
  123. records_by_group.setdefault(record.group_name, []).append(record)
  124. rows: list[str] = []
  125. for group_name, group_records in records_by_group.items():
  126. best_record: BenchmarkRecord | None = None
  127. best_tps = float("-inf")
  128. for record in group_records:
  129. record_tps = float(record.payload.get("tokens_per_second_mean") or 0)
  130. if record_tps > best_tps:
  131. best_tps = record_tps
  132. best_record = record
  133. if best_record is None:
  134. continue
  135. candidate_tps = float(best_record.payload.get("tokens_per_second_mean") or 0)
  136. candidate_tpot = float(
  137. best_record.payload.get("time_per_output_token_mean") or 0
  138. )
  139. rows.append(
  140. "| "
  141. + f"{group_name} | "
  142. + f"Total TPS: {candidate_tps:.2f} {throughput_annotation(baseline_tps, candidate_tps, best_record)}"
  143. + f"<br>Mean TPOT(ms): {fmt_number(candidate_tpot)} | "
  144. + f"Total TPS: {baseline_tps:.2f}<br>Mean TPOT(ms): {fmt_number(baseline_tpot)} |"
  145. )
  146. return rows
  147. def build_profile_comparison_rows(
  148. other_grouped_records: list[tuple[str, list[BenchmarkRecord]]],
  149. ) -> list[str]:
  150. rows = [
  151. "| Benchmark Case | baseline (vLLM without any optimizations) | Optimized |",
  152. "|----------|-------------------------------------------|-----------|",
  153. ]
  154. for profile_name, records in other_grouped_records:
  155. if len(records) < 2:
  156. continue
  157. baseline, optimized = records[0], records[1]
  158. baseline_tps = float(baseline.payload.get("tokens_per_second_mean") or 0)
  159. baseline_tpot = float(baseline.payload.get("time_per_output_token_mean") or 0)
  160. optimized_tps = float(optimized.payload.get("tokens_per_second_mean") or 0)
  161. optimized_tpot = float(optimized.payload.get("time_per_output_token_mean") or 0)
  162. rows.append(
  163. "| "
  164. + f"**{profile_name}** | "
  165. + f"Total TPS: {baseline_tps:.2f}<br>Mean TPOT(ms): {fmt_number(baseline_tpot)} | "
  166. + f"Total TPS: {optimized_tps:.2f} {throughput_annotation(baseline_tps, optimized_tps, optimized)}"
  167. + f"<br>Mean TPOT(ms): {fmt_number(optimized_tpot)} |"
  168. )
  169. return rows
  170. def record_display_name(record: BenchmarkRecord) -> str:
  171. return record.custom_title or record.name
  172. def render_group_speed_summary(records: list[BenchmarkRecord]) -> str:
  173. if len(records) < 2:
  174. return ""
  175. ranked_records = sorted(
  176. records,
  177. key=lambda record: float(record.payload.get("tokens_per_second_mean") or 0),
  178. )
  179. slowest = ranked_records[0]
  180. fastest = ranked_records[-1]
  181. slowest_tps = float(slowest.payload.get("tokens_per_second_mean") or 0)
  182. fastest_tps = float(fastest.payload.get("tokens_per_second_mean") or 0)
  183. slowest_tpot = float(slowest.payload.get("time_per_output_token_mean") or 0)
  184. fastest_tpot = float(fastest.payload.get("time_per_output_token_mean") or 0)
  185. if slowest_tps <= 0 or fastest_tps <= 0:
  186. return ""
  187. tps_gap = fastest_tps - slowest_tps
  188. tps_ratio = tps_gap / slowest_tps * 100
  189. summary = (
  190. f"- Summary: `{record_display_name(fastest)}` Total TPS = {fmt_number(fastest_tps)}, "
  191. f"`{record_display_name(slowest)}` Total TPS = {fmt_number(slowest_tps)}. "
  192. f"`{record_display_name(fastest)}` is faster by {fmt_number(tps_gap)} tok/s "
  193. f"({fmt_number(tps_ratio)}%)"
  194. )
  195. if slowest_tpot > 0 and fastest_tpot > 0:
  196. summary += "; " + format_tpot_delta_summary(fastest_tpot, slowest_tpot) + "."
  197. else:
  198. summary += "."
  199. return summary
  200. def render_other_group_pair_section(
  201. group_name: str, records: list[BenchmarkRecord]
  202. ) -> str:
  203. if not records:
  204. return ""
  205. lines = [f"#### {group_name}", ""]
  206. if len(records) >= 1:
  207. baseline = records[0]
  208. baseline_instance = baseline.payload.get("snapshot", {}).get("instances", {})
  209. if isinstance(baseline_instance, dict) and baseline_instance:
  210. baseline_instance = next(iter(baseline_instance.values()))
  211. else:
  212. baseline_instance = {}
  213. lines.extend(
  214. [
  215. f"- Baseline Backend Parameters:{render_backend_parameters(baseline_instance)}",
  216. "",
  217. render_benchmark_result_block(baseline.payload).replace(
  218. '??? info "Benchmark result"',
  219. '??? info "Baseline benchmark result"',
  220. 1,
  221. ),
  222. "",
  223. ]
  224. )
  225. if len(records) >= 2:
  226. optimized = records[1]
  227. optimized_instance = optimized.payload.get("snapshot", {}).get("instances", {})
  228. if isinstance(optimized_instance, dict) and optimized_instance:
  229. optimized_instance = next(iter(optimized_instance.values()))
  230. else:
  231. optimized_instance = {}
  232. lines.extend(
  233. [
  234. f"- Optimized Backend Parameters:{render_backend_parameters(optimized_instance)}",
  235. "",
  236. render_benchmark_result_block(optimized.payload).replace(
  237. '??? info "Benchmark result"',
  238. '??? info "Optimized benchmark result"',
  239. 1,
  240. ),
  241. "",
  242. ]
  243. )
  244. return "\n".join(lines)
  245. def generate_throughput_markdown(
  246. title: str,
  247. baseline: BenchmarkRecord,
  248. grouped_records: list[tuple[str, list[BenchmarkRecord]]],
  249. other_grouped_records: list[tuple[str, list[BenchmarkRecord]]],
  250. declared_models: list[str] | None = None,
  251. optimized_record: BenchmarkRecord | None = None,
  252. image_name: str = "replace-this-image.png",
  253. ) -> str:
  254. summary_records = [record for _, records in grouped_records for record in records]
  255. all_records = summary_records + [
  256. record for _, records in other_grouped_records for record in records
  257. ]
  258. conclusion_rows = build_profile_comparison_rows(other_grouped_records)
  259. optimization_rows = build_optimization_option_rows(baseline, summary_records)
  260. models, hardware, engine_versions = collect_experimental_setup(
  261. all_records, declared_models=declared_models
  262. )
  263. profile_configs = collect_profiles_from_results(all_records)
  264. used_profiles = list(profile_configs.keys())
  265. sections = [
  266. f"# {title}",
  267. "",
  268. "## Conclusion",
  269. "",
  270. render_result_image("Throughput Optimization Result", image_name),
  271. "",
  272. *render_recommended_configuration(
  273. "throughput", optimized_record, declared_models=declared_models
  274. ),
  275. "Comparison of benchmark results before and after optimization:",
  276. "",
  277. *conclusion_rows,
  278. "",
  279. *render_standard_note_block(),
  280. "## Experimental Setup",
  281. "",
  282. *render_setup_list("Model", models),
  283. *render_setup_list("Hardware", hardware),
  284. *render_setup_list("Engine Version", engine_versions),
  285. *render_benchmark_method_intro(),
  286. ]
  287. for profile_name in used_profiles:
  288. sections.extend(
  289. render_profile_config(profile_name, profile_configs.get(profile_name))
  290. )
  291. sections.extend(
  292. [
  293. "## Experiment Results",
  294. "",
  295. ]
  296. )
  297. for group_name, records in grouped_records:
  298. sections.append(f"### {group_name}")
  299. sections.append("")
  300. include_subheading = len(records) > 1
  301. for record in records:
  302. sections.append(
  303. render_record_section(record, include_heading=include_subheading)
  304. )
  305. group_summary = render_group_speed_summary(records)
  306. if group_summary:
  307. sections.extend([group_summary, ""])
  308. sections.extend(
  309. [
  310. "### Summary of Optimization Options",
  311. "",
  312. "| Benchmark Cases | Optimized | Baseline |",
  313. "| --------------------------------------- | ------------------------------------------------------------------------------------------------------- | ------------------------------------------ |",
  314. *optimization_rows,
  315. "",
  316. ]
  317. )
  318. if other_grouped_records:
  319. sections.extend(
  320. [
  321. "### Other Benchmark Cases",
  322. "",
  323. ]
  324. )
  325. for group_name, records in other_grouped_records:
  326. rendered = render_other_group_pair_section(group_name, records)
  327. if rendered:
  328. sections.append(rendered)
  329. return "\n".join(sections).rstrip() + "\n"
  330. def main() -> None:
  331. args = build_arg_parser("throughput").parse_args()
  332. baseline_record, grouped_records, other_grouped_records = load_grouped_records(
  333. args.baseline_file, args.group, args.other_group
  334. )
  335. optimized_record = load_record(args.optimized_file) if args.optimized_file else None
  336. output_file = Path(args.output)
  337. output_file.parent.mkdir(parents=True, exist_ok=True)
  338. output_file.write_text(
  339. generate_throughput_markdown(
  340. args.title,
  341. baseline_record,
  342. grouped_records,
  343. other_grouped_records,
  344. declared_models=args.model,
  345. optimized_record=optimized_record,
  346. image_name=args.image_name,
  347. ),
  348. encoding="utf-8",
  349. )
  350. print(f"Generated markdown document at {output_file}")
  351. if __name__ == "__main__":
  352. main()