generate_latency_optimization_doc.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. #!/usr/bin/env python3
  2. """
  3. Generate a latency optimization markdown document from benchmark result JSON files.
  4. Example:
  5. python3 hack/perf/generate_latency_optimization_doc.py \
  6. --title "Optimizing Qwen3-8B Latency" \
  7. --model Qwen/Qwen3-8B-FP8 \
  8. --baseline-file ./output/baseline.json \
  9. --optimized-file ./output/optimized.json \
  10. --group "Inference Engine=./output/vllm.json,./output/sglang.json" \
  11. --group "Inference Engine=vLLM::./output/vllm.json,SGLang::./output/sglang.json" \
  12. --group "Speculative Decoding=./output/speculative.json" \
  13. --other-group "ShareGPT BS=1=./output/sharegpt_bs1_baseline.json,./output/sharegpt_bs1_optimized.json" \
  14. --other-group "ShareGPT BS=2=./output/sharegpt_bs2_baseline.json,./output/sharegpt_bs2_optimized.json" \
  15. --output ./output/latency_doc.md
  16. """
  17. from __future__ import annotations
  18. from perf_doc_common import (
  19. BenchmarkRecord,
  20. build_arg_parser,
  21. collect_experimental_setup,
  22. collect_profiles_from_results,
  23. fmt_number,
  24. load_record,
  25. load_grouped_records,
  26. request_success_rate,
  27. render_benchmark_method_intro,
  28. render_benchmark_result_block,
  29. render_backend_parameters,
  30. render_profile_config,
  31. render_recommended_configuration,
  32. render_record_section,
  33. render_result_image,
  34. render_setup_list,
  35. render_open_source_replacement,
  36. render_standard_note_block,
  37. )
  38. def fmt_latency_compare(baseline_value: float, candidate_value: float) -> str:
  39. if baseline_value <= 0 or candidate_value <= 0:
  40. return "N/A"
  41. ratio = baseline_value / candidate_value
  42. if ratio >= 1:
  43. return (
  44. f'<span style="background-color:lightgreen;">({ratio:.2f}x faster)</span>'
  45. )
  46. return f'<span style="background-color:#ffd6d6;">({(1 / ratio):.2f}x slower)</span>'
  47. def is_successful_record(record: BenchmarkRecord) -> bool:
  48. success_rate = request_success_rate(record)
  49. return success_rate is None or success_rate >= 1
  50. def latency_annotation(
  51. baseline_latency: float,
  52. candidate_latency: float,
  53. candidate_record: BenchmarkRecord,
  54. ) -> str:
  55. success_rate = request_success_rate(candidate_record)
  56. if success_rate is not None and success_rate < 1:
  57. return (
  58. '<span style="background-color:#ffd6d6;">'
  59. f'(Success rate: {success_rate * 100:.1f}%, optimization skipped)'
  60. "</span>"
  61. )
  62. return fmt_latency_compare(baseline_latency, candidate_latency)
  63. def format_latency_delta_summary(
  64. metric_name: str, faster_value: float, slower_value: float
  65. ) -> str:
  66. delta = slower_value - faster_value
  67. if faster_value <= 0 or slower_value <= 0:
  68. return f"{metric_name} = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms"
  69. if abs(delta) < 1e-9:
  70. return f"{metric_name} = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms, unchanged"
  71. if delta > 0:
  72. ratio = slower_value / faster_value
  73. return (
  74. f"{metric_name} = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms, "
  75. f"reduced by {fmt_number(delta)} ms ({fmt_number(ratio)}x faster)"
  76. )
  77. increase = abs(delta)
  78. ratio = faster_value / slower_value
  79. return (
  80. f"{metric_name} = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms, "
  81. f"increased by {fmt_number(increase)} ms ({fmt_number(ratio)}x slower)"
  82. )
  83. def build_summary_rows(
  84. baseline: BenchmarkRecord, records: list[BenchmarkRecord]
  85. ) -> list[str]:
  86. baseline_latency = float(baseline.payload.get("request_latency_mean") or 0)
  87. best_by_group: dict[str, BenchmarkRecord] = {}
  88. for record in records:
  89. if not is_successful_record(record):
  90. continue
  91. current_best = best_by_group.get(record.group_name)
  92. record_latency = float(
  93. record.payload.get("request_latency_mean") or float("inf")
  94. )
  95. if current_best is None:
  96. best_by_group[record.group_name] = record
  97. continue
  98. current_best_latency = float(
  99. current_best.payload.get("request_latency_mean") or float("inf")
  100. )
  101. if record_latency < current_best_latency:
  102. best_by_group[record.group_name] = record
  103. rows = [
  104. "| Benchmark Case | Group | Optimized | Baseline | Comparison |",
  105. "|---|---|---:|---:|---|",
  106. ]
  107. for record in best_by_group.values():
  108. candidate_latency = float(record.payload.get("request_latency_mean") or 0)
  109. benchmark_case = record.profile
  110. if record.request_rate is not None:
  111. benchmark_case = f"{benchmark_case} (r={record.request_rate})"
  112. rows.append(
  113. "| "
  114. + f"{benchmark_case} | {record.group_name} | "
  115. + f"{candidate_latency:.2f}s | {baseline_latency:.2f}s | "
  116. + f"{latency_annotation(baseline_latency, candidate_latency, record)} |"
  117. )
  118. return rows
  119. def build_profile_comparison_rows(
  120. other_grouped_records: list[tuple[str, list[BenchmarkRecord]]],
  121. ) -> list[str]:
  122. rows = [
  123. "| Benchmark Case | Baseline (vLLM without any optimizations) | Optimized |",
  124. "|----------|-------------------------------------------|-----------|",
  125. ]
  126. for profile_name, records in other_grouped_records:
  127. if len(records) < 2:
  128. continue
  129. baseline, optimized = records[0], records[1]
  130. baseline_latency = float(baseline.payload.get("request_latency_mean") or 0)
  131. optimized_latency = float(optimized.payload.get("request_latency_mean") or 0)
  132. rows.append(
  133. "| "
  134. + f"**{profile_name}** | "
  135. + f"Mean latency: {baseline_latency:.2f}s/req | "
  136. + f"Mean latency: {optimized_latency:.2f}s/req "
  137. + f"{latency_annotation(baseline_latency, optimized_latency, optimized)} |"
  138. )
  139. return rows
  140. def record_display_name(record: BenchmarkRecord) -> str:
  141. return record.custom_title or record.name
  142. def render_group_latency_summary(records: list[BenchmarkRecord]) -> str:
  143. successful_records = [record for record in records if is_successful_record(record)]
  144. if len(successful_records) < 2:
  145. return ""
  146. ranked_records = sorted(
  147. successful_records,
  148. key=lambda record: float(
  149. record.payload.get("request_latency_mean") or float("inf")
  150. ),
  151. )
  152. fastest = ranked_records[0]
  153. slowest = ranked_records[-1]
  154. fastest_latency = float(fastest.payload.get("request_latency_mean") or 0)
  155. slowest_latency = float(slowest.payload.get("request_latency_mean") or 0)
  156. fastest_ttft = float(fastest.payload.get("time_to_first_token_mean") or 0)
  157. slowest_ttft = float(slowest.payload.get("time_to_first_token_mean") or 0)
  158. fastest_tpot = float(fastest.payload.get("time_per_output_token_mean") or 0)
  159. slowest_tpot = float(slowest.payload.get("time_per_output_token_mean") or 0)
  160. if fastest_latency <= 0 or slowest_latency <= 0:
  161. return ""
  162. latency_delta = slowest_latency - fastest_latency
  163. latency_ratio = slowest_latency / fastest_latency
  164. summary = (
  165. f"- Summary: `{record_display_name(fastest)}` Mean Latency = {fmt_number(fastest_latency)}s, "
  166. f"`{record_display_name(slowest)}` Mean Latency = {fmt_number(slowest_latency)}s. "
  167. f"`{record_display_name(fastest)}` is faster by {fmt_number(latency_delta)}s "
  168. f"({fmt_number(latency_ratio)}x faster)."
  169. )
  170. extra_metrics: list[str] = []
  171. if slowest_ttft > 0 and fastest_ttft > 0:
  172. extra_metrics.append(
  173. format_latency_delta_summary("TTFT", fastest_ttft, slowest_ttft)
  174. )
  175. if slowest_tpot > 0 and fastest_tpot > 0:
  176. extra_metrics.append(
  177. format_latency_delta_summary("TPOT", fastest_tpot, slowest_tpot)
  178. )
  179. if extra_metrics:
  180. summary += " " + "; ".join(extra_metrics) + "."
  181. return summary
  182. def render_other_group_pair_section(
  183. group_name: str, records: list[BenchmarkRecord]
  184. ) -> str:
  185. if not records:
  186. return ""
  187. lines = [f"#### {group_name}", ""]
  188. if len(records) >= 1:
  189. baseline = records[0]
  190. baseline_instance = baseline.payload.get("snapshot", {}).get("instances", {})
  191. if isinstance(baseline_instance, dict) and baseline_instance:
  192. baseline_instance = next(iter(baseline_instance.values()))
  193. else:
  194. baseline_instance = {}
  195. lines.extend(
  196. [
  197. f"- Baseline Backend Parameters:{render_backend_parameters(baseline_instance)}",
  198. "",
  199. render_benchmark_result_block(baseline.payload).replace(
  200. '??? info "Benchmark result"',
  201. '??? info "Baseline benchmark result"',
  202. 1,
  203. ),
  204. "",
  205. ]
  206. )
  207. if len(records) >= 2:
  208. optimized = records[1]
  209. optimized_instance = optimized.payload.get("snapshot", {}).get("instances", {})
  210. if isinstance(optimized_instance, dict) and optimized_instance:
  211. optimized_instance = next(iter(optimized_instance.values()))
  212. else:
  213. optimized_instance = {}
  214. lines.extend(
  215. [
  216. f"- Optimized Backend Parameters:{render_backend_parameters(optimized_instance)}",
  217. "",
  218. render_benchmark_result_block(optimized.payload).replace(
  219. '??? info "Benchmark result"',
  220. '??? info "Optimized benchmark result"',
  221. 1,
  222. ),
  223. "",
  224. ]
  225. )
  226. return "\n".join(lines)
  227. def generate_latency_markdown(
  228. title: str,
  229. baseline: BenchmarkRecord,
  230. grouped_records: list[tuple[str, list[BenchmarkRecord]]],
  231. other_grouped_records: list[tuple[str, list[BenchmarkRecord]]],
  232. declared_models: list[str] | None = None,
  233. optimized_record: BenchmarkRecord | None = None,
  234. image_name: str = "replace-this-image.png",
  235. ) -> str:
  236. summary_records = [record for _, records in grouped_records for record in records]
  237. all_records = summary_records + [
  238. record for _, records in other_grouped_records for record in records
  239. ]
  240. summary_rows = build_profile_comparison_rows(other_grouped_records)
  241. optimization_rows = build_summary_rows(baseline, summary_records)
  242. models, hardware, engine_versions = collect_experimental_setup(
  243. all_records, declared_models=declared_models
  244. )
  245. profile_configs = collect_profiles_from_results(all_records)
  246. used_profiles = list(profile_configs.keys())
  247. sections = [
  248. f"# {title}",
  249. "",
  250. "## Conclusion",
  251. "",
  252. render_result_image("Latency Optimization Result", image_name),
  253. "",
  254. *render_recommended_configuration(
  255. "latency", optimized_record, declared_models=declared_models
  256. ),
  257. "Comparison of benchmark results before and after optimization:",
  258. "",
  259. *summary_rows,
  260. "",
  261. *render_standard_note_block(),
  262. "## Experimental Setup",
  263. "",
  264. *render_setup_list("Model", models),
  265. *render_setup_list("Hardware", hardware),
  266. *render_setup_list("Engine Version", engine_versions),
  267. *render_benchmark_method_intro(),
  268. ]
  269. for profile_name in used_profiles:
  270. sections.extend(
  271. render_profile_config(profile_name, profile_configs.get(profile_name))
  272. )
  273. sections.extend(render_open_source_replacement(profile_configs))
  274. sections.extend(
  275. [
  276. "## Experiment Results",
  277. "",
  278. ]
  279. )
  280. for group_name, records in grouped_records:
  281. sections.append(f"### {group_name}")
  282. sections.append("")
  283. include_subheading = len(records) > 1
  284. for record in records:
  285. sections.append(
  286. render_record_section(record, include_heading=include_subheading)
  287. )
  288. group_summary = render_group_latency_summary(records)
  289. if group_summary:
  290. sections.extend([group_summary, ""])
  291. sections.extend(
  292. [
  293. "### Summary of Optimization Options",
  294. "",
  295. "| Benchmark Case | Group | Optimized | Baseline | Comparison |",
  296. "|---|---|---:|---:|---|",
  297. *optimization_rows[2:],
  298. "",
  299. ]
  300. )
  301. if other_grouped_records:
  302. sections.extend(
  303. [
  304. "### Other Benchmark Cases",
  305. "",
  306. ]
  307. )
  308. for group_name, records in other_grouped_records:
  309. rendered = render_other_group_pair_section(group_name, records)
  310. if rendered:
  311. sections.append(rendered)
  312. return "\n".join(sections).rstrip() + "\n"
  313. def main() -> None:
  314. args = build_arg_parser("latency").parse_args()
  315. baseline_record, grouped_records, other_grouped_records = load_grouped_records(
  316. args.baseline_file, args.group, args.other_group
  317. )
  318. optimized_record = load_record(args.optimized_file) if args.optimized_file else None
  319. output_path = args.output
  320. from pathlib import Path
  321. output_file = Path(output_path)
  322. output_file.parent.mkdir(parents=True, exist_ok=True)
  323. output_file.write_text(
  324. generate_latency_markdown(
  325. args.title,
  326. baseline_record,
  327. grouped_records,
  328. other_grouped_records,
  329. declared_models=args.model,
  330. optimized_record=optimized_record,
  331. image_name=args.image_name,
  332. ),
  333. encoding="utf-8",
  334. )
  335. print(f"Generated markdown document at {output_file}")
  336. if __name__ == "__main__":
  337. main()