#!/usr/bin/env python3 """ Generate a throughput optimization markdown document from benchmark result JSON files. Example: python3 hack/perf/generate_throughput_optimization_doc.py \ --title "Optimizing Qwen3.5-35B-A3B Throughput" \ --model Qwen/Qwen3.5-35B-A3B-FP8 \ --baseline-file ./output/baseline.json \ --optimized-file ./output/optimized.json \ --group "Baseline of the Inference Engine=./output/baseline.json,./output/sglang.json" \ --group "Choosing the Inference Engine=vLLM:: ./output/baseline.json,SGLang:: ./output/sglang.json" \ --group "Quantization=./output/fp8.json" \ --other-group "ShareGPT=./output/sharegpt_baseline.json,./output/sharegpt_optimized.json" \ --other-group "Long Context=./output/long_context_baseline.json,./output/long_context_optimized.json" \ --output ./output/throughput_doc.md """ from __future__ import annotations from pathlib import Path from perf_doc_common import ( BenchmarkRecord, build_arg_parser, collect_experimental_setup, collect_profiles_from_results, fmt_number, load_record, load_grouped_records, request_success_rate, render_benchmark_method_intro, render_benchmark_result_block, render_backend_parameters, render_profile_config, render_recommended_configuration, render_record_section, render_result_image, render_setup_list, render_standard_note_block, ) def fmt_tps_compare(baseline_value: float, candidate_value: float) -> str: if baseline_value <= 0 or candidate_value <= 0: return "N/A" ratio = (candidate_value - baseline_value) / baseline_value * 100 if ratio >= 0: return f'(+{ratio:.2f}%)' return f'({ratio:.2f}%)' def throughput_annotation( baseline_tps: float, candidate_tps: float, candidate_record: BenchmarkRecord, ) -> str: success_rate = request_success_rate(candidate_record) if success_rate is not None and success_rate < 1: return ( '' f'(Success rate: {success_rate * 100:.1f}%, optimization skipped)' "" ) return fmt_tps_compare(baseline_tps, candidate_tps) def format_tpot_delta_summary(faster_value: float, slower_value: float) -> str: delta = slower_value - faster_value if faster_value <= 0 or slower_value <= 0: return f"Mean TPOT = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms" if abs(delta) < 1e-9: return f"Mean TPOT = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms, unchanged" if delta > 0: ratio = delta / slower_value * 100 return ( f"Mean TPOT = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms, " f"reduced by {fmt_number(delta)} ms ({fmt_number(ratio)}%)" ) increase = abs(delta) ratio = increase / slower_value * 100 return ( f"Mean TPOT = {fmt_number(faster_value)} ms vs {fmt_number(slower_value)} ms, " f"increased by {fmt_number(increase)} ms ({fmt_number(ratio)}% slower)" ) def build_summary_rows( baseline: BenchmarkRecord, records: list[BenchmarkRecord] ) -> list[str]: baseline_tps = float(baseline.payload.get("tokens_per_second_mean") or 0) baseline_tpot = float(baseline.payload.get("time_per_output_token_mean") or 0) records_by_group: dict[str, list[BenchmarkRecord]] = {} for record in records: records_by_group.setdefault(record.group_name, []).append(record) best_by_group: dict[str, BenchmarkRecord] = {} for group_name, group_records in records_by_group.items(): # Skip the engine baseline comparison group from the optimization summary. if any(record.path == baseline.path for record in group_records): continue best_record: BenchmarkRecord | None = None best_tps = float("-inf") for record in group_records: record_tps = float(record.payload.get("tokens_per_second_mean") or 0) if record_tps > best_tps: best_tps = record_tps best_record = record if best_record is not None: best_by_group[group_name] = best_record rows = [ "| Benchmark Case | Group | Optimized | Baseline |", "|---|---|---|---|", ] for record in best_by_group.values(): benchmark_case = record.profile if record.request_rate is not None: benchmark_case = f"{benchmark_case} (r={record.request_rate})" candidate_tps = float(record.payload.get("tokens_per_second_mean") or 0) candidate_tpot = float(record.payload.get("time_per_output_token_mean") or 0) rows.append( "| " + f"{benchmark_case} | {record.group_name} | " + f"Total TPS: {candidate_tps:.2f} {throughput_annotation(baseline_tps, candidate_tps, record)}" + f"
Mean TPOT(ms): {fmt_number(candidate_tpot)} | " + f"Total TPS: {baseline_tps:.2f}
Mean TPOT(ms): {fmt_number(baseline_tpot)} |" ) return rows def build_optimization_option_rows( baseline: BenchmarkRecord, records: list[BenchmarkRecord] ) -> list[str]: baseline_tps = float(baseline.payload.get("tokens_per_second_mean") or 0) baseline_tpot = float(baseline.payload.get("time_per_output_token_mean") or 0) records_by_group: dict[str, list[BenchmarkRecord]] = {} for record in records: records_by_group.setdefault(record.group_name, []).append(record) rows: list[str] = [] for group_name, group_records in records_by_group.items(): best_record: BenchmarkRecord | None = None best_tps = float("-inf") for record in group_records: record_tps = float(record.payload.get("tokens_per_second_mean") or 0) if record_tps > best_tps: best_tps = record_tps best_record = record if best_record is None: continue candidate_tps = float(best_record.payload.get("tokens_per_second_mean") or 0) candidate_tpot = float( best_record.payload.get("time_per_output_token_mean") or 0 ) rows.append( "| " + f"{group_name} | " + f"Total TPS: {candidate_tps:.2f} {throughput_annotation(baseline_tps, candidate_tps, best_record)}" + f"
Mean TPOT(ms): {fmt_number(candidate_tpot)} | " + f"Total TPS: {baseline_tps:.2f}
Mean TPOT(ms): {fmt_number(baseline_tpot)} |" ) return rows def build_profile_comparison_rows( other_grouped_records: list[tuple[str, list[BenchmarkRecord]]], ) -> list[str]: rows = [ "| Benchmark Case | baseline (vLLM without any optimizations) | Optimized |", "|----------|-------------------------------------------|-----------|", ] for profile_name, records in other_grouped_records: if len(records) < 2: continue baseline, optimized = records[0], records[1] baseline_tps = float(baseline.payload.get("tokens_per_second_mean") or 0) baseline_tpot = float(baseline.payload.get("time_per_output_token_mean") or 0) optimized_tps = float(optimized.payload.get("tokens_per_second_mean") or 0) optimized_tpot = float(optimized.payload.get("time_per_output_token_mean") or 0) rows.append( "| " + f"**{profile_name}** | " + f"Total TPS: {baseline_tps:.2f}
Mean TPOT(ms): {fmt_number(baseline_tpot)} | " + f"Total TPS: {optimized_tps:.2f} {throughput_annotation(baseline_tps, optimized_tps, optimized)}" + f"
Mean TPOT(ms): {fmt_number(optimized_tpot)} |" ) return rows def record_display_name(record: BenchmarkRecord) -> str: return record.custom_title or record.name def render_group_speed_summary(records: list[BenchmarkRecord]) -> str: if len(records) < 2: return "" ranked_records = sorted( records, key=lambda record: float(record.payload.get("tokens_per_second_mean") or 0), ) slowest = ranked_records[0] fastest = ranked_records[-1] slowest_tps = float(slowest.payload.get("tokens_per_second_mean") or 0) fastest_tps = float(fastest.payload.get("tokens_per_second_mean") or 0) slowest_tpot = float(slowest.payload.get("time_per_output_token_mean") or 0) fastest_tpot = float(fastest.payload.get("time_per_output_token_mean") or 0) if slowest_tps <= 0 or fastest_tps <= 0: return "" tps_gap = fastest_tps - slowest_tps tps_ratio = tps_gap / slowest_tps * 100 summary = ( f"- Summary: `{record_display_name(fastest)}` Total TPS = {fmt_number(fastest_tps)}, " f"`{record_display_name(slowest)}` Total TPS = {fmt_number(slowest_tps)}. " f"`{record_display_name(fastest)}` is faster by {fmt_number(tps_gap)} tok/s " f"({fmt_number(tps_ratio)}%)" ) if slowest_tpot > 0 and fastest_tpot > 0: summary += "; " + format_tpot_delta_summary(fastest_tpot, slowest_tpot) + "." else: summary += "." return summary def render_other_group_pair_section( group_name: str, records: list[BenchmarkRecord] ) -> str: if not records: return "" lines = [f"#### {group_name}", ""] if len(records) >= 1: baseline = records[0] baseline_instance = baseline.payload.get("snapshot", {}).get("instances", {}) if isinstance(baseline_instance, dict) and baseline_instance: baseline_instance = next(iter(baseline_instance.values())) else: baseline_instance = {} lines.extend( [ f"- Baseline Backend Parameters:{render_backend_parameters(baseline_instance)}", "", render_benchmark_result_block(baseline.payload).replace( '??? info "Benchmark result"', '??? info "Baseline benchmark result"', 1, ), "", ] ) if len(records) >= 2: optimized = records[1] optimized_instance = optimized.payload.get("snapshot", {}).get("instances", {}) if isinstance(optimized_instance, dict) and optimized_instance: optimized_instance = next(iter(optimized_instance.values())) else: optimized_instance = {} lines.extend( [ f"- Optimized Backend Parameters:{render_backend_parameters(optimized_instance)}", "", render_benchmark_result_block(optimized.payload).replace( '??? info "Benchmark result"', '??? info "Optimized benchmark result"', 1, ), "", ] ) return "\n".join(lines) def generate_throughput_markdown( title: str, baseline: BenchmarkRecord, grouped_records: list[tuple[str, list[BenchmarkRecord]]], other_grouped_records: list[tuple[str, list[BenchmarkRecord]]], declared_models: list[str] | None = None, optimized_record: BenchmarkRecord | None = None, image_name: str = "replace-this-image.png", ) -> str: summary_records = [record for _, records in grouped_records for record in records] all_records = summary_records + [ record for _, records in other_grouped_records for record in records ] conclusion_rows = build_profile_comparison_rows(other_grouped_records) optimization_rows = build_optimization_option_rows(baseline, summary_records) models, hardware, engine_versions = collect_experimental_setup( all_records, declared_models=declared_models ) profile_configs = collect_profiles_from_results(all_records) used_profiles = list(profile_configs.keys()) sections = [ f"# {title}", "", "## Conclusion", "", render_result_image("Throughput Optimization Result", image_name), "", *render_recommended_configuration( "throughput", optimized_record, declared_models=declared_models ), "Comparison of benchmark results before and after optimization:", "", *conclusion_rows, "", *render_standard_note_block(), "## Experimental Setup", "", *render_setup_list("Model", models), *render_setup_list("Hardware", hardware), *render_setup_list("Engine Version", engine_versions), *render_benchmark_method_intro(), ] for profile_name in used_profiles: sections.extend( render_profile_config(profile_name, profile_configs.get(profile_name)) ) sections.extend( [ "## Experiment Results", "", ] ) for group_name, records in grouped_records: sections.append(f"### {group_name}") sections.append("") include_subheading = len(records) > 1 for record in records: sections.append( render_record_section(record, include_heading=include_subheading) ) group_summary = render_group_speed_summary(records) if group_summary: sections.extend([group_summary, ""]) sections.extend( [ "### Summary of Optimization Options", "", "| Benchmark Cases | Optimized | Baseline |", "| --------------------------------------- | ------------------------------------------------------------------------------------------------------- | ------------------------------------------ |", *optimization_rows, "", ] ) if other_grouped_records: sections.extend( [ "### Other Benchmark Cases", "", ] ) for group_name, records in other_grouped_records: rendered = render_other_group_pair_section(group_name, records) if rendered: sections.append(rendered) return "\n".join(sections).rstrip() + "\n" def main() -> None: args = build_arg_parser("throughput").parse_args() baseline_record, grouped_records, other_grouped_records = load_grouped_records( args.baseline_file, args.group, args.other_group ) optimized_record = load_record(args.optimized_file) if args.optimized_file else None output_file = Path(args.output) output_file.parent.mkdir(parents=True, exist_ok=True) output_file.write_text( generate_throughput_markdown( args.title, baseline_record, grouped_records, other_grouped_records, declared_models=args.model, optimized_record=optimized_record, image_name=args.image_name, ), encoding="utf-8", ) print(f"Generated markdown document at {output_file}") if __name__ == "__main__": main()