| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- #!/usr/bin/env python3
- """
- Extract one or more JSON-path values from all JSON files under a directory and print
- a markdown table.
- Examples:
- python3 hack/perf/extract_json_key_table.py \
- --dir ./output \
- --path request_successful
- python3 hack/perf/extract_json_key_table.py \
- --dir ./output \
- --path request_successful \
- --path request_total
- python3 hack/perf/extract_json_key_table.py \
- --dir ./output \
- --path raw_metrics.benchmarks[0].duration
- python3 hack/perf/extract_json_key_table.py \
- --dir hack/perf/run-cases/high-throughput/qwen_3.5_35b_fp8/output \
- --path snapshot.instances.*.computed_resource_claim.is_unified_memory
- python3 hack/perf/extract_json_key_table.py \
- --dir ./output \
- --path request_successful \
- --sort-by request_successful \
- --sort-order desc \
- --output /tmp/request_successful.md
- """
- from __future__ import annotations
- import argparse
- import json
- from pathlib import Path
- from typing import Any
- def build_parser() -> argparse.ArgumentParser:
- parser = argparse.ArgumentParser(
- description=(
- "Scan all JSON files under a directory and print a markdown table for "
- "the given JSON path."
- )
- )
- parser.add_argument(
- "--dir",
- required=True,
- help="Directory containing JSON files. The scan is recursive.",
- )
- parser.add_argument(
- "--path",
- action="append",
- required=True,
- help=(
- "JSON path such as key.subkey or list[0].field. "
- "Use * to match all values under an object level, for example "
- "snapshot.instances.*.computed_resource_claim.is_unified_memory. "
- "Can be repeated to add multiple columns."
- ),
- )
- parser.add_argument(
- "--output",
- help="Optional output file path. If omitted, print to stdout.",
- )
- parser.add_argument(
- "--sort-by",
- help=(
- "Optional sort column. Use 'file_name' to sort by file name, or pass "
- "one of the values given to --path."
- ),
- )
- parser.add_argument(
- "--sort-order",
- choices=["asc", "desc"],
- default="asc",
- help="Sort order for --sort-by. Default: asc.",
- )
- return parser
- def parse_json_path(path: str) -> list[str | int]:
- tokens: list[str | int] = []
- current = ""
- i = 0
- while i < len(path):
- char = path[i]
- if char == ".":
- if current:
- tokens.append(current)
- current = ""
- i += 1
- continue
- if char == "*":
- if current:
- tokens.append(current)
- current = ""
- tokens.append("*")
- i += 1
- continue
- if char == "[":
- if current:
- tokens.append(current)
- current = ""
- end = path.find("]", i)
- if end == -1:
- raise ValueError(f"Invalid JSON path: {path}")
- index_text = path[i + 1 : end].strip()
- if index_text == "*":
- tokens.append("*")
- i = end + 1
- continue
- if not index_text.isdigit():
- raise ValueError(f"Invalid list index in JSON path: {path}")
- tokens.append(int(index_text))
- i = end + 1
- continue
- current += char
- i += 1
- if current:
- tokens.append(current)
- return tokens
- def get_path_value(payload: Any, path_tokens: list[str | int]) -> Any:
- current_items = [payload]
- for token in path_tokens:
- next_items: list[Any] = []
- for current in current_items:
- if token == "*":
- if isinstance(current, dict):
- next_items.extend(current.values())
- elif isinstance(current, list):
- next_items.extend(current)
- continue
- if isinstance(token, int):
- if not isinstance(current, list) or token >= len(current):
- continue
- next_items.append(current[token])
- continue
- if not isinstance(current, dict) or token not in current:
- continue
- next_items.append(current[token])
- if not next_items:
- return None
- current_items = next_items
- if len(current_items) == 1:
- return current_items[0]
- return current_items
- def format_value(value: Any) -> str:
- if value is None:
- return "N/A"
- if isinstance(value, list):
- if not value:
- return "N/A"
- if all(not isinstance(item, (dict, list)) for item in value):
- return ", ".join(str(item) for item in value)
- if isinstance(value, (dict, list)):
- return json.dumps(value, ensure_ascii=True)
- return str(value)
- def escape_cell(value: str) -> str:
- return value.replace("|", r"\|").replace("\n", " ")
- def normalize_sort_value(value: Any) -> tuple[int, Any]:
- if isinstance(value, bool):
- return (0, int(value))
- if isinstance(value, (int, float)):
- return (1, value)
- if isinstance(value, str):
- return (2, value)
- if isinstance(value, list):
- return (3, json.dumps(value, ensure_ascii=True, sort_keys=True))
- if isinstance(value, dict):
- return (4, json.dumps(value, ensure_ascii=True, sort_keys=True))
- return (5, str(value))
- def collect_rows(
- directory: Path, json_paths: list[str]
- ) -> list[tuple[str, list[Any], list[str]]]:
- path_tokens_by_path = {
- json_path: parse_json_path(json_path) for json_path in json_paths
- }
- rows: list[tuple[str, list[Any], list[str]]] = []
- for file_path in sorted(directory.rglob("*.json")):
- with file_path.open("r", encoding="utf-8") as file:
- payload = json.load(file)
- raw_values: list[Any] = []
- values: list[str] = []
- for json_path in json_paths:
- value = get_path_value(payload, path_tokens_by_path[json_path])
- raw_values.append(value)
- values.append(format_value(value))
- rows.append((file_path.name, raw_values, values))
- return rows
- def sort_rows(
- rows: list[tuple[str, list[Any], list[str]]],
- json_paths: list[str],
- sort_by: str | None,
- sort_order: str,
- ) -> list[tuple[str, list[Any], list[str]]]:
- if not sort_by:
- return rows
- if sort_by == "file_name":
- present_rows = [row for row in rows if row[0] is not None]
- return sorted(
- present_rows,
- key=lambda row: row[0],
- reverse=sort_order == "desc",
- )
- if sort_by not in json_paths:
- raise SystemExit(
- f"Invalid --sort-by value: {sort_by}. Expected 'file_name' or one of: "
- + ", ".join(json_paths)
- )
- sort_index = json_paths.index(sort_by)
- present_rows = [row for row in rows if row[1][sort_index] is not None]
- missing_rows = [row for row in rows if row[1][sort_index] is None]
- present_rows = sorted(
- present_rows,
- key=lambda row: normalize_sort_value(row[1][sort_index]),
- reverse=sort_order == "desc",
- )
- return [*present_rows, *missing_rows]
- def render_markdown_table(
- rows: list[tuple[str, list[Any], list[str]]], json_paths: list[str]
- ) -> str:
- header_cells = ["File Name", *json_paths]
- lines = [
- "| " + " | ".join(escape_cell(cell) for cell in header_cells) + " |",
- "|" + "|".join("---" for _ in header_cells) + "|",
- ]
- for file_name, _, values in rows:
- cells = [file_name, *values]
- lines.append("| " + " | ".join(escape_cell(cell) for cell in cells) + " |")
- return "\n".join(lines)
- def main() -> None:
- args = build_parser().parse_args()
- directory = Path(args.dir).resolve()
- if not directory.is_dir():
- raise SystemExit(f"Directory not found: {directory}")
- rows = collect_rows(directory, args.path)
- if not rows:
- raise SystemExit(f"No JSON files found under: {directory}")
- rows = sort_rows(rows, args.path, args.sort_by, args.sort_order)
- output = render_markdown_table(rows, args.path)
- if args.output:
- output_path = Path(args.output)
- output_path.parent.mkdir(parents=True, exist_ok=True)
- output_path.write_text(output + "\n", encoding="utf-8")
- print(f"Saved markdown table to {output_path}")
- return
- print(output)
- if __name__ == "__main__":
- main()
|