extract_json_key_table.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. #!/usr/bin/env python3
  2. """
  3. Extract one or more JSON-path values from all JSON files under a directory and print
  4. a markdown table.
  5. Examples:
  6. python3 hack/perf/extract_json_key_table.py \
  7. --dir ./output \
  8. --path request_successful
  9. python3 hack/perf/extract_json_key_table.py \
  10. --dir ./output \
  11. --path request_successful \
  12. --path request_total
  13. python3 hack/perf/extract_json_key_table.py \
  14. --dir ./output \
  15. --path raw_metrics.benchmarks[0].duration
  16. python3 hack/perf/extract_json_key_table.py \
  17. --dir hack/perf/run-cases/high-throughput/qwen_3.5_35b_fp8/output \
  18. --path snapshot.instances.*.computed_resource_claim.is_unified_memory
  19. python3 hack/perf/extract_json_key_table.py \
  20. --dir ./output \
  21. --path request_successful \
  22. --sort-by request_successful \
  23. --sort-order desc \
  24. --output /tmp/request_successful.md
  25. """
  26. from __future__ import annotations
  27. import argparse
  28. import json
  29. from pathlib import Path
  30. from typing import Any
  31. def build_parser() -> argparse.ArgumentParser:
  32. parser = argparse.ArgumentParser(
  33. description=(
  34. "Scan all JSON files under a directory and print a markdown table for "
  35. "the given JSON path."
  36. )
  37. )
  38. parser.add_argument(
  39. "--dir",
  40. required=True,
  41. help="Directory containing JSON files. The scan is recursive.",
  42. )
  43. parser.add_argument(
  44. "--path",
  45. action="append",
  46. required=True,
  47. help=(
  48. "JSON path such as key.subkey or list[0].field. "
  49. "Use * to match all values under an object level, for example "
  50. "snapshot.instances.*.computed_resource_claim.is_unified_memory. "
  51. "Can be repeated to add multiple columns."
  52. ),
  53. )
  54. parser.add_argument(
  55. "--output",
  56. help="Optional output file path. If omitted, print to stdout.",
  57. )
  58. parser.add_argument(
  59. "--sort-by",
  60. help=(
  61. "Optional sort column. Use 'file_name' to sort by file name, or pass "
  62. "one of the values given to --path."
  63. ),
  64. )
  65. parser.add_argument(
  66. "--sort-order",
  67. choices=["asc", "desc"],
  68. default="asc",
  69. help="Sort order for --sort-by. Default: asc.",
  70. )
  71. return parser
  72. def parse_json_path(path: str) -> list[str | int]:
  73. tokens: list[str | int] = []
  74. current = ""
  75. i = 0
  76. while i < len(path):
  77. char = path[i]
  78. if char == ".":
  79. if current:
  80. tokens.append(current)
  81. current = ""
  82. i += 1
  83. continue
  84. if char == "*":
  85. if current:
  86. tokens.append(current)
  87. current = ""
  88. tokens.append("*")
  89. i += 1
  90. continue
  91. if char == "[":
  92. if current:
  93. tokens.append(current)
  94. current = ""
  95. end = path.find("]", i)
  96. if end == -1:
  97. raise ValueError(f"Invalid JSON path: {path}")
  98. index_text = path[i + 1 : end].strip()
  99. if index_text == "*":
  100. tokens.append("*")
  101. i = end + 1
  102. continue
  103. if not index_text.isdigit():
  104. raise ValueError(f"Invalid list index in JSON path: {path}")
  105. tokens.append(int(index_text))
  106. i = end + 1
  107. continue
  108. current += char
  109. i += 1
  110. if current:
  111. tokens.append(current)
  112. return tokens
  113. def get_path_value(payload: Any, path_tokens: list[str | int]) -> Any:
  114. current_items = [payload]
  115. for token in path_tokens:
  116. next_items: list[Any] = []
  117. for current in current_items:
  118. if token == "*":
  119. if isinstance(current, dict):
  120. next_items.extend(current.values())
  121. elif isinstance(current, list):
  122. next_items.extend(current)
  123. continue
  124. if isinstance(token, int):
  125. if not isinstance(current, list) or token >= len(current):
  126. continue
  127. next_items.append(current[token])
  128. continue
  129. if not isinstance(current, dict) or token not in current:
  130. continue
  131. next_items.append(current[token])
  132. if not next_items:
  133. return None
  134. current_items = next_items
  135. if len(current_items) == 1:
  136. return current_items[0]
  137. return current_items
  138. def format_value(value: Any) -> str:
  139. if value is None:
  140. return "N/A"
  141. if isinstance(value, list):
  142. if not value:
  143. return "N/A"
  144. if all(not isinstance(item, (dict, list)) for item in value):
  145. return ", ".join(str(item) for item in value)
  146. if isinstance(value, (dict, list)):
  147. return json.dumps(value, ensure_ascii=True)
  148. return str(value)
  149. def escape_cell(value: str) -> str:
  150. return value.replace("|", r"\|").replace("\n", " ")
  151. def normalize_sort_value(value: Any) -> tuple[int, Any]:
  152. if isinstance(value, bool):
  153. return (0, int(value))
  154. if isinstance(value, (int, float)):
  155. return (1, value)
  156. if isinstance(value, str):
  157. return (2, value)
  158. if isinstance(value, list):
  159. return (3, json.dumps(value, ensure_ascii=True, sort_keys=True))
  160. if isinstance(value, dict):
  161. return (4, json.dumps(value, ensure_ascii=True, sort_keys=True))
  162. return (5, str(value))
  163. def collect_rows(
  164. directory: Path, json_paths: list[str]
  165. ) -> list[tuple[str, list[Any], list[str]]]:
  166. path_tokens_by_path = {
  167. json_path: parse_json_path(json_path) for json_path in json_paths
  168. }
  169. rows: list[tuple[str, list[Any], list[str]]] = []
  170. for file_path in sorted(directory.rglob("*.json")):
  171. with file_path.open("r", encoding="utf-8") as file:
  172. payload = json.load(file)
  173. raw_values: list[Any] = []
  174. values: list[str] = []
  175. for json_path in json_paths:
  176. value = get_path_value(payload, path_tokens_by_path[json_path])
  177. raw_values.append(value)
  178. values.append(format_value(value))
  179. rows.append((file_path.name, raw_values, values))
  180. return rows
  181. def sort_rows(
  182. rows: list[tuple[str, list[Any], list[str]]],
  183. json_paths: list[str],
  184. sort_by: str | None,
  185. sort_order: str,
  186. ) -> list[tuple[str, list[Any], list[str]]]:
  187. if not sort_by:
  188. return rows
  189. if sort_by == "file_name":
  190. present_rows = [row for row in rows if row[0] is not None]
  191. return sorted(
  192. present_rows,
  193. key=lambda row: row[0],
  194. reverse=sort_order == "desc",
  195. )
  196. if sort_by not in json_paths:
  197. raise SystemExit(
  198. f"Invalid --sort-by value: {sort_by}. Expected 'file_name' or one of: "
  199. + ", ".join(json_paths)
  200. )
  201. sort_index = json_paths.index(sort_by)
  202. present_rows = [row for row in rows if row[1][sort_index] is not None]
  203. missing_rows = [row for row in rows if row[1][sort_index] is None]
  204. present_rows = sorted(
  205. present_rows,
  206. key=lambda row: normalize_sort_value(row[1][sort_index]),
  207. reverse=sort_order == "desc",
  208. )
  209. return [*present_rows, *missing_rows]
  210. def render_markdown_table(
  211. rows: list[tuple[str, list[Any], list[str]]], json_paths: list[str]
  212. ) -> str:
  213. header_cells = ["File Name", *json_paths]
  214. lines = [
  215. "| " + " | ".join(escape_cell(cell) for cell in header_cells) + " |",
  216. "|" + "|".join("---" for _ in header_cells) + "|",
  217. ]
  218. for file_name, _, values in rows:
  219. cells = [file_name, *values]
  220. lines.append("| " + " | ".join(escape_cell(cell) for cell in cells) + " |")
  221. return "\n".join(lines)
  222. def main() -> None:
  223. args = build_parser().parse_args()
  224. directory = Path(args.dir).resolve()
  225. if not directory.is_dir():
  226. raise SystemExit(f"Directory not found: {directory}")
  227. rows = collect_rows(directory, args.path)
  228. if not rows:
  229. raise SystemExit(f"No JSON files found under: {directory}")
  230. rows = sort_rows(rows, args.path, args.sort_by, args.sort_order)
  231. output = render_markdown_table(rows, args.path)
  232. if args.output:
  233. output_path = Path(args.output)
  234. output_path.parent.mkdir(parents=True, exist_ok=True)
  235. output_path.write_text(output + "\n", encoding="utf-8")
  236. print(f"Saved markdown table to {output_path}")
  237. return
  238. print(output)
  239. if __name__ == "__main__":
  240. main()