benchmark_runner.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. # The data structures in this file are adapted from:
  2. # https://github.com/vllm-project/guidellm/blob/62b0f8e01f5c558920fd5d02fe828459264b4f87/src/guidellm/benchmark/schemas/generative/report.py#L58
  3. # Modifications have been made to fit project requirements.
  4. import json
  5. import uuid
  6. from pathlib import Path
  7. from typing import Generic, Literal, Optional, Self, TypeVar
  8. from pydantic import BaseModel, Field
  9. from gpustack.schemas.benchmark import BenchmarkMetrics
  10. BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
  11. RegisterClassT = TypeVar("RegisterClassT", bound=type)
  12. SuccessfulT = TypeVar("SuccessfulT")
  13. ErroredT = TypeVar("ErroredT")
  14. IncompleteT = TypeVar("IncompleteT")
  15. TotalT = TypeVar("TotalT")
  16. GenerativeRequestType = Literal[
  17. "text_completions",
  18. "chat_completions",
  19. "audio_transcriptions",
  20. "audio_translations",
  21. ]
  22. class StatusBreakdown(BaseModel, Generic[SuccessfulT, ErroredT, IncompleteT, TotalT]):
  23. """
  24. Generic model for organizing results by processing status.
  25. Provides structured categorization of results into successful, errored,
  26. incomplete, and total status groups. Supports flexible typing for each
  27. status category to accommodate different result types while maintaining
  28. consistent organization patterns across the application.
  29. Example:
  30. ::
  31. from guidellm.utils import StatusBreakdown
  32. # Define a breakdown for request counts
  33. breakdown = StatusBreakdown[int, int, int, int](
  34. successful=150,
  35. errored=5,
  36. incomplete=10,
  37. total=165
  38. )
  39. """
  40. successful: SuccessfulT = Field(
  41. description="Results or metrics for requests with successful completion status",
  42. default=None, # type: ignore[assignment]
  43. )
  44. errored: ErroredT = Field(
  45. description="Results or metrics for requests with error completion status",
  46. default=None, # type: ignore[assignment]
  47. )
  48. incomplete: IncompleteT = Field(
  49. description="Results or metrics for requests with incomplete processing status",
  50. default=None, # type: ignore[assignment]
  51. )
  52. total: TotalT = Field(
  53. description="Aggregated results or metrics combining all status categories",
  54. default=None, # type: ignore[assignment]
  55. )
  56. class SchedulerMetrics(BaseModel):
  57. """
  58. Scheduler timing and performance statistics.
  59. Tracks overall benchmark timing, request counts by status, and detailed internal
  60. scheduler performance metrics including queue times, processing delays, and
  61. request execution statistics. Used to analyze scheduler efficiency and identify
  62. bottlenecks in request processing pipelines.
  63. """
  64. # Overall timings for the scheduler
  65. start_time: float = Field(
  66. description="Unix timestamp when the benchmark run started"
  67. )
  68. request_start_time: float = Field(
  69. description="Unix timestamp when first request was made"
  70. )
  71. measure_start_time: float = Field(
  72. description="Unix timestamp when measurement period started"
  73. )
  74. measure_end_time: float = Field(
  75. description="Unix timestamp when measurement period ended"
  76. )
  77. request_end_time: float = Field(
  78. description="Unix timestamp when last request completed"
  79. )
  80. end_time: float = Field(description="Unix timestamp when the benchmark run ended")
  81. class Percentiles(BaseModel):
  82. """
  83. Standard percentile values for probability distributions.
  84. Captures key percentile points from 0.1th to 99.9th percentile for comprehensive
  85. distribution analysis, enabling assessment of central tendency, spread, and tail
  86. behavior in benchmark metrics.
  87. """
  88. p50: float = Field(description="50th percentile (median) value")
  89. p90: float = Field(description="90th percentile value")
  90. p95: float = Field(description="95th percentile value")
  91. p99: float = Field(description="99th percentile value")
  92. class DistributionSummary(BaseModel):
  93. """
  94. Comprehensive statistical summary of a probability distribution.
  95. Captures central tendency (mean, median, mode), spread (variance, std_dev),
  96. extrema (min, max), and percentile information with optional probability density
  97. function. Supports creation from raw values, PDFs, or time-based event data for
  98. rate and concurrency analysis in benchmark metrics.
  99. """
  100. mean: float = Field(description="Mean/average value")
  101. median: float = Field(description="Median (50th percentile) value")
  102. min: float = Field(description="Minimum value")
  103. max: float = Field(description="Maximum value")
  104. percentiles: Percentiles = Field(description="Standard percentile values")
  105. class StatusDistributionSummary(
  106. StatusBreakdown[
  107. DistributionSummary,
  108. DistributionSummary,
  109. DistributionSummary,
  110. DistributionSummary,
  111. ]
  112. ):
  113. """
  114. Distribution summaries broken down by request status categories.
  115. Provides separate statistical analysis for successful, incomplete, and errored
  116. requests with total aggregate statistics. Enables status-aware performance analysis
  117. and SLO validation across different request outcomes in benchmark results.
  118. """
  119. pass
  120. class GenerativeMetrics(BaseModel):
  121. """
  122. Comprehensive metrics for generative AI benchmarks.
  123. Aggregates request statistics, token metrics, timing distributions, and
  124. domain-specific measurements across text, image, video, and audio modalities.
  125. Provides detailed statistical summaries including distribution analysis for
  126. throughput, latency, concurrency, and resource utilization metrics across
  127. successful, incomplete, and errored requests.
  128. """
  129. # Request stats
  130. request_totals: StatusBreakdown[int, int, int, int] = Field(
  131. description="Request counts by status: successful, incomplete, errored, total"
  132. )
  133. requests_per_second: StatusDistributionSummary = Field(
  134. description="Distribution of requests per second across benchmark execution"
  135. )
  136. request_concurrency: StatusDistributionSummary = Field(
  137. description="Distribution of concurrent request counts during execution"
  138. )
  139. request_latency: StatusDistributionSummary = Field(
  140. description="Distribution of request latencies for completed requests"
  141. )
  142. request_streaming_iterations_count: StatusDistributionSummary = Field(
  143. description="Distribution of stream iterations for completed requests"
  144. )
  145. # General token stats
  146. prompt_token_count: StatusDistributionSummary = Field(
  147. description="Distribution of prompt token counts by request status"
  148. )
  149. output_token_count: StatusDistributionSummary = Field(
  150. description="Distribution of output token counts by request status"
  151. )
  152. total_token_count: StatusDistributionSummary = Field(
  153. description="Distribution of total token counts by request status"
  154. )
  155. time_to_first_token_ms: StatusDistributionSummary = Field(
  156. description="Distribution of first token latencies in milliseconds"
  157. )
  158. time_per_output_token_ms: StatusDistributionSummary = Field(
  159. description="Distribution of average time per output token in milliseconds"
  160. )
  161. inter_token_latency_ms: StatusDistributionSummary = Field(
  162. description="Distribution of inter-token latencies in milliseconds"
  163. )
  164. prompt_tokens_per_second: StatusDistributionSummary = Field(
  165. description="Distribution of prompt token processing rates"
  166. )
  167. output_tokens_per_second: StatusDistributionSummary = Field(
  168. description="Distribution of output token generation rates"
  169. )
  170. tokens_per_second: StatusDistributionSummary = Field(
  171. description="Distribution of total token throughput including prompt and output"
  172. )
  173. output_tokens_per_iteration: StatusDistributionSummary = Field(
  174. description="Distribution of output tokens generated per streaming iteration"
  175. )
  176. iter_tokens_per_iteration: StatusDistributionSummary = Field(
  177. description=(
  178. "Distribution of output tokens (without first) generated per "
  179. "streaming iteration"
  180. )
  181. )
  182. class RequestTimings(BaseModel):
  183. """
  184. Timing measurements for tracking request lifecycle events.
  185. Provides comprehensive timing data for distributed request processing, capturing
  186. key timestamps from initial targeting through final completion. Essential for
  187. performance analysis, SLA monitoring, and debugging request processing bottlenecks
  188. across scheduler workers and backend systems.
  189. """
  190. targeted_start: float | None = Field(
  191. default=None,
  192. description="Unix timestamp when request was initially targeted for execution",
  193. )
  194. queued: float | None = Field(
  195. default=None,
  196. description="Unix timestamp when request was placed into processing queue",
  197. )
  198. dequeued: float | None = Field(
  199. default=None,
  200. description="Unix timestamp when request was removed from queue for processing",
  201. )
  202. scheduled_at: float | None = Field(
  203. default=None,
  204. description="Unix timestamp when the request was scheduled for processing",
  205. )
  206. resolve_start: float | None = Field(
  207. default=None,
  208. description="Unix timestamp when backend resolution of the request began",
  209. )
  210. request_start: float | None = Field(
  211. default=None,
  212. description="Unix timestamp when the backend began processing the request",
  213. )
  214. first_request_iteration: float | None = Field(
  215. default=None,
  216. )
  217. first_token_iteration: float | None = Field(
  218. default=None,
  219. )
  220. last_token_iteration: float | None = Field(
  221. default=None,
  222. )
  223. last_request_iteration: float | None = Field(
  224. default=None,
  225. )
  226. request_iterations: int = Field(
  227. default=0,
  228. )
  229. token_iterations: int = Field(
  230. default=0,
  231. )
  232. request_end: float | None = Field(
  233. default=None,
  234. description="Unix timestamp when the backend completed processing the request",
  235. )
  236. resolve_end: float | None = Field(
  237. default=None,
  238. description="Unix timestamp when backend resolution of the request completed",
  239. )
  240. finalized: float | None = Field(
  241. default=None,
  242. description="Unix timestamp when request was processed by the scheduler",
  243. )
  244. class RequestInfo(BaseModel):
  245. """
  246. Complete information about a request in the scheduler system.
  247. Encapsulates all metadata, status tracking, and timing information for requests
  248. processed through the distributed scheduler. Provides comprehensive lifecycle
  249. tracking from initial queuing through final completion, including error handling
  250. and node identification for debugging and performance analysis.
  251. Example:
  252. ::
  253. request = RequestInfo()
  254. request.status = "in_progress"
  255. start_time = request.started_at
  256. completion_time = request.completed_at
  257. """
  258. request_id: str = Field(
  259. description="Unique identifier for the request",
  260. default_factory=lambda: str(uuid.uuid4()),
  261. )
  262. status: Literal[
  263. "queued", "pending", "in_progress", "completed", "errored", "cancelled"
  264. ] = Field(description="Current processing status of the request", default="queued")
  265. scheduler_node_id: int = Field(
  266. description="ID/rank of the scheduler node handling the request",
  267. default=-1,
  268. )
  269. scheduler_process_id: int = Field(
  270. description="ID/rank of the node's scheduler process handling the request",
  271. default=-1,
  272. )
  273. scheduler_start_time: float = Field(
  274. description="Unix timestamp when scheduler processing began",
  275. default=-1,
  276. )
  277. timings: RequestTimings = Field(
  278. default_factory=RequestTimings,
  279. description="Timing measurements for the request lifecycle",
  280. )
  281. error: str | None = Field(
  282. default=None, description="Error message if the request status is 'errored'"
  283. )
  284. traceback: str | None = Field(
  285. default=None,
  286. description="Full traceback of the error if the request status is 'errored'",
  287. )
  288. class UsageMetrics(BaseModel):
  289. """
  290. Multimodal usage metrics for generation requests.
  291. Tracks resource consumption across different modalities including text, images,
  292. video, and audio. Provides granular metrics for tokens, bytes, duration, and
  293. format-specific measurements to enable comprehensive usage monitoring and billing.
  294. """
  295. # Text stats
  296. text_tokens: int | None = Field(
  297. default=None, description="Number of text tokens processed/generated."
  298. )
  299. text_words: int | None = Field(
  300. default=None, description="Number of text words processed/generated."
  301. )
  302. text_characters: int | None = Field(
  303. default=None, description="Number of text characters processed/generated."
  304. )
  305. class GenerativeRequestStats(BaseModel):
  306. """
  307. Request statistics for generative AI text generation workloads.
  308. Captures comprehensive performance metrics for individual generative requests,
  309. including token counts, timing measurements, and derived performance statistics.
  310. Provides computed properties for latency analysis, throughput calculations,
  311. and token generation metrics essential for benchmark evaluation.
  312. Example:
  313. ::
  314. stats = GenerativeRequestStats(
  315. request_id="req_123",
  316. request_type="text_completion",
  317. info=request_info,
  318. input_metrics=input_usage,
  319. output_metrics=output_usage
  320. )
  321. throughput = stats.output_tokens_per_second
  322. """
  323. type_: Literal["generative_request_stats"] = "generative_request_stats"
  324. request_id: str = Field(description="Unique identifier for the request")
  325. request_type: GenerativeRequestType | str = Field(
  326. description="Type of generative request (text_completion or chat_completion)"
  327. )
  328. response_id: str | None = Field(
  329. default=None, description="Unique identifier matching vLLM Response ID"
  330. )
  331. request_args: str | None = Field(
  332. default=None, description="Backend arguments used for this request"
  333. )
  334. output: str | None = Field(
  335. default=None, description="Generated text output from the request"
  336. )
  337. info: RequestInfo = Field(description="Request metadata and timing information")
  338. input_metrics: UsageMetrics = Field(
  339. description="Token usage statistics for the input prompt"
  340. )
  341. output_metrics: UsageMetrics = Field(
  342. description="Token usage statistics for the generated output"
  343. )
  344. class GenerativeBenchmark(BaseModel):
  345. """
  346. Complete generative AI benchmark results with specialized metrics.
  347. Encapsulates comprehensive performance data from scheduler-driven generative
  348. workload executions including request-level statistics, token/latency distributions,
  349. throughput analysis, and concurrency patterns. Provides computed fields for temporal
  350. analysis and status-grouped request details for detailed post-execution reporting.
  351. """
  352. scheduler_metrics: SchedulerMetrics = Field(
  353. description="Scheduler timing and performance statistics",
  354. )
  355. metrics: GenerativeMetrics = Field(
  356. description="Performance metrics and statistical distributions",
  357. )
  358. start_time: float = Field(
  359. description="Benchmark start time in seconds since epoch",
  360. )
  361. end_time: float = Field(
  362. description="Benchmark end time in seconds since epoch",
  363. )
  364. duration: float = Field(
  365. description="Total benchmark execution duration in seconds",
  366. )
  367. requests_truncated: StatusBreakdown[
  368. list[GenerativeRequestStats],
  369. list[GenerativeRequestStats],
  370. list[GenerativeRequestStats],
  371. None,
  372. ] = Field(
  373. default_factory=lambda: StatusBreakdown(
  374. successful=[],
  375. errored=[],
  376. incomplete=[],
  377. total=None,
  378. ),
  379. description=(
  380. "Request details grouped by status: successful, incomplete, errored"
  381. ),
  382. )
  383. class GenerativeBenchmarksReport(BaseModel):
  384. """
  385. Container for multiple benchmark results with load/save functionality.
  386. Aggregates multiple generative benchmark executions into a single report,
  387. providing persistence through JSON and YAML file formats. Enables result
  388. collection, storage, and retrieval across different execution sessions with
  389. automatic file type detection and path resolution.
  390. """
  391. benchmarks: list[GenerativeBenchmark] = Field(
  392. description="List of completed benchmarks in the report",
  393. default_factory=list,
  394. )
  395. def to_metrics(self) -> Optional[BenchmarkMetrics]:
  396. """
  397. Convert the report to a gpustack benchmark metrics object.
  398. """
  399. if not self.benchmarks:
  400. return None
  401. if self.benchmarks[0].metrics is None:
  402. return None
  403. fbm = self.benchmarks[0].metrics
  404. return BenchmarkMetrics(
  405. raw_metrics=self.model_dump(),
  406. requests_per_second_mean=fbm.requests_per_second.successful.mean,
  407. request_latency_mean=fbm.request_latency.successful.mean,
  408. time_per_output_token_mean=fbm.time_per_output_token_ms.successful.mean,
  409. inter_token_latency_mean=fbm.inter_token_latency_ms.successful.mean,
  410. time_to_first_token_mean=fbm.time_to_first_token_ms.successful.mean,
  411. tokens_per_second_mean=fbm.tokens_per_second.successful.mean,
  412. output_tokens_per_second_mean=fbm.output_tokens_per_second.successful.mean,
  413. input_tokens_per_second_mean=fbm.prompt_tokens_per_second.successful.mean,
  414. request_concurrency_max=fbm.request_concurrency.successful.max,
  415. request_concurrency_mean=fbm.request_concurrency.successful.mean,
  416. request_total=fbm.request_totals.total,
  417. request_successful=fbm.request_totals.successful,
  418. request_errored=fbm.request_totals.errored,
  419. request_incomplete=fbm.request_totals.incomplete,
  420. )
  421. @classmethod
  422. def load_file(cls, path: str) -> Self:
  423. """
  424. Load report from JSON or YAML file.
  425. :param path: File path or directory containing DEFAULT_FILE to load from
  426. :param type_: File format override ('json' or 'yaml'), auto-detected from
  427. extension if None
  428. :return: Loaded report instance with benchmarks and configuration
  429. :raises ValueError: If file type is unsupported or cannot be determined
  430. :raises FileNotFoundError: If specified file does not exist
  431. """
  432. file_path = Path(path)
  433. file_type = file_path.suffix.lower()[1:]
  434. with open(file_path, "r", encoding="utf-8") as metrics_file:
  435. if file_type == "json":
  436. model_dict = json.loads(metrics_file.read())
  437. else:
  438. raise ValueError(f"Unsupported file type: {file_type} for {file_path}.")
  439. return cls.model_validate(model_dict)