test_ocr_effectiveness.py 64 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571
  1. """
  2. OCR 模型效果与稳定性测试脚本
  3. 测试 GLM-OCR 模型在施工方案 PDF 上的表现:
  4. - 目录提取测试(默认): YOLO检测目录页 → GLM-OCR识别 → 规则解析 → 保存 catalog JSON
  5. - 版面检测(--detection): RapidLayout 表格/图片区域检测
  6. - OCR 识别(--detection): GLM-OCR 对表格区域的识别质量
  7. - 全链路测试(--detection --full-pipeline): 检测 → OCR → 文本回填
  8. - 稳定性测试(--detection --stability): 高并发下的错误率和延迟分布
  9. 运行方式:
  10. # 默认:仅目录提取测试(快速)
  11. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p utils_test/minimal_pipeline/完整性1.pdf
  12. # 目录OCR稳定性测试(10次提取对比一致性)
  13. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p utils_test/minimal_pipeline/完整性1.pdf --catalog-stability
  14. # 目录OCR稳定性测试(30次)
  15. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p utils_test/minimal_pipeline/完整性1.pdf --catalog-stability --catalog-iterations 30
  16. # 目录提取 + 版面检测 + OCR识别
  17. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p utils_test/minimal_pipeline/完整性1.pdf --detection
  18. # 目录+检测+全链路(检测→OCR→文本回填)
  19. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p utils_test/minimal_pipeline/完整性1.pdf --detection --full-pipeline
  20. # 目录+检测+稳定性测试(20并发,50次调用)
  21. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p utils_test/minimal_pipeline/完整性1.pdf --detection --stability --concurrency 20 --iterations 50
  22. # 批量测试
  23. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -d <pdf_dir>
  24. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -d <pdf_dir> --detection
  25. 输出目录: utils_test/minimal_pipeline/temp/test_ocr_effectiveness/
  26. ├── catalog/ 目录提取结果(每次带时间戳)
  27. ├── detection/ 版面检测全页标注图
  28. ├── table/ 表格区域截图+OCR文本
  29. ├── figure/ 图片区域截图+OCR文本
  30. └── results/ JSON 汇总结果
  31. """
  32. import argparse
  33. import configparser
  34. import json
  35. import os
  36. import sys
  37. import time
  38. import statistics
  39. from collections import Counter
  40. from concurrent.futures import ThreadPoolExecutor, as_completed
  41. from dataclasses import dataclass, field
  42. from pathlib import Path
  43. from typing import Any, Dict, List, Optional, Tuple
  44. import fitz
  45. import numpy as np
  46. # 从 config.ini 读取 OCR 配置
  47. _CONFIG_PATH = Path(__file__).resolve().parent.parent.parent / "config" / "config.ini"
  48. _OCR_CONFIG: Dict[str, str] = {}
  49. if _CONFIG_PATH.exists():
  50. _cp = configparser.ConfigParser()
  51. _cp.read(str(_CONFIG_PATH), encoding="utf-8")
  52. if _cp.has_section("ocr"):
  53. _OCR_CONFIG = {
  54. "GLM_OCR_API_URL": _cp.get("ocr", "GLM_OCR_API_URL", fallback="http://183.220.37.46:25429/v1/chat/completions"),
  55. "GLM_OCR_API_KEY": _cp.get("ocr", "GLM_OCR_API_KEY", fallback=""),
  56. "GLM_OCR_TIMEOUT": _cp.get("ocr", "GLM_OCR_TIMEOUT", fallback="600"),
  57. }
  58. else:
  59. _OCR_CONFIG = {
  60. "GLM_OCR_API_URL": "http://183.220.37.46:25429/v1/chat/completions",
  61. "GLM_OCR_API_KEY": "",
  62. "GLM_OCR_TIMEOUT": "600",
  63. }
  64. else:
  65. _OCR_CONFIG = {
  66. "GLM_OCR_API_URL": "http://183.220.37.46:25429/v1/chat/completions",
  67. "GLM_OCR_API_KEY": "",
  68. "GLM_OCR_TIMEOUT": "600",
  69. }
  70. # 将被测试的目标模块(本地解耦版,不依赖 core/foundation)
  71. TEST_DIR = Path(__file__).resolve().parent
  72. sys.path.insert(0, str(TEST_DIR.parent.parent)) # 项目根目录,使 utils_test 可导入
  73. from utils_test.minimal_pipeline._ocr_processor import (
  74. OcrProcessor,
  75. RAPID_LAYOUT_AVAILABLE,
  76. TableRegion,
  77. OcrResult,
  78. )
  79. # ============================================================
  80. # 数据结构
  81. # ============================================================
  82. @dataclass
  83. class DetectionSample:
  84. """单次版面检测样本"""
  85. page_num: int
  86. label: str
  87. score: float
  88. bbox: Tuple[float, float, float, float]
  89. width: float
  90. height: float
  91. @dataclass
  92. class PageDetectionResult:
  93. """单页版面检测结果"""
  94. page_num: int
  95. samples: List[DetectionSample]
  96. table_count: int
  97. figure_count: int
  98. @dataclass
  99. class OcrSampleResult:
  100. """单次 OCR 识别样本"""
  101. page_num: int
  102. label: str
  103. score: float
  104. bbox: Tuple[float, float, float, float]
  105. text: str
  106. text_length: int
  107. success: bool
  108. latency_ms: float
  109. retry_count: int = 0
  110. error: Optional[str] = None
  111. @dataclass
  112. class OcrTestResult:
  113. """OCR 测试结果汇总"""
  114. file_name: str
  115. total_pages: int
  116. detection: Dict[str, Any] = field(default_factory=dict)
  117. ocr: Dict[str, Any] = field(default_factory=dict)
  118. pipeline: Dict[str, Any] = field(default_factory=dict)
  119. # ============================================================
  120. # OCR 测试器
  121. # ============================================================
  122. class OcrEffectivenessTester:
  123. """OCR 模型效果与稳定性测试器"""
  124. def __init__(
  125. self,
  126. ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
  127. ocr_api_key: str = "",
  128. ocr_timeout: int = 600,
  129. dpi: int = 200,
  130. clip_top: float = 60,
  131. clip_bottom: float = 60,
  132. confidence_threshold: float = 0.5,
  133. concurrent_workers: int = 5,
  134. ):
  135. self.dpi = dpi
  136. self.clip_top = clip_top
  137. self.clip_bottom = clip_bottom
  138. self.confidence_threshold = confidence_threshold
  139. self.concurrent_workers = concurrent_workers
  140. # 初始化 OcrProcessor 用以复用其版面检测和 OCR 逻辑
  141. self.ocr_processor = OcrProcessor(
  142. ocr_api_url=ocr_api_url,
  143. ocr_api_key=ocr_api_key,
  144. ocr_timeout=ocr_timeout,
  145. ocr_dpi=dpi,
  146. confidence_threshold=confidence_threshold,
  147. concurrent_workers=concurrent_workers,
  148. )
  149. # 工具: 检查 RapidLayout 是否可用
  150. def check_environment(self) -> Dict[str, bool]:
  151. """检查运行环境依赖"""
  152. return {
  153. "rapid_layout_available": RAPID_LAYOUT_AVAILABLE,
  154. "pymupdf_available": True,
  155. "numpy_available": True,
  156. }
  157. # ============================================================
  158. # 效果测试: 版面检测
  159. # ============================================================
  160. def test_detection(
  161. self,
  162. pdf_path: Path,
  163. pages: Optional[List[int]] = None,
  164. save_images_dir: Optional[Path] = None,
  165. ) -> Dict[str, Any]:
  166. """测试 RapidLayout 版面检测效果"""
  167. if not RAPID_LAYOUT_AVAILABLE:
  168. return {"error": "RapidLayout 未安装,无法测试版面检测"}
  169. doc = fitz.open(str(pdf_path))
  170. try:
  171. total_pages = len(doc)
  172. target_pages = pages if pages is not None else list(range(total_pages))
  173. all_samples: List[DetectionSample] = []
  174. page_results: List[PageDetectionResult] = []
  175. for page_num in target_pages:
  176. page = doc.load_page(page_num)
  177. rect = page.rect
  178. clip_box = fitz.Rect(
  179. 0, self.clip_top,
  180. rect.width, rect.height - self.clip_bottom,
  181. )
  182. # 使用 OcrProcessor 的版面检测逻辑
  183. regions = self.ocr_processor.detect_table_regions(page, page_num + 1, clip_box)
  184. page_samples: List[DetectionSample] = []
  185. for bbox, score, label in regions:
  186. x1, y1, x2, y2 = bbox
  187. page_samples.append(DetectionSample(
  188. page_num=page_num + 1,
  189. label=label,
  190. score=score,
  191. bbox=bbox,
  192. width=x2 - x1,
  193. height=y2 - y1,
  194. ))
  195. all_samples.extend(page_samples)
  196. page_results.append(PageDetectionResult(
  197. page_num=page_num + 1,
  198. samples=page_samples,
  199. table_count=sum(1 for s in page_samples if s.label == "table"),
  200. figure_count=sum(1 for s in page_samples if s.label == "figure"),
  201. ))
  202. # 保存标注图片
  203. if save_images_dir and page_samples:
  204. self._save_detection_image(page, clip_box, page_samples, page_num + 1, save_images_dir)
  205. finally:
  206. doc.close()
  207. # 汇总统计
  208. label_counter = Counter(s.label for s in all_samples)
  209. table_count = label_counter.get("table", 0)
  210. figure_count = label_counter.get("figure", 0)
  211. # 尺寸分布
  212. table_widths = [s.width for s in all_samples if s.label == "table"]
  213. table_heights = [s.height for s in all_samples if s.label == "table"]
  214. # 置信度分布
  215. table_scores = [s.score for s in all_samples if s.label == "table"]
  216. figure_scores = [s.score for s in all_samples if s.label == "figure"]
  217. return {
  218. "status": "ok",
  219. "total_pages": total_pages,
  220. "analyzed_pages": len(target_pages),
  221. "total_regions": len(all_samples),
  222. "label_distribution": dict(label_counter.most_common()),
  223. "table_count": table_count,
  224. "figure_count": figure_count,
  225. "tables_per_page_avg": round(table_count / max(len(target_pages), 1), 2),
  226. "figures_per_page_avg": round(figure_count / max(len(target_pages), 1), 2),
  227. "table_width_avg": round(statistics.mean(table_widths), 1) if table_widths else None,
  228. "table_height_avg": round(statistics.mean(table_heights), 1) if table_heights else None,
  229. "table_score_avg": round(statistics.mean(table_scores), 4) if table_scores else None,
  230. "figure_score_avg": round(statistics.mean(figure_scores), 4) if figure_scores else None,
  231. "table_score_min": round(min(table_scores), 4) if table_scores else None,
  232. "table_score_max": round(max(table_scores), 4) if table_scores else None,
  233. "page_details": [
  234. {
  235. "page": r.page_num,
  236. "table_count": r.table_count,
  237. "figure_count": r.figure_count,
  238. "regions": [
  239. {
  240. "label": s.label,
  241. "score": round(s.score, 4),
  242. "bbox": [round(c, 1) for c in s.bbox],
  243. "size": [round(s.width, 1), round(s.height, 1)],
  244. }
  245. for s in r.samples
  246. ],
  247. }
  248. for r in page_results if r.samples
  249. ],
  250. }
  251. # ============================================================
  252. # 效果测试: OCR 识别
  253. # ============================================================
  254. def test_ocr_recognition(
  255. self,
  256. pdf_path: Path,
  257. pages: Optional[List[int]] = None,
  258. max_regions_per_page: int = 5,
  259. ) -> Dict[str, Any]:
  260. """测试 GLM-OCR 识别质量,先检测表格区域再逐个识别"""
  261. doc = fitz.open(str(pdf_path))
  262. try:
  263. total_pages = len(doc)
  264. target_pages = pages if pages is not None else list(range(total_pages))
  265. # 阶段1: 收集表格区域
  266. all_regions: List[TableRegion] = []
  267. for page_num in target_pages:
  268. page = doc.load_page(page_num)
  269. rect = page.rect
  270. clip_box = fitz.Rect(
  271. 0, self.clip_top,
  272. rect.width, rect.height - self.clip_bottom,
  273. )
  274. regions = self.ocr_processor.detect_table_regions(page, page_num + 1, clip_box)
  275. for bbox, score, label in regions[:max_regions_per_page]:
  276. all_regions.append(TableRegion(
  277. page_num=page_num + 1,
  278. page=page,
  279. bbox=bbox,
  280. score=score,
  281. label=label,
  282. ))
  283. if not all_regions:
  284. return {
  285. "status": "no_regions",
  286. "message": "未检测到表格区域,无需 OCR 识别",
  287. "total_pages": total_pages,
  288. }
  289. # 阶段2: 串行逐个识别(记录详细统计)
  290. ocr_samples: List[OcrSampleResult] = []
  291. total = len(all_regions)
  292. print(f"\n [OCR识别测试] 共 {total} 个区域,开始串行识别...")
  293. for idx, region in enumerate(all_regions):
  294. start_time = time.perf_counter()
  295. retry_count = 0
  296. error = None
  297. text = ""
  298. success = False
  299. # 手动调用 _ocr_table_region 并记录重试次数
  300. # (使用指数退避重试,最多3次)
  301. for attempt in range(3):
  302. try:
  303. text = self.ocr_processor._ocr_table_region(
  304. region.page, region.bbox, max_retries=1,
  305. )
  306. success = True
  307. retry_count = attempt
  308. break
  309. except Exception as e:
  310. error = str(e)[:200]
  311. if attempt < 2:
  312. time.sleep(1)
  313. latency = (time.perf_counter() - start_time) * 1000
  314. # 判断是否为 Non-table
  315. is_non_table = text.strip() == ""
  316. ocr_samples.append(OcrSampleResult(
  317. page_num=region.page_num,
  318. label=region.label,
  319. score=region.score,
  320. bbox=region.bbox,
  321. text=text,
  322. text_length=len(text.strip()),
  323. success=success or is_non_table, # Non-table 也算成功
  324. latency_ms=round(latency, 1),
  325. retry_count=retry_count,
  326. error=error if not success and not is_non_table else None,
  327. ))
  328. progress = f"[{idx + 1}/{total}]"
  329. status = "OK" if success else f"FAIL({error[:40]})"
  330. print(f" {progress} 第{region.page_num}页 [{region.label}] "
  331. f"score={region.score:.2f} 耗时={latency:.0f}ms 状态={status}")
  332. finally:
  333. doc.close()
  334. # 统计
  335. total_count = len(ocr_samples)
  336. success_count = sum(1 for s in ocr_samples if s.success)
  337. non_table_count = sum(1 for s in ocr_samples if not s.text.strip())
  338. table_with_content = sum(1 for s in ocr_samples if s.text.strip())
  339. latencies = [s.latency_ms for s in ocr_samples if s.success]
  340. text_lengths = [s.text_length for s in ocr_samples if s.text_length > 0]
  341. return {
  342. "status": "ok",
  343. "total_regions": total_count,
  344. "success_count": success_count,
  345. "non_table_count": non_table_count,
  346. "table_with_content": table_with_content,
  347. "success_rate": round(success_count / max(total_count, 1) * 100, 1),
  348. "content_rate": round(table_with_content / max(total_count, 1) * 100, 1),
  349. "latency_ms_avg": round(statistics.mean(latencies), 0) if latencies else None,
  350. "latency_ms_min": round(min(latencies), 0) if latencies else None,
  351. "latency_ms_max": round(max(latencies), 0) if latencies else None,
  352. "latency_ms_p50": self._percentile(latencies, 50) if latencies else None,
  353. "latency_ms_p95": self._percentile(latencies, 95) if latencies else None,
  354. "text_length_avg": round(statistics.mean(text_lengths), 0) if text_lengths else None,
  355. "text_length_max": max(text_lengths) if text_lengths else None,
  356. "retry_distribution": dict(Counter(s.retry_count for s in ocr_samples).most_common()),
  357. "label_breakdown": {
  358. label: {
  359. "count": sum(1 for s in ocr_samples if s.label == label),
  360. "success": sum(1 for s in ocr_samples if s.label == label and s.success),
  361. "with_content": sum(1 for s in ocr_samples if s.label == label and s.text.strip()),
  362. }
  363. for label in set(s.label for s in ocr_samples)
  364. },
  365. "errors": list(set(s.error for s in ocr_samples if s.error))[:10],
  366. "samples": [
  367. {
  368. "page": s.page_num,
  369. "label": s.label,
  370. "score": round(s.score, 4),
  371. "text_preview": s.text[:200] if s.text else "(empty/Non-table)",
  372. "text_length": s.text_length,
  373. "success": s.success,
  374. "latency_ms": s.latency_ms,
  375. "retry_count": s.retry_count,
  376. }
  377. for s in ocr_samples[:20] # 只保留前20个样本
  378. ],
  379. }
  380. # ============================================================
  381. # 稳定性测试: 并发 + 重试
  382. # ============================================================
  383. def test_stability(
  384. self,
  385. pdf_path: Path,
  386. concurrency: int = 5,
  387. iterations: int = 10,
  388. pages: Optional[List[int]] = None,
  389. ) -> Dict[str, Any]:
  390. """稳定性测试:高并发 OCR 调用,观测错误率、延迟分布、资源泄漏
  391. Args:
  392. concurrency: 并发线程数
  393. iterations: 总 OCR 调用次数(分配到各区域)
  394. """
  395. doc = fitz.open(str(pdf_path))
  396. try:
  397. total_pages = len(doc)
  398. target_pages = pages if pages is not None else list(range(min(total_pages, 10)))
  399. # 收集一定数量的表格区域作为测试样本
  400. all_regions: List[TableRegion] = []
  401. for page_num in target_pages:
  402. page = doc.load_page(page_num)
  403. rect = page.rect
  404. clip_box = fitz.Rect(
  405. 0, self.clip_top,
  406. rect.width, rect.height - self.clip_bottom,
  407. )
  408. regions = self.ocr_processor.detect_table_regions(page, page_num + 1, clip_box)
  409. for bbox, score, label in regions:
  410. all_regions.append(TableRegion(
  411. page_num=page_num + 1,
  412. page=page,
  413. bbox=bbox,
  414. score=score,
  415. label=label,
  416. ))
  417. if not all_regions:
  418. return {
  419. "status": "no_regions",
  420. "message": "未检测到表格区域,跳过稳定性测试",
  421. }
  422. # 循环分配任务: 每次从 regions 列表循环取一个
  423. total_tasks = min(iterations, len(all_regions) * 3)
  424. task_regions = [all_regions[i % len(all_regions)] for i in range(total_tasks)]
  425. print(f"\n [稳定性测试] 并发={concurrency}, 任务数={total_tasks}, 区域样本数={len(all_regions)}")
  426. # 并发执行 OCR
  427. ocr_samples: List[OcrSampleResult] = []
  428. progress_lock = [0]
  429. def _ocr_task(region: TableRegion, task_idx: int) -> OcrSampleResult:
  430. start_time = time.perf_counter()
  431. error = None
  432. text = ""
  433. success = False
  434. retry_count = 0
  435. for attempt in range(3):
  436. try:
  437. text = self.ocr_processor._ocr_table_region(
  438. region.page, region.bbox, max_retries=1,
  439. )
  440. success = True
  441. retry_count = attempt
  442. break
  443. except Exception as e:
  444. error = str(e)[:200]
  445. time.sleep(0.5)
  446. latency = (time.perf_counter() - start_time) * 1000
  447. with ThreadPoolExecutor._thread_queues:
  448. pass # dummy for lock
  449. # 简单进度
  450. progress_lock[0] += 1
  451. done = progress_lock[0]
  452. if done % max(1, total_tasks // 10) == 0 or done == total_tasks:
  453. pct = done / total_tasks * 100
  454. print(f" [进度] {done}/{total_tasks} ({pct:.0f}%)", flush=True)
  455. return OcrSampleResult(
  456. page_num=region.page_num,
  457. label=region.label,
  458. score=region.score,
  459. bbox=region.bbox,
  460. text=text,
  461. text_length=len(text.strip()),
  462. success=success,
  463. latency_ms=round(latency, 1),
  464. retry_count=retry_count,
  465. error=error if not success else None,
  466. )
  467. # 使用 ThreadPoolExecutor 并发执行
  468. results: List[OcrSampleResult] = []
  469. with ThreadPoolExecutor(max_workers=concurrency) as executor:
  470. futures = {
  471. executor.submit(_ocr_task, region, idx): (region, idx)
  472. for idx, region in enumerate(task_regions)
  473. }
  474. for future in as_completed(futures):
  475. try:
  476. results.append(future.result())
  477. except Exception as e:
  478. # 不会发生,因为内部已 catch
  479. pass
  480. ocr_samples = results
  481. finally:
  482. doc.close()
  483. # 统计
  484. total_count = len(ocr_samples)
  485. success_count = sum(1 for s in ocr_samples if s.success)
  486. non_table_count = sum(1 for s in ocr_samples if not s.text.strip())
  487. table_with_content = sum(1 for s in ocr_samples if s.text.strip())
  488. fail_count = total_count - success_count
  489. latencies = sorted(s.latency_ms for s in ocr_samples if s.success)
  490. return {
  491. "status": "ok",
  492. "concurrency": concurrency,
  493. "total_requests": total_count,
  494. "success_count": success_count,
  495. "fail_count": fail_count,
  496. "non_table_count": non_table_count,
  497. "table_with_content": table_with_content,
  498. "success_rate": round(success_count / max(total_count, 1) * 100, 1),
  499. "error_rate": round(fail_count / max(total_count, 1) * 100, 1),
  500. "latency_ms_avg": round(statistics.mean(latencies), 0) if latencies else None,
  501. "latency_ms_min": min(latencies) if latencies else None,
  502. "latency_ms_max": max(latencies) if latencies else None,
  503. "latency_ms_p50": self._percentile(latencies, 50) if latencies else None,
  504. "latency_ms_p95": self._percentile(latencies, 95) if latencies else None,
  505. "latency_ms_p99": self._percentile(latencies, 99) if latencies else None,
  506. "latency_ms_std": round(statistics.stdev(latencies), 0) if len(latencies) > 1 else None,
  507. "retry_distribution": dict(Counter(s.retry_count for s in ocr_samples).most_common()),
  508. "errors": list(set(s.error for s in ocr_samples if s.error))[:10],
  509. }
  510. # ============================================================
  511. # 全链路测试: 检测 → OCR → 回填
  512. # ============================================================
  513. def test_full_pipeline(
  514. self,
  515. pdf_path: Path,
  516. pages: Optional[List[int]] = None,
  517. ) -> Dict[str, Any]:
  518. """测试 OCR 全链路: 版面检测 → 并发 OCR → 文本回填"""
  519. doc = fitz.open(str(pdf_path))
  520. try:
  521. total_pages = len(doc)
  522. target_pages = pages if pages is not None else list(range(total_pages))
  523. # 阶段1: 检测表格区域
  524. all_regions: List[TableRegion] = []
  525. for page_num in target_pages:
  526. page = doc.load_page(page_num)
  527. rect = page.rect
  528. clip_box = fitz.Rect(
  529. 0, self.clip_top,
  530. rect.width, rect.height - self.clip_bottom,
  531. )
  532. regions = self.ocr_processor.detect_table_regions(page, page_num + 1, clip_box)
  533. for bbox, score, label in regions:
  534. all_regions.append(TableRegion(
  535. page_num=page_num + 1,
  536. page=page,
  537. bbox=bbox,
  538. score=score,
  539. label=label,
  540. ))
  541. table_count = sum(1 for r in all_regions if r.label == "table")
  542. figure_count = sum(1 for r in all_regions if r.label == "figure")
  543. if not all_regions:
  544. return {
  545. "status": "no_regions",
  546. "total_pages": total_pages,
  547. "message": "未检测到表格/图片区域",
  548. }
  549. # 阶段2: 并发 OCR
  550. ocr_start = time.perf_counter()
  551. ocr_results = extractor._process_ocr_concurrent(all_regions)
  552. ocr_elapsed = time.perf_counter() - ocr_start
  553. ocr_success = sum(1 for r in ocr_results if r.success and r.text.strip())
  554. ocr_fail = sum(1 for r in ocr_results if not r.success)
  555. ocr_empty = sum(1 for r in ocr_results if r.success and not r.text.strip())
  556. # 阶段3: 检查文本回填效果
  557. # 对每页对比 原始文本 vs OCR回填文本
  558. page_comparison = []
  559. for page_num in target_pages:
  560. page = doc.load_page(page_num)
  561. rect = page.rect
  562. clip_box = fitz.Rect(
  563. 0, self.clip_top,
  564. rect.width, rect.height - self.clip_bottom,
  565. )
  566. original_text = page.get_text("text", clip=clip_box)
  567. page_ocr_results = [
  568. {
  569. "region_index": i,
  570. "bbox": r.bbox,
  571. "score": r.score,
  572. "ocr_text": r.text,
  573. }
  574. for i, r in enumerate(ocr_results)
  575. if r.page_num == page_num + 1 and r.success
  576. ]
  577. replaced_text = extractor._replace_table_regions(
  578. page, original_text, page_ocr_results, clip_box,
  579. )
  580. has_replacement = replaced_text != original_text
  581. page_comparison.append({
  582. "page": page_num + 1,
  583. "original_length": len(original_text),
  584. "replaced_length": len(replaced_text),
  585. "has_replacement": has_replacement,
  586. "ocr_regions_on_page": len(page_ocr_results),
  587. "length_change": len(replaced_text) - len(original_text),
  588. })
  589. finally:
  590. doc.close()
  591. replaced_pages = sum(1 for p in page_comparison if p["has_replacement"])
  592. total_latencies = [r.latency_ms for r in ocr_results if r.success]
  593. return {
  594. "status": "ok",
  595. "total_pages": total_pages,
  596. "analyzed_pages": len(target_pages),
  597. "total_regions": len(all_regions),
  598. "table_count": table_count,
  599. "figure_count": figure_count,
  600. "ocr_results": {
  601. "total": len(ocr_results),
  602. "success_with_content": ocr_success,
  603. "empty_non_table": ocr_empty,
  604. "failed": ocr_fail,
  605. "content_rate": round(ocr_success / max(len(ocr_results), 1) * 100, 1),
  606. "ocr_total_time_s": round(ocr_elapsed, 2),
  607. "ocr_avg_latency_ms": round(statistics.mean(total_latencies), 0) if total_latencies else None,
  608. },
  609. "replacement": {
  610. "pages_with_replacement": replaced_pages,
  611. "replacement_rate": round(replaced_pages / max(len(target_pages), 1) * 100, 1),
  612. },
  613. "page_details": page_comparison[:30],
  614. }
  615. # ============================================================
  616. # 辅助方法
  617. # ============================================================
  618. @staticmethod
  619. def _percentile(data: List[float], p: float) -> float:
  620. if not data:
  621. return 0.0
  622. sorted_data = sorted(data)
  623. idx = max(0, min(len(sorted_data) - 1, int(len(sorted_data) * p / 100)))
  624. return round(sorted_data[idx], 0)
  625. def _save_detection_image(
  626. self,
  627. page: fitz.Page,
  628. clip_box: fitz.Rect,
  629. samples: List[DetectionSample],
  630. page_num: int,
  631. output_dir: Path,
  632. ):
  633. """保存带检测框的页面图片"""
  634. try:
  635. from PIL import Image, ImageDraw
  636. except ImportError:
  637. return
  638. pix = page.get_pixmap(dpi=self.dpi, clip=clip_box)
  639. img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
  640. draw = ImageDraw.Draw(img)
  641. # 还原原始图片尺寸(裁剪前)
  642. scale_x = pix.width / clip_box.width
  643. scale_y = pix.height / clip_box.height
  644. colors = {
  645. "table": (0, 255, 0),
  646. "figure": (255, 80, 80),
  647. }
  648. for s in samples:
  649. color = colors.get(s.label, (200, 200, 200))
  650. x1 = (s.bbox[0] - clip_box.x0) * scale_x
  651. y1 = (s.bbox[1] - clip_box.y0) * scale_y
  652. x2 = (s.bbox[2] - clip_box.x0) * scale_x
  653. y2 = (s.bbox[3] - clip_box.y0) * scale_y
  654. draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
  655. draw.text((x1 + 2, y1 + 2), f"{s.label} ({s.score:.2f})", fill=color)
  656. output_path = output_dir / f"page_{page_num:03d}_ocr_detection.jpg"
  657. img.save(str(output_path), quality=85)
  658. # ============================================================
  659. # 报告打印
  660. # ============================================================
  661. def print_env_report(env: Dict[str, bool]):
  662. print("\n" + "=" * 60)
  663. print(" 环境检查")
  664. print("=" * 60)
  665. for k, v in env.items():
  666. status = "✓ 可用" if v else "✗ 不可用"
  667. print(f" {k:30s}: {status}")
  668. if not env.get("rapid_layout_available"):
  669. print("\n ⚠ RapidLayout 未安装,版面检测功能受限")
  670. print(" 安装: pip install rapid-layout")
  671. print()
  672. def print_detection_report(result: Dict[str, Any]):
  673. if "error" in result:
  674. print(f" [错误] {result['error']}")
  675. return
  676. print("\n" + "=" * 70)
  677. print(" 版面检测效果报告 (RapidLayout)")
  678. print("=" * 70)
  679. print(f" 分析页数: {result['analyzed_pages']} / {result['total_pages']}")
  680. print(f" 检测区域总数: {result['total_regions']}")
  681. print(f" 表格数: {result['table_count']} (均 {result['tables_per_page_avg']}/页)")
  682. print(f" 图片数: {result['figure_count']} (均 {result['figures_per_page_avg']}/页)")
  683. print()
  684. if result["label_distribution"]:
  685. print(" 标签分布:")
  686. for label, count in result["label_distribution"].items():
  687. pct = count / max(result["total_regions"], 1) * 100
  688. bar = "█" * int(pct / 2)
  689. print(f" {label:12s}: {count:4d} ({pct:5.1f}%) {bar}")
  690. print()
  691. if result.get("table_score_avg"):
  692. print(f" 表格置信度: avg={result['table_score_avg']:.3f} "
  693. f"min={result['table_score_min']:.3f} max={result['table_score_max']:.3f}")
  694. if result.get("figure_score_avg"):
  695. print(f" 图片置信度: avg={result['figure_score_avg']:.3f}")
  696. if result.get("page_details"):
  697. print()
  698. print(" 逐页详情:")
  699. for p in result["page_details"]:
  700. regions_str = ", ".join(
  701. f"[{r['label']}]({r['score']:.2f})"
  702. for r in p["regions"]
  703. )
  704. print(f" 第{p['page']:3d}页: table={p['table_count']} figure={p['figure_count']} {regions_str}")
  705. def print_ocr_report(result: Dict[str, Any]):
  706. if "error" in result:
  707. print(f" [错误] {result['error']}")
  708. return
  709. if result.get("status") == "no_regions":
  710. print(f"\n [提示] {result['message']}")
  711. return
  712. print("\n" + "=" * 70)
  713. print(" OCR 识别效果报告 (GLM-OCR)")
  714. print("=" * 70)
  715. print(f" 总区域数: {result['total_regions']}")
  716. print(f" 识别成功: {result['success_count']} ({result['success_rate']}%)")
  717. print(f" 含表格内容: {result['table_with_content']} ({result['content_rate']}%)")
  718. print(f" Non-table(跳过): {result['non_table_count']}")
  719. if result.get("latency_ms_avg"):
  720. print(f"\n 延迟统计 (ms):")
  721. print(f" 平均: {result['latency_ms_avg']:.0f}")
  722. print(f" 最小: {result['latency_ms_min']:.0f}")
  723. print(f" 最大: {result['latency_ms_max']:.0f}")
  724. print(f" P50: {result['latency_ms_p50']:.0f}")
  725. print(f" P95: {result['latency_ms_p95']:.0f}")
  726. if result.get("text_length_avg"):
  727. print(f"\n 文本长度: avg={result['text_length_avg']:.0f} max={result['text_length_max']}")
  728. if result.get("retry_distribution"):
  729. print(f"\n 重试分布: {result['retry_distribution']}")
  730. if result.get("label_breakdown"):
  731. print(f"\n 按标签统计:")
  732. for label, stats in result["label_breakdown"].items():
  733. print(f" {label:8s}: 总数={stats['count']}, 成功={stats['success']}, "
  734. f"含内容={stats['with_content']}")
  735. if result.get("errors"):
  736. print(f"\n 错误 ({len(result['errors'])} 种):")
  737. for e in result["errors"]:
  738. print(f" - {e}")
  739. if result.get("samples"):
  740. print(f"\n 样本预览 (前20):")
  741. print(f" {'页':>4s} {'标签':>8s} {'置信度':>8s} {'耗时ms':>8s} {'重试':>4s} {'内容':>6s} {'预览'}")
  742. print(f" {'-'*60}")
  743. for s in result["samples"]:
  744. preview = (s["text_preview"][:50] + "..") if len(s.get("text_preview", "")) > 50 else s.get("text_preview", "")
  745. ok = "✓" if s["success"] else "✗"
  746. print(f" {s['page']:4d} {s['label']:>8s} {s['score']:.2f} {s['latency_ms']:6.0f} {s['retry_count']:3d} "
  747. f"{ok:>4s} {preview}")
  748. print()
  749. def print_stability_report(result: Dict[str, Any]):
  750. if result.get("status") == "no_regions":
  751. print(f"\n [提示] {result['message']}")
  752. return
  753. print("\n" + "=" * 70)
  754. print(" 稳定性测试报告")
  755. print("=" * 70)
  756. print(f" 并发数: {result['concurrency']}")
  757. print(f" 总请求数: {result['total_requests']}")
  758. print(f" 成功: {result['success_count']} ({result['success_rate']}%)")
  759. print(f" 失败: {result['fail_count']} ({result['error_rate']}%)")
  760. print(f" 含表格内容: {result['table_with_content']}")
  761. print(f" Non-table跳过: {result['non_table_count']}")
  762. if result.get("latency_ms_avg"):
  763. print(f"\n 延迟统计 (ms):")
  764. print(f" 平均: {result['latency_ms_avg']:.0f}")
  765. print(f" 最小: {result['latency_ms_min']:.0f}")
  766. print(f" 最大: {result['latency_ms_max']:.0f}")
  767. print(f" P50: {result['latency_ms_p50']:.0f}")
  768. print(f" P95: {result['latency_ms_p95']:.0f}")
  769. print(f" P99: {result['latency_ms_p99']:.0f}")
  770. if result.get("latency_ms_std"):
  771. print(f" 标准差: {result['latency_ms_std']:.0f}")
  772. if result.get("retry_distribution"):
  773. print(f"\n 重试分布: {result['retry_distribution']}")
  774. if result.get("errors"):
  775. print(f"\n 错误列表:")
  776. for e in result["errors"]:
  777. print(f" - {e}")
  778. print()
  779. def print_pipeline_report(result: Dict[str, Any]):
  780. if result.get("status") == "no_regions":
  781. print(f"\n [提示] {result['message']}")
  782. return
  783. print("\n" + "=" * 70)
  784. print(" 全链路测试报告 (检测 → OCR → 回填)")
  785. print("=" * 70)
  786. print(f" 总页数: {result['total_pages']}")
  787. print(f" 分析页数: {result['analyzed_pages']}")
  788. print(f" 检测区域: 表格={result['table_count']}, 图片={result['figure_count']}")
  789. ocr = result.get("ocr_results", {})
  790. print(f"\n OCR 识别:")
  791. print(f" 总区域: {ocr.get('total', 0)}")
  792. print(f" 含内容: {ocr.get('success_with_content', 0)} ({ocr.get('content_rate', 0)}%)")
  793. print(f" Non-table跳过: {ocr.get('empty_non_table', 0)}")
  794. print(f" 失败: {ocr.get('failed', 0)}")
  795. print(f" 总耗时: {ocr.get('ocr_total_time_s', 0)}s")
  796. if ocr.get("ocr_avg_latency_ms"):
  797. print(f" 平均延迟: {ocr['ocr_avg_latency_ms']:.0f}ms")
  798. repl = result.get("replacement", {})
  799. print(f"\n 文本回填:")
  800. print(f" 发生替换的页数: {repl.get('pages_with_replacement', 0)}/{result['analyzed_pages']} ({repl.get('replacement_rate', 0)}%)")
  801. print()
  802. def _save_ocr_region_images(
  803. pdf_path: Path,
  804. det_result: Dict[str, Any],
  805. table_img_dir: Path,
  806. figure_img_dir: Path,
  807. tester: OcrEffectivenessTester,
  808. ) -> Dict[str, int]:
  809. """将检测到的表格/图片区域截图和OCR识别内容分别保存到对应目录"""
  810. from PIL import Image
  811. count = {"table": 0, "figure": 0}
  812. page_details = det_result.get("page_details", [])
  813. if not page_details:
  814. return count
  815. doc = fitz.open(str(pdf_path))
  816. try:
  817. for page_info in page_details:
  818. page_num = page_info["page"] - 1
  819. page = doc.load_page(page_num)
  820. rect = page.rect
  821. clip_box = fitz.Rect(0, tester.clip_top, rect.width, rect.height - tester.clip_bottom)
  822. for region in page_info.get("regions", []):
  823. label = region["label"]
  824. bbox = region["bbox"]
  825. score = region["score"]
  826. # 确定保存目录
  827. if label == "table":
  828. target_dir = table_img_dir
  829. elif label == "figure":
  830. target_dir = figure_img_dir
  831. else:
  832. continue
  833. pdf_rect = fitz.Rect(bbox)
  834. pix = page.get_pixmap(dpi=tester.dpi, clip=pdf_rect)
  835. img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
  836. basename = f"{pdf_path.stem}_p{page_info['page']:03d}_{label}_{count[label]:02d}"
  837. img_path = target_dir / f"{basename}.jpg"
  838. img.save(str(img_path), quality=90)
  839. # 尝试 OCR 识别并保存文本内容
  840. try:
  841. ocr_text = tester.ocr_processor._ocr_table_region(page, bbox, max_retries=1)
  842. txt_path = target_dir / f"{basename}.txt"
  843. with open(txt_path, "w", encoding="utf-8") as f:
  844. f.write(ocr_text)
  845. except Exception as e:
  846. txt_path = target_dir / f"{basename}.txt"
  847. with open(txt_path, "w", encoding="utf-8") as f:
  848. f.write(f"[OCR识别失败] {e}")
  849. count[label] += 1
  850. finally:
  851. doc.close()
  852. return count
  853. def _test_catalog_ocr(
  854. pdf_path: Path,
  855. tester: OcrEffectivenessTester,
  856. catalog_dir: Path,
  857. detection_img_dir: Path,
  858. timestamp: str = "",
  859. ) -> Dict[str, Any]:
  860. """测试目录页(目录)OCR识别效果,使用项目实际的 _extract_catalog 链路
  861. 输出格式与项目 catalog JSON 一致:
  862. {"chapters": [...], "total_chapters": N, "raw_ocr_text": "...", "formatted_text": "..."}
  863. """
  864. from utils_test.minimal_pipeline._toc_detector import TOCCatalogExtractor
  865. file_content = pdf_path.read_bytes()
  866. # 文件名前缀(提前定义,供后续截图使用)
  867. suffix = f"_{timestamp}" if timestamp else ""
  868. stem = f"{pdf_path.stem}{suffix}"
  869. # 使用 YOLO + GLM-OCR 目录提取链路(不依赖 core/foundation)
  870. extractor = TOCCatalogExtractor(
  871. model_path=str(TEST_DIR / "best.pt"),
  872. ocr_api_url=tester.ocr_processor.ocr_api_url,
  873. ocr_api_key=tester.ocr_processor.ocr_api_key,
  874. ocr_timeout=tester.ocr_processor.ocr_timeout,
  875. )
  876. catalog = extractor.detect_and_extract(file_content)
  877. catalog = catalog or {}
  878. # 保存目录页截图(使用 YOLO 检测到的目录页码范围)
  879. from PIL import Image as PILImage
  880. try:
  881. doc = fitz.open(stream=file_content)
  882. try:
  883. toc_range = (catalog or {}).get("toc_page_range")
  884. if toc_range:
  885. for page_num in range(toc_range["start"] - 1, toc_range["end"]):
  886. page = doc.load_page(page_num)
  887. pix = page.get_pixmap(dpi=150)
  888. img = PILImage.frombytes("RGB", [pix.width, pix.height], pix.samples)
  889. img_path = catalog_dir / f"{stem}_catalog_page_{page_num + 1:03d}.jpg"
  890. img.save(str(img_path), quality=85)
  891. finally:
  892. doc.close()
  893. except Exception as e:
  894. print(f" [警告] 目录页截图保存失败: {e}")
  895. # 构造与项目格式一致的 catalog 输出
  896. # 格式: {"catalog": {"chapters": [...], "total_chapters": N}, "raw_ocr_text": "..."}
  897. # 不含 content/page_start/page_end 等后续流程才填充的字段
  898. catalog_output: Dict[str, Any] = {
  899. "catalog": {
  900. "chapters": [],
  901. "total_chapters": 0,
  902. },
  903. "raw_ocr_text": "",
  904. }
  905. raw_ocr_text = ""
  906. extract_status = "failed"
  907. if catalog:
  908. chapters = catalog.get("chapters", [])
  909. # 清理掉可能混入的 content 等字段(仅保留 catalog 原始字段)
  910. clean_chapters = []
  911. for ch in chapters:
  912. clean_ch = {
  913. "index": ch.get("index", 0),
  914. "title": ch.get("title", ""),
  915. "page": str(ch.get("page", "")),
  916. "original": ch.get("original", ""),
  917. }
  918. clean_subs = []
  919. for sub in ch.get("subsections", []):
  920. clean_subs.append({
  921. "title": sub.get("title", ""),
  922. "page": str(sub.get("page", "")),
  923. "level": sub.get("level", 2),
  924. "original": sub.get("original", ""),
  925. })
  926. clean_ch["subsections"] = clean_subs
  927. clean_chapters.append(clean_ch)
  928. catalog_output["catalog"]["chapters"] = clean_chapters
  929. catalog_output["catalog"]["total_chapters"] = len(clean_chapters)
  930. raw_ocr_text = catalog.get("raw_ocr_text", "") or ""
  931. catalog_output["raw_ocr_text"] = raw_ocr_text
  932. extract_status = "success"
  933. # 保存 catalog JSON(含 raw_ocr_text 字段替代单独的 txt)
  934. json_path = catalog_dir / f"{stem}_catalog.json"
  935. with open(json_path, "w", encoding="utf-8") as f:
  936. json.dump(catalog_output, f, ensure_ascii=False, indent=2)
  937. # 返回简洁统计信息,json_path 供主流程使用
  938. return {
  939. "extract_status": extract_status,
  940. "total_chapters": catalog_output["catalog"]["total_chapters"],
  941. "raw_ocr_length": len(raw_ocr_text),
  942. "json_path": str(json_path),
  943. }
  944. def _test_catalog_stability(
  945. pdf_path: Path,
  946. tester: OcrEffectivenessTester,
  947. catalog_dir: Path,
  948. iterations: int = 10,
  949. ) -> Dict[str, Any]:
  950. """目录OCR稳定性测试:多次执行目录提取,对比结果一致性"""
  951. from utils_test.minimal_pipeline._toc_detector import TOCCatalogExtractor
  952. file_content = pdf_path.read_bytes()
  953. timestamp = time.strftime("%Y%m%d_%H%M%S")
  954. results = []
  955. # 先快速提取一次获取目录页范围用于截图
  956. first_extractor = TOCCatalogExtractor(
  957. model_path=str(TEST_DIR / "best.pt"),
  958. ocr_api_url=tester.ocr_processor.ocr_api_url,
  959. ocr_api_key=tester.ocr_processor.ocr_api_key,
  960. ocr_timeout=tester.ocr_processor.ocr_timeout,
  961. )
  962. first_catalog = first_extractor.detect_and_extract(file_content) or {}
  963. # 保存目录页截图(使用 YOLO 检测到的目录页码范围)
  964. from PIL import Image as PILImage
  965. try:
  966. doc = fitz.open(stream=file_content)
  967. try:
  968. toc_range = first_catalog.get("toc_page_range")
  969. if toc_range:
  970. for page_num in range(toc_range["start"] - 1, toc_range["end"]):
  971. page = doc.load_page(page_num)
  972. pix = page.get_pixmap(dpi=150)
  973. img = PILImage.frombytes("RGB", [pix.width, pix.height], pix.samples)
  974. img_path = catalog_dir / f"{pdf_path.stem}_{timestamp}_catalog_page_{page_num + 1:03d}.jpg"
  975. img.save(str(img_path), quality=85)
  976. finally:
  977. doc.close()
  978. print(f" [保存] 目录页截图 → {catalog_dir}/")
  979. except Exception as e:
  980. print(f" [警告] 目录页截图保存失败: {e}")
  981. print(f"\n [目录稳定性] 开始 {iterations} 次目录提取...")
  982. for i in range(iterations):
  983. extractor = TOCCatalogExtractor(
  984. model_path=str(TEST_DIR / "best.pt"),
  985. ocr_api_url=tester.ocr_processor.ocr_api_url,
  986. ocr_api_key=tester.ocr_processor.ocr_api_key,
  987. ocr_timeout=tester.ocr_processor.ocr_timeout,
  988. )
  989. start = time.perf_counter()
  990. try:
  991. catalog = extractor.detect_and_extract(file_content)
  992. catalog = catalog or {}
  993. elapsed = (time.perf_counter() - start) * 1000
  994. if catalog:
  995. chapters = catalog.get("chapters", [])
  996. raw_text = catalog.get("raw_ocr_text", "") or ""
  997. chapter_titles = [ch.get("title", "") for ch in chapters]
  998. # 保存本次运行的独立 catalog JSON(含 raw_ocr_text)
  999. run_output = {
  1000. "run": i + 1,
  1001. "catalog": {
  1002. "chapters": [
  1003. {"index": ch.get("index"), "title": ch.get("title", ""),
  1004. "page": str(ch.get("page", "")), "original": ch.get("original", ""),
  1005. "subsections": [
  1006. {"title": s.get("title", ""), "page": str(s.get("page", "")),
  1007. "level": s.get("level", 2), "original": s.get("original", "")}
  1008. for s in (ch.get("subsections") or [])
  1009. ]}
  1010. for ch in chapters
  1011. ],
  1012. "total_chapters": len(chapters),
  1013. },
  1014. "raw_ocr_text": raw_text,
  1015. "elapsed_ms": round(elapsed, 0),
  1016. }
  1017. run_path = catalog_dir / f"{pdf_path.stem}_{timestamp}_{i + 1:03d}_catalog.json"
  1018. with open(run_path, "w", encoding="utf-8") as f:
  1019. json.dump(run_output, f, ensure_ascii=False, indent=2)
  1020. results.append({
  1021. "run": i + 1,
  1022. "success": True,
  1023. "elapsed_ms": round(elapsed, 0),
  1024. "chapter_count": len(chapters),
  1025. "chapter_titles": chapter_titles,
  1026. "raw_text_len": len(raw_text),
  1027. "raw_text_hash": hash(raw_text),
  1028. "json_path": str(run_path),
  1029. "raw_text": raw_text,
  1030. })
  1031. else:
  1032. results.append({
  1033. "run": i + 1, "success": False, "elapsed_ms": round(elapsed, 0),
  1034. "chapter_count": 0, "error": "catalog is None",
  1035. })
  1036. except Exception as e:
  1037. results.append({
  1038. "run": i + 1, "success": False,
  1039. "elapsed_ms": round((time.perf_counter() - start) * 1000, 0),
  1040. "error": str(e)[:200],
  1041. })
  1042. r = results[-1]
  1043. print(f" [{i + 1}/{iterations}] "
  1044. f"{'OK' if r['success'] else 'FAIL'} "
  1045. f"{r.get('chapter_count', 0):>2}章 "
  1046. f"{r.get('elapsed_ms', 0):.0f}ms", flush=True)
  1047. # ---- 一致性分析 ----
  1048. success_runs = [r for r in results if r["success"]]
  1049. fail_count = len(results) - len(success_runs)
  1050. title_sequences = [tuple(r["chapter_titles"]) for r in success_runs]
  1051. unique_sequences = set(title_sequences)
  1052. text_hashes = {r.get("raw_text_hash") for r in success_runs if "raw_text_hash" in r}
  1053. chapter_counts = [r["chapter_count"] for r in success_runs]
  1054. count_distribution = {}
  1055. for c in chapter_counts:
  1056. count_distribution[c] = count_distribution.get(c, 0) + 1
  1057. patterns = []
  1058. for seq in unique_sequences:
  1059. matching_runs = [r["run"] for r in success_runs if tuple(r["chapter_titles"]) == seq]
  1060. patterns.append({"titles": list(seq), "count": len(matching_runs), "runs": matching_runs})
  1061. patterns.sort(key=lambda x: -x["count"])
  1062. latencies = [r["elapsed_ms"] for r in success_runs]
  1063. stability = {
  1064. "total_runs": len(results),
  1065. "success_count": len(success_runs),
  1066. "fail_count": fail_count,
  1067. "success_rate": round(len(success_runs) / max(len(results), 1) * 100, 1),
  1068. "all_titles_identical": len(unique_sequences) <= 1,
  1069. "all_text_identical": len(text_hashes) <= 1,
  1070. "unique_title_patterns": len(unique_sequences),
  1071. "unique_text_hashes": len(text_hashes),
  1072. "chapter_count_distribution": count_distribution,
  1073. "most_common_chapter_count": max(set(chapter_counts), key=chapter_counts.count) if chapter_counts else 0,
  1074. "latency_ms_avg": round(statistics.mean(latencies), 0) if latencies else None,
  1075. "latency_ms_min": min(latencies) if latencies else None,
  1076. "latency_ms_max": max(latencies) if latencies else None,
  1077. "patterns": patterns,
  1078. }
  1079. output = {"stability": stability, "runs": results}
  1080. json_path = catalog_dir / f"{pdf_path.stem}_{timestamp}_catalog_stability.json"
  1081. with open(json_path, "w", encoding="utf-8") as f:
  1082. json.dump(output, f, ensure_ascii=False, indent=2)
  1083. print(f" [保存] 稳定性报告 → {json_path}")
  1084. return stability
  1085. # ============================================================
  1086. # 主入口
  1087. # ============================================================
  1088. def main():
  1089. parser = argparse.ArgumentParser(
  1090. description="OCR 模型效果与稳定性测试",
  1091. formatter_class=argparse.RawDescriptionHelpFormatter,
  1092. epilog="""
  1093. 示例:
  1094. # 默认:仅目录提取测试
  1095. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p test.pdf
  1096. # 版面检测测试(表格/图片检测 + OCR识别)
  1097. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p test.pdf --detection
  1098. # 全链路测试(检测+OCR+回填)
  1099. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p test.pdf --detection --full-pipeline
  1100. # 稳定性测试(20并发,50次调用)
  1101. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -p test.pdf --detection --stability --concurrency 20 --iterations 50
  1102. # 批量测试目录下所有 PDF
  1103. python utils_test/minimal_pipeline/test_ocr_effectiveness.py -d ./pdfs/
  1104. """,
  1105. )
  1106. parser.add_argument("-p", "--pdf", help="单个 PDF 文件路径")
  1107. parser.add_argument("-d", "--dir", help="批量: PDF 文件目录")
  1108. parser.add_argument("positional_pdf", nargs="?", metavar="PDF", help="也支持位置参数直接传 PDF 路径")
  1109. parser.add_argument("--pages", help="分析指定页码, 逗号分隔 (1-based)")
  1110. parser.add_argument("--detection", action="store_true", help="版面检测 + OCR 识别测试(默认仅目录OCR)")
  1111. parser.add_argument("--catalog-stability", action="store_true", help="目录OCR稳定性测试(多次提取对比一致性)")
  1112. parser.add_argument("--catalog-iterations", type=int, default=10, help="目录稳定性测试迭代次数(默认 10)")
  1113. parser.add_argument("--full-pipeline", action="store_true", help="全链路测试(需同时开启 --detection)")
  1114. parser.add_argument("--stability", action="store_true", help="稳定性测试(需同时开启 --detection)")
  1115. parser.add_argument("--concurrency", type=int, default=5, help="稳定性测试并发数")
  1116. parser.add_argument("--iterations", type=int, default=10, help="稳定性测试迭代次数")
  1117. parser.add_argument("--output-dir", help="输出目录(默认 utils_test/minimal_pipeline/temp/test_ocr_effectiveness/)")
  1118. parser.add_argument("--confidence", type=float, default=0.5, help="检测置信度阈值")
  1119. parser.add_argument("--json", action="store_true", help="以 JSON 格式输出结果")
  1120. parser.add_argument("--ocr-url", default=_OCR_CONFIG["GLM_OCR_API_URL"], help="OCR API 地址")
  1121. parser.add_argument("--ocr-key", default=_OCR_CONFIG["GLM_OCR_API_KEY"], help="OCR API 密钥")
  1122. parser.add_argument("--ocr-timeout", type=int, default=int(_OCR_CONFIG["GLM_OCR_TIMEOUT"]), help="OCR 超时秒数")
  1123. args = parser.parse_args()
  1124. if not args.pdf and not args.dir:
  1125. if args.positional_pdf:
  1126. args.pdf = args.positional_pdf
  1127. else:
  1128. parser.print_help()
  1129. return 1
  1130. tester = OcrEffectivenessTester(
  1131. ocr_api_url=args.ocr_url,
  1132. ocr_api_key=args.ocr_key,
  1133. ocr_timeout=args.ocr_timeout,
  1134. confidence_threshold=args.confidence,
  1135. concurrent_workers=args.concurrency if args.stability else 5,
  1136. )
  1137. pages = None
  1138. if args.pages:
  1139. pages = [int(p.strip()) - 1 for p in args.pages.split(",")]
  1140. # ---- 环境检查 ----
  1141. env = tester.check_environment()
  1142. if not args.json:
  1143. print_env_report(env)
  1144. # ---- 单个文件模式 ----
  1145. if args.pdf:
  1146. pdf_path = Path(args.pdf)
  1147. if not pdf_path.exists():
  1148. print(f"[错误] PDF 文件不存在: {pdf_path}")
  1149. return 1
  1150. # 输出目录结构: temp/test_ocr_effectiveness/{catalog, detection, table, figure, results}
  1151. base_output = Path(args.output_dir) if args.output_dir else TEST_DIR / "temp" / "test_ocr_effectiveness"
  1152. catalog_dir = base_output / "catalog"
  1153. detection_img_dir = base_output / "detection"
  1154. table_img_dir = base_output / "table"
  1155. figure_img_dir = base_output / "figure"
  1156. results_dir = base_output / "results"
  1157. for d in [catalog_dir, detection_img_dir, table_img_dir, figure_img_dir, results_dir]:
  1158. d.mkdir(parents=True, exist_ok=True)
  1159. timestamp = time.strftime("%Y%m%d_%H%M%S")
  1160. print(f"\n[测试] {pdf_path.name}")
  1161. print(f"[输出] 目录提取 → {catalog_dir}/")
  1162. print(f"[输出] 版面检测图 → {detection_img_dir}/")
  1163. print(f"[输出] 表格区域截图 → {table_img_dir}/")
  1164. print(f"[输出] 图片区域截图 → {figure_img_dir}/")
  1165. print(f"[输出] JSON 结果 → {results_dir}/")
  1166. print("=" * 70)
  1167. # 目录页 OCR 识别测试
  1168. # 有 --catalog-stability 时,稳定性编号文件已含完整结果,跳过单次提取
  1169. catalog_result = None
  1170. catalog_stability_result = None
  1171. if args.catalog_stability:
  1172. catalog_stability_result = _test_catalog_stability(
  1173. pdf_path, tester, catalog_dir, iterations=args.catalog_iterations,
  1174. )
  1175. s = catalog_stability_result
  1176. print(f" [目录稳定性] {s['total_runs']}次, "
  1177. f"一致={s['all_titles_identical']}, "
  1178. f"模式数={s['unique_title_patterns']}, "
  1179. f"延迟avg={s['latency_ms_avg']:.0f}ms")
  1180. catalog_result = {
  1181. "extract_status": "success" if s.get("success_count", 0) > 0 else "failed",
  1182. "total_chapters": s.get("most_common_chapter_count", 0),
  1183. "raw_ocr_length": 0,
  1184. "json_path": "",
  1185. }
  1186. else:
  1187. catalog_result = _test_catalog_ocr(pdf_path, tester, catalog_dir, detection_img_dir, timestamp)
  1188. cat_status = catalog_result.get("extract_status", "failed")
  1189. cat_chapters = catalog_result.get("total_chapters", 0)
  1190. cat_raw_len = catalog_result.get("raw_ocr_length", 0)
  1191. print(f" [目录OCR] status={cat_status}, chapters={cat_chapters}, raw_ocr_len={cat_raw_len}")
  1192. if cat_chapters > 0:
  1193. print(f" [保存] catalog JSON → {catalog_result.get('json_path', '')}")
  1194. # 版面检测 + OCR 识别(仅 --detection 时启用)
  1195. det_result = {"status": "skipped", "total_pages": 0}
  1196. ocr_result = {"status": "skipped"}
  1197. pipeline_result = None
  1198. stab_result = None
  1199. saved_table_count = {"table": 0, "figure": 0}
  1200. if args.detection:
  1201. det_result = tester.test_detection(pdf_path, pages=pages, save_images_dir=detection_img_dir)
  1202. if not args.json:
  1203. print_detection_report(det_result)
  1204. saved_table_count = _save_ocr_region_images(pdf_path, det_result, table_img_dir, figure_img_dir, tester)
  1205. print(f" [保存] 表格区域截图: {saved_table_count['table']} 张 → {table_img_dir}/")
  1206. print(f" [保存] 图片区域截图: {saved_table_count['figure']} 张 → {figure_img_dir}/")
  1207. ocr_result = tester.test_ocr_recognition(pdf_path, pages=pages)
  1208. if not args.json:
  1209. print_ocr_report(ocr_result)
  1210. if args.full_pipeline:
  1211. pipeline_result = tester.test_full_pipeline(pdf_path, pages=pages)
  1212. if not args.json:
  1213. print_pipeline_report(pipeline_result)
  1214. if args.stability:
  1215. stab_result = tester.test_stability(
  1216. pdf_path, concurrency=args.concurrency, iterations=args.iterations, pages=pages,
  1217. )
  1218. if not args.json:
  1219. print_stability_report(stab_result)
  1220. # 保存 JSON 结果到文件
  1221. output = {
  1222. "file": pdf_path.name,
  1223. "test_time": time.strftime("%Y-%m-%d %H:%M:%S"),
  1224. "environment": env,
  1225. "detection": det_result,
  1226. "ocr": ocr_result,
  1227. "catalog_ocr": {
  1228. "extract_status": catalog_result.get("extract_status"),
  1229. "total_chapters": catalog_result.get("total_chapters"),
  1230. "raw_ocr_length": catalog_result.get("raw_ocr_length"),
  1231. "json_path": catalog_result.get("json_path"),
  1232. },
  1233. }
  1234. if pipeline_result:
  1235. output["pipeline"] = pipeline_result
  1236. if stab_result:
  1237. output["stability"] = stab_result
  1238. if catalog_stability_result:
  1239. output["catalog_stability"] = {
  1240. "total_runs": catalog_stability_result.get("total_runs"),
  1241. "all_titles_identical": catalog_stability_result.get("all_titles_identical"),
  1242. "unique_title_patterns": catalog_stability_result.get("unique_title_patterns"),
  1243. "most_common_chapter_count": catalog_stability_result.get("most_common_chapter_count"),
  1244. }
  1245. json_path = results_dir / f"{pdf_path.stem}_ocr_test_result.json"
  1246. with open(json_path, "w", encoding="utf-8") as f:
  1247. json.dump(output, f, ensure_ascii=False, indent=2)
  1248. print(f"\n [保存] JSON 结果 → {json_path}")
  1249. if args.json:
  1250. print(json.dumps(output, ensure_ascii=False, indent=2))
  1251. return 0
  1252. # ---- 批量模式 ----
  1253. dir_path = Path(args.dir)
  1254. if not dir_path.is_dir():
  1255. print(f"[错误] 目录不存在: {dir_path}")
  1256. return 1
  1257. pdf_files = sorted(dir_path.glob("*.pdf"))
  1258. if not pdf_files:
  1259. print(f"[错误] 目录下无 PDF 文件: {dir_path}")
  1260. return 1
  1261. # 批量输出目录
  1262. base_output = Path(args.output_dir) if args.output_dir else TEST_DIR / "temp" / "test_ocr_effectiveness"
  1263. catalog_dir = base_output / "catalog"
  1264. detection_img_dir = base_output / "detection"
  1265. table_img_dir = base_output / "table"
  1266. figure_img_dir = base_output / "figure"
  1267. results_dir = base_output / "results"
  1268. for d in [catalog_dir, detection_img_dir, table_img_dir, figure_img_dir, results_dir]:
  1269. d.mkdir(parents=True, exist_ok=True)
  1270. batch_timestamp = time.strftime("%Y%m%d_%H%M%S")
  1271. print(f"\n[批量测试] 找到 {len(pdf_files)} 个 PDF 文件")
  1272. print(f"[批量测试] 目录: {dir_path}\n")
  1273. batch_results: List[Dict] = []
  1274. for idx, pdf_path in enumerate(pdf_files, 1):
  1275. print(f"[{idx}/{len(pdf_files)}] {pdf_path.name} ...", flush=True)
  1276. try:
  1277. # 目录提取(默认执行)
  1278. cat = _test_catalog_ocr(pdf_path, tester, catalog_dir, detection_img_dir, batch_timestamp)
  1279. cat_chap = cat.get("total_chapters", 0)
  1280. cat_ocr_len = cat.get("raw_ocr_length", 0)
  1281. file_result = {
  1282. "file": pdf_path.name,
  1283. "catalog_status": cat.get("extract_status"),
  1284. "catalog_chapters": cat_chap,
  1285. "raw_ocr_length": cat_ocr_len,
  1286. }
  1287. # 版面检测 + OCR(仅 --detection 时)
  1288. if args.detection:
  1289. det = tester.test_detection(pdf_path, pages=pages, save_images_dir=detection_img_dir)
  1290. _save_ocr_region_images(pdf_path, det, table_img_dir, figure_img_dir, tester)
  1291. ocr = tester.test_ocr_recognition(pdf_path, pages=pages)
  1292. file_result.update({
  1293. "pages": det.get("total_pages", 0),
  1294. "table_count": det.get("table_count", 0),
  1295. "figure_count": det.get("figure_count", 0),
  1296. "ocr_success_rate": ocr.get("success_rate"),
  1297. "ocr_content_rate": ocr.get("content_rate"),
  1298. "ocr_avg_latency": ocr.get("latency_ms_avg"),
  1299. })
  1300. batch_results.append(file_result)
  1301. print(f" → catalog={cat_chap}章, raw_ocr={cat_ocr_len}字符", flush=True)
  1302. except Exception as e:
  1303. print(f" → 失败: {e}", flush=True)
  1304. batch_results.append({"file": pdf_path.name, "error": str(e)})
  1305. # 保存批量汇总 JSON
  1306. batch_json_path = results_dir / f"batch_{batch_timestamp}_summary.json"
  1307. with open(batch_json_path, "w", encoding="utf-8") as f:
  1308. json.dump(batch_results, f, ensure_ascii=False, indent=2)
  1309. print(f"\n [保存] 批量汇总 → {batch_json_path}")
  1310. # 批量汇总报告
  1311. valid = [r for r in batch_results if "error" not in r]
  1312. errors = [r for r in batch_results if "error" in r]
  1313. if not args.json:
  1314. print("\n" + "=" * 90)
  1315. print(" 批量测试汇总报告")
  1316. print("=" * 90)
  1317. print(f" 文件数: {len(batch_results)} (成功={len(valid)}, 失败={len(errors)})")
  1318. if valid:
  1319. total_chapters = sum(r.get("catalog_chapters", 0) for r in valid)
  1320. total_ocr_len = sum(r.get("raw_ocr_length", 0) for r in valid)
  1321. print(f"\n 目录提取统计:")
  1322. print(f" 总章数: {total_chapters}")
  1323. print(f" 总OCR字符数: {total_ocr_len}")
  1324. print(f"\n 逐文件:")
  1325. print(f" {'文件':40s} {'章数':>6s} {'OCR字符':>8s}")
  1326. print(f" {'-'*55}")
  1327. for r in valid:
  1328. name = r["file"][:38] + ".." if len(r["file"]) > 38 else r["file"]
  1329. print(f" {name:40s} {r.get('catalog_chapters', 0):5d} {r.get('raw_ocr_length', 0):7d}")
  1330. # --detection 时额外输出检测统计
  1331. if valid[0].get("table_count") is not None:
  1332. total_tables = sum(r["table_count"] for r in valid)
  1333. total_figures = sum(r["figure_count"] for r in valid)
  1334. ocr_rates = [r["ocr_success_rate"] for r in valid if r["ocr_success_rate"] is not None]
  1335. ocr_latencies = [r["ocr_avg_latency"] for r in valid if r["ocr_avg_latency"] is not None]
  1336. print(f"\n 版面检测统计:")
  1337. print(f" 总表格数: {total_tables}")
  1338. print(f" 总图片数: {total_figures}")
  1339. if ocr_rates:
  1340. print(f" OCR成功率: avg={statistics.mean(ocr_rates):.1f}%")
  1341. if ocr_latencies:
  1342. print(f" OCR延迟(ms): avg={statistics.mean(ocr_latencies):.0f}")
  1343. if errors:
  1344. print(f"\n 失败文件:")
  1345. for e in errors:
  1346. print(f" - {e['file']}: {e.get('error', '')}")
  1347. print()
  1348. if args.json:
  1349. print(json.dumps(batch_results, ensure_ascii=False, indent=2))
  1350. return 0
  1351. if __name__ == "__main__":
  1352. sys.exit(main())