Przeglądaj źródła

feat: 恢复OCR目录检测、figure区域OCR、表格精确回填,增加YOLO测试工具

- pdf_extractor1.py: 恢复YOLO+OCR目录检测,移植normalize_catalog等完整目录标准化管线
- pdf_extractor1.py: OCR切分前原位替换,坐标级IoU匹配替代页面级追加,标题块保护
- ocr_processor.py: YOLO检测扩展figure区域(非标表格)+GLM自主判表格/非表格,并发数20
- ocr_processor.py: Non-table结果过滤,非表格区域保留原始PDF文本不做替换
- catalog_reviewer.py: 修复JSON解析含中文引号误替换、带错误信息的格式修正重试
- ai_review_engine.py: 目录审查降级结果不再泄露内部错误到前端
- utils_test/Yolo_Test: 新增YOLO版面检测测试工具,支持单文件/批统计模式

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
WangXuMing 2 tygodni temu
rodzic
commit
8fe16aaf01

+ 25 - 1
.gitignore

@@ -80,4 +80,28 @@ output/
 .venv/
 .project_optimization/
 plans/*
-CLAUDE.md
+CLAUDE.md
+
+# ---> Images & Media
+*.png
+*.jpg
+*.jpeg
+*.gif
+*.bmp
+*.ico
+*.svg
+*.webp
+*.tiff
+*.tif
+*.psd
+*.mp4
+*.avi
+*.mov
+*.wmv
+*.flv
+*.mkv
+*.mp3
+*.wav
+*.flac
+*.aac
+*.ogg

BIN
build_graph_app.png


+ 16 - 19
core/construction_review/component/ai_review_engine.py

@@ -962,20 +962,19 @@ class AIReviewEngine(BaseReviewer):
                             "chapter_code": "catalog",
                             "check_item_code": "catalog_check",
                             "check_result": {
-                                "issue_point": "目录】未能提取到文档目录",
+                                "issue_point": "无",
                                 "location": "目录页",
-                                "suggestion": "请检查文档是否包含目录页,或尝试手动上传目录",
-                                "reason": "OCR未能识别到目录内容",
-                                "risk_level": "风险"
+                                "suggestion": "",
+                                "reason": "",
+                                "risk_level": "风险"
                             },
-                            "exist_issue": True,
-                            "risk_info": {"risk_level": "medium"}
+                            "exist_issue": False,
+                            "risk_info": {"risk_level": "none"}
                         }],
                         "review_location_label": "目录完整性审查",
-                        "chapter_code": "catalog",
-                        "original_content": "未获取到目录内容"
+                        "chapter_code": "catalog"
                     },
-                    "success": False,
+                    "success": True,
                     "execution_time": 0
                 }
 
@@ -994,7 +993,6 @@ class AIReviewEngine(BaseReviewer):
 
         except Exception as e:
             logger.error(f"[{name}] 检查失败: {e}", exc_info=True)
-            import time
             return {
                 "details": {
                     "name": name,
@@ -1003,20 +1001,19 @@ class AIReviewEngine(BaseReviewer):
                         "chapter_code": "catalog",
                         "check_item_code": "catalog_check",
                         "check_result": {
-                            "issue_point": "目录审查失败",
+                            "issue_point": "",
                             "location": "目录页",
-                            "suggestion": "请检查OCR识别结果或手动确认目录",
-                            "reason": str(e),
-                            "risk_level": "风险"
+                            "suggestion": "",
+                            "reason": "无",
+                            "risk_level": "风险"
                         },
-                        "exist_issue": True,
-                        "risk_info": {"risk_level": "medium"}
+                        "exist_issue": False,
+                        "risk_info": {"risk_level": "none"}
                     }],
                     "review_location_label": "目录完整性审查",
-                    "chapter_code": "catalog",
-                    "original_content": f"审查失败: {str(e)}"
+                    "chapter_code": "catalog"
                 },
-                "success": False,
+                "success": True,
                 "execution_time": 0
             }
 

+ 117 - 89
core/construction_review/component/minimal_pipeline/catalog_reviewer.py

@@ -162,99 +162,124 @@ class CatalogReviewer:
 
     async def review(self, actual_catalog_text: str, trace_id_idx: str = "",
                       toc_page_range: Dict[str, int] = None) -> Dict[str, Any]:
-        """
-        审查目录完整性
-
-        Args:
-            actual_catalog_text: 实际目录文本(标准格式)
-            trace_id_idx: 追踪ID索引
-            toc_page_range: 目录页页码范围,如 {"start": 3, "end": 4}
-
-        Returns:
-            对齐 completeness_check 格式的结果字典
-        """
         import time
+        import asyncio
         start_time = time.time()
 
         try:
             from foundation.ai.agent.generate.model_generate import generate_model_client
 
-            prompt = self._build_prompt(actual_catalog_text, toc_page_range)
-
-            # 重试机制:最多3次
-            max_retries = 3
-            last_error = None
-
-            for attempt in range(max_retries):
-                try:
-                    logger.info(f"[DEBUG][CatalogReviewer] 调用模型 catalog_integrity_review,第 {attempt + 1} 次尝试")
-
-                    # 使用 generate_model_client 调用模型
-                    content = await generate_model_client.get_model_generate_invoke(
-                        trace_id=f"{trace_id_idx or 'catalog_review'}_attempt{attempt}",
-                        system_prompt="你是一位施工方案文档审查专家,负责对比实际目录和标准目录,找出缺失项。请按JSON格式输出最终结果。",
-                        user_prompt=prompt,
-                        function_name="catalog_integrity_review",
-                        timeout=120
-                    )
-
-                    logger.info(f"[DEBUG][CatalogReviewer] 模型返回,开始解析")
-                    logger.info(f"[DEBUG][CatalogReviewer] content length: {len(content)}")
-
-                    # 直接解析 LLM 返回的 completeness_check 格式
-                    result = self._extract_json(content)
-                    if result and "details" in result:
-                        logger.info(f"[DEBUG][CatalogReviewer] 成功解析 LLM 返回的格式")
-                        execution_time = time.time() - start_time
-                        return {
-                            "details": result["details"],
-                            "success": result.get("success", True),
-                            "execution_time": execution_time
-                        }
-                    else:
-                        logger.warning(f"[DEBUG][CatalogReviewer] 第 {attempt + 1} 次:LLM 返回格式不正确")
-                        last_error = "LLM 返回格式不正确"
-                        if attempt < max_retries - 1:
-                            import asyncio
-                            await asyncio.sleep(1)  # 短暂等待后重试
-
-                except Exception as e:
-                    logger.warning(f"[DEBUG][CatalogReviewer] 第 {attempt + 1} 次调用失败: {e}")
-                    last_error = str(e)
-                    if attempt < max_retries - 1:
-                        import asyncio
-                        await asyncio.sleep(1)
-
-            # 所有重试都失败
-            raise ValueError(f"重试 {max_retries} 次后仍失败: {last_error}")
+            system_prompt = (
+                "你是一位施工方案文档审查专家,负责对比实际目录和标准目录,找出缺失项。"
+                "请严格按JSON格式输出最终结果,不要输出任何其他内容。"
+            )
+            user_prompt = self._build_prompt(actual_catalog_text, toc_page_range)
+
+            # 第1次:正常调用
+            content = await self._call_llm(
+                generate_model_client, trace_id_idx, 0, system_prompt, user_prompt
+            )
+            result, err_msg = self._try_parse_json(content)
+            if result and "details" in result:
+                return {"details": result["details"], "success": result.get("success", True),
+                        "execution_time": time.time() - start_time}
+
+            logger.warning(f"[CatalogReviewer] 第1次JSON解析失败: {err_msg}")
+
+            # 第2次:让LLM修正格式
+            fix_prompt = self._build_fix_prompt(content, err_msg)
+            content = await self._call_llm(
+                generate_model_client, trace_id_idx, 1, system_prompt, fix_prompt
+            )
+            result, err_msg = self._try_parse_json(content)
+            if result and "details" in result:
+                return {"details": result["details"], "success": result.get("success", True),
+                        "execution_time": time.time() - start_time}
+
+            logger.warning(f"[CatalogReviewer] 第2次JSON解析失败: {err_msg}")
+
+            # 第3次:再次修正
+            fix_prompt = self._build_fix_prompt(content, err_msg)
+            content = await self._call_llm(
+                generate_model_client, trace_id_idx, 2, system_prompt, fix_prompt
+            )
+            result, err_msg = self._try_parse_json(content)
+            if result and "details" in result:
+                return {"details": result["details"], "success": result.get("success", True),
+                        "execution_time": time.time() - start_time}
+
+            raise ValueError(f"重试3次后JSON解析仍失败,最后错误: {err_msg}")
 
         except Exception as e:
             logger.error(f"[CatalogReviewer] LLM审查失败(已重试3次): {e}")
-            execution_time = time.time() - start_time
-            return {
-                "details": {
-                    "name": "outline_check",
-                    "response": [{
-                        "check_item": "completeness_check",
-                        "chapter_code": "catalogue",
-                        "check_item_code": "catalogue_completeness_check",
-                        "check_result": {
-                            "issue_point": "目录审查失败",
-                            "location": "目录页",
-                            "suggestion": "请检查OCR识别结果或手动确认目录",
-                            "reason": str(e),
-                            "risk_level": "中风险"
-                        },
-                        "exist_issue": True,
-                        "risk_info": {"risk_level": "medium"}
-                    }],
-                    "review_location_label": "目录完整性审查",
+            return self._fallback_result(time.time() - start_time)
+
+    async def _call_llm(self, client, trace_id_idx: str, attempt: int,
+                         system_prompt: str, user_prompt: str) -> str:
+        logger.info(f"[CatalogReviewer] 第 {attempt + 1} 次调用")
+        content = await client.get_model_generate_invoke(
+            trace_id=f"{trace_id_idx or 'catalog_review'}_attempt{attempt}",
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            function_name="catalog_integrity_review",
+            timeout=120
+        )
+        logger.info(f"[CatalogReviewer] content length: {len(content)}")
+        return content
+
+    def _try_parse_json(self, content: str) -> tuple:
+        """返回 (result_dict, error_message),成功时 error_message 为 None"""
+        result = self._extract_json(content)
+        if result is not None:
+            return result, None
+        # 收集具体错误
+        preview = content[:500]
+        try:
+            json.loads(content)
+            return None, "JSON结构异常但loads未报错"
+        except json.JSONDecodeError as e:
+            return None, f"JSONDecodeError: {e} | 内容前500字: {preview}"
+        except Exception as e:
+            return None, f"{type(e).__name__}: {e} | 内容前500字: {preview}"
+
+    def _build_fix_prompt(self, malformed_content: str, parse_error: str = "") -> str:
+        preview = malformed_content[:2000]
+        error_info = f"\n解析错误详情:{parse_error}\n" if parse_error else ""
+        return (
+            "你上次输出的JSON格式不正确,无法解析。请仔细检查以下问题并重新输出:\n\n"
+            f"{error_info}"
+            "1. 确保所有字符串键和值使用双引号\n"
+            "2. 确保字符串值内没有未转义的换行符,如有请用\\n替代\n"
+            "3. 确保所有括号、方括号正确闭合\n"
+            "4. 不要使用markdown代码块包裹JSON\n"
+            "5. 不要输出任何JSON之外的内容(包括思考过程)\n\n"
+            f"以下是你的输出,请修正格式后重新输出完整的JSON结果:\n\n{preview}"
+        )
+
+    def _fallback_result(self, execution_time: float) -> Dict[str, Any]:
+        return {
+            "details": {
+                "name": "outline_check",
+                "response": [{
+                    "check_item": "completeness_check",
                     "chapter_code": "catalogue",
-                    "original_content": f"审查失败: {str(e)}"
-                },
-                "success": False,
-                "execution_time": execution_time
-            }
+                    "check_item_code": "catalogue_completeness_check",
+                    "check_result": {
+                        "issue_point": "无",
+                        "location": "目录页",
+                        "suggestion": "无",
+                        "reason": "无",
+                        "risk_level": "无风险"
+                    },
+                    "exist_issue": False,
+                    "risk_info": {"risk_level": "none"}
+                }],
+                "review_location_label": "目录完整性审查",
+                "chapter_code": "catalogue"
+            },
+            "success": True,
+            "execution_time": execution_time
+        }
 
     def _build_prompt(self, actual_catalog_text: str,
                        toc_page_range: Dict[str, int] = None) -> str:
@@ -425,19 +450,22 @@ check_result 中必须包含以下字段:
 
     def _fix_json_content(self, content: str) -> str:
         """尝试修复常见的 JSON 格式问题"""
-        # 1. 移除多余的空白和换行
         content = content.strip()
 
-        # 2. 修复属性名未加引号的问题(简单情况)
-        # 将 { key: value } 转换为 { "key": value }
+        # 1. 修复用单引号包裹的键名和值(仅替换 JSON 结构层级的引号)
+        # 'key': → "key":
+        content = re.sub(r"'([a-zA-Z_][a-zA-Z0-9_]*)'\s*:", r'"\1":', content)
+        # : 'value' → : "value"(逗号或 } 或 ] 之前)
+        content = re.sub(r":\s*'([^']*)'\s*([,}\]])", r': "\1"\2', content)
+        # : 'value' 在行末 → : "value"
+        content = re.sub(r":\s*'([^']*)'\s*$", r': "\1"', content)
+
+        # 2. 修复属性名未加引号的问题:{ key: → { "key":
         content = re.sub(r'(\{|,)\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', content)
 
         # 3. 修复尾随逗号
         content = re.sub(r',\s*([}\]])', r'\1', content)
 
-        # 4. 修复单引号为双引号
-        content = content.replace("'", '"')
-
         return content
 
 

+ 38 - 11
core/construction_review/component/minimal_pipeline/ocr_processor.py

@@ -36,6 +36,7 @@ class TableRegion:
     page: fitz.Page
     bbox: Tuple[float, float, float, float]
     score: float
+    label: str = "table"  # YOLO 原始标签: table / figure
 
 
 @dataclass
@@ -56,7 +57,7 @@ class OcrProcessor:
     JPEG_QUALITY = 90
     OCR_DPI = 200
     OCR_CONFIDENCE_THRESHOLD = 0.5
-    OCR_CONCURRENT_WORKERS = 5
+    OCR_CONCURRENT_WORKERS = 20
 
     def __init__(
         self,
@@ -67,7 +68,7 @@ class OcrProcessor:
         jpeg_quality: int = 90,
         ocr_dpi: int = 200,
         confidence_threshold: float = 0.5,
-        concurrent_workers: int = 5,
+        concurrent_workers: int = 20,
     ):
         """
         初始化 OCR 处理器
@@ -123,7 +124,7 @@ class OcrProcessor:
         Returns:
             列表,元素为 ((x1, y1, x2, y2), score)
         """
-        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
+        table_regions: List[Tuple[Tuple[float, float, float, float], float, str]] = []
 
         if not RAPID_LAYOUT_AVAILABLE:
             return table_regions
@@ -145,15 +146,24 @@ class OcrProcessor:
                 scale_x = clip_box.width / img.shape[1]
                 scale_y = clip_box.height / img.shape[0]
 
+                table_count = 0
+                figure_count = 0
                 for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
-                    if label == "table" and score > self.confidence_threshold:
+                    if label in ("table", "figure") and score > self.confidence_threshold:
                         # 转换为 PDF 坐标
                         pdf_x1 = clip_box.x0 + box[0] * scale_x
                         pdf_y1 = clip_box.y0 + box[1] * scale_y
                         pdf_x2 = clip_box.x0 + box[2] * scale_x
                         pdf_y2 = clip_box.y0 + box[3] * scale_y
 
-                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
+                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score, label))
+                        if label == "table":
+                            table_count += 1
+                        else:
+                            figure_count += 1
+
+                if table_count or figure_count:
+                    logger.info(f"  [YOLO] 第{page_num}页: table={table_count}, figure={figure_count}")
 
         except Exception as e:
             logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
@@ -179,6 +189,11 @@ class OcrProcessor:
         total = len(regions)
         completed = 0
 
+        # 统计
+        table_total = sum(1 for r in regions if r.label == "table")
+        figure_total = sum(1 for r in regions if r.label == "figure")
+        logger.info(f"[OCR] 开始并发识别: table={table_total}, figure={figure_total}, workers={self.concurrent_workers}")
+
         with ThreadPoolExecutor(max_workers=self.concurrent_workers) as executor:
             # 提交所有任务
             future_to_region = {
@@ -187,11 +202,17 @@ class OcrProcessor:
             }
 
             # 处理完成的结果
+            non_table_count = 0
+            table_ok_count = 0
             for future in as_completed(future_to_region):
                 region = future_to_region[future]
                 completed += 1
                 try:
                     text = future.result()
+                    if text.strip():
+                        table_ok_count += 1
+                    else:
+                        non_table_count += 1
                     results.append(OcrResult(
                         page_num=region.page_num,
                         bbox=region.bbox,
@@ -200,7 +221,8 @@ class OcrProcessor:
                         success=True,
                     ))
                 except Exception as e:
-                    logger.error(f"  第 {region.page_num} 页表格 OCR 失败: {e}")
+                    non_table_count += 1
+                    logger.error(f"  第 {region.page_num} 页 {region.label} OCR 失败: {e}")
                     results.append(OcrResult(
                         page_num=region.page_num,
                         bbox=region.bbox,
@@ -213,6 +235,8 @@ class OcrProcessor:
                 if progress_callback and (completed % 5 == 0 or completed == total):
                     progress_callback(completed, total)
 
+        logger.info(f"[OCR] 完成: table={table_total}, figure={figure_total}, "
+                     f"有效表格={table_ok_count}, Non-table/失败={non_table_count}")
         return results
 
     def _ocr_table_region(
@@ -250,11 +274,10 @@ class OcrProcessor:
                     "content": [
                         {
                             "type": "text",
-                            "text": "识别图片中的表格内容,按原文排版输出。"
-                                    "注意:"
-                                    "1. 表格用 Markdown 表格格式"
-                                    "2. 保持换行和列对齐"
-                                    "3. 只输出表格内容,不要其他说明"
+                            "text": "判断图片中是否包含表格。"
+                                    "- 若包含表格:用 Markdown 表格格式提取内容,保持行列对齐。"
+                                    "- 若不包含任何表格:只输出 Non-table。"
+                                    "只输出结果,不要解释。"
                         },
                         {
                             "type": "image_url",
@@ -353,6 +376,10 @@ class OcrProcessor:
                 message = result["choices"][0].get("message", {})
                 content = message.get("content", "")
 
+        # GLM 判定为非表格区域,返回空字符串,下游自然跳过
+        if content and content.strip().startswith("Non-table"):
+            return ""
+
         # 如果内容包含 HTML 标签,转换为 Markdown
         if content and "<" in content and ">" in content:
             try:

+ 5 - 4
core/construction_review/component/minimal_pipeline/pdf_extractor.py

@@ -83,7 +83,7 @@ class PdfStructureExtractor:
     JPEG_QUALITY = 90
     OCR_DPI = 200
     OCR_CONFIDENCE_THRESHOLD = 0.5
-    OCR_CONCURRENT_WORKERS = 5
+    OCR_CONCURRENT_WORKERS = 20
 
     def __init__(
         self,
@@ -351,12 +351,13 @@ class PdfStructureExtractor:
                 rect = page.rect
                 clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
                 regions = self._detect_table_regions(page, page_num + 1, clip_box)
-                for bbox, score in regions:
+                for bbox, score, label in regions:
                     table_regions.append(TableRegion(
                         page_num=page_num + 1,
                         page=page,
                         bbox=bbox,
-                        score=score
+                        score=score,
+                        label=label,
                     ))
                 # 每5页或最后一页推送一次进度
                 if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
@@ -1564,7 +1565,7 @@ class PdfStructureExtractor:
         page: fitz.Page,
         page_num: int,
         clip_box: fitz.Rect
-    ) -> List[Tuple[Tuple[float, float, float, float], float]]:
+    ) -> List[Tuple[Tuple[float, float, float, float], float, str]]:
         """检测页面中的表格区域,具体实现委托给 OcrProcessor。"""
         if self.ocr_processor is None:
             return []

+ 753 - 127
core/construction_review/component/minimal_pipeline/pdf_extractor1.py

@@ -8,7 +8,7 @@ PDF 结构提取器。
 
 import re
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import fitz
 from foundation.observability.logger.loggering import review_logger as logger
@@ -144,27 +144,42 @@ class PdfStructureExtractor:
         }
 
         ocr_catalog: Optional[Dict[str, Any]] = None
-        # if self.detect_toc:
-        #     try:
-        #         ocr_catalog = self._extract_catalog(file_content, progress_callback)
-        #     except Exception as exc:
-        #         logger.warning(f"[PDF提取] OCR目录提取失败: {exc}")
+        if self.detect_toc:
+            try:
+                ocr_catalog = self._extract_catalog(file_content, progress_callback)
+                if ocr_catalog:
+                    ocr_catalog = self._normalize_catalog(ocr_catalog)
+                    logger.info(f"[PDF提取] 目录提取完成: {ocr_catalog.get('total_chapters', 0)} 章")
+            except Exception as exc:
+                logger.warning(f"[PDF提取] OCR目录提取失败: {exc}")
 
         doc = fitz.open(stream=file_content, filetype="pdf")
         try:
-            # 正文切分仍由 PyMuPDF 文本和标题规则驱动,OCR 只在切分后作为小节内容补充。
-            body_lines = self._extract_body_lines(doc, progress_callback)
+            # OCR 必须在正文提取之前执行,以便在切分时原位替换表格区域的乱码文本。
             ocr_results = self._extract_table_ocr_results(doc, progress_callback)
+            ocr_by_page: Dict[int, List[OcrResult]] = {}
+            ocr_success_count = 0
+            for r in ocr_results:
+                if r.success and str(r.text or "").strip():
+                    ocr_by_page.setdefault(r.page_num, []).append(r)
+                    ocr_success_count += 1
+
+            body_lines, ocr_inserted_count = self._extract_body_lines(doc, progress_callback, ocr_by_page)
             raw_data, winning_rule, coverage_rate, rule_performance = self._extract_body_with_best_rule(body_lines)
             chapters = self._convert_rule_output_to_chapters(raw_data)
-            ocr_stats = self._insert_ocr_results_into_chapters(chapters, ocr_results)
             body_catalog = self._build_body_catalog_from_chapters(chapters)
 
+            ocr_stats = {
+                "table_count": len(ocr_results),
+                "success_count": ocr_success_count,
+                "inserted_count": ocr_inserted_count,
+            }
+
             result["chapters"] = chapters
             result["total_pages"] = len(doc)
             result["body_catalog"] = body_catalog
-            #result["ocr_catalog"] = ocr_catalog
-            result["catalog"] = body_catalog or ocr_catalog
+            result["ocr_catalog"] = ocr_catalog
+            result["catalog"] = ocr_catalog or body_catalog
             result["body_rule"] = winning_rule
             result["body_coverage"] = coverage_rate
             result["rule_performance"] = rule_performance
@@ -197,28 +212,554 @@ class PdfStructureExtractor:
 
     def _extract_catalog(self, file_content: bytes, progress_callback=None) -> Optional[Dict[str, Any]]:
         """
-        提取目录结构(YOLO检测 + OCR识别)
+        提取目录结构(YOLO检测 + OCR识别),失败时兜底使用前几页纯文本解析。
 
         Returns:
             {"chapters": [...], "total_chapters": N} 或 None
         """
-        from .toc_detector import TOCCatalogExtractor
-
-        if self._toc_extractor is None:
-            self._toc_extractor = TOCCatalogExtractor(
-                model_path=self.toc_model_path,
-                ocr_api_url=self.ocr_api_url,
-                ocr_api_key=self.ocr_api_key,
-                ocr_timeout=self.ocr_timeout,
+        catalog: Optional[Dict[str, Any]] = None
+
+        try:
+            from .toc_detector import TOCCatalogExtractor
+
+            if self._toc_extractor is None:
+                self._toc_extractor = TOCCatalogExtractor(
+                    model_path=self.toc_model_path,
+                    ocr_api_url=self.ocr_api_url,
+                    ocr_api_key=self.ocr_api_key,
+                    ocr_timeout=self.ocr_timeout,
+                )
+
+            catalog = self._toc_extractor.detect_and_extract(file_content, progress_callback)
+        except Exception as exc:
+            logger.warning(f"[PDF提取] 目录检测器不可用,回退到纯文本目录解析: {exc}")
+
+        if catalog:
+            catalog_chapters = self._sanitize_catalog_chapters(catalog.get("chapters", []))
+            raw_text = (catalog.get("raw_ocr_text") or "").strip()
+            if catalog_chapters or raw_text:
+                catalog.setdefault("source", "ocr_toc")
+                return catalog
+
+        fallback_catalog = self._extract_catalog_from_front_pages_text(file_content)
+        if fallback_catalog:
+            logger.info(
+                f"[PDF提取] 使用前几页纯文本目录兜底成功: {fallback_catalog.get('total_chapters', 0)} 章"
             )
+        return fallback_catalog
 
-        catalog = self._toc_extractor.detect_and_extract(file_content, progress_callback)
+    def _normalize_catalog(self, catalog: Dict[str, Any]) -> Dict[str, Any]:
+        """统一目录来源并择优合并。"""
         if not catalog:
+            return {}
+
+        normalized = dict(catalog)
+        existing_chapters = self._sanitize_catalog_chapters(catalog.get("chapters", []))
+        raw_text = catalog.get("raw_ocr_text", "")
+        parsed_chapters = self._parse_catalog_from_raw_text(raw_text) if isinstance(raw_text, str) else []
+        selected_chapters = existing_chapters
+
+        if parsed_chapters:
+            if self._should_prefer_parsed_catalog(parsed_chapters, existing_chapters):
+                selected_chapters = parsed_chapters
+            elif existing_chapters:
+                logger.info(
+                    "[PDF提取] raw_ocr_text目录解析结果异常,保留原始目录骨架: "
+                    f"parsed={len(parsed_chapters)}, original={len(existing_chapters)}"
+                )
+            else:
+                selected_chapters = parsed_chapters
+
+        if selected_chapters:
+            selected_chapters = self._merge_catalog_chapters(
+                selected_chapters,
+                parsed_chapters,
+            )
+            normalized["chapters"] = selected_chapters
+            normalized["total_chapters"] = len(selected_chapters)
+            normalized["formatted_text"] = self._format_catalog_chapters(selected_chapters)
+        return normalized
+
+    def _parse_catalog_from_raw_text(self, text: str) -> List[Dict[str, Any]]:
+        """把目录页 OCR 原文解析成章节树。"""
+        if not text or not text.strip():
+            return []
+
+        chapters: List[Dict[str, Any]] = []
+        current_chapter: Optional[Dict[str, Any]] = None
+        active_l2_rule: Optional[str] = None
+        document_l1_rules: Optional[List[str]] = None
+
+        for raw_line in self._prepare_catalog_raw_lines(text):
+            title_text, page = self._split_catalog_entry(raw_line)
+            if not title_text:
+                continue
+
+            compact = re.sub(r"\s+", "", title_text)
+            if compact in {"目录", "目錄"}:
+                continue
+
+            chapter_matches = self._matching_rule_names(title_text, "l1", document_l1_rules)
+            if chapter_matches:
+                if document_l1_rules is None:
+                    document_l1_rules = chapter_matches
+                current_chapter = {
+                    "index": len(chapters) + 1,
+                    "title": self._clean_chapter_title(title_text),
+                    "page": str(page or 1),
+                    "original": raw_line.strip(),
+                    "subsections": [],
+                }
+                chapters.append(current_chapter)
+                active_l2_rule = None
+                continue
+
+            if current_chapter is None:
+                continue
+
+            section_matches = self._matching_rule_names(title_text, "l2")
+            if not section_matches:
+                numeric_section_title = self._coerce_numeric_catalog_section(
+                    title_text,
+                    document_l1_rules,
+                    active_l2_rule,
+                )
+                if numeric_section_title:
+                    section_key = self._normalize_heading_key(numeric_section_title)
+                    existing_keys = {
+                        self._normalize_heading_key(sub.get("title", ""))
+                        for sub in current_chapter.get("subsections", [])
+                    }
+                    if section_key not in existing_keys:
+                        current_chapter["subsections"].append({
+                            "title": numeric_section_title,
+                            "page": str(page or current_chapter.get("page", 1)),
+                            "level": 2,
+                            "original": raw_line.strip(),
+                        })
+                continue
+
+            if active_l2_rule is None:
+                active_l2_rule = section_matches[0]
+            if active_l2_rule not in section_matches:
+                continue
+
+            section_title = self._clean_section_title(title_text)
+            section_key = self._normalize_heading_key(section_title)
+            existing_keys = {
+                self._normalize_heading_key(sub.get("title", ""))
+                for sub in current_chapter.get("subsections", [])
+            }
+            if section_key in existing_keys:
+                continue
+
+            current_chapter["subsections"].append({
+                "title": section_title,
+                "page": str(page or current_chapter.get("page", 1)),
+                "level": 2,
+                "original": raw_line.strip(),
+            })
+
+        return chapters
+
+    @classmethod
+    def _sanitize_catalog_chapters(cls, chapters: Any) -> List[Dict[str, Any]]:
+        if not isinstance(chapters, list):
+            return []
+
+        sanitized: List[Dict[str, Any]] = []
+        seen_chapter_keys: Set[str] = set()
+
+        for idx, chapter in enumerate(chapters, 1):
+            if not isinstance(chapter, dict):
+                continue
+
+            chapter_title = cls._clean_chapter_title(str(chapter.get("title", "") or ""))
+            chapter_key = cls._normalize_heading_key(chapter_title)
+            if not chapter_key or chapter_key in seen_chapter_keys:
+                continue
+
+            seen_chapter_keys.add(chapter_key)
+            chapter_page = str(chapter.get("page") or idx)
+            subsections: List[Dict[str, Any]] = []
+            seen_section_keys: Set[str] = set()
+
+            for subsection in chapter.get("subsections", []) or []:
+                if not isinstance(subsection, dict):
+                    continue
+
+                section_title = cls._clean_section_title(str(subsection.get("title", "") or ""))
+                section_key = cls._normalize_heading_key(section_title)
+                if not section_key or section_key in seen_section_keys:
+                    continue
+
+                seen_section_keys.add(section_key)
+                subsections.append({
+                    "title": section_title,
+                    "page": str(subsection.get("page") or chapter_page),
+                    "level": 2,
+                    "original": subsection.get("original", "") or section_title,
+                })
+
+            sanitized.append({
+                "index": len(sanitized) + 1,
+                "title": chapter_title,
+                "page": chapter_page,
+                "original": chapter.get("original", "") or chapter_title,
+                "subsections": subsections,
+            })
+
+        return sanitized
+
+    @classmethod
+    def _should_prefer_parsed_catalog(
+        cls,
+        parsed_chapters: List[Dict[str, Any]],
+        existing_chapters: List[Dict[str, Any]],
+    ) -> bool:
+        if not parsed_chapters:
+            return False
+
+        parsed_is_suspicious = cls._catalog_has_suspicious_structure(parsed_chapters)
+        existing_is_suspicious = cls._catalog_has_suspicious_structure(existing_chapters)
+
+        if parsed_is_suspicious:
+            if not existing_chapters or not existing_is_suspicious:
+                return False
+
+            parsed_score = cls._catalog_structure_score(parsed_chapters)
+            existing_score = cls._catalog_structure_score(existing_chapters)
+            overlap_ratio = cls._catalog_chapter_overlap_ratio(parsed_chapters, existing_chapters)
+            return overlap_ratio >= 0.6 and parsed_score > existing_score
+
+        if not existing_chapters:
+            return True
+
+        if existing_is_suspicious:
+            return True
+
+        if cls._should_prefer_single_level_parsed_catalog(parsed_chapters, existing_chapters):
+            return True
+
+        parsed_score = cls._catalog_structure_score(parsed_chapters)
+        existing_score = cls._catalog_structure_score(existing_chapters)
+        if parsed_score <= existing_score:
+            return False
+
+        if not cls._catalog_has_suspicious_structure(existing_chapters):
+            existing_count = len(existing_chapters)
+            parsed_count = len(parsed_chapters)
+            if parsed_count > max(existing_count * 2, existing_count + 8):
+                return False
+            if existing_count >= 4 and parsed_count < max(2, existing_count // 2):
+                return False
+
+        return True
+
+    @classmethod
+    def _should_prefer_single_level_parsed_catalog(
+        cls,
+        parsed_chapters: List[Dict[str, Any]],
+        existing_chapters: List[Dict[str, Any]],
+    ) -> bool:
+        """特判"单层目录被误识别成一章多节"的场景。"""
+        if len(parsed_chapters) < 2 or len(existing_chapters) != 1:
+            return False
+
+        if any(chapter.get("subsections") for chapter in parsed_chapters):
+            return False
+
+        existing_subsections = existing_chapters[0].get("subsections", []) or []
+        if len(existing_subsections) < len(parsed_chapters) - 1:
+            return False
+
+        parsed_pages = [
+            cls._safe_page_number(chapter.get("page"), 1)
+            for chapter in parsed_chapters
+        ]
+        return parsed_pages == sorted(parsed_pages)
+
+    @classmethod
+    def _catalog_has_suspicious_structure(cls, chapters: List[Dict[str, Any]]) -> bool:
+        if not chapters:
+            return False
+
+        titles = [(chapter.get("title", "") or "").strip() for chapter in chapters]
+        chinese_chapter_count = sum(
+            1 for title in titles
+            if re.match(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]", title)
+        )
+        numeric_heading_count = sum(
+            1 for title in titles
+            if re.match(r"^\d{1,2}(?:[\..。、])?\s+\S+", title)
+        )
+        embedded_numeric_body_count = 0
+        repeated_chapter_no_count = 0
+        reversed_chapter_no_count = 0
+        seen_chapter_numbers: Set[str] = set()
+        previous_numeric_chapter_no: Optional[int] = None
+
+        for title in titles:
+            chapter_match = re.match(
+                r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]\s*(.*)$",
+                title,
+            )
+            if not chapter_match:
+                continue
+
+            chapter_no = re.sub(r"\s+", "", chapter_match.group(1))
+            chapter_body = (chapter_match.group(2) or "").strip()
+            if chapter_no in seen_chapter_numbers:
+                repeated_chapter_no_count += 1
+            seen_chapter_numbers.add(chapter_no)
+
+            if chapter_no.isdigit():
+                current_numeric_no = int(chapter_no)
+                if previous_numeric_chapter_no is not None and current_numeric_no < previous_numeric_chapter_no:
+                    reversed_chapter_no_count += 1
+                previous_numeric_chapter_no = current_numeric_no
+
+            if re.match(r"^\d{1,2}(?:\.\d{1,2})*\.?(?:\s+|$)", chapter_body):
+                embedded_numeric_body_count += 1
+
+        if chinese_chapter_count >= 2 and numeric_heading_count >= max(3, chinese_chapter_count // 2):
+            return True
+
+        if chinese_chapter_count >= max(2, len(titles) // 3) and numeric_heading_count >= max(2, len(titles) // 6):
+            return True
+
+        if embedded_numeric_body_count >= max(2, len(titles) // 5):
+            return True
+
+        if repeated_chapter_no_count > 0 or reversed_chapter_no_count > 0:
+            return True
+
+        return False
+
+    @staticmethod
+    def _catalog_structure_score(chapters: List[Dict[str, Any]]) -> int:
+        score = 0
+        for chapter in chapters:
+            score += 1
+            score += len(chapter.get("subsections", []) or [])
+        return score
+
+    @classmethod
+    def _catalog_chapter_overlap_ratio(
+        cls,
+        chapters_a: List[Dict[str, Any]],
+        chapters_b: List[Dict[str, Any]],
+    ) -> float:
+        if not chapters_a or not chapters_b:
+            return 0.0
+
+        keys_a = {
+            cls._catalog_chapter_identity_key(chapter.get("title", ""))
+            for chapter in chapters_a
+            if chapter.get("title")
+        }
+        keys_b = {
+            cls._catalog_chapter_identity_key(chapter.get("title", ""))
+            for chapter in chapters_b
+            if chapter.get("title")
+        }
+        if not keys_a or not keys_b:
+            return 0.0
+
+        return len(keys_a & keys_b) / max(1, min(len(keys_a), len(keys_b)))
+
+    @classmethod
+    def _merge_catalog_chapters(
+        cls,
+        base_chapters: List[Dict[str, Any]],
+        supplemental_chapters: List[Dict[str, Any]],
+    ) -> List[Dict[str, Any]]:
+        if not base_chapters:
+            return supplemental_chapters or []
+        if not supplemental_chapters:
+            return base_chapters
+
+        merged: List[Dict[str, Any]] = []
+        supplemental_by_key = {
+            cls._catalog_chapter_identity_key(chapter.get("title", "")): chapter
+            for chapter in supplemental_chapters
+            if chapter.get("title")
+        }
+
+        for index, chapter in enumerate(base_chapters, 1):
+            chapter_copy = {
+                **chapter,
+                "subsections": [dict(sub) for sub in chapter.get("subsections", []) or []],
+            }
+            chapter_key = cls._catalog_chapter_identity_key(chapter_copy.get("title", ""))
+            supplemental = supplemental_by_key.get(chapter_key)
+            if supplemental:
+                merged_subsections = cls._merge_catalog_subsections(
+                    chapter_copy.get("subsections", []),
+                    supplemental.get("subsections", []) or [],
+                )
+                chapter_copy["subsections"] = merged_subsections
+            chapter_copy["index"] = index
+            merged.append(chapter_copy)
+
+        return merged
+
+    @classmethod
+    def _merge_catalog_subsections(
+        cls,
+        base_subsections: List[Dict[str, Any]],
+        supplemental_subsections: List[Dict[str, Any]],
+    ) -> List[Dict[str, Any]]:
+        if not base_subsections:
+            return [dict(sub) for sub in supplemental_subsections]
+        if not supplemental_subsections:
+            return [dict(sub) for sub in base_subsections]
+
+        def _subsection_score(items: List[Dict[str, Any]]) -> int:
+            score = 0
+            for item in items:
+                title = (item.get("title", "") or "").strip()
+                if not title:
+                    continue
+                score += 1
+                if re.match(r"^\d+\.\d+(?!\.\d)\.?\s*", title):
+                    score += 3
+                elif re.match(r"^(第\s*[一二三四五六七八九十百零两]+\s*节)", title):
+                    score += 3
+                elif re.match(r"^([一二三四五六七八九十百零两]+[、)\)\]])", title):
+                    score += 3
+                elif re.match(r"^[【\[]\s*\d+\s*[\]】]", title):
+                    score += 3
+                elif re.match(r"^\d{1,2}[\..。、]\s*", title):
+                    score += 1
+            return score
+
+        base_score = _subsection_score(base_subsections)
+        supplemental_score = _subsection_score(supplemental_subsections)
+        if supplemental_score > base_score:
+            return [dict(sub) for sub in supplemental_subsections]
+
+        merged = [dict(sub) for sub in base_subsections]
+        seen_keys = {
+            cls._normalize_heading_key(sub.get("title", ""))
+            for sub in merged
+            if sub.get("title")
+        }
+        for subsection in supplemental_subsections:
+            subsection_key = cls._normalize_heading_key(subsection.get("title", ""))
+            if not subsection_key or subsection_key in seen_keys:
+                continue
+            merged.append(dict(subsection))
+            seen_keys.add(subsection_key)
+        return merged
+
+    @classmethod
+    def _coerce_numeric_catalog_section(
+        cls,
+        title_text: str,
+        document_l1_rules: Optional[List[str]],
+        active_l2_rule: Optional[str],
+    ) -> Optional[str]:
+        if active_l2_rule is not None:
             return None
 
-        normalized_catalog = dict(catalog)
-        normalized_catalog.setdefault("source", "ocr_toc")
-        return normalized_catalog
+        if not document_l1_rules:
+            return None
+
+        if "Rule_1_纯数字派" in document_l1_rules:
+            return None
+
+        if re.match(r"^\d{1,2}(?:[\..。、])?\s*(?!\d)[一-龥A-Za-z].*", title_text.strip()):
+            return cls._clean_section_title(title_text)
+
+        return None
+
+    def _extract_catalog_from_front_pages_text(
+        self,
+        file_content: bytes,
+        max_pages: int = 12,
+    ) -> Optional[Dict[str, Any]]:
+        """当目录检测失败时,从前几页纯文本中兜底解析目录。"""
+        doc = fitz.open(stream=file_content, filetype="pdf")
+        try:
+            catalog_pages: List[str] = []
+            started = False
+            scan_pages = min(max_pages, len(doc))
+
+            for page_num in range(scan_pages):
+                page = doc.load_page(page_num)
+                rect = page.rect
+                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+                page_text = page.get_text("text", clip=clip_box)
+                if not page_text or not page_text.strip():
+                    if started:
+                        break
+                    continue
+
+                has_marker, toc_like_count, page_suffix_count = self._catalog_text_signals(page_text)
+                if not started:
+                    is_catalog_page = (
+                        has_marker
+                        or page_suffix_count >= 2
+                        or (page_suffix_count >= 1 and toc_like_count >= 6)
+                    )
+                    if not is_catalog_page:
+                        continue
+                    started = True
+                else:
+                    is_catalog_page = (
+                        has_marker
+                        or page_suffix_count >= 1
+                    )
+                    if not is_catalog_page:
+                        break
+
+                catalog_pages.append(page_text)
+
+            raw_text = "\n".join(catalog_pages).strip()
+            if not raw_text:
+                return None
+
+            chapters = self._parse_catalog_from_raw_text(raw_text)
+            if not chapters:
+                return None
+
+            return {
+                "chapters": chapters,
+                "total_chapters": len(chapters),
+                "raw_ocr_text": raw_text,
+                "formatted_text": self._format_catalog_chapters(chapters),
+                "source": "front_pages_text",
+            }
+        finally:
+            doc.close()
+
+    @classmethod
+    def _catalog_text_signals(cls, text: str) -> Tuple[bool, int, int]:
+        compact_text = re.sub(r"\s+", "", text or "")
+        has_marker = "目录" in compact_text or "目錄" in compact_text
+        toc_like_count = 0
+        page_suffix_count = 0
+
+        for raw_line in cls._prepare_catalog_raw_lines(text):
+            title_text, page = cls._split_catalog_entry(raw_line)
+            if not title_text:
+                continue
+
+            compact_title = re.sub(r"\s+", "", title_text)
+            if compact_title in {"目录", "目錄"}:
+                toc_like_count += 1
+                continue
+
+            if page is not None:
+                page_suffix_count += 1
+                toc_like_count += 1
+                continue
+
+            if cls._matching_rule_names(title_text, "l1") or cls._matching_rule_names(title_text, "l2"):
+                toc_like_count += 1
+
+        return has_marker, toc_like_count, page_suffix_count
 
     def _extract_table_ocr_results(self, doc: fitz.Document, progress_callback=None) -> List[OcrResult]:
         """在 OCR 启用时检测 PDF 表格区域,并发执行表格识别。"""
@@ -244,12 +785,13 @@ class PdfStructureExtractor:
             clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
             regions = self.ocr_processor.detect_table_regions(page, page_index + 1, clip_box)
             # 保存页面对象和区域坐标,便于 OcrProcessor 后续精确渲染表格裁剪区域。
-            for bbox, score in regions:
+            for bbox, score, label in regions:
                 table_regions.append(TableRegion(
                     page_num=page_index + 1,
                     page=page,
                     bbox=bbox,
                     score=score,
+                    label=label,
                 ))
 
             if page_index + 1 == total_pages or (page_index + 1) % 5 == 0:
@@ -272,100 +814,36 @@ class PdfStructureExtractor:
             progress_callback=_progress_adapter,
         )
 
-    def _insert_ocr_results_into_chapters(
-        self,
-        chapters: Dict[str, Dict[str, Dict[str, Any]]],
-        ocr_results: List[OcrResult],
-    ) -> Dict[str, int]:
-        """把成功识别的表格 OCR 文本追加到同页最可能的小节正文中。"""
-
-        stats = {
-            "table_count": len(ocr_results),
-            "success_count": 0,
-            "inserted_count": 0,
-        }
-        if not chapters or not ocr_results:
-            return stats
-
-        successful_results = [
-            result for result in ocr_results
-            if getattr(result, "success", False) and str(getattr(result, "text", "") or "").strip()
-        ]
-        stats["success_count"] = len(successful_results)
-
-        for ocr_result in sorted(successful_results, key=lambda item: (item.page_num, item.bbox[1], item.bbox[0])):
-            # 轻量提取器在切分后不再保留文本块坐标,因此使用页码范围作为 OCR 回填的稳定定位信号。
-            target = self._find_ocr_target_section(chapters, ocr_result.page_num)
-            if target is None:
-                continue
-
-            _, _, payload = target
-            original_content = str(payload.get("content", "") or "").strip()
-            if original_content == EMPTY_SECTION_PLACEHOLDER:
-                original_content = ""
-
-            ocr_text = str(ocr_result.text or "").strip()
-            table_text = f"{TABLE_OCR_START}\n{ocr_text}\n{TABLE_OCR_END}"
-            payload["content"] = f"{original_content}\n\n{table_text}".strip()
-            payload["page_start"] = min(
-                self._safe_page_number(payload.get("page_start"), ocr_result.page_num),
-                ocr_result.page_num,
-            )
-            payload["page_end"] = max(
-                self._safe_page_number(payload.get("page_end"), ocr_result.page_num),
-                ocr_result.page_num,
-            )
-            stats["inserted_count"] += 1
-
-        return stats
-
-    def _find_ocr_target_section(
+    def _extract_body_lines(
         self,
-        chapters: Dict[str, Dict[str, Dict[str, Any]]],
-        page_num: int,
-    ) -> Optional[Tuple[str, str, Dict[str, Any]]]:
-        """查找页码范围最能覆盖 OCR 表格所在页的小节。"""
-
-        candidates: List[Tuple[int, int, str, str, Dict[str, Any]]] = []
-        fallback: Optional[Tuple[str, str, Dict[str, Any]]] = None
-
-        for chapter_title, sections in chapters.items():
-            if not isinstance(sections, dict):
-                continue
-
-            for section_title, payload in sections.items():
-                if not isinstance(payload, dict):
-                    continue
+        doc: fitz.Document,
+        progress_callback=None,
+        ocr_by_page: Dict[int, List[OcrResult]] = None,
+    ) -> Tuple[List[BodyLine], int]:
+        """读取页面正文文本,规范化正文行,并移除重复的非标题噪声。
 
-                page_start = self._safe_page_number(payload.get("page_start"), page_num)
-                page_end = self._safe_page_number(payload.get("page_end"), page_start)
-                if section_title == SECTION_TITLE_KEY:
-                    if fallback is None and page_start <= page_num <= page_end:
-                        fallback = (chapter_title, section_title, payload)
-                    continue
-
-                # 优先选择页码范围最窄的小节,过宽的范围通常是章节级内容外溢。
-                if page_start <= page_num <= page_end:
-                    span = max(page_end - page_start, 0)
-                    candidates.append((span, -page_start, chapter_title, section_title, payload))
-                elif page_start <= page_num:
-                    fallback = (chapter_title, section_title, payload)
+        Args:
+            ocr_by_page: 按页码分组的 OCR 结果,用于原位替换表格乱码文本。
 
-        if candidates:
-            _, _, chapter_title, section_title, payload = min(candidates, key=lambda item: (item[0], item[1]))
-            return chapter_title, section_title, payload
-        return fallback
-
-    def _extract_body_lines(self, doc: fitz.Document, progress_callback=None) -> List[BodyLine]:
-        """读取页面正文文本,规范化正文行,并移除重复的非标题噪声。"""
+        Returns:
+            (body_lines, ocr_inserted_count)
+        """
+        if ocr_by_page is None:
+            ocr_by_page = {}
 
         page_lines_by_page: List[Tuple[int, List[str]]] = []
         total_pages = len(doc)
         repeated_margin_keys = self._find_repeated_margin_block_lines(doc)
+        ocr_inserted_count = 0
 
         for page_index in range(total_pages):
             page = doc.load_page(page_index)
-            page_lines = self._extract_page_lines_with_margin_filter(page, repeated_margin_keys)
+            page_num = page_index + 1
+            page_ocr_results = ocr_by_page.get(page_num, [])
+            page_lines, page_inserted = self._extract_page_lines_with_margin_filter(
+                page, repeated_margin_keys, page_ocr_results,
+            )
+            ocr_inserted_count += page_inserted
 
             recovered_headings, clipped_fragment_keys = self._recover_top_clipped_l1_headings(page, page_lines)
             if clipped_fragment_keys:
@@ -377,19 +855,19 @@ class PdfStructureExtractor:
             if recovered_headings:
                 page_lines = recovered_headings + page_lines
 
-            page_lines_by_page.append((page_index + 1, page_lines))
+            page_lines_by_page.append((page_num, page_lines))
 
-            if progress_callback and (page_index + 1 == total_pages or (page_index + 1) % 10 == 0):
+            if progress_callback and (page_num == total_pages or page_num % 10 == 0):
                 try:
                     progress_callback(
                         "正文抽取",
-                        int((page_index + 1) / max(total_pages, 1) * 60),
-                        f"读取正文页 {page_index + 1}/{total_pages}",
+                        int(page_num / max(total_pages, 1) * 60),
+                        f"读取正文页 {page_num}/{total_pages}",
                     )
                 except Exception:
                     pass
 
-        # 页眉页脚往往跨页重复,但真实标题不能被误删,所以只移除“不像标题”的重复行。
+        # 页眉页脚往往跨页重复,但真实标题不能被误删,所以只移除"不像标题"的重复行。
         repeated_noise_keys = self._find_repeated_non_heading_lines(page_lines_by_page, total_pages)
         body_lines: List[BodyLine] = []
         for page, lines in page_lines_by_page:
@@ -397,29 +875,35 @@ class PdfStructureExtractor:
                 if self._normalize_repeated_line_key(line) in repeated_noise_keys:
                     continue
                 body_lines.append(BodyLine(page=page, text=line))
-        return body_lines
+        return body_lines, ocr_inserted_count
 
     def _extract_page_lines_with_margin_filter(
         self,
         page: fitz.Page,
         repeated_margin_keys: set[str],
-    ) -> List[str]:
-        """按文本块读取页面,并过滤跨页重复的页边页眉/页脚行。"""
+        ocr_results_for_page: List[OcrResult] = None,
+    ) -> Tuple[List[str], int]:
+        """按文本块读取页面,过滤页眉页脚,并原位替换表格区域的 OCR 文本。
 
+        Returns:
+            (page_lines, ocr_inserted_count)
+        """
         rect = page.rect
         body_top = self.clip_top
         body_bottom = rect.height - self.clip_bottom
+        ocr_results_for_page = ocr_results_for_page or []
 
         try:
             page_dict = page.get_text("dict")
         except Exception:
             clip_box = fitz.Rect(0, body_top, rect.width, body_bottom)
             text = page.get_text("text", clip=clip_box)
-            return [
+            lines = [
                 stripped
                 for stripped in (line.strip() for line in self._prepare_page_lines(text))
                 if stripped and not self._is_header_footer(stripped)
             ]
+            return lines, 0
 
         page_lines: List[str] = []
         blocks = sorted(
@@ -429,7 +913,39 @@ class PdfStructureExtractor:
                 item.get("bbox", [0, 0, 0, 0])[0],
             ),
         )
-        for block in blocks:
+
+        # 预计算每个 block 匹配到的 OCR 结果索引
+        ocr_match_by_block: Dict[int, int] = {}
+        ocr_used: Set[int] = set()
+
+        if ocr_results_for_page:
+            for block_idx, block in enumerate(blocks):
+                bbox = block.get("bbox") or ()
+                if len(bbox) != 4:
+                    continue
+                _, y0, _, y1 = bbox
+                if y1 <= body_top or y0 >= body_bottom:
+                    continue
+
+                block_text = self._extract_text_block_text(block)
+                if not block_text or self._matches_any_heading(block_text):
+                    continue
+
+                bx0, by0, bx1, by1 = bbox
+                for ocr_idx, ocr_result in enumerate(ocr_results_for_page):
+                    if ocr_idx in ocr_used:
+                        continue
+                    rx0, ry0, rx1, ry1 = ocr_result.bbox
+                    overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
+                    overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
+                    overlap_area = overlap_x * overlap_y
+                    block_area = max((bx1 - bx0) * (by1 - by0), 1)
+                    if overlap_area / block_area > 0.5:
+                        ocr_match_by_block[block_idx] = ocr_idx
+                        ocr_used.add(ocr_idx)
+                        break
+
+        for block_idx, block in enumerate(blocks):
             bbox = block.get("bbox") or ()
             if len(bbox) != 4:
                 continue
@@ -438,19 +954,27 @@ class PdfStructureExtractor:
             if y1 <= body_top or y0 >= body_bottom:
                 continue
 
-            block_text = self._extract_text_block_text(block)
-            if not block_text:
+            in_margin = self._is_margin_band(y0, y1, rect.height)
+
+            if block_idx in ocr_match_by_block:
+                ocr_result = ocr_results_for_page[ocr_match_by_block[block_idx]]
+                ocr_text = str(ocr_result.text or "").strip()
+                source_text = f"{TABLE_OCR_START}\n{ocr_text}\n{TABLE_OCR_END}"
+            else:
+                source_text = self._extract_text_block_text(block)
+
+            if not source_text:
                 continue
 
-            in_margin = self._is_margin_band(y0, y1, rect.height)
-            for line in self._prepare_page_lines(block_text):
+            for line in self._prepare_page_lines(source_text):
                 stripped = line.strip()
                 if not stripped or self._is_header_footer(stripped):
                     continue
                 if in_margin and self._is_repeated_margin_noise(stripped, repeated_margin_keys):
                     continue
                 page_lines.append(stripped)
-        return page_lines
+
+        return page_lines, len(ocr_match_by_block)
 
     def _find_repeated_margin_block_lines(self, doc: fitz.Document) -> set[str]:
         """统计顶部/底部页边区域中跨页重复出现、且不像标题的文本行。"""
@@ -766,7 +1290,7 @@ class PdfStructureExtractor:
         }
 
         for index, item in enumerate(body_lines):
-            # 先处理跨行标题碎片,再进入章/节识别,避免“第X章”单独成行时丢标题。
+            # 先处理跨行标题碎片,再进入章/节识别,避免"第X章"单独成行时丢标题。
             original_line = item.text.strip()
             page = item.page
             if not original_line or original_line.isdigit():
@@ -997,7 +1521,7 @@ class PdfStructureExtractor:
 
     @classmethod
     def _has_stable_explicit_chapter_headings(cls, body_lines: List[BodyLine]) -> bool:
-        """判断正文前段是否已经存在稳定的“第X章”显式章节结构。"""
+        """判断正文前段是否已经存在稳定的"第X章"显式章节结构。"""
 
         chapter_numbers: List[int] = []
 
@@ -1057,7 +1581,7 @@ class PdfStructureExtractor:
 
     @classmethod
     def _detect_cn_order_l2_style(cls, line: str) -> Optional[str]:
-        """识别中文序号小节标题的样式,区分“ 一)”和“ 一、/一 空格”。"""
+        """识别中文序号小节标题的样式,区分" 一)"和" 一、/一 空格"。"""
 
         cleaned = cls._strip_catalog_page_suffix(line)
         cleaned = re.sub(r"\s+", " ", str(cleaned or "").strip())
@@ -1387,6 +1911,108 @@ class PdfStructureExtractor:
 
         return re.sub(r"\s+", "", str(line or "").strip())
 
+    @staticmethod
+    def _normalize_heading_key(text: str) -> str:
+        normalized = PdfStructureExtractor._strip_catalog_page_suffix((text or "").strip())
+        normalized = normalized.replace("【", "[").replace("】", "]")
+        normalized = normalized.replace("(", "(").replace(")", ")")
+        normalized = normalized.replace(".", ".").replace("。", ".")
+        normalized = re.sub(r"\s+", "", normalized)
+        return normalized
+
+    @classmethod
+    def _matching_rule_names(
+        cls,
+        line: str,
+        level: str,
+        rule_names: Optional[List[str]] = None,
+    ) -> List[str]:
+        clean_line = line.strip()
+        if level == "l1":
+            clean_line = cls._strip_leading_page_number_from_heading(clean_line)
+        names = rule_names or list(cls.RULE_LIB.keys())
+        return [
+            rule_name
+            for rule_name in names
+            if cls.RULE_LIB[rule_name][level].match(clean_line)
+        ]
+
+    @staticmethod
+    def _split_catalog_entry(line: str) -> Tuple[str, Optional[int]]:
+        cleaned = line.strip()
+        if not cleaned:
+            return "", None
+
+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
+        page_match = re.search(
+            r"(?:[.…·•·• ]{2,})[-–— ]*(\d+)\s*[-–— ]*$",
+            cleaned,
+        )
+        if page_match:
+            title_text = cleaned[:page_match.start()].strip()
+            title_text = re.sub(r"[.…·• ]+$", "", title_text).strip()
+            return title_text, int(page_match.group(1))
+
+        return cleaned, None
+
+    @classmethod
+    def _prepare_catalog_raw_lines(cls, text: str) -> List[str]:
+        raw_lines = [line.strip() for line in text.splitlines() if line.strip()]
+        prepared: List[str] = []
+        index = 0
+
+        while index < len(raw_lines):
+            current = raw_lines[index].strip()
+            compact_current = re.sub(r"\s+", "", current)
+
+            if compact_current in {"目", "錄", "录"} and index + 1 < len(raw_lines):
+                next_compact = re.sub(r"\s+", "", raw_lines[index + 1].strip())
+                if compact_current + next_compact in {"目录", "目錄"}:
+                    prepared.append(compact_current + next_compact)
+                    index += 2
+                    continue
+
+            if cls._is_incomplete_heading_fragment(current) and index + 1 < len(raw_lines):
+                next_line = raw_lines[index + 1].strip()
+                candidate = f"{current} {next_line}".strip()
+                _, candidate_page = cls._split_catalog_entry(candidate)
+                if (
+                    cls._matching_rule_names(candidate, "l1")
+                    or cls._matching_rule_names(candidate, "l2")
+                    or candidate_page is not None
+                ):
+                    prepared.append(candidate)
+                    index += 2
+                    continue
+
+            prepared.append(current)
+            index += 1
+
+        return prepared
+
+    @classmethod
+    def _catalog_chapter_identity_key(cls, title: str) -> str:
+        cleaned = cls._clean_chapter_title(title)
+        if not cleaned:
+            return ""
+
+        chapter_match = re.match(
+            r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部部分篇]\s*(.*)$",
+            cleaned,
+        )
+        if chapter_match:
+            chapter_body = cls._normalize_heading_key(chapter_match.group(1))
+            if chapter_body:
+                return chapter_body
+
+        numeric_match = re.match(r"^\d{1,2}(?:[\..。、])?\s*(.*)$", cleaned)
+        if numeric_match:
+            numeric_body = cls._normalize_heading_key(numeric_match.group(1))
+            if numeric_body:
+                return numeric_body
+
+        return cls._normalize_heading_key(cleaned)
+
     @classmethod
     def _matches_any_heading(cls, line: str) -> bool:
         """判断文本是否命中任意一套章/节标题规则。"""

BIN
utils_test/MinerU_Test/sgfa_mineru_testimage.png


+ 451 - 0
utils_test/Yolo_Test/test_yolo_layout.py

@@ -0,0 +1,451 @@
+"""
+YOLO 版面检测模型测试脚本
+
+测试 RapidLayout 对表格(table)、图片(image)的识别情况,
+确认非标准表格是否被误判为 image,以及纯图片的分类标签。
+
+用法:
+    python utils_test/Yolo_Test/test_yolo_layout.py -p <pdf_path>
+    python utils_test/Yolo_Test/test_yolo_layout.py -p <pdf_path> --save-images
+    python utils_test/Yolo_Test/test_yolo_layout.py -p <pdf_path> --pages 0,1,2
+
+依赖:
+    pip install rapid-layout pymupdf numpy Pillow
+"""
+
+import argparse
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import fitz
+import numpy as np
+
+try:
+    from rapid_layout import RapidLayout
+    RAPID_LAYOUT_AVAILABLE = True
+except ImportError:
+    RAPID_LAYOUT_AVAILABLE = False
+    RapidLayout = None
+
+
+class YoloLayoutTester:
+    """YOLO 版面检测测试器"""
+
+    def __init__(
+        self,
+        dpi: int = 200,
+        clip_top: float = 60,
+        clip_bottom: float = 60,
+        confidence_threshold: float = 0.3,
+    ):
+        self.dpi = dpi
+        self.clip_top = clip_top
+        self.clip_bottom = clip_bottom
+        self.confidence_threshold = confidence_threshold
+        self._engine: Optional[RapidLayout] = None
+
+    def _get_engine(self) -> Optional[RapidLayout]:
+        if not RAPID_LAYOUT_AVAILABLE:
+            return None
+        if self._engine is None:
+            self._engine = RapidLayout()
+        return self._engine
+
+    def analyze_pdf(
+        self,
+        pdf_path: Path,
+        pages: Optional[List[int]] = None,
+        save_images_dir: Optional[Path] = None,
+    ) -> Dict:
+        """分析 PDF 文件的版面检测结果"""
+        if not RAPID_LAYOUT_AVAILABLE:
+            return {"error": "RapidLayout 未安装,请执行: pip install rapid-layout"}
+
+        engine = self._get_engine()
+        if engine is None:
+            return {"error": "RapidLayout 初始化失败"}
+
+        doc = fitz.open(str(pdf_path))
+        try:
+            total_pages = len(doc)
+            target_pages = pages if pages is not None else list(range(total_pages))
+
+            all_labels: List[str] = []
+            page_details: List[Dict] = []
+
+            for page_num in target_pages:
+                if page_num >= total_pages:
+                    continue
+
+                page = doc.load_page(page_num)
+                rect = page.rect
+                clip_box = fitz.Rect(
+                    0, self.clip_top,
+                    rect.width, rect.height - self.clip_bottom,
+                )
+
+                pix = page.get_pixmap(dpi=self.dpi, clip=clip_box)
+                img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
+                    pix.height, pix.width, 3,
+                )
+
+                layout_output = engine(img)
+
+                scale_x = clip_box.width / img.shape[1]
+                scale_y = clip_box.height / img.shape[0]
+
+                page_regions: List[Dict] = []
+                page_labels: List[str] = []
+
+                if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
+                    for box, label, score in zip(
+                        layout_output.boxes,
+                        layout_output.class_names,
+                        layout_output.scores,
+                    ):
+                        if score < self.confidence_threshold:
+                            continue
+
+                        pdf_x1 = clip_box.x0 + box[0] * scale_x
+                        pdf_y1 = clip_box.y0 + box[1] * scale_y
+                        pdf_x2 = clip_box.x0 + box[2] * scale_x
+                        pdf_y2 = clip_box.y0 + box[3] * scale_y
+
+                        width = pdf_x2 - pdf_x1
+                        height = pdf_y2 - pdf_y1
+
+                        page_regions.append({
+                            "label": label,
+                            "score": round(float(score), 4),
+                            "bbox": [round(pdf_x1, 1), round(pdf_y1, 1),
+                                      round(pdf_x2, 1), round(pdf_y2, 1)],
+                            "size": [round(width, 1), round(height, 1)],
+                        })
+                        page_labels.append(label)
+
+                all_labels.extend(page_labels)
+                page_details.append({
+                    "page": page_num + 1,
+                    "regions": page_regions,
+                    "counts": dict(Counter(page_labels)),
+                })
+
+                if save_images_dir:
+                    self._save_annotated_image(
+                        img, layout_output, page_num + 1,
+                        scale_x, scale_y, save_images_dir,
+                    )
+
+        finally:
+            doc.close()
+
+        label_counter = Counter(all_labels)
+        return {
+            "total_pages": total_pages,
+            "analyzed_pages": len(target_pages),
+            "total_regions": len(all_labels),
+            "label_distribution": dict(label_counter.most_common()),
+            "table_count": label_counter.get("table", 0),
+            "image_count": label_counter.get("image", 0),
+            "figure_count": label_counter.get("figure", 0),
+            "page_details": page_details,
+        }
+
+    def _save_annotated_image(
+        self,
+        img: np.ndarray,
+        layout_output,
+        page_num: int,
+        scale_x: float,
+        scale_y: float,
+        output_dir: Path,
+    ):
+        """保存带标注框的图片"""
+        try:
+            from PIL import Image, ImageDraw, ImageFont
+        except ImportError:
+            print("  [跳过] Pillow 未安装,无法保存标注图片")
+            return
+
+        pil_img = Image.fromarray(img)
+        draw = ImageDraw.Draw(pil_img)
+
+        label_colors = {
+            "table": (0, 255, 0),           # 绿色
+            "figure": (255, 80, 80),        # 红色 — 关键:非标表格可能在这
+            "figure_caption": (255, 165, 0),# 橙色
+            "table_caption": (200, 200, 0), # 黄绿
+            "text": (0, 0, 255),            # 蓝色
+            "title": (255, 255, 0),         # 黄色
+            "header": (128, 0, 128),        # 紫色
+            "footer": (128, 128, 0),        # 橄榄色
+            "reference": (0, 128, 128),
+            "equation": (0, 200, 200),
+        }
+        default_color = (200, 200, 200)
+
+        if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
+            for box, label, score in zip(
+                layout_output.boxes,
+                layout_output.class_names,
+                layout_output.scores,
+            ):
+                if score < self.confidence_threshold:
+                    continue
+
+                x1_img = box[0] / scale_x
+                y1_img = box[1] / scale_y
+                x2_img = box[2] / scale_x
+                y2_img = box[3] / scale_y
+
+                color = label_colors.get(label, default_color)
+                draw.rectangle([x1_img, y1_img, x2_img, y2_img], outline=color, width=2)
+                draw.text(
+                    (x1_img + 2, y1_img + 2),
+                    f"{label} ({score:.2f})",
+                    fill=color,
+                )
+
+        output_path = output_dir / f"page_{page_num:03d}_layout.jpg"
+        pil_img.save(str(output_path), quality=85)
+        print(f"  [保存] {output_path}")
+
+
+def print_report(result: Dict):
+    """打印检测报告"""
+    if "error" in result:
+        print(f"[错误] {result['error']}")
+        return
+
+    print()
+    print("=" * 70)
+    print("YOLO 版面检测报告")
+    print("=" * 70)
+    print(f"总页数: {result['total_pages']}")
+    print(f"分析页数: {result['analyzed_pages']}")
+    print(f"检测区域总数: {result['total_regions']}")
+    print()
+    print("标签分布:")
+    print("-" * 50)
+    for label, count in result["label_distribution"].items():
+        pct = count / max(result["total_regions"], 1) * 100
+        bar = "█" * int(pct / 2)
+        print(f"  {label:15s}: {count:4d} ({pct:5.1f}%) {bar}")
+    print()
+
+    # 重点关注
+    print("关键指标:")
+    print(f"  table  : {result['table_count']}")
+    print(f"  image  : {result['image_count']}")
+    print(f"  figure : {result['figure_count']}")
+    print()
+
+    # 逐页详情
+    print("逐页详情:")
+    print("-" * 50)
+    for page_info in result["page_details"]:
+        page_num = page_info["page"]
+        regions = page_info["regions"]
+        if not regions:
+            continue
+        print(f"\n  --- 第 {page_num} 页 ({len(regions)} 个区域) ---")
+        for r in regions:
+            size_str = f"{r['size'][0]}x{r['size'][1]}"
+            print(f"    [{r['label']:12s}] score={r['score']:.3f}  "
+                  f"bbox=({r['bbox'][0]:.0f},{r['bbox'][1]:.0f},{r['bbox'][2]:.0f},{r['bbox'][3]:.0f})  "
+                  f"size={size_str}")
+    print()
+
+
+def print_batch_report(batch_results: List[Dict]):
+    """打印批统计报告"""
+    valid = [r for r in batch_results if "error" not in r]
+    errors = [r for r in batch_results if "error" in r]
+
+    if not valid:
+        print("[错误] 没有成功分析任何 PDF 文件")
+        return
+
+    print()
+    print("=" * 80)
+    print("YOLO 版面检测 — 批统计报告")
+    print("=" * 80)
+    print(f"分析文件数: {len(batch_results)} (成功 {len(valid)}, 失败 {len(errors)})")
+
+    # 汇总所有文件的标签计数
+    all_labels: Counter = Counter()
+    file_summaries: List[Dict] = []
+
+    for r in valid:
+        file_labels = r["label_distribution"]
+        all_labels.update(file_labels)
+        total = r["total_regions"]
+        file_summaries.append({
+            "file": r["file_name"],
+            "pages": r["total_pages"],
+            "regions": total,
+            "table_pct": file_labels.get("table", 0) / max(total, 1) * 100,
+            "figure_pct": file_labels.get("figure", 0) / max(total, 1) * 100,
+            "table_count": file_labels.get("table", 0),
+            "figure_count": file_labels.get("figure", 0),
+        })
+
+    total_regions = sum(s["regions"] for s in file_summaries)
+    total_pages = sum(s["pages"] for s in file_summaries)
+
+    print(f"总页数: {total_pages}")
+    print(f"总区域数: {total_regions}")
+    print()
+
+    # 全局标签分布
+    print("全局标签分布:")
+    print("-" * 55)
+    for label, count in all_labels.most_common():
+        pct = count / max(total_regions, 1) * 100
+        bar = "█" * int(pct)
+        print(f"  {label:15s}: {count:5d} ({pct:5.1f}%) {bar}")
+    print()
+
+    # 逐文件摘要
+    print("逐文件摘要:")
+    print("-" * 80)
+    print(f"  {'文件':40s} {'页':>4s} {'区域':>5s} {'table%':>7s} {'figure%':>7s} {'table':>6s} {'figure':>6s}")
+    print("  " + "-" * 76)
+    for s in file_summaries:
+        name = s["file"][:38] + ".." if len(s["file"]) > 40 else s["file"]
+        print(f"  {name:40s} {s['pages']:4d} {s['regions']:5d} "
+              f"{s['table_pct']:6.1f}% {s['figure_pct']:6.1f}% "
+              f"{s['table_count']:5d} {s['figure_count']:5d}")
+
+    # 平均统计
+    avg_table_pct = sum(s["table_pct"] for s in file_summaries) / len(file_summaries)
+    avg_figure_pct = sum(s["figure_pct"] for s in file_summaries) / len(file_summaries)
+    avg_regions_per_page = total_regions / max(total_pages, 1)
+    avg_table_per_page = sum(s["table_count"] for s in file_summaries) / max(total_pages, 1)
+    avg_figure_per_page = sum(s["figure_count"] for s in file_summaries) / max(total_pages, 1)
+
+    print()
+    print("平均统计 (按页):")
+    print("-" * 40)
+    print(f"  平均区域/页:    {avg_regions_per_page:.1f}")
+    print(f"  平均 table/页:  {avg_table_per_page:.2f}")
+    print(f"  平均 figure/页: {avg_figure_per_page:.2f}")
+    print(f"  平均 table 占比: {avg_table_pct:.1f}%")
+    print(f"  平均 figure 占比:{avg_figure_pct:.1f}%")
+    print(f"  table+figure/页: {avg_table_per_page + avg_figure_per_page:.2f}")
+
+    if errors:
+        print()
+        print(f"失败文件 ({len(errors)}):")
+        for e in errors:
+            print(f"  - {e['file_name']}: {e['error']}")
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="YOLO 版面检测模型测试")
+    parser.add_argument("-p", "--pdf", default=None, help="单个 PDF 文件路径")
+    parser.add_argument("-d", "--dir", default=None, help="批量: 扫描目录下所有 PDF 文件")
+    parser.add_argument("--pages", default=None, help="分析指定页码, 逗号分隔, 如 0,1,2 (0-based)")
+    parser.add_argument("--save-images", action="store_true", help="保存标注图片 (批模式不生效)")
+    parser.add_argument("--output-dir", default=None, help="输出目录 (默认与 PDF 同目录)")
+    parser.add_argument("--dpi", type=int, default=200, help="渲染 DPI (默认 200)")
+    parser.add_argument("--confidence", type=float, default=0.3, help="置信度阈值 (默认 0.3)")
+    parser.add_argument("--clip-top", type=float, default=60, help="顶部裁剪 (默认 60)")
+    parser.add_argument("--clip-bottom", type=float, default=60, help="底部裁剪 (默认 60)")
+    parser.add_argument("--json", action="store_true", help="输出 JSON 格式 (批模式输出每个文件的关键统计)")
+    args = parser.parse_args()
+
+    if not args.pdf and not args.dir:
+        print("[错误] 请指定 -p <pdf文件> 或 -d <pdf目录>")
+        return 1
+
+    tester = YoloLayoutTester(
+        dpi=args.dpi,
+        clip_top=args.clip_top,
+        clip_bottom=args.clip_bottom,
+        confidence_threshold=args.confidence,
+    )
+
+    pages = None
+    if args.pages:
+        pages = [int(p.strip()) for p in args.pages.split(",")]
+
+    # ---- 单文件模式 ----
+    if args.pdf:
+        pdf_path = Path(args.pdf)
+        if not pdf_path.exists():
+            print(f"[错误] PDF 不存在: {pdf_path}")
+            return 1
+
+        output_dir = None
+        if args.save_images:
+            output_dir = Path(args.output_dir) if args.output_dir else pdf_path.parent / "yolo_layout_output"
+            output_dir.mkdir(parents=True, exist_ok=True)
+
+        print(f"[分析] {pdf_path}")
+        result = tester.analyze_pdf(pdf_path, pages=pages, save_images_dir=output_dir)
+        result["file_name"] = pdf_path.name
+
+        if args.json:
+            print(json.dumps(result, ensure_ascii=False, indent=2))
+        else:
+            print_report(result)
+        return 0
+
+    # ---- 批模式 ----
+    dir_path = Path(args.dir)
+    if not dir_path.is_dir():
+        print(f"[错误] 目录不存在: {dir_path}")
+        return 1
+
+    pdf_files = sorted(dir_path.glob("*.pdf"))
+    if not pdf_files:
+        print(f"[错误] 目录下无 PDF 文件: {dir_path}")
+        return 1
+
+    print(f"[批分析] 找到 {len(pdf_files)} 个 PDF 文件")
+    print(f"[批分析] 目录: {dir_path}")
+    print()
+
+    batch_results: List[Dict] = []
+    for idx, pdf_path in enumerate(pdf_files, 1):
+        print(f"[{idx}/{len(pdf_files)}] {pdf_path.name} ...", end=" ", flush=True)
+        try:
+            result = tester.analyze_pdf(pdf_path, pages=pages)
+            result["file_name"] = pdf_path.name
+            batch_results.append(result)
+            regions = result["total_regions"]
+            t = result.get("table_count", 0)
+            f = result.get("figure_count", 0)
+            print(f"OK ({result['total_pages']}页, {regions}区域, table={t}, figure={f})")
+        except Exception as e:
+            print(f"失败: {e}")
+            batch_results.append({"file_name": pdf_path.name, "error": str(e)})
+
+    if args.json:
+        summary = []
+        for r in batch_results:
+            if "error" in r:
+                summary.append(r)
+            else:
+                summary.append({
+                    "file": r["file_name"],
+                    "pages": r["total_pages"],
+                    "regions": r["total_regions"],
+                    "label_distribution": r["label_distribution"],
+                    "table_count": r.get("table_count", 0),
+                    "figure_count": r.get("figure_count", 0),
+                })
+        print(json.dumps(summary, ensure_ascii=False, indent=2))
+    else:
+        print_batch_report(batch_results)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

Plik diff jest za duży
+ 0 - 0
utils_test/standard_new_Test/施工方案审查流程图.svg


Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików