1 месяц назад · 32c8eb873b
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
				 
			
 
				 """
			
 
				 PDF 结构提取器。
			
 
				+不依赖ocr的目录提取，使用基于规则的正文结构切分。
			
 
				 
			
 
				 """
			
 
				 
			
@@ -10,16 +11,9 @@ from dataclasses import dataclass
 
				 from typing import Any, Dict, List, Optional, Tuple
			
 
				 
			
 
				 import fitz
			
 
				+from foundation.observability.logger.loggering import review_logger as logger
			
 
				 
			
 
				-try:
			
 
				-    from .ocr_processor import OcrProcessor, OcrResult, TableRegion
			
 
				-except ImportError:  # pragma: no cover - direct script-style imports
			
 
				-    try:
			
 
				-        from ocr_processor import OcrProcessor, OcrResult, TableRegion  # type: ignore
			
 
				-    except ImportError:  # pragma: no cover - OCR dependencies are optional
			
 
				-        OcrProcessor = None  # type: ignore
			
 
				-        OcrResult = Any  # type: ignore
			
 
				-        TableRegion = Any  # type: ignore
			
 
				+from .ocr_processor import OcrProcessor, OcrResult, TableRegion
			
 
				 
			
 
				 
			
 
				 SECTION_TITLE_KEY = "章节标题"
			
@@ -44,7 +38,10 @@ class PdfStructureExtractor:
 
				 
			
 
				     RULE_LIB = {
			
 
				         "Rule_1_纯数字派": {
			
 
				-            "l1": re.compile(r"^\d{1,2}(?:[\.．。])?\s+(?!\d)[\u4e00-\u9fa5A-Za-z].*"),
			
 
				+            "l1": re.compile(
			
 
				+                r"^\d{1,2}(?:[\.．。])?\s+"
			
 
				+                r"(?:(?!\d)[\u4e00-\u9fa5A-Za-z].*|[、,，]\s*[\u4e00-\u9fa5A-Za-z0-9].*)"
			
 
				+            ),
			
 
				             "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
			
 
				         },
			
 
				         "Rule_2_混合章派": {
			
@@ -103,7 +100,7 @@ class PdfStructureExtractor:
 
				         ocr_timeout: int = 600,
			
 
				         ocr_api_key: str = "",
			
 
				         detect_toc: bool = True,
			
 
				-        toc_model_path: str = "",
			
 
				+        toc_model_path: str = "config/yolo/best.pt",
			
 
				     ):
			
 
				         """初始化提取参数，并在依赖可用时启用 OCR。"""
			
 
				 
			
@@ -120,11 +117,12 @@ class PdfStructureExtractor:
 
				                 ocr_api_key=ocr_api_key,
			
 
				             )
			
 
				             self.use_ocr = self.ocr_processor.is_available()
			
 
				-        self.detect_toc = False
			
 
				+        self.detect_toc = detect_toc
			
 
				         self.ocr_api_url = ocr_api_url
			
 
				         self.ocr_timeout = ocr_timeout
			
 
				         self.ocr_api_key = ocr_api_key
			
 
				         self.toc_model_path = toc_model_path
			
 
				+        self._toc_extractor = None
			
 
				 
			
 
				     def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
			
 
				         """提取章节、正文派生目录、规则诊断信息，以及可选的表格 OCR 内容。"""
			
@@ -135,7 +133,7 @@ class PdfStructureExtractor:
 
				             "catalog": None,
			
 
				             "body_catalog": None,
			
 
				             "ocr_catalog": None,
			
 
				-            "catalog_mode": "testc_body_only",
			
 
				+            "catalog_mode": "none",
			
 
				             "body_rule": None,
			
 
				             "body_coverage": 0.0,
			
 
				             "rule_performance": {},
			
@@ -145,6 +143,13 @@ class PdfStructureExtractor:
 
				             "ocr_inserted_count": 0,
			
 
				         }
			
 
				 
			
 
				+        ocr_catalog: Optional[Dict[str, Any]] = None
			
 
				+        # if self.detect_toc:
			
 
				+        #     try:
			
 
				+        #         ocr_catalog = self._extract_catalog(file_content, progress_callback)
			
 
				+        #     except Exception as exc:
			
 
				+        #         logger.warning(f"[PDF提取] OCR目录提取失败: {exc}")
			
 
				+
			
 
				         doc = fitz.open(stream=file_content, filetype="pdf")
			
 
				         try:
			
 
				             # 正文切分仍由 PyMuPDF 文本和标题规则驱动，OCR 只在切分后作为小节内容补充。
			
@@ -157,14 +162,21 @@ class PdfStructureExtractor:
 
				 
			
 
				             result["chapters"] = chapters
			
 
				             result["total_pages"] = len(doc)
			
 
				-            result["catalog"] = body_catalog
			
 
				             result["body_catalog"] = body_catalog
			
 
				+            #result["ocr_catalog"] = ocr_catalog
			
 
				+            result["catalog"] = body_catalog or ocr_catalog
			
 
				             result["body_rule"] = winning_rule
			
 
				             result["body_coverage"] = coverage_rate
			
 
				             result["rule_performance"] = rule_performance
			
 
				             result["ocr_table_count"] = ocr_stats["table_count"]
			
 
				             result["ocr_success_count"] = ocr_stats["success_count"]
			
 
				             result["ocr_inserted_count"] = ocr_stats["inserted_count"]
			
 
				+            if body_catalog and ocr_catalog:
			
 
				+                result["catalog_mode"] = "body_and_ocr"
			
 
				+            elif body_catalog:
			
 
				+                result["catalog_mode"] = "body_only"
			
 
				+            elif ocr_catalog:
			
 
				+                result["catalog_mode"] = "ocr_only"
			
 
				             # 记录 OCR 是否实际影响输出，方便批处理统计时判断 OCR 状态。
			
 
				             # disabled：默认值，表示本次没有请求 OCR。
			
 
				             # unavailable：请求了 OCR，但依赖不可用，例如 rapid_layout 未安装或检测器不可用。
			
@@ -183,6 +195,31 @@ class PdfStructureExtractor:
 
				         finally:
			
 
				             doc.close()
			
 
				 
			
 
				+    def _extract_catalog(self, file_content: bytes, progress_callback=None) -> Optional[Dict[str, Any]]:
			
 
				+        """
			
 
				+        提取目录结构（YOLO检测 + OCR识别）
			
 
				+
			
 
				+        Returns:
			
 
				+            {"chapters": [...], "total_chapters": N} 或 None
			
 
				+        """
			
 
				+        from .toc_detector import TOCCatalogExtractor
			
 
				+
			
 
				+        if self._toc_extractor is None:
			
 
				+            self._toc_extractor = TOCCatalogExtractor(
			
 
				+                model_path=self.toc_model_path,
			
 
				+                ocr_api_url=self.ocr_api_url,
			
 
				+                ocr_api_key=self.ocr_api_key,
			
 
				+                ocr_timeout=self.ocr_timeout,
			
 
				+            )
			
 
				+
			
 
				+        catalog = self._toc_extractor.detect_and_extract(file_content, progress_callback)
			
 
				+        if not catalog:
			
 
				+            return None
			
 
				+
			
 
				+        normalized_catalog = dict(catalog)
			
 
				+        normalized_catalog.setdefault("source", "ocr_toc")
			
 
				+        return normalized_catalog
			
 
				+
			
 
				     def _extract_table_ocr_results(self, doc: fitz.Document, progress_callback=None) -> List[OcrResult]:
			
 
				         """在 OCR 启用时检测 PDF 表格区域，并发执行表格识别。"""
			
 
				 
			
@@ -338,6 +375,16 @@ class PdfStructureExtractor:
 
				                     continue
			
 
				                 page_lines.append(stripped)
			
 
				 
			
 
				+            recovered_headings, clipped_fragment_keys = self._recover_top_clipped_l1_headings(page, page_lines)
			
 
				+            if clipped_fragment_keys:
			
 
				+                page_lines = [
			
 
				+                    line
			
 
				+                    for line in page_lines
			
 
				+                    if self._normalize_repeated_line_key(line) not in clipped_fragment_keys
			
 
				+                ]
			
 
				+            if recovered_headings:
			
 
				+                page_lines = recovered_headings + page_lines
			
 
				+
			
 
				             page_lines_by_page.append((page_index + 1, page_lines))
			
 
				 
			
 
				             if progress_callback and (page_index + 1 == total_pages or (page_index + 1) % 10 == 0):
			
@@ -360,6 +407,127 @@ class PdfStructureExtractor:
 
				                 body_lines.append(BodyLine(page=page, text=line))
			
 
				         return body_lines
			
 
				 
			
 
				+    def _recover_top_clipped_l1_headings(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        page_lines: List[str],
			
 
				+    ) -> Tuple[List[str], set[str]]:
			
 
				+        """恢复被顶部裁剪线切坏的一级标题，并返回需要清理的碎片 key。"""
			
 
				+
			
 
				+        try:
			
 
				+            page_dict = page.get_text("dict")
			
 
				+        except Exception:
			
 
				+            return [], set()
			
 
				+
			
 
				+        recovered_headings: List[str] = []
			
 
				+        fragment_keys: set[str] = set()
			
 
				+        existing_keys = {self._normalize_repeated_line_key(line) for line in page_lines}
			
 
				+        top_band_limit = min(page.rect.height, self.clip_top + 40)
			
 
				+        sorted_blocks = sorted(
			
 
				+            (block for block in page_dict.get("blocks", []) if block.get("type") == 0),
			
 
				+            key=lambda item: item.get("bbox", [0, 0, 0, 0])[1],
			
 
				+        )
			
 
				+
			
 
				+        for block in sorted_blocks:
			
 
				+            bbox = block.get("bbox") or ()
			
 
				+            if len(bbox) != 4:
			
 
				+                continue
			
 
				+
			
 
				+            x0, y0, x1, y1 = bbox
			
 
				+            if not (y0 < self.clip_top < y1):
			
 
				+                continue
			
 
				+            if y0 < max(0.0, self.clip_top - 35):
			
 
				+                continue
			
 
				+            if y1 > top_band_limit:
			
 
				+                continue
			
 
				+
			
 
				+            full_text = self._extract_text_block_text(block)
			
 
				+            if not full_text:
			
 
				+                continue
			
 
				+
			
 
				+            full_lines = [line.strip() for line in self._prepare_page_lines(full_text) if line.strip()]
			
 
				+            full_heading = next(
			
 
				+                (
			
 
				+                    line
			
 
				+                    for line in full_lines
			
 
				+                    if self._matches_any_l1_heading(line) and self._is_valid_heading_strict(line, is_l1=True)
			
 
				+                ),
			
 
				+                None,
			
 
				+            )
			
 
				+            if not full_heading:
			
 
				+                continue
			
 
				+
			
 
				+            full_key = self._normalize_repeated_line_key(full_heading)
			
 
				+            if full_key in existing_keys:
			
 
				+                continue
			
 
				+
			
 
				+            clipped_rect = fitz.Rect(x0, self.clip_top, x1, min(y1, page.rect.height))
			
 
				+            clipped_text = page.get_text("text", clip=clipped_rect)
			
 
				+            clipped_lines = [line.strip() for line in self._prepare_page_lines(clipped_text) if line.strip()]
			
 
				+            if any(self._matches_any_l1_heading(line) for line in clipped_lines):
			
 
				+                continue
			
 
				+            if not self._looks_like_clipped_heading_loss(full_heading, clipped_lines):
			
 
				+                continue
			
 
				+
			
 
				+            recovered_headings.append(full_heading)
			
 
				+            existing_keys.add(full_key)
			
 
				+            fragment_keys.update(
			
 
				+                self._normalize_repeated_line_key(line)
			
 
				+                for line in clipped_lines
			
 
				+                if line and self._normalize_repeated_line_key(line) != full_key
			
 
				+            )
			
 
				+
			
 
				+        return recovered_headings, fragment_keys
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _extract_text_block_text(cls, block: Dict[str, Any]) -> str:
			
 
				+        """从 PyMuPDF 的 dict block 中按行还原文本。"""
			
 
				+
			
 
				+        block_lines: List[str] = []
			
 
				+        for line in block.get("lines", []) or []:
			
 
				+            spans = line.get("spans", []) or []
			
 
				+            line_text = "".join(str(span.get("text", "") or "") for span in spans).strip()
			
 
				+            if line_text:
			
 
				+                block_lines.append(line_text)
			
 
				+        return "\n".join(block_lines)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _matches_any_l1_heading(cls, line: str) -> bool:
			
 
				+        """判断文本是否命中任意一级标题规则。"""
			
 
				+
			
 
				+        clean_line = cls._strip_leading_page_number_from_heading(str(line or "").strip())
			
 
				+        if not clean_line or cls._is_toc_line(clean_line):
			
 
				+            return False
			
 
				+        return any(rule["l1"].match(clean_line) for rule in cls.RULE_LIB.values())
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _looks_like_clipped_heading_loss(cls, full_heading: str, clipped_lines: List[str]) -> bool:
			
 
				+        """判断裁剪后的文本是否只是完整一级标题的残片。"""
			
 
				+
			
 
				+        full_key = cls._normalize_repeated_line_key(full_heading)
			
 
				+        if not full_key:
			
 
				+            return False
			
 
				+
			
 
				+        clipped_keys: List[str] = []
			
 
				+        for line in clipped_lines:
			
 
				+            key = cls._normalize_repeated_line_key(line)
			
 
				+            if key:
			
 
				+                clipped_keys.append(key)
			
 
				+
			
 
				+        if not clipped_keys:
			
 
				+            return True
			
 
				+        if len(clipped_keys) > 3:
			
 
				+            return False
			
 
				+        if any(key == full_key for key in clipped_keys):
			
 
				+            return False
			
 
				+
			
 
				+        combined_key = "".join(clipped_keys)
			
 
				+        if combined_key == full_key:
			
 
				+            return True
			
 
				+        if combined_key and combined_key in full_key:
			
 
				+            return True
			
 
				+        return all(key in full_key for key in clipped_keys)
			
 
				+
			
 
				     def _extract_body_with_best_rule(
			
 
				         self,
			
 
				         body_lines: List[BodyLine],
			
@@ -367,10 +535,12 @@ class PdfStructureExtractor:
 
				         """运行所有候选标题规则，并返回评分最高的正文结构。"""
			
 
				 
			
 
				         total_raw_chars = sum(len(item.text.strip()) for item in body_lines if item.text.strip())
			
 
				+        preferred_cn_l2_style = self._detect_document_cn_order_l2_style(body_lines)
			
 
				         best_score = -9999
			
 
				         best_rule_name: Optional[str] = None
			
 
				         best_data: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
			
 
				         best_coverage = 0.0
			
 
				+        best_rule_style_preference = 0
			
 
				         rule_performance: Dict[str, Any] = {}
			
 
				 
			
 
				         for rule_name, rule_set in self.RULE_LIB.items():
			
@@ -381,24 +551,49 @@ class PdfStructureExtractor:
 
				                 len([key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY])
			
 
				                 for sections in data.values()
			
 
				             )
			
 
				+            rule_guard_reason: Optional[str] = None
			
 
				             if (
			
 
				                 rule_name == CN_LIST_L1_NUMERIC_L2_RULE
			
 
				-                and not self._is_viable_cn_list_l1_numeric_l2_structure(data, l1_count, l2_count)
			
 
				             ):
			
 
				-                score -= 1500
			
 
				+                is_viable, rule_guard_reason = self._inspect_cn_list_l1_numeric_l2_structure(
			
 
				+                    body_lines,
			
 
				+                    data,
			
 
				+                    l1_count,
			
 
				+                    l2_count,
			
 
				+                )
			
 
				+                if not is_viable:
			
 
				+                    score -= 1500
			
 
				+            rule_style_preference = self._score_rule_cn_l2_style_preference(rule_name, preferred_cn_l2_style)
			
 
				             rule_performance[rule_name] = {
			
 
				                 "score": score,
			
 
				                 "coverage_rate": f"{coverage_rate * 100:.1f}%",
			
 
				                 "l1_count": l1_count,
			
 
				                 "l2_count": l2_count,
			
 
				             }
			
 
				+            if rule_guard_reason:
			
 
				+                rule_performance[rule_name]["guard_reason"] = rule_guard_reason
			
 
				+            if rule_style_preference > 0:
			
 
				+                rule_performance[rule_name]["style_preference"] = rule_style_preference
			
 
				 
			
 
				             # 规则选择以综合得分为主，覆盖率保留用于兜底过滤和诊断输出。
			
 
				-            if score > best_score:
			
 
				+            if (
			
 
				+                score > best_score
			
 
				+                or (
			
 
				+                    score == best_score
			
 
				+                    and rule_style_preference > best_rule_style_preference
			
 
				+                    and abs(coverage_rate - best_coverage) <= 0.03
			
 
				+                )
			
 
				+                or (
			
 
				+                    score == best_score
			
 
				+                    and rule_style_preference == best_rule_style_preference
			
 
				+                    and coverage_rate > best_coverage
			
 
				+                )
			
 
				+            ):
			
 
				                 best_score = score
			
 
				                 best_rule_name = rule_name
			
 
				                 best_data = data
			
 
				                 best_coverage = coverage_rate
			
 
				+                best_rule_style_preference = rule_style_preference
			
 
				 
			
 
				         if best_score <= 0 or best_coverage < 0.15:
			
 
				             return {}, best_rule_name, best_coverage, rule_performance
			
@@ -420,6 +615,8 @@ class PdfStructureExtractor:
 
				         pending_prefix: Optional[str] = None
			
 
				         pending_page: Optional[int] = None
			
 
				         last_l2_sub_num = 0
			
 
				+        chapter_l2_style_hint: Optional[str] = None
			
 
				+        chapter_line_offset = 0
			
 
				 
			
 
				         backup_l1: Optional[str] = None
			
 
				         backup_l1_num = 0
			
@@ -497,6 +694,8 @@ class PdfStructureExtractor:
 
				                                 current_l1_num = l1_candidate_num
			
 
				                                 current_l2 = None
			
 
				                                 last_l2_sub_num = 0
			
 
				+                                chapter_l2_style_hint = None
			
 
				+                                chapter_line_offset = 0
			
 
				                             continue
			
 
				 
			
 
				                     backup_l1 = current_l1
			
@@ -509,8 +708,20 @@ class PdfStructureExtractor:
 
				                     structured_data.setdefault(current_l1, {"_chapter_page": page})  # type: ignore[assignment]
			
 
				                     current_l2 = None
			
 
				                     last_l2_sub_num = 0
			
 
				+                    chapter_l2_style_hint = None
			
 
				+                    chapter_line_offset = 0
			
 
				                     continue
			
 
				 
			
 
				+            if current_l1 and not has_toc:
			
 
				+                chapter_line_offset += 1
			
 
				+                if (
			
 
				+                    chapter_l2_style_hint is None
			
 
				+                    and chapter_line_offset <= 30
			
 
				+                    and rule_name in {"Rule_4_传统公文派", "Rule_5_单边括号派"}
			
 
				+                    and self._is_valid_heading_strict(line, is_l1=False)
			
 
				+                ):
			
 
				+                    chapter_l2_style_hint = self._detect_cn_order_l2_style(line)
			
 
				+
			
 
				             match_l2 = rule_set["l2"].match(line)
			
 
				             if current_l1 and match_l2 and not has_toc:
			
 
				                 if self._is_valid_heading_strict(line, is_l1=False):
			
@@ -550,14 +761,22 @@ class PdfStructureExtractor:
 
				                             self._ensure_section_node(structured_data, current_l1, current_l2, page)
			
 
				                             continue
			
 
				                     else:
			
 
				-                        l2_sub_num = self._extract_non_numeric_l2_number(match_l2.group(1))
			
 
				-                        if l2_sub_num <= last_l2_sub_num:
			
 
				+                        candidate_l2_style = self._detect_cn_order_l2_style(line)
			
 
				+                        if (
			
 
				+                            chapter_l2_style_hint is not None
			
 
				+                            and candidate_l2_style is not None
			
 
				+                            and candidate_l2_style != chapter_l2_style_hint
			
 
				+                        ):
			
 
				                             pass
			
 
				                         else:
			
 
				-                            current_l2 = self._clean_section_title(line)
			
 
				-                            last_l2_sub_num = l2_sub_num
			
 
				-                            self._ensure_section_node(structured_data, current_l1, current_l2, page)
			
 
				-                            continue
			
 
				+                            l2_sub_num = self._extract_non_numeric_l2_number(match_l2.group(1))
			
 
				+                            if l2_sub_num <= last_l2_sub_num:
			
 
				+                                pass
			
 
				+                            else:
			
 
				+                                current_l2 = self._clean_section_title(line)
			
 
				+                                last_l2_sub_num = l2_sub_num
			
 
				+                                self._ensure_section_node(structured_data, current_l1, current_l2, page)
			
 
				+                                continue
			
 
				 
			
 
				             if current_l1 and not has_toc:
			
 
				                 target_key = current_l2 or SECTION_TITLE_KEY
			
@@ -613,23 +832,166 @@ class PdfStructureExtractor:
 
				 
			
 
				         return False
			
 
				 
			
 
				-    @staticmethod
			
 
				-    def _is_viable_cn_list_l1_numeric_l2_structure(
			
 
				+    def _inspect_cn_list_l1_numeric_l2_structure(
			
 
				+        self,
			
 
				+        body_lines: List[BodyLine],
			
 
				         raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
			
 
				         l1_count: int,
			
 
				         l2_count: int,
			
 
				-    ) -> bool:
			
 
				-        """限制新规则只在真正形成“中文章 + 数字小节”结构时参与竞争。"""
			
 
				+    ) -> Tuple[bool, Optional[str]]:
			
 
				+        """限制 Rule_8 只在真正缺少显式章节结构时作为兜底参与竞争。"""
			
 
				 
			
 
				         if l1_count < 2 or l2_count < 3:
			
 
				-            return False
			
 
				+            return False, "insufficient_structure"
			
 
				+
			
 
				+        if self._has_stable_explicit_chapter_headings(body_lines):
			
 
				+            return False, "explicit_chapter_structure"
			
 
				+
			
 
				+        if self._has_excessive_cn_list_l1_resets(raw_data):
			
 
				+            return False, "cn_list_l1_resets"
			
 
				 
			
 
				         chapters_with_l2 = sum(
			
 
				             1
			
 
				             for sections in raw_data.values()
			
 
				             if any(key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY)
			
 
				         )
			
 
				-        return chapters_with_l2 >= max(2, (l1_count + 1) // 2)
			
 
				+        if chapters_with_l2 < max(2, (l1_count + 1) // 2):
			
 
				+            return False, "too_few_chapters_with_l2"
			
 
				+
			
 
				+        return True, None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _has_stable_explicit_chapter_headings(cls, body_lines: List[BodyLine]) -> bool:
			
 
				+        """判断正文前段是否已经存在稳定的“第X章”显式章节结构。"""
			
 
				+
			
 
				+        chapter_numbers: List[int] = []
			
 
				+
			
 
				+        for item in body_lines:
			
 
				+            line = cls._strip_leading_page_number_from_heading(item.text.strip())
			
 
				+            if not line or cls._is_toc_line(line):
			
 
				+                continue
			
 
				+
			
 
				+            chapter_match = re.match(
			
 
				+                r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*[章节部部分篇]",
			
 
				+                line,
			
 
				+            )
			
 
				+            if not chapter_match:
			
 
				+                continue
			
 
				+
			
 
				+            token = chapter_match.group(1)
			
 
				+            chapter_num = int(token) if token.isdigit() else cls._cn_to_int(token)
			
 
				+            if chapter_num <= 0:
			
 
				+                continue
			
 
				+            if chapter_numbers and chapter_numbers[-1] == chapter_num:
			
 
				+                continue
			
 
				+
			
 
				+            chapter_numbers.append(chapter_num)
			
 
				+            if len(chapter_numbers) >= 4:
			
 
				+                break
			
 
				+
			
 
				+        return len(set(chapter_numbers)) >= 2
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _has_excessive_cn_list_l1_resets(
			
 
				+        cls,
			
 
				+        raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
			
 
				+    ) -> bool:
			
 
				+        """判断 Rule_8 的一级序号是否出现明显重复回跳，避免章内标题被抬成顶层。"""
			
 
				+
			
 
				+        l1_sequence: List[int] = []
			
 
				+        for chapter_title in raw_data.keys():
			
 
				+            match = re.match(r"^([一二三四五六七八九十百零两]+)[、）)\]]", str(chapter_title or "").strip())
			
 
				+            if not match:
			
 
				+                continue
			
 
				+            chapter_num = cls._cn_to_int(match.group(1))
			
 
				+            if chapter_num > 0:
			
 
				+                l1_sequence.append(chapter_num)
			
 
				+
			
 
				+        if len(l1_sequence) < 3:
			
 
				+            return False
			
 
				+
			
 
				+        backward_jumps = 0
			
 
				+        severe_resets = 0
			
 
				+        for prev_num, curr_num in zip(l1_sequence, l1_sequence[1:]):
			
 
				+            if curr_num < prev_num:
			
 
				+                backward_jumps += 1
			
 
				+                if prev_num >= 3 and curr_num <= 2:
			
 
				+                    severe_resets += 1
			
 
				+
			
 
				+        return severe_resets >= 1 or backward_jumps >= 2
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _detect_cn_order_l2_style(cls, line: str) -> Optional[str]:
			
 
				+        """识别中文序号小节标题的样式，区分“ 一）”和“ 一、/一 空格”。"""
			
 
				+
			
 
				+        cleaned = cls._strip_catalog_page_suffix(line)
			
 
				+        cleaned = re.sub(r"\s+", " ", str(cleaned or "").strip())
			
 
				+        if not cleaned:
			
 
				+            return None
			
 
				+
			
 
				+        bracket_match = re.match(
			
 
				+            r"^[一二三四五六七八九十百零两]+[）)\]]\s*[\u4e00-\u9fa5A-Za-z].*",
			
 
				+            cleaned,
			
 
				+        )
			
 
				+        if bracket_match:
			
 
				+            return "bracket"
			
 
				+
			
 
				+        plain_match = re.match(
			
 
				+            r"^[一二三四五六七八九十百零两]+(?:、|\s+)\s*[\u4e00-\u9fa5A-Za-z].*",
			
 
				+            cleaned,
			
 
				+        )
			
 
				+        if plain_match:
			
 
				+            return "plain"
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    def _detect_document_cn_order_l2_style(self, body_lines: List[BodyLine]) -> Optional[str]:
			
 
				+        """按章节扫描正文早期小节样式，为 Rule_4/5 平分时提供稳定偏好。"""
			
 
				+
			
 
				+        plain_count = 0
			
 
				+        bracket_count = 0
			
 
				+        lines_since_chapter = -1
			
 
				+
			
 
				+        for item in body_lines:
			
 
				+            line = self._strip_leading_page_number_from_heading(item.text.strip())
			
 
				+            if not line or self._is_toc_line(line):
			
 
				+                continue
			
 
				+
			
 
				+            if re.match(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]", line):
			
 
				+                lines_since_chapter = 0
			
 
				+                continue
			
 
				+
			
 
				+            if lines_since_chapter < 0:
			
 
				+                continue
			
 
				+
			
 
				+            lines_since_chapter += 1
			
 
				+            if lines_since_chapter > 30:
			
 
				+                lines_since_chapter = -1
			
 
				+                continue
			
 
				+
			
 
				+            style = self._detect_cn_order_l2_style(line)
			
 
				+            if style is None or not self._is_valid_heading_strict(line, is_l1=False):
			
 
				+                continue
			
 
				+
			
 
				+            if style == "plain":
			
 
				+                plain_count += 1
			
 
				+            elif style == "bracket":
			
 
				+                bracket_count += 1
			
 
				+            lines_since_chapter = -1
			
 
				+
			
 
				+        if plain_count == bracket_count:
			
 
				+            return None
			
 
				+        return "plain" if plain_count > bracket_count else "bracket"
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _score_rule_cn_l2_style_preference(rule_name: str, preferred_style: Optional[str]) -> int:
			
 
				+        """把文档级样式偏好映射到规则选择的平分决胜分。"""
			
 
				+
			
 
				+        if preferred_style == "plain" and rule_name == "Rule_4_传统公文派":
			
 
				+            return 1
			
 
				+        if preferred_style == "bracket" and rule_name == "Rule_5_单边括号派":
			
 
				+            return 1
			
 
				+        return 0
			
 
				 
			
 
				     def _convert_rule_output_to_chapters(
			
 
				         self,
			
@@ -1064,6 +1426,8 @@ class PdfStructureExtractor:
 
				             "设计",
			
 
				             "部署",
			
 
				             "安排",
			
 
				+            "方法",
			
 
				+            "参数",
			
 
				         )
			
 
				         return not any(keyword in compact for keyword in chapter_keywords)
			
 
				 
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py
@@ -60,6 +60,8 @@ CATALOG_L2_PATTERNS = (
 
				 )
			
 
				 CATALOG_CN_LIST_PATTERN = re.compile(r"^[一二三四五六七八九十百零两]+[、）\)\]]\s*[\u4e00-\u9fa5A-Za-z].*")
			
 
				 CATALOG_NUMERIC_SECTION_PATTERN = re.compile(r"^\d+\.\d+(?!\.\d)\.?\s*[\u4e00-\u9fa5A-Za-z].*")
			
 
				+CATALOG_SPLIT_NUMERIC_L1_PATTERN = re.compile(r"^\d{1,2}(?:[\.．。、])?\s*$")
			
 
				+CATALOG_SPLIT_NUMERIC_L2_PATTERN = re.compile(r"^\d+\.\d+(?!\.\d)\.?\s*$")
			
 
				 
			
 
				 
			
 
				 class _SilentLogger:
			
@@ -358,12 +360,44 @@ def _merge_split_catalog_heading_lines(lines: List[str]) -> List[str]:
 
				                 merged.append(f"{line} {next_line}")
			
 
				                 index += 2
			
 
				                 continue
			
 
				+        if index + 1 < len(lines) and (
			
 
				+            CATALOG_SPLIT_NUMERIC_L1_PATTERN.match(line) or CATALOG_SPLIT_NUMERIC_L2_PATTERN.match(line)
			
 
				+        ):
			
 
				+            next_line = lines[index + 1].strip()
			
 
				+            # 目录页常把“1.”、“1.1.”单独放一行，下一行才是标题。
			
 
				+            # 这里只在下一行明显不像目录噪声、也不像另一个编号时做合并，
			
 
				+            # 尽量只修评分基准中的“分行目录”问题，不影响已有正文抽取逻辑。
			
 
				+            if _looks_like_split_catalog_title(next_line):
			
 
				+                merged.append(f"{line} {next_line}")
			
 
				+                index += 2
			
 
				+                continue
			
 
				         merged.append(line)
			
 
				         index += 1
			
 
				 
			
 
				     return merged
			
 
				 
			
 
				 
			
 
				+def _looks_like_split_catalog_title(line: str) -> bool:
			
 
				+    cleaned = re.sub(r"\s+", " ", str(line or "").strip())
			
 
				+    if not cleaned:
			
 
				+        return False
			
 
				+
			
 
				+    compact = re.sub(r"\s+", "", cleaned)
			
 
				+    if compact in {"目录", "目", "录"}:
			
 
				+        return False
			
 
				+    if re.fullmatch(r"[IVXLCDM]+", compact, re.IGNORECASE):
			
 
				+        return False
			
 
				+    if TOC_LINE_PATTERN.search(cleaned) or TOC_PAGE_SUFFIX_PATTERN.search(cleaned):
			
 
				+        return False
			
 
				+    if any(pattern.match(cleaned) for pattern in CATALOG_L1_PATTERNS):
			
 
				+        return False
			
 
				+    if any(pattern.match(cleaned) for pattern in CATALOG_L2_PATTERNS):
			
 
				+        return False
			
 
				+    if CATALOG_SPLIT_NUMERIC_L1_PATTERN.match(cleaned) or CATALOG_SPLIT_NUMERIC_L2_PATTERN.match(cleaned):
			
 
				+        return False
			
 
				+    return bool(re.match(r"^[\u4e00-\u9fa5A-Za-z]", cleaned))
			
 
				+
			
 
				+
			
 
				 def _classify_catalog_line_level(
			
 
				     line: str,
			
 
				     next_line: str,
			
@@ -454,6 +488,10 @@ def compute_extracted_char_count(result: Dict[str, Any]) -> int:
 
				 
			
 
				 
			
 
				 def compute_quality_rate(raw_char_count: int, extracted_char_count: int) -> Tuple[float, str]:
			
 
				+    # 内容合格率规则：
			
 
				+    # 1. 分母使用原 PDF 正文范围内的字符数（会跳过前置目录页）。
			
 
				+    # 2. 分子使用提取结果中的章节标题、节标题和正文 content 字符数总和。
			
 
				+    # 3. 最终得分 = extracted_char_count / raw_char_count，并截断到 [0, 1]。
			
 
				     if raw_char_count <= 0:
			
 
				         return 0.0, "0.0%"
			
 
				 
			
@@ -581,6 +619,11 @@ def _strip_catalog_heading_prefix(text: str) -> str:
 
				 
			
 
				 
			
 
				 def _catalog_title_similarity(left: str, right: str) -> float:
			
 
				+    # 目录标题相似度规则：
			
 
				+    # 1. 先比较完整标题（统一去空白、常见标点、页码尾缀后）；
			
 
				+    # 2. 再比较去掉“第X章 / 1.2 / 一、”等编号前缀后的标题主体；
			
 
				+    # 3. 如果主体互相包含且长度足够，给一个接近命中的 0.95；
			
 
				+    # 4. 取多种比较方式中的最高分，尽量容忍编号体系差异和轻微 OCR 噪声。
			
 
				     left_full = _normalize_catalog_title(left)
			
 
				     right_full = _normalize_catalog_title(right)
			
 
				     if not left_full or not right_full:
			
@@ -623,6 +666,9 @@ def _longest_increasing_subsequence_length(values: List[int]) -> int:
 
				 
			
 
				 
			
 
				 def _catalog_count_score(original_count: int, extracted_count: int) -> float:
			
 
				+    # 数量得分只看“提取出来的条数是否接近原 PDF 目录条数”：
			
 
				+    # score = min(original_count, extracted_count) / max(original_count, extracted_count)
			
 
				+    # 两边数量完全一致时为 1.0，差距越大分越低。
			
 
				     max_count = max(original_count, extracted_count)
			
 
				     if max_count <= 0:
			
 
				         return 1.0
			
@@ -634,6 +680,11 @@ def _match_catalog_level(
 
				     extracted_items: List[Dict[str, Any]],
			
 
				     level: int,
			
 
				 ) -> Dict[str, Any]:
			
 
				+    # 分层匹配规则（一级目录和二级目录分别独立计算）：
			
 
				+    # 1. 每个原始目录项只匹配一个未占用的提取项，采用贪心“最高相似度”匹配；
			
 
				+    # 2. 一级目录阈值 0.82，二级目录阈值 0.78，低于阈值视为未命中；
			
 
				+    # 3. 命中后统计 precision / recall / F1，作为标题匹配质量；
			
 
				+    # 4. 再根据命中项的相对顺序计算 order_score，衡量目录顺序是否被保留。
			
 
				     originals = [item for item in original_items if item.get("level") == level]
			
 
				     extracted = [item for item in extracted_items if item.get("level") == level]
			
 
				     used_extracted_indexes: set[int] = set()
			
@@ -688,6 +739,9 @@ def _match_catalog_level(
 
				 
			
 
				 
			
 
				 def _weighted_catalog_score(level_details: Dict[str, Dict[str, Any]], metric: str) -> float:
			
 
				+    # 目录总分会把一级、二级拆开算，再按权重合并：
			
 
				+    # 一级目录权重 0.35，二级目录权重 0.65。
			
 
				+    # 这样做是因为二级目录数量通常更多，也更能反映目录结构是否完整。
			
 
				     weighted_scores: List[Tuple[float, float]] = []
			
 
				     if max(level_details["chapter"]["original"], level_details["chapter"]["extracted"]) > 0:
			
 
				         weighted_scores.append((0.35, float(level_details["chapter"][metric])))
			
@@ -713,6 +767,13 @@ def compute_catalog_quality_rate_from_items(
 
				     original_items: List[Dict[str, Any]],
			
 
				     extracted_items: List[Dict[str, Any]],
			
 
				 ) -> Tuple[float, str, Dict[str, Any]]:
			
 
				+    # 目录合格率总规则：
			
 
				+    # 1. title_score：标题匹配 F1，权重 70%
			
 
				+    # 2. count_score：目录数量接近程度，权重 20%
			
 
				+    # 3. order_score：目录顺序保持程度，权重 10%
			
 
				+    #
			
 
				+    # 其中每个分项都先按“一级 35% + 二级 65%”合并，再做总加权：
			
 
				+    # final = 0.70 * title_score + 0.20 * count_score + 0.10 * order_score
			
 
				     level_details = {
			
 
				         "chapter": _match_catalog_level(original_items, extracted_items, 1),
			
 
				         "section": _match_catalog_level(original_items, extracted_items, 2),
			
@@ -733,6 +794,42 @@ def compute_catalog_quality_rate_from_items(
 
				     return rate, f"{rate * 100:.1f}%", _round_catalog_detail(detail)
			
 
				 
			
 
				 
			
 
				+def compute_overall_quality_rate(
			
 
				+    content_rate: float,
			
 
				+    catalog_rate: float,
			
 
				+    original_catalog_count: int,
			
 
				+) -> Tuple[float, str, Dict[str, Any]]:
			
 
				+    # 总体合格率规则：
			
 
				+    # 1. 如果原 PDF 本身没有可用目录，就退化为只看内容覆盖率；
			
 
				+    # 2. 如果原 PDF 有目录，则把内容分和目录分做“加权几何平均”：
			
 
				+    #    final = content_rate^0.7 * catalog_rate^0.3
			
 
				+    # 3. 这种做法会在目录严重错误时显著拉低总分，避免“正文字符提取得多，但结构明显错了”时分数虚高。
			
 
				+    content_rate = max(0.0, min(content_rate, 1.0))
			
 
				+    catalog_rate = max(0.0, min(catalog_rate, 1.0))
			
 
				+
			
 
				+    if original_catalog_count <= 0:
			
 
				+        rate = content_rate
			
 
				+        detail = {
			
 
				+            "score_model": "content_only_no_original_catalog",
			
 
				+            "content_rate": content_rate,
			
 
				+            "catalog_rate": catalog_rate,
			
 
				+            "original_catalog_count": original_catalog_count,
			
 
				+        }
			
 
				+        return rate, f"{rate * 100:.1f}%", _round_catalog_detail(detail)
			
 
				+
			
 
				+    rate = (content_rate ** 0.7) * (catalog_rate ** 0.3)
			
 
				+    rate = max(0.0, min(rate, 1.0))
			
 
				+    detail = {
			
 
				+        "score_model": "geometric_mean_content_70_catalog_30",
			
 
				+        "content_rate": content_rate,
			
 
				+        "catalog_rate": catalog_rate,
			
 
				+        "content_weight": 0.7,
			
 
				+        "catalog_weight": 0.3,
			
 
				+        "original_catalog_count": original_catalog_count,
			
 
				+    }
			
 
				+    return rate, f"{rate * 100:.1f}%", _round_catalog_detail(detail)
			
 
				+
			
 
				+
			
 
				 def append_static_record(
			
 
				     stat_path: Path,
			
 
				     pdf_path: Path,
			
@@ -839,13 +936,18 @@ def process_pdf(
 
				     file_content = pdf_path.read_bytes()
			
 
				     extractor_result = extractor.extract(file_content)
			
 
				     extracted_char_count = compute_extracted_char_count(extractor_result)
			
 
				-    _, quality_rate_text = compute_quality_rate(raw_char_count, extracted_char_count)
			
 
				+    content_rate, content_quality_rate_text = compute_quality_rate(raw_char_count, extracted_char_count)
			
 
				     extracted_catalog_items = extract_result_catalog_items(extractor_result)
			
 
				     extracted_l1_count, extracted_l2_count = count_catalog_item_levels(extracted_catalog_items)
			
 
				-    _, catalog_quality_rate_text, catalog_quality_detail = compute_catalog_quality_rate_from_items(
			
 
				+    catalog_rate, catalog_quality_rate_text, catalog_quality_detail = compute_catalog_quality_rate_from_items(
			
 
				         original_items=original_catalog_items,
			
 
				         extracted_items=extracted_catalog_items,
			
 
				     )
			
 
				+    _, quality_rate_text, quality_rate_detail = compute_overall_quality_rate(
			
 
				+        content_rate=content_rate,
			
 
				+        catalog_rate=catalog_rate,
			
 
				+        original_catalog_count=original_l1_count + original_l2_count,
			
 
				+    )
			
 
				 
			
 
				     payload = build_output_payload(
			
 
				         pdf_path=pdf_path,
			
@@ -859,6 +961,10 @@ def process_pdf(
 
				             extractor_name=extractor_name,
			
 
				     )
			
 
				     payload["metadata"].update({
			
 
				+        "overall_quality_rate": quality_rate_text,
			
 
				+        "content_quality_rate": content_quality_rate_text,
			
 
				+        "quality_rate_model": quality_rate_detail.get("score_model"),
			
 
				+        "quality_rate_detail": quality_rate_detail,
			
 
				         "original_catalog_chapter_count": original_l1_count,
			
 
				         "original_catalog_section_count": original_l2_count,
			
 
				         "extracted_catalog_chapter_count": extracted_l1_count,
			
@@ -880,7 +986,7 @@ def process_pdf(
 
				         original_l2_count=original_l2_count,
			
 
				         extracted_l2_count=extracted_l2_count,
			
 
				         catalog_quality_rate_text=catalog_quality_rate_text,
			
 
				-        content_quality_rate_text=quality_rate_text,
			
 
				+        content_quality_rate_text=content_quality_rate_text,
			
 
				     )
			
 
				     return output_path, quality_rate_text
			
 
				 
			
@@ -900,6 +1006,7 @@ def main() -> int:
 
				         return 1
			
 
				 
			
 
				     PdfStructureExtractor = load_pdf_structure_extractor(args.extractor)
			
 
				+    effective_detect_toc = (not args.disable_toc) and args.extractor != "pdf_extractor1"
			
 
				     extractor = PdfStructureExtractor(
			
 
				         clip_top=args.clip_top,
			
 
				         clip_bottom=args.clip_bottom,
			
@@ -907,7 +1014,7 @@ def main() -> int:
 
				         ocr_api_url=args.ocr_api_url,
			
 
				         ocr_timeout=args.ocr_timeout,
			
 
				         ocr_api_key=args.ocr_api_key,
			
 
				-        detect_toc=not args.disable_toc,
			
 
				+        detect_toc=effective_detect_toc,
			
 
				         toc_model_path=args.toc_model_path,
			
 
				     )
			
 
				 
			
@@ -928,7 +1035,7 @@ def main() -> int:
 
				                 clip_top=args.clip_top,
			
 
				                 clip_bottom=args.clip_bottom,
			
 
				                 use_ocr=args.use_ocr,
			
 
				-                detect_toc=not args.disable_toc,
			
 
				+                detect_toc=effective_detect_toc,
			
 
				                 extractor_name=args.extractor,
			
 
				             )
			
 
				             success_count += 1