Просмотр исходного кода

fix(正则提取目录和内容)

tangle 12 часов назад
Родитель
Сommit
32c8eb873b

+ 393 - 29
core/construction_review/component/minimal_pipeline/pdf_extractor1.py

@@ -2,6 +2,7 @@ from __future__ import annotations
 
 """
 PDF 结构提取器。
+不依赖ocr的目录提取,使用基于规则的正文结构切分。
 
 """
 
@@ -10,16 +11,9 @@ from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
 
 import fitz
+from foundation.observability.logger.loggering import review_logger as logger
 
-try:
-    from .ocr_processor import OcrProcessor, OcrResult, TableRegion
-except ImportError:  # pragma: no cover - direct script-style imports
-    try:
-        from ocr_processor import OcrProcessor, OcrResult, TableRegion  # type: ignore
-    except ImportError:  # pragma: no cover - OCR dependencies are optional
-        OcrProcessor = None  # type: ignore
-        OcrResult = Any  # type: ignore
-        TableRegion = Any  # type: ignore
+from .ocr_processor import OcrProcessor, OcrResult, TableRegion
 
 
 SECTION_TITLE_KEY = "章节标题"
@@ -44,7 +38,10 @@ class PdfStructureExtractor:
 
     RULE_LIB = {
         "Rule_1_纯数字派": {
-            "l1": re.compile(r"^\d{1,2}(?:[\..。])?\s+(?!\d)[\u4e00-\u9fa5A-Za-z].*"),
+            "l1": re.compile(
+                r"^\d{1,2}(?:[\..。])?\s+"
+                r"(?:(?!\d)[\u4e00-\u9fa5A-Za-z].*|[、,,]\s*[\u4e00-\u9fa5A-Za-z0-9].*)"
+            ),
             "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
         },
         "Rule_2_混合章派": {
@@ -103,7 +100,7 @@ class PdfStructureExtractor:
         ocr_timeout: int = 600,
         ocr_api_key: str = "",
         detect_toc: bool = True,
-        toc_model_path: str = "",
+        toc_model_path: str = "config/yolo/best.pt",
     ):
         """初始化提取参数,并在依赖可用时启用 OCR。"""
 
@@ -120,11 +117,12 @@ class PdfStructureExtractor:
                 ocr_api_key=ocr_api_key,
             )
             self.use_ocr = self.ocr_processor.is_available()
-        self.detect_toc = False
+        self.detect_toc = detect_toc
         self.ocr_api_url = ocr_api_url
         self.ocr_timeout = ocr_timeout
         self.ocr_api_key = ocr_api_key
         self.toc_model_path = toc_model_path
+        self._toc_extractor = None
 
     def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
         """提取章节、正文派生目录、规则诊断信息,以及可选的表格 OCR 内容。"""
@@ -135,7 +133,7 @@ class PdfStructureExtractor:
             "catalog": None,
             "body_catalog": None,
             "ocr_catalog": None,
-            "catalog_mode": "testc_body_only",
+            "catalog_mode": "none",
             "body_rule": None,
             "body_coverage": 0.0,
             "rule_performance": {},
@@ -145,6 +143,13 @@ class PdfStructureExtractor:
             "ocr_inserted_count": 0,
         }
 
+        ocr_catalog: Optional[Dict[str, Any]] = None
+        # if self.detect_toc:
+        #     try:
+        #         ocr_catalog = self._extract_catalog(file_content, progress_callback)
+        #     except Exception as exc:
+        #         logger.warning(f"[PDF提取] OCR目录提取失败: {exc}")
+
         doc = fitz.open(stream=file_content, filetype="pdf")
         try:
             # 正文切分仍由 PyMuPDF 文本和标题规则驱动,OCR 只在切分后作为小节内容补充。
@@ -157,14 +162,21 @@ class PdfStructureExtractor:
 
             result["chapters"] = chapters
             result["total_pages"] = len(doc)
-            result["catalog"] = body_catalog
             result["body_catalog"] = body_catalog
+            #result["ocr_catalog"] = ocr_catalog
+            result["catalog"] = body_catalog or ocr_catalog
             result["body_rule"] = winning_rule
             result["body_coverage"] = coverage_rate
             result["rule_performance"] = rule_performance
             result["ocr_table_count"] = ocr_stats["table_count"]
             result["ocr_success_count"] = ocr_stats["success_count"]
             result["ocr_inserted_count"] = ocr_stats["inserted_count"]
+            if body_catalog and ocr_catalog:
+                result["catalog_mode"] = "body_and_ocr"
+            elif body_catalog:
+                result["catalog_mode"] = "body_only"
+            elif ocr_catalog:
+                result["catalog_mode"] = "ocr_only"
             # 记录 OCR 是否实际影响输出,方便批处理统计时判断 OCR 状态。
             # disabled:默认值,表示本次没有请求 OCR。
             # unavailable:请求了 OCR,但依赖不可用,例如 rapid_layout 未安装或检测器不可用。
@@ -183,6 +195,31 @@ class PdfStructureExtractor:
         finally:
             doc.close()
 
+    def _extract_catalog(self, file_content: bytes, progress_callback=None) -> Optional[Dict[str, Any]]:
+        """
+        提取目录结构(YOLO检测 + OCR识别)
+
+        Returns:
+            {"chapters": [...], "total_chapters": N} 或 None
+        """
+        from .toc_detector import TOCCatalogExtractor
+
+        if self._toc_extractor is None:
+            self._toc_extractor = TOCCatalogExtractor(
+                model_path=self.toc_model_path,
+                ocr_api_url=self.ocr_api_url,
+                ocr_api_key=self.ocr_api_key,
+                ocr_timeout=self.ocr_timeout,
+            )
+
+        catalog = self._toc_extractor.detect_and_extract(file_content, progress_callback)
+        if not catalog:
+            return None
+
+        normalized_catalog = dict(catalog)
+        normalized_catalog.setdefault("source", "ocr_toc")
+        return normalized_catalog
+
     def _extract_table_ocr_results(self, doc: fitz.Document, progress_callback=None) -> List[OcrResult]:
         """在 OCR 启用时检测 PDF 表格区域,并发执行表格识别。"""
 
@@ -338,6 +375,16 @@ class PdfStructureExtractor:
                     continue
                 page_lines.append(stripped)
 
+            recovered_headings, clipped_fragment_keys = self._recover_top_clipped_l1_headings(page, page_lines)
+            if clipped_fragment_keys:
+                page_lines = [
+                    line
+                    for line in page_lines
+                    if self._normalize_repeated_line_key(line) not in clipped_fragment_keys
+                ]
+            if recovered_headings:
+                page_lines = recovered_headings + page_lines
+
             page_lines_by_page.append((page_index + 1, page_lines))
 
             if progress_callback and (page_index + 1 == total_pages or (page_index + 1) % 10 == 0):
@@ -360,6 +407,127 @@ class PdfStructureExtractor:
                 body_lines.append(BodyLine(page=page, text=line))
         return body_lines
 
+    def _recover_top_clipped_l1_headings(
+        self,
+        page: fitz.Page,
+        page_lines: List[str],
+    ) -> Tuple[List[str], set[str]]:
+        """恢复被顶部裁剪线切坏的一级标题,并返回需要清理的碎片 key。"""
+
+        try:
+            page_dict = page.get_text("dict")
+        except Exception:
+            return [], set()
+
+        recovered_headings: List[str] = []
+        fragment_keys: set[str] = set()
+        existing_keys = {self._normalize_repeated_line_key(line) for line in page_lines}
+        top_band_limit = min(page.rect.height, self.clip_top + 40)
+        sorted_blocks = sorted(
+            (block for block in page_dict.get("blocks", []) if block.get("type") == 0),
+            key=lambda item: item.get("bbox", [0, 0, 0, 0])[1],
+        )
+
+        for block in sorted_blocks:
+            bbox = block.get("bbox") or ()
+            if len(bbox) != 4:
+                continue
+
+            x0, y0, x1, y1 = bbox
+            if not (y0 < self.clip_top < y1):
+                continue
+            if y0 < max(0.0, self.clip_top - 35):
+                continue
+            if y1 > top_band_limit:
+                continue
+
+            full_text = self._extract_text_block_text(block)
+            if not full_text:
+                continue
+
+            full_lines = [line.strip() for line in self._prepare_page_lines(full_text) if line.strip()]
+            full_heading = next(
+                (
+                    line
+                    for line in full_lines
+                    if self._matches_any_l1_heading(line) and self._is_valid_heading_strict(line, is_l1=True)
+                ),
+                None,
+            )
+            if not full_heading:
+                continue
+
+            full_key = self._normalize_repeated_line_key(full_heading)
+            if full_key in existing_keys:
+                continue
+
+            clipped_rect = fitz.Rect(x0, self.clip_top, x1, min(y1, page.rect.height))
+            clipped_text = page.get_text("text", clip=clipped_rect)
+            clipped_lines = [line.strip() for line in self._prepare_page_lines(clipped_text) if line.strip()]
+            if any(self._matches_any_l1_heading(line) for line in clipped_lines):
+                continue
+            if not self._looks_like_clipped_heading_loss(full_heading, clipped_lines):
+                continue
+
+            recovered_headings.append(full_heading)
+            existing_keys.add(full_key)
+            fragment_keys.update(
+                self._normalize_repeated_line_key(line)
+                for line in clipped_lines
+                if line and self._normalize_repeated_line_key(line) != full_key
+            )
+
+        return recovered_headings, fragment_keys
+
+    @classmethod
+    def _extract_text_block_text(cls, block: Dict[str, Any]) -> str:
+        """从 PyMuPDF 的 dict block 中按行还原文本。"""
+
+        block_lines: List[str] = []
+        for line in block.get("lines", []) or []:
+            spans = line.get("spans", []) or []
+            line_text = "".join(str(span.get("text", "") or "") for span in spans).strip()
+            if line_text:
+                block_lines.append(line_text)
+        return "\n".join(block_lines)
+
+    @classmethod
+    def _matches_any_l1_heading(cls, line: str) -> bool:
+        """判断文本是否命中任意一级标题规则。"""
+
+        clean_line = cls._strip_leading_page_number_from_heading(str(line or "").strip())
+        if not clean_line or cls._is_toc_line(clean_line):
+            return False
+        return any(rule["l1"].match(clean_line) for rule in cls.RULE_LIB.values())
+
+    @classmethod
+    def _looks_like_clipped_heading_loss(cls, full_heading: str, clipped_lines: List[str]) -> bool:
+        """判断裁剪后的文本是否只是完整一级标题的残片。"""
+
+        full_key = cls._normalize_repeated_line_key(full_heading)
+        if not full_key:
+            return False
+
+        clipped_keys: List[str] = []
+        for line in clipped_lines:
+            key = cls._normalize_repeated_line_key(line)
+            if key:
+                clipped_keys.append(key)
+
+        if not clipped_keys:
+            return True
+        if len(clipped_keys) > 3:
+            return False
+        if any(key == full_key for key in clipped_keys):
+            return False
+
+        combined_key = "".join(clipped_keys)
+        if combined_key == full_key:
+            return True
+        if combined_key and combined_key in full_key:
+            return True
+        return all(key in full_key for key in clipped_keys)
+
     def _extract_body_with_best_rule(
         self,
         body_lines: List[BodyLine],
@@ -367,10 +535,12 @@ class PdfStructureExtractor:
         """运行所有候选标题规则,并返回评分最高的正文结构。"""
 
         total_raw_chars = sum(len(item.text.strip()) for item in body_lines if item.text.strip())
+        preferred_cn_l2_style = self._detect_document_cn_order_l2_style(body_lines)
         best_score = -9999
         best_rule_name: Optional[str] = None
         best_data: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
         best_coverage = 0.0
+        best_rule_style_preference = 0
         rule_performance: Dict[str, Any] = {}
 
         for rule_name, rule_set in self.RULE_LIB.items():
@@ -381,24 +551,49 @@ class PdfStructureExtractor:
                 len([key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY])
                 for sections in data.values()
             )
+            rule_guard_reason: Optional[str] = None
             if (
                 rule_name == CN_LIST_L1_NUMERIC_L2_RULE
-                and not self._is_viable_cn_list_l1_numeric_l2_structure(data, l1_count, l2_count)
             ):
-                score -= 1500
+                is_viable, rule_guard_reason = self._inspect_cn_list_l1_numeric_l2_structure(
+                    body_lines,
+                    data,
+                    l1_count,
+                    l2_count,
+                )
+                if not is_viable:
+                    score -= 1500
+            rule_style_preference = self._score_rule_cn_l2_style_preference(rule_name, preferred_cn_l2_style)
             rule_performance[rule_name] = {
                 "score": score,
                 "coverage_rate": f"{coverage_rate * 100:.1f}%",
                 "l1_count": l1_count,
                 "l2_count": l2_count,
             }
+            if rule_guard_reason:
+                rule_performance[rule_name]["guard_reason"] = rule_guard_reason
+            if rule_style_preference > 0:
+                rule_performance[rule_name]["style_preference"] = rule_style_preference
 
             # 规则选择以综合得分为主,覆盖率保留用于兜底过滤和诊断输出。
-            if score > best_score:
+            if (
+                score > best_score
+                or (
+                    score == best_score
+                    and rule_style_preference > best_rule_style_preference
+                    and abs(coverage_rate - best_coverage) <= 0.03
+                )
+                or (
+                    score == best_score
+                    and rule_style_preference == best_rule_style_preference
+                    and coverage_rate > best_coverage
+                )
+            ):
                 best_score = score
                 best_rule_name = rule_name
                 best_data = data
                 best_coverage = coverage_rate
+                best_rule_style_preference = rule_style_preference
 
         if best_score <= 0 or best_coverage < 0.15:
             return {}, best_rule_name, best_coverage, rule_performance
@@ -420,6 +615,8 @@ class PdfStructureExtractor:
         pending_prefix: Optional[str] = None
         pending_page: Optional[int] = None
         last_l2_sub_num = 0
+        chapter_l2_style_hint: Optional[str] = None
+        chapter_line_offset = 0
 
         backup_l1: Optional[str] = None
         backup_l1_num = 0
@@ -497,6 +694,8 @@ class PdfStructureExtractor:
                                 current_l1_num = l1_candidate_num
                                 current_l2 = None
                                 last_l2_sub_num = 0
+                                chapter_l2_style_hint = None
+                                chapter_line_offset = 0
                             continue
 
                     backup_l1 = current_l1
@@ -509,8 +708,20 @@ class PdfStructureExtractor:
                     structured_data.setdefault(current_l1, {"_chapter_page": page})  # type: ignore[assignment]
                     current_l2 = None
                     last_l2_sub_num = 0
+                    chapter_l2_style_hint = None
+                    chapter_line_offset = 0
                     continue
 
+            if current_l1 and not has_toc:
+                chapter_line_offset += 1
+                if (
+                    chapter_l2_style_hint is None
+                    and chapter_line_offset <= 30
+                    and rule_name in {"Rule_4_传统公文派", "Rule_5_单边括号派"}
+                    and self._is_valid_heading_strict(line, is_l1=False)
+                ):
+                    chapter_l2_style_hint = self._detect_cn_order_l2_style(line)
+
             match_l2 = rule_set["l2"].match(line)
             if current_l1 and match_l2 and not has_toc:
                 if self._is_valid_heading_strict(line, is_l1=False):
@@ -550,14 +761,22 @@ class PdfStructureExtractor:
                             self._ensure_section_node(structured_data, current_l1, current_l2, page)
                             continue
                     else:
-                        l2_sub_num = self._extract_non_numeric_l2_number(match_l2.group(1))
-                        if l2_sub_num <= last_l2_sub_num:
+                        candidate_l2_style = self._detect_cn_order_l2_style(line)
+                        if (
+                            chapter_l2_style_hint is not None
+                            and candidate_l2_style is not None
+                            and candidate_l2_style != chapter_l2_style_hint
+                        ):
                             pass
                         else:
-                            current_l2 = self._clean_section_title(line)
-                            last_l2_sub_num = l2_sub_num
-                            self._ensure_section_node(structured_data, current_l1, current_l2, page)
-                            continue
+                            l2_sub_num = self._extract_non_numeric_l2_number(match_l2.group(1))
+                            if l2_sub_num <= last_l2_sub_num:
+                                pass
+                            else:
+                                current_l2 = self._clean_section_title(line)
+                                last_l2_sub_num = l2_sub_num
+                                self._ensure_section_node(structured_data, current_l1, current_l2, page)
+                                continue
 
             if current_l1 and not has_toc:
                 target_key = current_l2 or SECTION_TITLE_KEY
@@ -613,23 +832,166 @@ class PdfStructureExtractor:
 
         return False
 
-    @staticmethod
-    def _is_viable_cn_list_l1_numeric_l2_structure(
+    def _inspect_cn_list_l1_numeric_l2_structure(
+        self,
+        body_lines: List[BodyLine],
         raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
         l1_count: int,
         l2_count: int,
-    ) -> bool:
-        """限制新规则只在真正形成“中文章 + 数字小节”结构时参与竞争。"""
+    ) -> Tuple[bool, Optional[str]]:
+        """限制 Rule_8 只在真正缺少显式章节结构时作为兜底参与竞争。"""
 
         if l1_count < 2 or l2_count < 3:
-            return False
+            return False, "insufficient_structure"
+
+        if self._has_stable_explicit_chapter_headings(body_lines):
+            return False, "explicit_chapter_structure"
+
+        if self._has_excessive_cn_list_l1_resets(raw_data):
+            return False, "cn_list_l1_resets"
 
         chapters_with_l2 = sum(
             1
             for sections in raw_data.values()
             if any(key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY)
         )
-        return chapters_with_l2 >= max(2, (l1_count + 1) // 2)
+        if chapters_with_l2 < max(2, (l1_count + 1) // 2):
+            return False, "too_few_chapters_with_l2"
+
+        return True, None
+
+    @classmethod
+    def _has_stable_explicit_chapter_headings(cls, body_lines: List[BodyLine]) -> bool:
+        """判断正文前段是否已经存在稳定的“第X章”显式章节结构。"""
+
+        chapter_numbers: List[int] = []
+
+        for item in body_lines:
+            line = cls._strip_leading_page_number_from_heading(item.text.strip())
+            if not line or cls._is_toc_line(line):
+                continue
+
+            chapter_match = re.match(
+                r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*[章节部部分篇]",
+                line,
+            )
+            if not chapter_match:
+                continue
+
+            token = chapter_match.group(1)
+            chapter_num = int(token) if token.isdigit() else cls._cn_to_int(token)
+            if chapter_num <= 0:
+                continue
+            if chapter_numbers and chapter_numbers[-1] == chapter_num:
+                continue
+
+            chapter_numbers.append(chapter_num)
+            if len(chapter_numbers) >= 4:
+                break
+
+        return len(set(chapter_numbers)) >= 2
+
+    @classmethod
+    def _has_excessive_cn_list_l1_resets(
+        cls,
+        raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
+    ) -> bool:
+        """判断 Rule_8 的一级序号是否出现明显重复回跳,避免章内标题被抬成顶层。"""
+
+        l1_sequence: List[int] = []
+        for chapter_title in raw_data.keys():
+            match = re.match(r"^([一二三四五六七八九十百零两]+)[、))\]]", str(chapter_title or "").strip())
+            if not match:
+                continue
+            chapter_num = cls._cn_to_int(match.group(1))
+            if chapter_num > 0:
+                l1_sequence.append(chapter_num)
+
+        if len(l1_sequence) < 3:
+            return False
+
+        backward_jumps = 0
+        severe_resets = 0
+        for prev_num, curr_num in zip(l1_sequence, l1_sequence[1:]):
+            if curr_num < prev_num:
+                backward_jumps += 1
+                if prev_num >= 3 and curr_num <= 2:
+                    severe_resets += 1
+
+        return severe_resets >= 1 or backward_jumps >= 2
+
+    @classmethod
+    def _detect_cn_order_l2_style(cls, line: str) -> Optional[str]:
+        """识别中文序号小节标题的样式,区分“ 一)”和“ 一、/一 空格”。"""
+
+        cleaned = cls._strip_catalog_page_suffix(line)
+        cleaned = re.sub(r"\s+", " ", str(cleaned or "").strip())
+        if not cleaned:
+            return None
+
+        bracket_match = re.match(
+            r"^[一二三四五六七八九十百零两]+[))\]]\s*[\u4e00-\u9fa5A-Za-z].*",
+            cleaned,
+        )
+        if bracket_match:
+            return "bracket"
+
+        plain_match = re.match(
+            r"^[一二三四五六七八九十百零两]+(?:、|\s+)\s*[\u4e00-\u9fa5A-Za-z].*",
+            cleaned,
+        )
+        if plain_match:
+            return "plain"
+
+        return None
+
+    def _detect_document_cn_order_l2_style(self, body_lines: List[BodyLine]) -> Optional[str]:
+        """按章节扫描正文早期小节样式,为 Rule_4/5 平分时提供稳定偏好。"""
+
+        plain_count = 0
+        bracket_count = 0
+        lines_since_chapter = -1
+
+        for item in body_lines:
+            line = self._strip_leading_page_number_from_heading(item.text.strip())
+            if not line or self._is_toc_line(line):
+                continue
+
+            if re.match(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]", line):
+                lines_since_chapter = 0
+                continue
+
+            if lines_since_chapter < 0:
+                continue
+
+            lines_since_chapter += 1
+            if lines_since_chapter > 30:
+                lines_since_chapter = -1
+                continue
+
+            style = self._detect_cn_order_l2_style(line)
+            if style is None or not self._is_valid_heading_strict(line, is_l1=False):
+                continue
+
+            if style == "plain":
+                plain_count += 1
+            elif style == "bracket":
+                bracket_count += 1
+            lines_since_chapter = -1
+
+        if plain_count == bracket_count:
+            return None
+        return "plain" if plain_count > bracket_count else "bracket"
+
+    @staticmethod
+    def _score_rule_cn_l2_style_preference(rule_name: str, preferred_style: Optional[str]) -> int:
+        """把文档级样式偏好映射到规则选择的平分决胜分。"""
+
+        if preferred_style == "plain" and rule_name == "Rule_4_传统公文派":
+            return 1
+        if preferred_style == "bracket" and rule_name == "Rule_5_单边括号派":
+            return 1
+        return 0
 
     def _convert_rule_output_to_chapters(
         self,
@@ -1064,6 +1426,8 @@ class PdfStructureExtractor:
             "设计",
             "部署",
             "安排",
+            "方法",
+            "参数",
         )
         return not any(keyword in compact for keyword in chapter_keywords)
 

+ 112 - 5
core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py

@@ -60,6 +60,8 @@ CATALOG_L2_PATTERNS = (
 )
 CATALOG_CN_LIST_PATTERN = re.compile(r"^[一二三四五六七八九十百零两]+[、)\)\]]\s*[\u4e00-\u9fa5A-Za-z].*")
 CATALOG_NUMERIC_SECTION_PATTERN = re.compile(r"^\d+\.\d+(?!\.\d)\.?\s*[\u4e00-\u9fa5A-Za-z].*")
+CATALOG_SPLIT_NUMERIC_L1_PATTERN = re.compile(r"^\d{1,2}(?:[\..。、])?\s*$")
+CATALOG_SPLIT_NUMERIC_L2_PATTERN = re.compile(r"^\d+\.\d+(?!\.\d)\.?\s*$")
 
 
 class _SilentLogger:
@@ -358,12 +360,44 @@ def _merge_split_catalog_heading_lines(lines: List[str]) -> List[str]:
                 merged.append(f"{line} {next_line}")
                 index += 2
                 continue
+        if index + 1 < len(lines) and (
+            CATALOG_SPLIT_NUMERIC_L1_PATTERN.match(line) or CATALOG_SPLIT_NUMERIC_L2_PATTERN.match(line)
+        ):
+            next_line = lines[index + 1].strip()
+            # 目录页常把“1.”、“1.1.”单独放一行,下一行才是标题。
+            # 这里只在下一行明显不像目录噪声、也不像另一个编号时做合并,
+            # 尽量只修评分基准中的“分行目录”问题,不影响已有正文抽取逻辑。
+            if _looks_like_split_catalog_title(next_line):
+                merged.append(f"{line} {next_line}")
+                index += 2
+                continue
         merged.append(line)
         index += 1
 
     return merged
 
 
+def _looks_like_split_catalog_title(line: str) -> bool:
+    cleaned = re.sub(r"\s+", " ", str(line or "").strip())
+    if not cleaned:
+        return False
+
+    compact = re.sub(r"\s+", "", cleaned)
+    if compact in {"目录", "目", "录"}:
+        return False
+    if re.fullmatch(r"[IVXLCDM]+", compact, re.IGNORECASE):
+        return False
+    if TOC_LINE_PATTERN.search(cleaned) or TOC_PAGE_SUFFIX_PATTERN.search(cleaned):
+        return False
+    if any(pattern.match(cleaned) for pattern in CATALOG_L1_PATTERNS):
+        return False
+    if any(pattern.match(cleaned) for pattern in CATALOG_L2_PATTERNS):
+        return False
+    if CATALOG_SPLIT_NUMERIC_L1_PATTERN.match(cleaned) or CATALOG_SPLIT_NUMERIC_L2_PATTERN.match(cleaned):
+        return False
+    return bool(re.match(r"^[\u4e00-\u9fa5A-Za-z]", cleaned))
+
+
 def _classify_catalog_line_level(
     line: str,
     next_line: str,
@@ -454,6 +488,10 @@ def compute_extracted_char_count(result: Dict[str, Any]) -> int:
 
 
 def compute_quality_rate(raw_char_count: int, extracted_char_count: int) -> Tuple[float, str]:
+    # 内容合格率规则:
+    # 1. 分母使用原 PDF 正文范围内的字符数(会跳过前置目录页)。
+    # 2. 分子使用提取结果中的章节标题、节标题和正文 content 字符数总和。
+    # 3. 最终得分 = extracted_char_count / raw_char_count,并截断到 [0, 1]。
     if raw_char_count <= 0:
         return 0.0, "0.0%"
 
@@ -581,6 +619,11 @@ def _strip_catalog_heading_prefix(text: str) -> str:
 
 
 def _catalog_title_similarity(left: str, right: str) -> float:
+    # 目录标题相似度规则:
+    # 1. 先比较完整标题(统一去空白、常见标点、页码尾缀后);
+    # 2. 再比较去掉“第X章 / 1.2 / 一、”等编号前缀后的标题主体;
+    # 3. 如果主体互相包含且长度足够,给一个接近命中的 0.95;
+    # 4. 取多种比较方式中的最高分,尽量容忍编号体系差异和轻微 OCR 噪声。
     left_full = _normalize_catalog_title(left)
     right_full = _normalize_catalog_title(right)
     if not left_full or not right_full:
@@ -623,6 +666,9 @@ def _longest_increasing_subsequence_length(values: List[int]) -> int:
 
 
 def _catalog_count_score(original_count: int, extracted_count: int) -> float:
+    # 数量得分只看“提取出来的条数是否接近原 PDF 目录条数”:
+    # score = min(original_count, extracted_count) / max(original_count, extracted_count)
+    # 两边数量完全一致时为 1.0,差距越大分越低。
     max_count = max(original_count, extracted_count)
     if max_count <= 0:
         return 1.0
@@ -634,6 +680,11 @@ def _match_catalog_level(
     extracted_items: List[Dict[str, Any]],
     level: int,
 ) -> Dict[str, Any]:
+    # 分层匹配规则(一级目录和二级目录分别独立计算):
+    # 1. 每个原始目录项只匹配一个未占用的提取项,采用贪心“最高相似度”匹配;
+    # 2. 一级目录阈值 0.82,二级目录阈值 0.78,低于阈值视为未命中;
+    # 3. 命中后统计 precision / recall / F1,作为标题匹配质量;
+    # 4. 再根据命中项的相对顺序计算 order_score,衡量目录顺序是否被保留。
     originals = [item for item in original_items if item.get("level") == level]
     extracted = [item for item in extracted_items if item.get("level") == level]
     used_extracted_indexes: set[int] = set()
@@ -688,6 +739,9 @@ def _match_catalog_level(
 
 
 def _weighted_catalog_score(level_details: Dict[str, Dict[str, Any]], metric: str) -> float:
+    # 目录总分会把一级、二级拆开算,再按权重合并:
+    # 一级目录权重 0.35,二级目录权重 0.65。
+    # 这样做是因为二级目录数量通常更多,也更能反映目录结构是否完整。
     weighted_scores: List[Tuple[float, float]] = []
     if max(level_details["chapter"]["original"], level_details["chapter"]["extracted"]) > 0:
         weighted_scores.append((0.35, float(level_details["chapter"][metric])))
@@ -713,6 +767,13 @@ def compute_catalog_quality_rate_from_items(
     original_items: List[Dict[str, Any]],
     extracted_items: List[Dict[str, Any]],
 ) -> Tuple[float, str, Dict[str, Any]]:
+    # 目录合格率总规则:
+    # 1. title_score:标题匹配 F1,权重 70%
+    # 2. count_score:目录数量接近程度,权重 20%
+    # 3. order_score:目录顺序保持程度,权重 10%
+    #
+    # 其中每个分项都先按“一级 35% + 二级 65%”合并,再做总加权:
+    # final = 0.70 * title_score + 0.20 * count_score + 0.10 * order_score
     level_details = {
         "chapter": _match_catalog_level(original_items, extracted_items, 1),
         "section": _match_catalog_level(original_items, extracted_items, 2),
@@ -733,6 +794,42 @@ def compute_catalog_quality_rate_from_items(
     return rate, f"{rate * 100:.1f}%", _round_catalog_detail(detail)
 
 
+def compute_overall_quality_rate(
+    content_rate: float,
+    catalog_rate: float,
+    original_catalog_count: int,
+) -> Tuple[float, str, Dict[str, Any]]:
+    # 总体合格率规则:
+    # 1. 如果原 PDF 本身没有可用目录,就退化为只看内容覆盖率;
+    # 2. 如果原 PDF 有目录,则把内容分和目录分做“加权几何平均”:
+    #    final = content_rate^0.7 * catalog_rate^0.3
+    # 3. 这种做法会在目录严重错误时显著拉低总分,避免“正文字符提取得多,但结构明显错了”时分数虚高。
+    content_rate = max(0.0, min(content_rate, 1.0))
+    catalog_rate = max(0.0, min(catalog_rate, 1.0))
+
+    if original_catalog_count <= 0:
+        rate = content_rate
+        detail = {
+            "score_model": "content_only_no_original_catalog",
+            "content_rate": content_rate,
+            "catalog_rate": catalog_rate,
+            "original_catalog_count": original_catalog_count,
+        }
+        return rate, f"{rate * 100:.1f}%", _round_catalog_detail(detail)
+
+    rate = (content_rate ** 0.7) * (catalog_rate ** 0.3)
+    rate = max(0.0, min(rate, 1.0))
+    detail = {
+        "score_model": "geometric_mean_content_70_catalog_30",
+        "content_rate": content_rate,
+        "catalog_rate": catalog_rate,
+        "content_weight": 0.7,
+        "catalog_weight": 0.3,
+        "original_catalog_count": original_catalog_count,
+    }
+    return rate, f"{rate * 100:.1f}%", _round_catalog_detail(detail)
+
+
 def append_static_record(
     stat_path: Path,
     pdf_path: Path,
@@ -839,13 +936,18 @@ def process_pdf(
     file_content = pdf_path.read_bytes()
     extractor_result = extractor.extract(file_content)
     extracted_char_count = compute_extracted_char_count(extractor_result)
-    _, quality_rate_text = compute_quality_rate(raw_char_count, extracted_char_count)
+    content_rate, content_quality_rate_text = compute_quality_rate(raw_char_count, extracted_char_count)
     extracted_catalog_items = extract_result_catalog_items(extractor_result)
     extracted_l1_count, extracted_l2_count = count_catalog_item_levels(extracted_catalog_items)
-    _, catalog_quality_rate_text, catalog_quality_detail = compute_catalog_quality_rate_from_items(
+    catalog_rate, catalog_quality_rate_text, catalog_quality_detail = compute_catalog_quality_rate_from_items(
         original_items=original_catalog_items,
         extracted_items=extracted_catalog_items,
     )
+    _, quality_rate_text, quality_rate_detail = compute_overall_quality_rate(
+        content_rate=content_rate,
+        catalog_rate=catalog_rate,
+        original_catalog_count=original_l1_count + original_l2_count,
+    )
 
     payload = build_output_payload(
         pdf_path=pdf_path,
@@ -859,6 +961,10 @@ def process_pdf(
             extractor_name=extractor_name,
     )
     payload["metadata"].update({
+        "overall_quality_rate": quality_rate_text,
+        "content_quality_rate": content_quality_rate_text,
+        "quality_rate_model": quality_rate_detail.get("score_model"),
+        "quality_rate_detail": quality_rate_detail,
         "original_catalog_chapter_count": original_l1_count,
         "original_catalog_section_count": original_l2_count,
         "extracted_catalog_chapter_count": extracted_l1_count,
@@ -880,7 +986,7 @@ def process_pdf(
         original_l2_count=original_l2_count,
         extracted_l2_count=extracted_l2_count,
         catalog_quality_rate_text=catalog_quality_rate_text,
-        content_quality_rate_text=quality_rate_text,
+        content_quality_rate_text=content_quality_rate_text,
     )
     return output_path, quality_rate_text
 
@@ -900,6 +1006,7 @@ def main() -> int:
         return 1
 
     PdfStructureExtractor = load_pdf_structure_extractor(args.extractor)
+    effective_detect_toc = (not args.disable_toc) and args.extractor != "pdf_extractor1"
     extractor = PdfStructureExtractor(
         clip_top=args.clip_top,
         clip_bottom=args.clip_bottom,
@@ -907,7 +1014,7 @@ def main() -> int:
         ocr_api_url=args.ocr_api_url,
         ocr_timeout=args.ocr_timeout,
         ocr_api_key=args.ocr_api_key,
-        detect_toc=not args.disable_toc,
+        detect_toc=effective_detect_toc,
         toc_model_path=args.toc_model_path,
     )
 
@@ -928,7 +1035,7 @@ def main() -> int:
                 clip_top=args.clip_top,
                 clip_bottom=args.clip_bottom,
                 use_ocr=args.use_ocr,
-                detect_toc=not args.disable_toc,
+                detect_toc=effective_detect_toc,
                 extractor_name=args.extractor,
             )
             success_count += 1