1 日前 · 2c2f6a9753
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
@@ -0,0 +1,1325 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+"""
			
 
				+PDF 结构提取器。
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Any, Dict, List, Optional, Tuple
			
 
				+
			
 
				+import fitz
			
 
				+
			
 
				+try:
			
 
				+    from .ocr_processor import OcrProcessor, OcrResult, TableRegion
			
 
				+except ImportError:  # pragma: no cover - direct script-style imports
			
 
				+    try:
			
 
				+        from ocr_processor import OcrProcessor, OcrResult, TableRegion  # type: ignore
			
 
				+    except ImportError:  # pragma: no cover - OCR dependencies are optional
			
 
				+        OcrProcessor = None  # type: ignore
			
 
				+        OcrResult = Any  # type: ignore
			
 
				+        TableRegion = Any  # type: ignore
			
 
				+
			
 
				+
			
 
				+SECTION_TITLE_KEY = "章节标题"
			
 
				+EMPTY_SECTION_PLACEHOLDER = "[本节无纯文本，原文档中可能为纯图片或表格]"
			
 
				+
			
 
				+
			
 
				+TABLE_OCR_START = "[表格OCR识别结果]:"
			
 
				+TABLE_OCR_END = "[/表格]"
			
 
				+CN_LIST_L1_NUMERIC_L2_RULE = "Rule_8_中文序号章数字小节派"
			
 
				+
			
 
				+
			
 
				+@dataclass(frozen=True)
			
 
				+class BodyLine:
			
 
				+    """一条规范化后的正文行，以及它所在的 PDF 页码。"""
			
 
				+
			
 
				+    page: int
			
 
				+    text: str
			
 
				+
			
 
				+
			
 
				+class PdfStructureExtractor:
			
 
				+    """基于规则的 PDF 正文结构提取器，可选增强表格 OCR 内容。"""
			
 
				+
			
 
				+    RULE_LIB = {
			
 
				+        "Rule_1_纯数字派": {
			
 
				+            "l1": re.compile(r"^\d{1,2}(?:[\.．。])?\s+(?!\d)[\u4e00-\u9fa5A-Za-z].*"),
			
 
				+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
			
 
				+        },
			
 
				+        "Rule_2_混合章派": {
			
 
				+            "l1": re.compile(r"^第\s*(\d+)\s*[章部部分篇]\s*[、]?\s*(.*)"),
			
 
				+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
			
 
				+        },
			
 
				+        "Rule_3_中英混血派": {
			
 
				+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部部分篇]\s*[、]?\s*(.*)"),
			
 
				+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
			
 
				+        },
			
 
				+        "Rule_4_传统公文派": {
			
 
				+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部部分篇]\s*[、]?\s*(.*)"),
			
 
				+            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[、\s]+([\u4e00-\u9fa5]+.*)"),
			
 
				+        },
			
 
				+        "Rule_5_单边括号派": {
			
 
				+            "l1": re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]\s*[、]?\s*(.*)"),
			
 
				+            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[）)\]]\s*([\u4e00-\u9fa5]+.*)"),
			
 
				+        },
			
 
				+        "Rule_6_小节派": {
			
 
				+            "l1": re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]\s*[、]?\s*(.*)"),
			
 
				+            "l2": re.compile(r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*节\s*[、]?\s*([\u4e00-\u9fa5]+.*)"),
			
 
				+        },
			
 
				+        "Rule_7_粗体括号派": {
			
 
				+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部部分篇]\s*[、]?\s*(.*)"),
			
 
				+            "l2": re.compile(r"^[【\[]\s*(\d+)\s*[\]】]\s*([\u4e00-\u9fa5]+.*)"),
			
 
				+        },
			
 
				+        CN_LIST_L1_NUMERIC_L2_RULE: {
			
 
				+            "l1": re.compile(r"^([一二三四五六七八九十百零两]+)[、）)\]]\s*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
			
 
				+        },
			
 
				+    }
			
 
				+
			
 
				+    CN_NUM_MAP = {
			
 
				+        "零": 0,
			
 
				+        "〇": 0,
			
 
				+        "一": 1,
			
 
				+        "二": 2,
			
 
				+        "两": 2,
			
 
				+        "三": 3,
			
 
				+        "四": 4,
			
 
				+        "五": 5,
			
 
				+        "六": 6,
			
 
				+        "七": 7,
			
 
				+        "八": 8,
			
 
				+        "九": 9,
			
 
				+    }
			
 
				+
			
 
				+    TOC_PATTERN = re.compile(r"\.{3,}|…{2,}|-{3,}|·{3,}|•{3,}")
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        clip_top: float = 60,
			
 
				+        clip_bottom: float = 60,
			
 
				+        use_ocr: bool = False,
			
 
				+        ocr_api_url: str = "",
			
 
				+        ocr_timeout: int = 600,
			
 
				+        ocr_api_key: str = "",
			
 
				+        detect_toc: bool = True,
			
 
				+        toc_model_path: str = "",
			
 
				+    ):
			
 
				+        """初始化提取参数，并在依赖可用时启用 OCR。"""
			
 
				+
			
 
				+        self.clip_top = clip_top
			
 
				+        self.clip_bottom = clip_bottom
			
 
				+        self.ocr_requested = bool(use_ocr)
			
 
				+        self.ocr_processor = None
			
 
				+        self.use_ocr = False
			
 
				+        # OCR 是可选增强：rapid_layout 或 OCR 依赖缺失时，正文规则提取仍然照常运行。
			
 
				+        if use_ocr and OcrProcessor is not None:
			
 
				+            self.ocr_processor = OcrProcessor(
			
 
				+                ocr_api_url=ocr_api_url,
			
 
				+                ocr_timeout=ocr_timeout,
			
 
				+                ocr_api_key=ocr_api_key,
			
 
				+            )
			
 
				+            self.use_ocr = self.ocr_processor.is_available()
			
 
				+        self.detect_toc = False
			
 
				+        self.ocr_api_url = ocr_api_url
			
 
				+        self.ocr_timeout = ocr_timeout
			
 
				+        self.ocr_api_key = ocr_api_key
			
 
				+        self.toc_model_path = toc_model_path
			
 
				+
			
 
				+    def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
			
 
				+        """提取章节、正文派生目录、规则诊断信息，以及可选的表格 OCR 内容。"""
			
 
				+
			
 
				+        result: Dict[str, Any] = {
			
 
				+            "chapters": {},
			
 
				+            "total_pages": 0,
			
 
				+            "catalog": None,
			
 
				+            "body_catalog": None,
			
 
				+            "ocr_catalog": None,
			
 
				+            "catalog_mode": "testc_body_only",
			
 
				+            "body_rule": None,
			
 
				+            "body_coverage": 0.0,
			
 
				+            "rule_performance": {},
			
 
				+            "ocr_content_mode": "disabled",
			
 
				+            "ocr_table_count": 0,
			
 
				+            "ocr_success_count": 0,
			
 
				+            "ocr_inserted_count": 0,
			
 
				+        }
			
 
				+
			
 
				+        doc = fitz.open(stream=file_content, filetype="pdf")
			
 
				+        try:
			
 
				+            # 正文切分仍由 PyMuPDF 文本和标题规则驱动，OCR 只在切分后作为小节内容补充。
			
 
				+            body_lines = self._extract_body_lines(doc, progress_callback)
			
 
				+            ocr_results = self._extract_table_ocr_results(doc, progress_callback)
			
 
				+            raw_data, winning_rule, coverage_rate, rule_performance = self._extract_body_with_best_rule(body_lines)
			
 
				+            chapters = self._convert_rule_output_to_chapters(raw_data)
			
 
				+            ocr_stats = self._insert_ocr_results_into_chapters(chapters, ocr_results)
			
 
				+            body_catalog = self._build_body_catalog_from_chapters(chapters)
			
 
				+
			
 
				+            result["chapters"] = chapters
			
 
				+            result["total_pages"] = len(doc)
			
 
				+            result["catalog"] = body_catalog
			
 
				+            result["body_catalog"] = body_catalog
			
 
				+            result["body_rule"] = winning_rule
			
 
				+            result["body_coverage"] = coverage_rate
			
 
				+            result["rule_performance"] = rule_performance
			
 
				+            result["ocr_table_count"] = ocr_stats["table_count"]
			
 
				+            result["ocr_success_count"] = ocr_stats["success_count"]
			
 
				+            result["ocr_inserted_count"] = ocr_stats["inserted_count"]
			
 
				+            # 记录 OCR 是否实际影响输出，方便批处理统计时判断 OCR 状态。
			
 
				+            # disabled：默认值，表示本次没有请求 OCR。
			
 
				+            # unavailable：请求了 OCR，但依赖不可用，例如 rapid_layout 未安装或检测器不可用。
			
 
				+            # enabled_no_table：OCR 已启用，但没有检测到可识别的表格区域。
			
 
				+            # table_regions_inserted：OCR 已启用，并且表格识别结果已经成功回填到正文小节。
			
 
				+            # enabled_no_insert：OCR 已启用，但没有成功回填，通常是 OCR 失败或未定位到合适小节。
			
 
				+            if self.ocr_requested and not self.use_ocr:
			
 
				+                result["ocr_content_mode"] = "unavailable"
			
 
				+            elif self.use_ocr and ocr_stats["table_count"] == 0:
			
 
				+                result["ocr_content_mode"] = "enabled_no_table"
			
 
				+            elif self.use_ocr and ocr_stats["inserted_count"] > 0:
			
 
				+                result["ocr_content_mode"] = "table_regions_inserted"
			
 
				+            elif self.use_ocr:
			
 
				+                result["ocr_content_mode"] = "enabled_no_insert"
			
 
				+            return result
			
 
				+        finally:
			
 
				+            doc.close()
			
 
				+
			
 
				+    def _extract_table_ocr_results(self, doc: fitz.Document, progress_callback=None) -> List[OcrResult]:
			
 
				+        """在 OCR 启用时检测 PDF 表格区域，并发执行表格识别。"""
			
 
				+
			
 
				+        if not self.use_ocr or self.ocr_processor is None:
			
 
				+            return []
			
 
				+
			
 
				+        def _emit_progress(stage: str, current: int, message: str) -> None:
			
 
				+            """转发 OCR 进度，同时避免回调异常中断提取流程。"""
			
 
				+
			
 
				+            if not progress_callback:
			
 
				+                return
			
 
				+            try:
			
 
				+                progress_callback(stage, current, message)
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+
			
 
				+        table_regions: List[TableRegion] = []
			
 
				+        total_pages = len(doc)
			
 
				+        for page_index in range(total_pages):
			
 
				+            page = doc.load_page(page_index)
			
 
				+            rect = page.rect
			
 
				+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				+            regions = self.ocr_processor.detect_table_regions(page, page_index + 1, clip_box)
			
 
				+            # 保存页面对象和区域坐标，便于 OcrProcessor 后续精确渲染表格裁剪区域。
			
 
				+            for bbox, score in regions:
			
 
				+                table_regions.append(TableRegion(
			
 
				+                    page_num=page_index + 1,
			
 
				+                    page=page,
			
 
				+                    bbox=bbox,
			
 
				+                    score=score,
			
 
				+                ))
			
 
				+
			
 
				+            if page_index + 1 == total_pages or (page_index + 1) % 5 == 0:
			
 
				+                progress = int((page_index + 1) / max(total_pages, 1) * 30)
			
 
				+                _emit_progress("ocr_layout", progress, f"scan tables {page_index + 1}/{total_pages}")
			
 
				+
			
 
				+        if not table_regions:
			
 
				+            return []
			
 
				+
			
 
				+        _emit_progress("ocr", 35, f"ocr tables 0/{len(table_regions)}")
			
 
				+
			
 
				+        def _progress_adapter(completed: int, total: int) -> None:
			
 
				+            """把 OcrProcessor 的 completed/total 进度转换为提取器统一的进度格式。"""
			
 
				+
			
 
				+            progress = 35 + int(completed / max(total, 1) * 15)
			
 
				+            _emit_progress("ocr", progress, f"ocr tables {completed}/{total}")
			
 
				+
			
 
				+        return self.ocr_processor.process_ocr_concurrent(
			
 
				+            table_regions,
			
 
				+            progress_callback=_progress_adapter,
			
 
				+        )
			
 
				+
			
 
				+    def _insert_ocr_results_into_chapters(
			
 
				+        self,
			
 
				+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				+        ocr_results: List[OcrResult],
			
 
				+    ) -> Dict[str, int]:
			
 
				+        """把成功识别的表格 OCR 文本追加到同页最可能的小节正文中。"""
			
 
				+
			
 
				+        stats = {
			
 
				+            "table_count": len(ocr_results),
			
 
				+            "success_count": 0,
			
 
				+            "inserted_count": 0,
			
 
				+        }
			
 
				+        if not chapters or not ocr_results:
			
 
				+            return stats
			
 
				+
			
 
				+        successful_results = [
			
 
				+            result for result in ocr_results
			
 
				+            if getattr(result, "success", False) and str(getattr(result, "text", "") or "").strip()
			
 
				+        ]
			
 
				+        stats["success_count"] = len(successful_results)
			
 
				+
			
 
				+        for ocr_result in sorted(successful_results, key=lambda item: (item.page_num, item.bbox[1], item.bbox[0])):
			
 
				+            # 轻量提取器在切分后不再保留文本块坐标，因此使用页码范围作为 OCR 回填的稳定定位信号。
			
 
				+            target = self._find_ocr_target_section(chapters, ocr_result.page_num)
			
 
				+            if target is None:
			
 
				+                continue
			
 
				+
			
 
				+            _, _, payload = target
			
 
				+            original_content = str(payload.get("content", "") or "").strip()
			
 
				+            if original_content == EMPTY_SECTION_PLACEHOLDER:
			
 
				+                original_content = ""
			
 
				+
			
 
				+            ocr_text = str(ocr_result.text or "").strip()
			
 
				+            table_text = f"{TABLE_OCR_START}\n{ocr_text}\n{TABLE_OCR_END}"
			
 
				+            payload["content"] = f"{original_content}\n\n{table_text}".strip()
			
 
				+            payload["page_start"] = min(
			
 
				+                self._safe_page_number(payload.get("page_start"), ocr_result.page_num),
			
 
				+                ocr_result.page_num,
			
 
				+            )
			
 
				+            payload["page_end"] = max(
			
 
				+                self._safe_page_number(payload.get("page_end"), ocr_result.page_num),
			
 
				+                ocr_result.page_num,
			
 
				+            )
			
 
				+            stats["inserted_count"] += 1
			
 
				+
			
 
				+        return stats
			
 
				+
			
 
				+    def _find_ocr_target_section(
			
 
				+        self,
			
 
				+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				+        page_num: int,
			
 
				+    ) -> Optional[Tuple[str, str, Dict[str, Any]]]:
			
 
				+        """查找页码范围最能覆盖 OCR 表格所在页的小节。"""
			
 
				+
			
 
				+        candidates: List[Tuple[int, int, str, str, Dict[str, Any]]] = []
			
 
				+        fallback: Optional[Tuple[str, str, Dict[str, Any]]] = None
			
 
				+
			
 
				+        for chapter_title, sections in chapters.items():
			
 
				+            if not isinstance(sections, dict):
			
 
				+                continue
			
 
				+
			
 
				+            for section_title, payload in sections.items():
			
 
				+                if not isinstance(payload, dict):
			
 
				+                    continue
			
 
				+
			
 
				+                page_start = self._safe_page_number(payload.get("page_start"), page_num)
			
 
				+                page_end = self._safe_page_number(payload.get("page_end"), page_start)
			
 
				+                if section_title == SECTION_TITLE_KEY:
			
 
				+                    if fallback is None and page_start <= page_num <= page_end:
			
 
				+                        fallback = (chapter_title, section_title, payload)
			
 
				+                    continue
			
 
				+
			
 
				+                # 优先选择页码范围最窄的小节，过宽的范围通常是章节级内容外溢。
			
 
				+                if page_start <= page_num <= page_end:
			
 
				+                    span = max(page_end - page_start, 0)
			
 
				+                    candidates.append((span, -page_start, chapter_title, section_title, payload))
			
 
				+                elif page_start <= page_num:
			
 
				+                    fallback = (chapter_title, section_title, payload)
			
 
				+
			
 
				+        if candidates:
			
 
				+            _, _, chapter_title, section_title, payload = min(candidates, key=lambda item: (item[0], item[1]))
			
 
				+            return chapter_title, section_title, payload
			
 
				+        return fallback
			
 
				+
			
 
				+    def _extract_body_lines(self, doc: fitz.Document, progress_callback=None) -> List[BodyLine]:
			
 
				+        """读取裁剪后的页面文本，规范化正文行，并移除重复的非标题噪声。"""
			
 
				+
			
 
				+        page_lines_by_page: List[Tuple[int, List[str]]] = []
			
 
				+        total_pages = len(doc)
			
 
				+
			
 
				+        for page_index in range(total_pages):
			
 
				+            page = doc.load_page(page_index)
			
 
				+            rect = page.rect
			
 
				+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				+            text = page.get_text("text", clip=clip_box)
			
 
				+
			
 
				+            page_lines: List[str] = []
			
 
				+            for line in self._prepare_page_lines(text):
			
 
				+                stripped = line.strip()
			
 
				+                if not stripped or self._is_header_footer(stripped):
			
 
				+                    continue
			
 
				+                page_lines.append(stripped)
			
 
				+
			
 
				+            page_lines_by_page.append((page_index + 1, page_lines))
			
 
				+
			
 
				+            if progress_callback and (page_index + 1 == total_pages or (page_index + 1) % 10 == 0):
			
 
				+                try:
			
 
				+                    progress_callback(
			
 
				+                        "正文抽取",
			
 
				+                        int((page_index + 1) / max(total_pages, 1) * 60),
			
 
				+                        f"读取正文页 {page_index + 1}/{total_pages}",
			
 
				+                    )
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+
			
 
				+        # 页眉页脚往往跨页重复，但真实标题不能被误删，所以只移除“不像标题”的重复行。
			
 
				+        repeated_noise_keys = self._find_repeated_non_heading_lines(page_lines_by_page, total_pages)
			
 
				+        body_lines: List[BodyLine] = []
			
 
				+        for page, lines in page_lines_by_page:
			
 
				+            for line in lines:
			
 
				+                if self._normalize_repeated_line_key(line) in repeated_noise_keys:
			
 
				+                    continue
			
 
				+                body_lines.append(BodyLine(page=page, text=line))
			
 
				+        return body_lines
			
 
				+
			
 
				+    def _extract_body_with_best_rule(
			
 
				+        self,
			
 
				+        body_lines: List[BodyLine],
			
 
				+    ) -> Tuple[Dict[str, Dict[str, List[Dict[str, Any]]]], Optional[str], float, Dict[str, Any]]:
			
 
				+        """运行所有候选标题规则，并返回评分最高的正文结构。"""
			
 
				+
			
 
				+        total_raw_chars = sum(len(item.text.strip()) for item in body_lines if item.text.strip())
			
 
				+        best_score = -9999
			
 
				+        best_rule_name: Optional[str] = None
			
 
				+        best_data: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
			
 
				+        best_coverage = 0.0
			
 
				+        rule_performance: Dict[str, Any] = {}
			
 
				+
			
 
				+        for rule_name, rule_set in self.RULE_LIB.items():
			
 
				+            data = self._extract_with_rule(body_lines, rule_name, rule_set)
			
 
				+            score, coverage_rate = self._evaluate_extraction(data, total_raw_chars)
			
 
				+            l1_count = len(data)
			
 
				+            l2_count = sum(
			
 
				+                len([key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY])
			
 
				+                for sections in data.values()
			
 
				+            )
			
 
				+            if (
			
 
				+                rule_name == CN_LIST_L1_NUMERIC_L2_RULE
			
 
				+                and not self._is_viable_cn_list_l1_numeric_l2_structure(data, l1_count, l2_count)
			
 
				+            ):
			
 
				+                score -= 1500
			
 
				+            rule_performance[rule_name] = {
			
 
				+                "score": score,
			
 
				+                "coverage_rate": f"{coverage_rate * 100:.1f}%",
			
 
				+                "l1_count": l1_count,
			
 
				+                "l2_count": l2_count,
			
 
				+            }
			
 
				+
			
 
				+            # 规则选择以综合得分为主，覆盖率保留用于兜底过滤和诊断输出。
			
 
				+            if score > best_score:
			
 
				+                best_score = score
			
 
				+                best_rule_name = rule_name
			
 
				+                best_data = data
			
 
				+                best_coverage = coverage_rate
			
 
				+
			
 
				+        if best_score <= 0 or best_coverage < 0.15:
			
 
				+            return {}, best_rule_name, best_coverage, rule_performance
			
 
				+
			
 
				+        return best_data, best_rule_name, best_coverage, rule_performance
			
 
				+
			
 
				+    def _extract_with_rule(
			
 
				+        self,
			
 
				+        body_lines: List[BodyLine],
			
 
				+        rule_name: str,
			
 
				+        rule_set: Dict[str, re.Pattern],
			
 
				+    ) -> Dict[str, Dict[str, List[Dict[str, Any]]]]:
			
 
				+        """使用单个候选标题规则，把正文行切分到章节和小节桶中。"""
			
 
				+
			
 
				+        structured_data: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
			
 
				+        current_l1: Optional[str] = None
			
 
				+        current_l1_num = 0
			
 
				+        current_l2: Optional[str] = None
			
 
				+        pending_prefix: Optional[str] = None
			
 
				+        pending_page: Optional[int] = None
			
 
				+        last_l2_sub_num = 0
			
 
				+
			
 
				+        backup_l1: Optional[str] = None
			
 
				+        backup_l1_num = 0
			
 
				+        backup_l2: Optional[str] = None
			
 
				+        backup_l2_sub_num = 0
			
 
				+
			
 
				+        is_numeric_l2 = rule_name in {
			
 
				+            "Rule_1_纯数字派",
			
 
				+            "Rule_2_混合章派",
			
 
				+            "Rule_3_中英混血派",
			
 
				+            CN_LIST_L1_NUMERIC_L2_RULE,
			
 
				+        }
			
 
				+
			
 
				+        for index, item in enumerate(body_lines):
			
 
				+            # 先处理跨行标题碎片，再进入章/节识别，避免“第X章”单独成行时丢标题。
			
 
				+            original_line = item.text.strip()
			
 
				+            page = item.page
			
 
				+            if not original_line or original_line.isdigit():
			
 
				+                continue
			
 
				+
			
 
				+            line = self._strip_leading_page_number_from_heading(original_line)
			
 
				+            if pending_prefix:
			
 
				+                line = f"{pending_prefix} {line}".strip()
			
 
				+                original_line = line
			
 
				+                page = pending_page or page
			
 
				+                pending_prefix = None
			
 
				+                pending_page = None
			
 
				+
			
 
				+            if self._is_incomplete_heading_fragment(line) and len(line) <= 15:
			
 
				+                pending_prefix = line
			
 
				+                pending_page = page
			
 
				+                continue
			
 
				+
			
 
				+            has_toc = self._is_toc_line(line)
			
 
				+
			
 
				+            match_l1 = rule_set["l1"].match(line)
			
 
				+            if match_l1 and not has_toc:
			
 
				+                core_text = self._blind_strip(line)
			
 
				+                if len(core_text) < 2:
			
 
				+                    pending_prefix = line
			
 
				+                    pending_page = page
			
 
				+                    continue
			
 
				+
			
 
				+                if self._is_valid_heading_strict(line, is_l1=True):
			
 
				+                    l1_candidate_num = self._extract_l1_number(line, rule_name, match_l1, current_l1_num)
			
 
				+
			
 
				+                    if rule_name == CN_LIST_L1_NUMERIC_L2_RULE:
			
 
				+                        if not self._has_expected_numeric_l2_ahead(body_lines, index, l1_candidate_num):
			
 
				+                            continue
			
 
				+
			
 
				+                    if rule_name == "Rule_1_纯数字派":
			
 
				+                        # 纯数字一级标题更容易误中表格行或编号列表，因此需要额外的序号和噪声校验。
			
 
				+                        if current_l1 is None and l1_candidate_num not in {1, 2}:
			
 
				+                            continue
			
 
				+                        if self._looks_like_plain_numeric_l1_noise(line):
			
 
				+                            continue
			
 
				+
			
 
				+                    if rule_name in {"Rule_1_纯数字派", "Rule_2_混合章派"} and current_l1 is not None:
			
 
				+                        if l1_candidate_num < current_l1_num:
			
 
				+                            continue
			
 
				+                        if l1_candidate_num - current_l1_num > 2:
			
 
				+                            continue
			
 
				+                        if l1_candidate_num == current_l1_num:
			
 
				+                            # 同编号章节重复出现时，若旧章节尚未出现小节，则把旧章节视作误判并回收内容。
			
 
				+                            if not self._chapter_has_l2(structured_data.get(current_l1, {})):
			
 
				+                                old_preface = structured_data[current_l1].get(SECTION_TITLE_KEY, [])
			
 
				+                                old_page = self._safe_page_number(structured_data[current_l1].get("_chapter_page"), page)
			
 
				+                                restored = [{"text": current_l1, "page": old_page}] + old_preface
			
 
				+                                del structured_data[current_l1]
			
 
				+
			
 
				+                                current_l1 = self._clean_chapter_title(line)
			
 
				+                                structured_data[current_l1] = {"_chapter_page": page}  # type: ignore[assignment]
			
 
				+                                if restored:
			
 
				+                                    structured_data[current_l1][SECTION_TITLE_KEY] = restored
			
 
				+                                current_l1_num = l1_candidate_num
			
 
				+                                current_l2 = None
			
 
				+                                last_l2_sub_num = 0
			
 
				+                            continue
			
 
				+
			
 
				+                    backup_l1 = current_l1
			
 
				+                    backup_l1_num = current_l1_num
			
 
				+                    backup_l2 = current_l2
			
 
				+                    backup_l2_sub_num = last_l2_sub_num
			
 
				+
			
 
				+                    current_l1 = self._clean_chapter_title(line)
			
 
				+                    current_l1_num = l1_candidate_num
			
 
				+                    structured_data.setdefault(current_l1, {"_chapter_page": page})  # type: ignore[assignment]
			
 
				+                    current_l2 = None
			
 
				+                    last_l2_sub_num = 0
			
 
				+                    continue
			
 
				+
			
 
				+            match_l2 = rule_set["l2"].match(line)
			
 
				+            if current_l1 and match_l2 and not has_toc:
			
 
				+                if self._is_valid_heading_strict(line, is_l1=False):
			
 
				+                    if is_numeric_l2:
			
 
				+                        l2_main_num = int(match_l2.group(1))
			
 
				+                        l2_sub_num = int(match_l2.group(2))
			
 
				+
			
 
				+                        if l2_main_num != current_l1_num and l2_main_num == backup_l1_num and backup_l1 is not None:
			
 
				+                            # 若小节编号指向上一个章节，说明当前章节可能是误识别标题，优先回退到备份章节。
			
 
				+                            has_l2 = self._chapter_has_l2(structured_data.get(current_l1, {}))
			
 
				+                            if not has_l2 and current_l1 in structured_data:
			
 
				+                                fake_preface = structured_data[current_l1].get(SECTION_TITLE_KEY, [])
			
 
				+                                chapter_page = self._safe_page_number(
			
 
				+                                    structured_data[current_l1].get("_chapter_page"),
			
 
				+                                    page,
			
 
				+                                )
			
 
				+                                text_to_restore = [{"text": current_l1, "page": chapter_page}] + fake_preface
			
 
				+                                target_node = backup_l2 or SECTION_TITLE_KEY
			
 
				+                                structured_data.setdefault(backup_l1, {"_chapter_page": chapter_page})  # type: ignore[arg-type]
			
 
				+                                structured_data[backup_l1].setdefault(target_node, []).extend(text_to_restore)
			
 
				+                                del structured_data[current_l1]
			
 
				+                                current_l1 = backup_l1
			
 
				+                                current_l1_num = backup_l1_num
			
 
				+                                current_l2 = backup_l2
			
 
				+                                last_l2_sub_num = backup_l2_sub_num
			
 
				+
			
 
				+                        if l2_main_num != current_l1_num:
			
 
				+                            pass
			
 
				+                        elif l2_sub_num <= last_l2_sub_num:
			
 
				+                            pass
			
 
				+                        elif self._is_suspicious_numeric_l2_jump(l2_sub_num, last_l2_sub_num):
			
 
				+                            # 大跨度跳号常见于正文引用，例如 1.2 后出现 1.9，不直接当作新小节。
			
 
				+                            pass
			
 
				+                        else:
			
 
				+                            current_l2 = self._clean_section_title(line)
			
 
				+                            last_l2_sub_num = l2_sub_num
			
 
				+                            self._ensure_section_node(structured_data, current_l1, current_l2, page)
			
 
				+                            continue
			
 
				+                    else:
			
 
				+                        l2_sub_num = self._extract_non_numeric_l2_number(match_l2.group(1))
			
 
				+                        if l2_sub_num <= last_l2_sub_num:
			
 
				+                            pass
			
 
				+                        else:
			
 
				+                            current_l2 = self._clean_section_title(line)
			
 
				+                            last_l2_sub_num = l2_sub_num
			
 
				+                            self._ensure_section_node(structured_data, current_l1, current_l2, page)
			
 
				+                            continue
			
 
				+
			
 
				+            if current_l1 and not has_toc:
			
 
				+                target_key = current_l2 or SECTION_TITLE_KEY
			
 
				+                self._ensure_section_node(structured_data, current_l1, target_key, page)
			
 
				+                structured_data[current_l1][target_key].append({"text": original_line, "page": page})
			
 
				+
			
 
				+        for chapter_title in list(structured_data.keys()):
			
 
				+            chapter_sections = structured_data[chapter_title]
			
 
				+            if list(chapter_sections.keys()) == ["_chapter_page"]:
			
 
				+                del structured_data[chapter_title]
			
 
				+
			
 
				+        return structured_data
			
 
				+
			
 
				+    def _has_expected_numeric_l2_ahead(
			
 
				+        self,
			
 
				+        body_lines: List[BodyLine],
			
 
				+        current_index: int,
			
 
				+        chapter_number: int,
			
 
				+    ) -> bool:
			
 
				+        """校验中文序号一级标题后，是否跟着同主序号的数字二级标题。"""
			
 
				+
			
 
				+        if chapter_number <= 0 or current_index >= len(body_lines):
			
 
				+            return False
			
 
				+
			
 
				+        start_page = body_lines[current_index].page
			
 
				+        max_index = min(len(body_lines), current_index + 40)
			
 
				+        max_page = start_page + 3
			
 
				+        expected_pattern = re.compile(
			
 
				+            rf"^{chapter_number}\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5A-Za-z].*)"
			
 
				+        )
			
 
				+
			
 
				+        for next_index in range(current_index + 1, max_index):
			
 
				+            candidate_item = body_lines[next_index]
			
 
				+            if candidate_item.page > max_page:
			
 
				+                break
			
 
				+
			
 
				+            candidate_line = self._strip_leading_page_number_from_heading(candidate_item.text.strip())
			
 
				+            if not candidate_line or self._is_toc_line(candidate_line):
			
 
				+                continue
			
 
				+
			
 
				+            if (
			
 
				+                expected_pattern.match(candidate_line)
			
 
				+                and self._is_valid_heading_strict(candidate_line, is_l1=False)
			
 
				+            ):
			
 
				+                return True
			
 
				+
			
 
				+            if next_index > current_index + 1 and any(
			
 
				+                rule["l1"].match(candidate_line)
			
 
				+                for name, rule in self.RULE_LIB.items()
			
 
				+                if name != CN_LIST_L1_NUMERIC_L2_RULE
			
 
				+            ):
			
 
				+                break
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _is_viable_cn_list_l1_numeric_l2_structure(
			
 
				+        raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
			
 
				+        l1_count: int,
			
 
				+        l2_count: int,
			
 
				+    ) -> bool:
			
 
				+        """限制新规则只在真正形成“中文章 + 数字小节”结构时参与竞争。"""
			
 
				+
			
 
				+        if l1_count < 2 or l2_count < 3:
			
 
				+            return False
			
 
				+
			
 
				+        chapters_with_l2 = sum(
			
 
				+            1
			
 
				+            for sections in raw_data.values()
			
 
				+            if any(key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY)
			
 
				+        )
			
 
				+        return chapters_with_l2 >= max(2, (l1_count + 1) // 2)
			
 
				+
			
 
				+    def _convert_rule_output_to_chapters(
			
 
				+        self,
			
 
				+        raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
			
 
				+    ) -> Dict[str, Dict[str, Dict[str, Any]]]:
			
 
				+        """把规则提取出的临时结构转换为最终 chapters JSON 结构。"""
			
 
				+
			
 
				+        chapters: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				+
			
 
				+        for chapter_title, sections in raw_data.items():
			
 
				+            chapter_page = self._safe_page_number(sections.get("_chapter_page"), 1)
			
 
				+            chapter_payloads: Dict[str, Dict[str, Any]] = {}
			
 
				+
			
 
				+            for section_title, entries in sections.items():
			
 
				+                if section_title.startswith("_"):
			
 
				+                    continue
			
 
				+
			
 
				+                if entries:
			
 
				+                    page_start = self._safe_page_number(entries[0].get("page"), chapter_page)
			
 
				+                    page_end = self._safe_page_number(entries[-1].get("page"), page_start)
			
 
				+                    content = "\n".join(str(entry.get("text", "") or "") for entry in entries).strip()
			
 
				+                else:
			
 
				+                    page_start = chapter_page
			
 
				+                    page_end = chapter_page
			
 
				+                    content = ""
			
 
				+
			
 
				+                chapter_payloads[section_title] = {
			
 
				+                    "content": content or EMPTY_SECTION_PLACEHOLDER,
			
 
				+                    "page_start": page_start,
			
 
				+                    "page_end": page_end,
			
 
				+                }
			
 
				+
			
 
				+            chapter_payloads.setdefault(
			
 
				+                SECTION_TITLE_KEY,
			
 
				+                {"content": "", "page_start": chapter_page, "page_end": chapter_page},
			
 
				+            )
			
 
				+            chapters[chapter_title] = chapter_payloads
			
 
				+
			
 
				+        return chapters
			
 
				+
			
 
				+    def _evaluate_extraction(
			
 
				+        self,
			
 
				+        raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
			
 
				+        total_raw_chars: int,
			
 
				+    ) -> Tuple[int, float]:
			
 
				+        """根据章节数量、小节数量、空章节比例和正文覆盖率评估规则效果。"""
			
 
				+
			
 
				+        if not raw_data or total_raw_chars == 0:
			
 
				+            return 0, 0.0
			
 
				+
			
 
				+        l1_count = len(raw_data)
			
 
				+        l2_total_count = sum(
			
 
				+            len([key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY])
			
 
				+            for sections in raw_data.values()
			
 
				+        )
			
 
				+
			
 
				+        extracted_chars = 0
			
 
				+        empty_l1_count = 0
			
 
				+
			
 
				+        for chapter_title, sections in raw_data.items():
			
 
				+            extracted_chars += len(chapter_title)
			
 
				+            chapter_has_content = False
			
 
				+
			
 
				+            for section_title, entries in sections.items():
			
 
				+                if section_title.startswith("_"):
			
 
				+                    continue
			
 
				+                if section_title != SECTION_TITLE_KEY:
			
 
				+                    extracted_chars += len(section_title)
			
 
				+                content = "\n".join(str(entry.get("text", "") or "") for entry in entries).strip()
			
 
				+                if content:
			
 
				+                    extracted_chars += len(content)
			
 
				+                    chapter_has_content = True
			
 
				+
			
 
				+            if not chapter_has_content:
			
 
				+                empty_l1_count += 1
			
 
				+
			
 
				+        raw_coverage_rate = extracted_chars / total_raw_chars if total_raw_chars > 0 else 0.0
			
 
				+        coverage_rate = min(raw_coverage_rate, 1.0)
			
 
				+        score = 0
			
 
				+
			
 
				+        if 2 <= l1_count <= 25:
			
 
				+            score += l1_count * 15
			
 
				+        elif l1_count > 25:
			
 
				+            score += 100
			
 
				+
			
 
				+        score += l2_total_count * 5
			
 
				+
			
 
				+        if l1_count > 0 and (empty_l1_count / l1_count) > 0.8:
			
 
				+            score -= 500
			
 
				+
			
 
				+        if raw_coverage_rate > 0.8:
			
 
				+            score += int(min(raw_coverage_rate, 1.0) * 1000)
			
 
				+        elif raw_coverage_rate < 0.5:
			
 
				+            score -= 1000
			
 
				+
			
 
				+        return score, coverage_rate
			
 
				+
			
 
				+    def _build_body_catalog_from_chapters(
			
 
				+        self,
			
 
				+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				+    ) -> Optional[Dict[str, Any]]:
			
 
				+        """从正文切分结果反向生成 body_catalog/catalog 结构。"""
			
 
				+
			
 
				+        if not chapters:
			
 
				+            return None
			
 
				+
			
 
				+        catalog_chapters: List[Dict[str, Any]] = []
			
 
				+        for chapter_title, sections in chapters.items():
			
 
				+            if not isinstance(sections, dict):
			
 
				+                continue
			
 
				+
			
 
				+            page_start, page_end = self._resolve_chapter_page_span(sections)
			
 
				+            title_payload = sections.get(SECTION_TITLE_KEY, {})
			
 
				+            catalog_chapter = {
			
 
				+                "index": len(catalog_chapters) + 1,
			
 
				+                "title": chapter_title,
			
 
				+                "page": str(page_start),
			
 
				+                "original": chapter_title,
			
 
				+                "content": title_payload.get("content", "") if isinstance(title_payload, dict) else "",
			
 
				+                "page_start": page_start,
			
 
				+                "page_end": page_end,
			
 
				+                "subsections": [],
			
 
				+            }
			
 
				+
			
 
				+            for section_title, payload in sections.items():
			
 
				+                if section_title == SECTION_TITLE_KEY or not isinstance(payload, dict):
			
 
				+                    continue
			
 
				+                subsection_page_start = self._safe_page_number(payload.get("page_start"), page_start)
			
 
				+                subsection_page_end = self._safe_page_number(payload.get("page_end"), subsection_page_start)
			
 
				+                catalog_chapter["subsections"].append({
			
 
				+                    "title": section_title,
			
 
				+                    "page": str(subsection_page_start),
			
 
				+                    "level": 2,
			
 
				+                    "original": section_title,
			
 
				+                    "content": payload.get("content", ""),
			
 
				+                    "page_start": subsection_page_start,
			
 
				+                    "page_end": subsection_page_end,
			
 
				+                })
			
 
				+
			
 
				+            catalog_chapters.append(catalog_chapter)
			
 
				+
			
 
				+        if not catalog_chapters:
			
 
				+            return None
			
 
				+
			
 
				+        return {
			
 
				+            "chapters": catalog_chapters,
			
 
				+            "total_chapters": len(catalog_chapters),
			
 
				+            "source": "body_titles",
			
 
				+            "formatted_text": self._format_catalog_chapters(catalog_chapters),
			
 
				+        }
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _prepare_page_lines(cls, text: str) -> List[str]:
			
 
				+        """把页面原始文本拆成行，并提前合并可能被换行拆开的标题。"""
			
 
				+
			
 
				+        raw_lines = [line.strip() for line in (text or "").splitlines() if line.strip()]
			
 
				+        prepared: List[str] = []
			
 
				+        index = 0
			
 
				+
			
 
				+        while index < len(raw_lines):
			
 
				+            merged_line, consumed = cls._merge_heading_fragment(raw_lines, index)
			
 
				+            if merged_line:
			
 
				+                prepared.append(merged_line)
			
 
				+                index += consumed
			
 
				+                continue
			
 
				+            prepared.append(raw_lines[index])
			
 
				+            index += 1
			
 
				+
			
 
				+        return prepared
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _merge_heading_fragment(cls, lines: List[str], start_index: int) -> Tuple[Optional[str], int]:
			
 
				+        """尝试把当前位置开始的 2 到 3 行合并为一个完整标题。"""
			
 
				+
			
 
				+        first_line = lines[start_index].strip()
			
 
				+        if not first_line:
			
 
				+            return None, 1
			
 
				+
			
 
				+        first_normalized = cls._strip_leading_page_number_from_heading(first_line)
			
 
				+        first_is_heading = cls._matches_any_heading(first_normalized)
			
 
				+        first_is_incomplete = cls._is_incomplete_heading_fragment(first_normalized)
			
 
				+        max_span = min(3, len(lines) - start_index)
			
 
				+
			
 
				+        for span in range(2, max_span + 1):
			
 
				+            candidate_lines = [
			
 
				+                cls._strip_leading_page_number_from_heading(lines[start_index + offset])
			
 
				+                for offset in range(span)
			
 
				+            ]
			
 
				+            candidate_text = " ".join(item for item in candidate_lines if item).strip()
			
 
				+            if not candidate_text or cls.TOC_PATTERN.search(candidate_text):
			
 
				+                continue
			
 
				+            if cls._looks_like_table_fragment(lines, start_index, span):
			
 
				+                continue
			
 
				+            if not cls._matches_any_heading(candidate_text):
			
 
				+                continue
			
 
				+            if first_is_incomplete or not first_is_heading:
			
 
				+                return candidate_text, span
			
 
				+
			
 
				+        return None, 1
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _looks_like_table_fragment(cls, lines: List[str], start_index: int, span: int) -> bool:
			
 
				+        """判断候选跨行标题是否更像表格单元格碎片。"""
			
 
				+
			
 
				+        first_line = lines[start_index].strip()
			
 
				+        if not re.fullmatch(r"\d{1,2}(?:\.\d{1,2})?", first_line):
			
 
				+            return False
			
 
				+
			
 
				+        next_lines = [lines[idx].strip() for idx in range(start_index + 1, min(len(lines), start_index + 5))]
			
 
				+        if next_lines and cls._is_short_table_cell(next_lines[0]):
			
 
				+            return True
			
 
				+        return sum(1 for item in next_lines if cls._is_short_table_cell(item) or cls._looks_like_quantity_cell(item)) >= 2
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _find_repeated_non_heading_lines(
			
 
				+        cls,
			
 
				+        page_lines_by_page: List[Tuple[int, List[str]]],
			
 
				+        total_pages: int,
			
 
				+    ) -> set[str]:
			
 
				+        """找出跨页重复出现、且不属于标题的页眉页脚类噪声行。"""
			
 
				+
			
 
				+        if total_pages < 3:
			
 
				+            return set()
			
 
				+
			
 
				+        pages_by_key: Dict[str, set[int]] = {}
			
 
				+        for page, lines in page_lines_by_page:
			
 
				+            for line in lines:
			
 
				+                key = cls._normalize_repeated_line_key(line)
			
 
				+                if not key or not (4 <= len(key) <= 80):
			
 
				+                    continue
			
 
				+                normalized = cls._strip_leading_page_number_from_heading(line)
			
 
				+                if cls._matches_any_heading(normalized) or cls._is_toc_line(normalized):
			
 
				+                    continue
			
 
				+                pages_by_key.setdefault(key, set()).add(page)
			
 
				+
			
 
				+        threshold = max(3, (total_pages + 11) // 12)
			
 
				+        return {key for key, pages in pages_by_key.items() if len(pages) >= threshold}
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _normalize_repeated_line_key(line: str) -> str:
			
 
				+        """生成重复行检测使用的无空白 key。"""
			
 
				+
			
 
				+        return re.sub(r"\s+", "", str(line or "").strip())
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _matches_any_heading(cls, line: str) -> bool:
			
 
				+        """判断文本是否命中任意一套章/节标题规则。"""
			
 
				+
			
 
				+        clean_line = line.strip()
			
 
				+        return any(rule["l1"].match(clean_line) or rule["l2"].match(clean_line) for rule in cls.RULE_LIB.values())
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _is_incomplete_heading_fragment(cls, line: str) -> bool:
			
 
				+        """识别只有编号或标题前缀、需要等待下一行拼接的标题碎片。"""
			
 
				+
			
 
				+        clean_line = re.sub(r"\s+", "", str(line or "").strip())
			
 
				+        if not clean_line:
			
 
				+            return False
			
 
				+
			
 
				+        fragment_patterns = (
			
 
				+            r"^第(?:\d+|[一二三四五六七八九十百零两]+)[章部部分篇]$",
			
 
				+            r"^\d{1,2}(?:[\.．。、])?$",
			
 
				+            r"^\d{1,2}\.\d{1,2}(?!\.\d)\.?$",
			
 
				+            r"^[一二三四五六七八九十百零两]+[、）)\]]$",
			
 
				+            r"^第(?:\d+|[一二三四五六七八九十百零两]+)节$",
			
 
				+            r"^[【\[]\d+[\]】]$",
			
 
				+        )
			
 
				+        return any(re.match(pattern, clean_line) for pattern in fragment_patterns)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _is_toc_line(cls, line: str) -> bool:
			
 
				+        """判断一行文本是否像目录行。"""
			
 
				+
			
 
				+        clean_line = str(line or "").strip()
			
 
				+        if cls.TOC_PATTERN.search(clean_line):
			
 
				+            return True
			
 
				+        return bool(re.search(r"\s{2,}\d{1,3}$", clean_line))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _is_header_footer(cls, line: str) -> bool:
			
 
				+        """过滤页码、页眉页脚和重复方案名等非正文内容。"""
			
 
				+
			
 
				+        compact = re.sub(r"\s+", "", str(line or "").strip())
			
 
				+        if not compact:
			
 
				+            return False
			
 
				+        if compact.isdigit():
			
 
				+            return True
			
 
				+        if re.fullmatch(r"第\d+页(?:共\d+页)?", compact):
			
 
				+            return True
			
 
				+        if re.fullmatch(r"第\d+页/共\d+页", compact):
			
 
				+            return True
			
 
				+        if compact.upper() in {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"}:
			
 
				+            return True
			
 
				+        if compact in {"目录", "目", "录"}:
			
 
				+            return False
			
 
				+        normalized = cls._strip_leading_page_number_from_heading(line)
			
 
				+        return not cls._matches_any_heading(normalized) and compact in {"专项方案", "施工方案"}
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _is_valid_heading_strict(cls, line: str, is_l1: bool = False) -> bool:
			
 
				+        """用长度、标点、单位和数量词规则过滤疑似误命中的标题。"""
			
 
				+
			
 
				+        clean_line = str(line or "").strip()
			
 
				+        if not (2 <= len(clean_line) <= 60):
			
 
				+            return False
			
 
				+        if clean_line.endswith(("、", "，", "。", "；", "：", ",", ";", ":")):
			
 
				+            return False
			
 
				+        if len(clean_line.split()) > 3:
			
 
				+            return False
			
 
				+        if re.search(r"[\(（][A-Za-z\*/]+[\)）]\s*$", clean_line):
			
 
				+            return False
			
 
				+
			
 
				+        unit_pattern = (
			
 
				+            r"(?:版|版本|年一遇|倍|t|m|kg|cm|mm|km|m2|m3|㎡|m³|L|ml|MPa|kPa|kN|Hz|kW|KV|"
			
 
				+            r"千克|公斤|千米|公里|平方米|立方米|平方|立方|分钟|小时|秒|工日|台班|台套|万元|亿元)"
			
 
				+        )
			
 
				+        if re.match(rf"^\d+(?:\.\d+)?\s*{unit_pattern}(?:\s|$|[\u4e00-\u9fa5])", clean_line, re.IGNORECASE):
			
 
				+            return False
			
 
				+
			
 
				+        quantity_pattern = (
			
 
				+            r"(?:人|名|位|个|组|班|件|项|把|根|台|套|辆|部|架|座|块|片|张|卷|桶|袋|车|"
			
 
				+            r"号|步|天|吨|箱|艘|磅|米|升|斤|两|次|条|孔|跨|排|层)"
			
 
				+        )
			
 
				+        if re.match(rf"^\d+(?:\.\d+)?\s*{quantity_pattern}(?:\s|$)", clean_line, re.IGNORECASE):
			
 
				+            return False
			
 
				+
			
 
				+        if is_l1:
			
 
				+            if re.match(r"^0\d+", clean_line):
			
 
				+                return False
			
 
				+            number_match = re.search(r"^\d+|第\s*(\d+)", clean_line)
			
 
				+            if number_match:
			
 
				+                raw_number = number_match.group(1) or number_match.group(0)
			
 
				+                if raw_number.isdigit() and int(raw_number) > 30:
			
 
				+                    return False
			
 
				+            if re.search(r"[，。！!,？?；;：:]", clean_line):
			
 
				+                return False
			
 
				+            if len(clean_line) > 35:
			
 
				+                return False
			
 
				+            if cls._looks_like_numbered_quantity_line(clean_line):
			
 
				+                return False
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _looks_like_numbered_quantity_line(line: str) -> bool:
			
 
				+        """判断数字开头的行是否更像数量清单而不是一级标题。"""
			
 
				+
			
 
				+        clean_line = re.sub(r"\s+", "", str(line or "").strip())
			
 
				+        return bool(
			
 
				+            re.match(
			
 
				+                r"^\d+(?:号|步|天|吨|套|件|箱|把|根|辆|部|艘|块|片|张|卷|桶|袋|车|磅|米|升|斤|两|秒)",
			
 
				+                clean_line,
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _looks_like_plain_numeric_l1_noise(line: str) -> bool:
			
 
				+        """识别纯数字一级标题规则中常见的图名、规范名和岗位名噪声。"""
			
 
				+
			
 
				+        clean_line = re.sub(r"\s+", " ", str(line or "").strip())
			
 
				+        match = re.match(r"^\d{1,2}(?:[\.．。、])?\s+(.+)$", clean_line)
			
 
				+        if not match:
			
 
				+            return False
			
 
				+
			
 
				+        title = match.group(1).strip()
			
 
				+        compact = re.sub(r"\s+", "", title)
			
 
				+        if not compact:
			
 
				+            return True
			
 
				+
			
 
				+        figure_terms = (
			
 
				+            "示意图",
			
 
				+            "布置图",
			
 
				+            "断面图",
			
 
				+            "构造图",
			
 
				+            "大样图",
			
 
				+            "详图",
			
 
				+            "平面图",
			
 
				+            "立面图",
			
 
				+            "剖面图",
			
 
				+            "流程图",
			
 
				+            "曲线图",
			
 
				+        )
			
 
				+        if any(term in compact for term in figure_terms):
			
 
				+            return True
			
 
				+
			
 
				+        if re.search(r"(规范|标准|规程|指南|办法|条例|规定|导则|手册|文件)$", compact):
			
 
				+            return True
			
 
				+
			
 
				+        if re.search(r"(部|室|经理|总工|部长|主任|办公室|试验室)$", compact):
			
 
				+            return True
			
 
				+
			
 
				+        if re.search(r"(地震动|峰值加速度|反应谱|特征周期|场地类别|荷载组合|荷载标准值|分项系数)", compact):
			
 
				+            return True
			
 
				+
			
 
				+        chapter_keywords = (
			
 
				+            "工程",
			
 
				+            "编制",
			
 
				+            "施工",
			
 
				+            "安全",
			
 
				+            "质量",
			
 
				+            "环保",
			
 
				+            "水保",
			
 
				+            "文明",
			
 
				+            "应急",
			
 
				+            "验收",
			
 
				+            "计算",
			
 
				+            "附件",
			
 
				+            "附录",
			
 
				+            "总体",
			
 
				+            "计划",
			
 
				+            "组织",
			
 
				+            "管理",
			
 
				+            "保证",
			
 
				+            "措施",
			
 
				+            "方案",
			
 
				+            "工艺",
			
 
				+            "技术",
			
 
				+            "要求",
			
 
				+            "概况",
			
 
				+            "依据",
			
 
				+            "原则",
			
 
				+            "资源",
			
 
				+            "设备",
			
 
				+            "材料",
			
 
				+            "人员",
			
 
				+            "进度",
			
 
				+            "监测",
			
 
				+            "风险",
			
 
				+            "分析",
			
 
				+            "检查",
			
 
				+            "图纸",
			
 
				+            "设计",
			
 
				+            "部署",
			
 
				+            "安排",
			
 
				+        )
			
 
				+        return not any(keyword in compact for keyword in chapter_keywords)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _is_suspicious_numeric_l2_jump(l2_sub_num: int, last_l2_sub_num: int) -> bool:
			
 
				+        """判断数字小节编号是否出现过大的可疑跳号。"""
			
 
				+
			
 
				+        if last_l2_sub_num <= 0:
			
 
				+            return False
			
 
				+        return l2_sub_num - last_l2_sub_num > 3
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _is_short_table_cell(text: str) -> bool:
			
 
				+        """判断文本是否像短表格单元格。"""
			
 
				+
			
 
				+        clean = str(text or "").strip()
			
 
				+        if not clean:
			
 
				+            return False
			
 
				+        if len(clean) <= 4 and re.fullmatch(r"[\u4e00-\u9fa5A-Za-z]{1,4}", clean):
			
 
				+            return True
			
 
				+        return bool(re.fullmatch(r"\d+(?:\.\d+)?", clean))
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _looks_like_quantity_cell(text: str) -> bool:
			
 
				+        """判断文本是否像数量、单位或状态类表格单元格。"""
			
 
				+
			
 
				+        clean = str(text or "").strip()
			
 
				+        if not clean:
			
 
				+            return False
			
 
				+        if clean in {"正常", "可使用", "若干", "大量"}:
			
 
				+            return True
			
 
				+        return bool(
			
 
				+            re.match(
			
 
				+                r"^\d+(?:\.\d+)?\s*(?:台|套|辆|部|架|座|个|件|人|m|km|cm|mm|kg|t|%)",
			
 
				+                clean,
			
 
				+                re.IGNORECASE,
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _blind_strip(text: str) -> str:
			
 
				+        """粗略剥离标题编号前缀，用于判断剩余标题核心文本长度。"""
			
 
				+
			
 
				+        return re.sub(
			
 
				+            r"^[第的一二三四五六七八九十百零两\d\.\s、）)\]】\[(（章节部部分篇]+",
			
 
				+            "",
			
 
				+            str(text or ""),
			
 
				+        ).strip()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _strip_leading_page_number_from_heading(cls, line: str) -> str:
			
 
				+        """去掉标题行前方误混入的页码。"""
			
 
				+
			
 
				+        cleaned = re.sub(r"\s+", " ", str(line or "").strip())
			
 
				+        if not cleaned:
			
 
				+            return ""
			
 
				+
			
 
				+        return re.sub(
			
 
				+            r"^\d{1,3}\s+(?="
			
 
				+            r"(?:第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇])|"
			
 
				+            r"(?:\d{1,2}\.\d{1,2}(?!\.\d)\.?\s*[\u4e00-\u9fa5])|"
			
 
				+            r"(?:\d{1,2}\s+[\u4e00-\u9fa5])|"
			
 
				+            r"(?:[一二三四五六七八九十百零两]+[、）)\]]\s*[\u4e00-\u9fa5])|"
			
 
				+            r"(?:[【\[]\s*\d+\s*[\]】]\s*[\u4e00-\u9fa5])"
			
 
				+            r")",
			
 
				+            "",
			
 
				+            cleaned,
			
 
				+            count=1,
			
 
				+        ).strip()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _extract_l1_number(
			
 
				+        cls,
			
 
				+        line: str,
			
 
				+        rule_name: str,
			
 
				+        match_l1: re.Match[str],
			
 
				+        current_l1_num: int,
			
 
				+    ) -> int:
			
 
				+        """从一级标题文本中提取章节序号，提取失败时顺延当前章节号。"""
			
 
				+
			
 
				+        if rule_name == "Rule_1_纯数字派":
			
 
				+            number_match = re.match(r"^(\d+)", line)
			
 
				+            return int(number_match.group(1)) if number_match else 999
			
 
				+
			
 
				+        if rule_name == "Rule_2_混合章派":
			
 
				+            return int(match_l1.group(1))
			
 
				+
			
 
				+        if rule_name == CN_LIST_L1_NUMERIC_L2_RULE:
			
 
				+            cn_match = re.match(r"^([一二三四五六七八九十百零两]+)[、）)\]]", line)
			
 
				+            if cn_match:
			
 
				+                return cls._cn_to_int(cn_match.group(1))
			
 
				+
			
 
				+        chapter_match = re.search(r"^第\s*(\d+|[一二三四五六七八九十百零两]+)", line)
			
 
				+        if chapter_match:
			
 
				+            chapter_number = chapter_match.group(1)
			
 
				+            return int(chapter_number) if chapter_number.isdigit() else cls._cn_to_int(chapter_number)
			
 
				+
			
 
				+        return current_l1_num + 1
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _extract_non_numeric_l2_number(cls, prefix: str) -> int:
			
 
				+        """把非数字小节前缀转换为用于顺序比较的整数。"""
			
 
				+
			
 
				+        prefix = str(prefix or "").strip()
			
 
				+        if prefix.isdigit():
			
 
				+            return int(prefix)
			
 
				+        return cls._cn_to_int(prefix)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _cn_to_int(cls, text: str) -> int:
			
 
				+        """把中文数字文本转换为整数。"""
			
 
				+
			
 
				+        normalized = str(text or "").replace("两", "二").strip()
			
 
				+        if not normalized:
			
 
				+            return 0
			
 
				+        if normalized.isdigit():
			
 
				+            return int(normalized)
			
 
				+        if normalized == "十":
			
 
				+            return 10
			
 
				+        if "百" in normalized:
			
 
				+            left, right = normalized.split("百", 1)
			
 
				+            hundreds = cls.CN_NUM_MAP.get(left, 1) if left else 1
			
 
				+            return hundreds * 100 + cls._cn_to_int(right)
			
 
				+        if "十" in normalized:
			
 
				+            left, right = normalized.split("十", 1)
			
 
				+            tens = cls.CN_NUM_MAP.get(left, 1) if left else 1
			
 
				+            ones = cls.CN_NUM_MAP.get(right, 0) if right else 0
			
 
				+            return tens * 10 + ones
			
 
				+        return cls.CN_NUM_MAP.get(normalized, 0)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _ensure_section_node(
			
 
				+        structured_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
			
 
				+        chapter_title: str,
			
 
				+        section_title: str,
			
 
				+        page: int,
			
 
				+    ) -> None:
			
 
				+        """确保章节和小节节点存在。"""
			
 
				+
			
 
				+        structured_data.setdefault(chapter_title, {"_chapter_page": page})  # type: ignore[assignment]
			
 
				+        structured_data[chapter_title].setdefault(section_title, [])
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _chapter_has_l2(chapter_data: Dict[str, Any]) -> bool:
			
 
				+        """判断章节临时结构中是否已经出现真实二级小节。"""
			
 
				+
			
 
				+        return any(key for key in chapter_data.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _strip_catalog_page_suffix(text: str) -> str:
			
 
				+        """清理目录行尾部的点线和页码。"""
			
 
				+
			
 
				+        cleaned = re.sub(r"\s+", " ", str(text or "").strip())
			
 
				+        if not cleaned:
			
 
				+            return ""
			
 
				+        cleaned = re.sub(r"(?:[.\u2026\u00b7\u2022]{2,})[-\u2013\u2014 ]*\d+\s*$", "", cleaned).strip()
			
 
				+        return re.sub(r"\s+\d{1,3}\s*$", "", cleaned).strip()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _clean_chapter_title(cls, line: str) -> str:
			
 
				+        """规范化一级标题文本，保留编号和标题主体。"""
			
 
				+
			
 
				+        cleaned = cls._strip_catalog_page_suffix(line)
			
 
				+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
 
				+
			
 
				+        cn_match = re.match(r"^(第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇])[\s、：:.-]*(.*)$", cleaned)
			
 
				+        if cn_match:
			
 
				+            prefix = re.sub(r"\s+", "", cn_match.group(1))
			
 
				+            title = cn_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        cn_list_match = re.match(r"^([一二三四五六七八九十百零两]+[、）)\]])\s*(.*)$", cleaned)
			
 
				+        if cn_list_match:
			
 
				+            prefix = cn_list_match.group(1).strip()
			
 
				+            title = cn_list_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        num_match = re.match(r"^(\d{1,2})(?:[\.．。、])?\s*(.*)$", cleaned)
			
 
				+        if num_match:
			
 
				+            prefix = num_match.group(1)
			
 
				+            title = num_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        return cleaned
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _clean_section_title(cls, line: str) -> str:
			
 
				+        """规范化二级标题文本，保留小节编号和标题主体。"""
			
 
				+
			
 
				+        cleaned = cls._strip_catalog_page_suffix(line)
			
 
				+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
 
				+
			
 
				+        numeric_match = re.match(r"^(\d+\.\d+)(?!\.\d)\.?\s*(.*)$", cleaned)
			
 
				+        if numeric_match:
			
 
				+            prefix = numeric_match.group(1)
			
 
				+            title = numeric_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        cn_section_match = re.match(r"^(第\s*[一二三四五六七八九十百零两]+\s*节)[\s、：:.-]*(.*)$", cleaned)
			
 
				+        if cn_section_match:
			
 
				+            prefix = re.sub(r"\s+", "", cn_section_match.group(1))
			
 
				+            title = cn_section_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        cn_list_match = re.match(r"^([一二三四五六七八九十百零两]+[、）)\]])\s*(.*)$", cleaned)
			
 
				+        if cn_list_match:
			
 
				+            prefix = cn_list_match.group(1).strip()
			
 
				+            title = cn_list_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        bracket_match = re.match(r"^([【\[]\s*\d+\s*[\]】])\s*(.*)$", cleaned)
			
 
				+        if bracket_match:
			
 
				+            prefix = re.sub(r"\s+", "", bracket_match.group(1))
			
 
				+            title = bracket_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        return cleaned
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _resolve_chapter_page_span(sections: Dict[str, Dict[str, Any]]) -> Tuple[int, int]:
			
 
				+        """根据章节下所有小节的页码范围，计算章节整体页码范围。"""
			
 
				+
			
 
				+        page_starts: List[int] = []
			
 
				+        page_ends: List[int] = []
			
 
				+        for payload in sections.values():
			
 
				+            if not isinstance(payload, dict):
			
 
				+                continue
			
 
				+            page_start = PdfStructureExtractor._safe_page_number(payload.get("page_start"), 1)
			
 
				+            page_end = PdfStructureExtractor._safe_page_number(payload.get("page_end"), page_start)
			
 
				+            page_starts.append(page_start)
			
 
				+            page_ends.append(page_end)
			
 
				+        if not page_starts:
			
 
				+            return 1, 1
			
 
				+        return min(page_starts), max(page_ends)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _format_catalog_chapters(chapters: List[Dict[str, Any]]) -> str:
			
 
				+        """把目录章节结构格式化为便于查看的纯文本。"""
			
 
				+
			
 
				+        lines: List[str] = []
			
 
				+        for chapter in chapters:
			
 
				+            title = str(chapter.get("title", "") or "").strip()
			
 
				+            if not title:
			
 
				+                continue
			
 
				+            lines.append(title)
			
 
				+            for subsection in chapter.get("subsections", []) or []:
			
 
				+                sub_title = str(subsection.get("title", "") or "").strip()
			
 
				+                if sub_title:
			
 
				+                    lines.append(f"  {sub_title}")
			
 
				+        return "\n".join(lines)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _safe_page_number(value: Any, default: int = 1) -> int:
			
 
				+        """安全地把页码值转换为不小于 1 的整数。"""
			
 
				+
			
 
				+        try:
			
 
				+            return max(1, int(str(value).strip()))
			
 
				+        except Exception:
			
 
				+            return default
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py
@@ -0,0 +1,946 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+r"""
			
 
				+Batch runner for PDF structure extraction.
			
 
				+
			
 
				+Example commands:
			
 
				+
			
 
				+1. Run the original extractor:
			
 
				+python core\construction_review\component\minimal_pipeline\pdf_extractor_batch_runner.py c:\work\桥梁公司施工规范\最终 --output-dir c:\work\桥梁公司施工规范\结果_v1 --recursive --extractor pdf_extractor
			
 
				+
			
 
				+2. Run the testc-style body-only extractor:
			
 
				+python core\construction_review\component\minimal_pipeline\pdf_extractor_batch_runner.py c:\work\桥梁公司施工规范\最终 --output-dir c:\work\桥梁公司施工规范\结果_v1 --recursive --extractor pdf_extractor1
			
 
				+
			
 
				+3. Run the split-catalog extractor:
			
 
				+python core\construction_review\component\minimal_pipeline\pdf_extractor_batch_runner.py c:\work\桥梁公司施工规范\最终 --output-dir c:\work\桥梁公司施工规范\结果_v2 --recursive --extractor pdf_extractor2
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+import importlib.util
			
 
				+import json
			
 
				+import re
			
 
				+import sys
			
 
				+import types
			
 
				+from datetime import datetime
			
 
				+from difflib import SequenceMatcher
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, Iterable, List, Tuple
			
 
				+
			
 
				+import fitz
			
 
				+
			
 
				+
			
 
				+REPO_ROOT = Path(__file__).resolve().parents[4]
			
 
				+MODULE_DIR = Path(__file__).resolve().parent
			
 
				+if str(REPO_ROOT) not in sys.path:
			
 
				+    sys.path.insert(0, str(REPO_ROOT))
			
 
				+
			
 
				+
			
 
				+SPECIAL_SECTION_KEYS = {"章节标题", "默认部分"}
			
 
				+STAT_FILE_NAME = "static.text"
			
 
				+TOC_LINE_PATTERN = re.compile(r"(?:[.\u2026·•…]{2,}|-{3,}).{0,30}\d+\s*$")
			
 
				+TOC_PAGE_SUFFIX_PATTERN = re.compile(
			
 
				+    r"(?:[.\u2026\u00b7\u2022·•…]{2,}|-{3,})[-\u2013\u2014 ]*(?:-\s*)?\d{1,3}(?:\s*-)?\s*$"
			
 
				+)
			
 
				+BODY_HEADING_PATTERNS = (
			
 
				+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇][\s、:：.\-]*\S+"),
			
 
				+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇]\s*$"),
			
 
				+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*节[\s、:：.\-]*\S+"),
			
 
				+    re.compile(r"^\d{1,2}(?:[\.．。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].{1,40}$"),
			
 
				+    re.compile(r"^[一二三四五六七八九十百零两]+[、）\)\]]\s*[\u4e00-\u9fa5A-Za-z].{1,40}$"),
			
 
				+)
			
 
				+CATALOG_L1_PATTERNS = (
			
 
				+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇][\s、:：.\-]*\S+"),
			
 
				+    re.compile(r"^\d{1,2}(?:[\.．。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].{1,60}$"),
			
 
				+)
			
 
				+CATALOG_L2_PATTERNS = (
			
 
				+    re.compile(r"^\d+\.\d+(?!\.\d)\.?\s*[\u4e00-\u9fa5A-Za-z].*"),
			
 
				+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*节[\s、:：.\-]*\S+"),
			
 
				+    re.compile(r"^[一二三四五六七八九十百零两]+[、）\)\]]\s*[\u4e00-\u9fa5A-Za-z].*"),
			
 
				+    re.compile(r"^[【\[]\s*\d+\s*[\]】]\s*[\u4e00-\u9fa5A-Za-z].*"),
			
 
				+)
			
 
				+CATALOG_CN_LIST_PATTERN = re.compile(r"^[一二三四五六七八九十百零两]+[、）\)\]]\s*[\u4e00-\u9fa5A-Za-z].*")
			
 
				+CATALOG_NUMERIC_SECTION_PATTERN = re.compile(r"^\d+\.\d+(?!\.\d)\.?\s*[\u4e00-\u9fa5A-Za-z].*")
			
 
				+
			
 
				+
			
 
				+class _SilentLogger:
			
 
				+    def debug(self, *args, **kwargs) -> None:
			
 
				+        pass
			
 
				+
			
 
				+    def info(self, *args, **kwargs) -> None:
			
 
				+        pass
			
 
				+
			
 
				+    def warning(self, *args, **kwargs) -> None:
			
 
				+        pass
			
 
				+
			
 
				+    def error(self, *args, **kwargs) -> None:
			
 
				+        pass
			
 
				+
			
 
				+    def exception(self, *args, **kwargs) -> None:
			
 
				+        pass
			
 
				+
			
 
				+    def critical(self, *args, **kwargs) -> None:
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+def install_silent_logger_module() -> None:
			
 
				+    module_name = "foundation.observability.logger.loggering"
			
 
				+    if module_name in sys.modules:
			
 
				+        return
			
 
				+
			
 
				+    silent_logger = _SilentLogger()
			
 
				+
			
 
				+    foundation_module = sys.modules.get("foundation")
			
 
				+    if foundation_module is None:
			
 
				+        foundation_module = types.ModuleType("foundation")
			
 
				+        foundation_module.__path__ = [str(REPO_ROOT / "foundation")]
			
 
				+        sys.modules["foundation"] = foundation_module
			
 
				+
			
 
				+    observability_module = sys.modules.get("foundation.observability")
			
 
				+    if observability_module is None:
			
 
				+        observability_module = types.ModuleType("foundation.observability")
			
 
				+        observability_module.__path__ = []
			
 
				+        sys.modules["foundation.observability"] = observability_module
			
 
				+        foundation_module.observability = observability_module
			
 
				+
			
 
				+    logger_package = sys.modules.get("foundation.observability.logger")
			
 
				+    if logger_package is None:
			
 
				+        logger_package = types.ModuleType("foundation.observability.logger")
			
 
				+        logger_package.__path__ = []
			
 
				+        sys.modules["foundation.observability.logger"] = logger_package
			
 
				+        observability_module.logger = logger_package
			
 
				+
			
 
				+    loggering_module = types.ModuleType(module_name)
			
 
				+    loggering_module.review_logger = silent_logger
			
 
				+    loggering_module.server_logger = silent_logger
			
 
				+    loggering_module.CompatibleLogger = _SilentLogger
			
 
				+    sys.modules[module_name] = loggering_module
			
 
				+    logger_package.loggering = loggering_module
			
 
				+
			
 
				+
			
 
				+def ensure_local_package(package_name: str) -> None:
			
 
				+    if package_name in sys.modules:
			
 
				+        return
			
 
				+
			
 
				+    package_module = types.ModuleType(package_name)
			
 
				+    package_module.__path__ = [str(MODULE_DIR)]
			
 
				+    sys.modules[package_name] = package_module
			
 
				+
			
 
				+
			
 
				+def load_module_from_file(module_name: str, file_path: Path) -> Any:
			
 
				+    spec = importlib.util.spec_from_file_location(module_name, file_path)
			
 
				+    if spec is None or spec.loader is None:
			
 
				+        raise ImportError(f"Unable to load module from: {file_path}")
			
 
				+
			
 
				+    module = importlib.util.module_from_spec(spec)
			
 
				+    sys.modules[module_name] = module
			
 
				+    spec.loader.exec_module(module)
			
 
				+    return module
			
 
				+
			
 
				+
			
 
				+def load_pdf_structure_extractor(extractor_module: str = "pdf_extractor") -> Any:
			
 
				+    install_silent_logger_module()
			
 
				+
			
 
				+    package_name = "_batch_local_minimal_pipeline"
			
 
				+    ensure_local_package(package_name)
			
 
				+
			
 
				+    ocr_module_name = f"{package_name}.ocr_processor"
			
 
				+    if ocr_module_name not in sys.modules:
			
 
				+        load_module_from_file(ocr_module_name, MODULE_DIR / "ocr_processor.py")
			
 
				+
			
 
				+    module_stem = extractor_module.removesuffix(".py")
			
 
				+    module_file = MODULE_DIR / f"{module_stem}.py"
			
 
				+    if not module_file.exists():
			
 
				+        raise FileNotFoundError(f"Extractor module not found: {module_file}")
			
 
				+
			
 
				+    pdf_module_name = f"{package_name}.{module_stem}"
			
 
				+    if pdf_module_name not in sys.modules:
			
 
				+        load_module_from_file(pdf_module_name, module_file)
			
 
				+
			
 
				+    return sys.modules[pdf_module_name].PdfStructureExtractor
			
 
				+
			
 
				+
			
 
				+def parse_args() -> argparse.Namespace:
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Batch extract PDF structure with PdfStructureExtractor."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "input_dir",
			
 
				+        nargs="?",
			
 
				+        default=".",
			
 
				+        help="Directory containing PDF files. Default: current directory.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--output-dir",
			
 
				+        default=None,
			
 
				+        help="Directory for JSON outputs. Default: same directory as each PDF.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--recursive",
			
 
				+        action="store_true",
			
 
				+        help="Scan PDF files recursively.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--use-ocr",
			
 
				+        action="store_true",
			
 
				+        help="Enable OCR for table regions.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--disable-toc",
			
 
				+        action="store_true",
			
 
				+        help="Disable TOC detection.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--clip-top",
			
 
				+        type=float,
			
 
				+        default=60.0,
			
 
				+        help="Top clip margin in points. Default: 60.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--clip-bottom",
			
 
				+        type=float,
			
 
				+        default=60.0,
			
 
				+        help="Bottom clip margin in points. Default: 60.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--ocr-api-url",
			
 
				+        default="http://183.220.37.46:25429/v1/chat/completions",
			
 
				+        help="OCR API URL.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--ocr-timeout",
			
 
				+        type=int,
			
 
				+        default=600,
			
 
				+        help="OCR timeout in seconds. Default: 600.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--ocr-api-key",
			
 
				+        default="",
			
 
				+        help="OCR API key.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--toc-model-path",
			
 
				+        default="config/yolo/best.pt",
			
 
				+        help="TOC detector model path.",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--extractor",
			
 
				+        default="pdf_extractor",
			
 
				+        choices=["pdf_extractor", "pdf_extractor1", "pdf_extractor2"],
			
 
				+        help="Extractor implementation to run. Default: pdf_extractor.",
			
 
				+    )
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+
			
 
				+def iter_pdf_files(input_dir: Path, recursive: bool) -> List[Path]:
			
 
				+    pattern = "*.pdf"
			
 
				+    files: Iterable[Path] = input_dir.rglob(pattern) if recursive else input_dir.glob(pattern)
			
 
				+    return sorted(path for path in files if path.is_file())
			
 
				+
			
 
				+
			
 
				+def _count_text_chars(text: str) -> int:
			
 
				+    return sum(len(line.strip()) for line in text.splitlines() if line.strip())
			
 
				+
			
 
				+
			
 
				+def _looks_like_toc_page(text: str) -> bool:
			
 
				+    lines = [line.strip() for line in (text or "").splitlines() if line.strip()]
			
 
				+    if not lines:
			
 
				+        return False
			
 
				+
			
 
				+    compact = re.sub(r"\s+", "", "\n".join(lines))
			
 
				+    if "目录" in compact or "目錄" in compact:
			
 
				+        return True
			
 
				+
			
 
				+    toc_like_count = sum(
			
 
				+        1
			
 
				+        for line in lines
			
 
				+        if TOC_LINE_PATTERN.search(line) or TOC_PAGE_SUFFIX_PATTERN.search(line)
			
 
				+    )
			
 
				+    return toc_like_count >= 3
			
 
				+
			
 
				+
			
 
				+def _strip_leading_page_number_for_heading(line: str) -> str:
			
 
				+    cleaned = re.sub(r"\s+", " ", str(line or "").strip())
			
 
				+    if not cleaned:
			
 
				+        return ""
			
 
				+
			
 
				+    return re.sub(
			
 
				+        r"^\d{1,3}\s+(?="
			
 
				+        r"(?:第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇])|"
			
 
				+        r"(?:第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*节)|"
			
 
				+        r"(?:\d{1,2}(?:[\.．。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z])|"
			
 
				+        r"(?:[一二三四五六七八九十百零两]+[、）\)\]]\s*[\u4e00-\u9fa5A-Za-z])"
			
 
				+        r")",
			
 
				+        "",
			
 
				+        cleaned,
			
 
				+        count=1,
			
 
				+    ).strip()
			
 
				+
			
 
				+
			
 
				+def _looks_like_body_start_page(text: str) -> bool:
			
 
				+    for raw_line in (text or "").splitlines():
			
 
				+        line = _strip_leading_page_number_for_heading(raw_line)
			
 
				+        if not line or TOC_LINE_PATTERN.search(line):
			
 
				+            continue
			
 
				+        if any(pattern.match(line) for pattern in BODY_HEADING_PATTERNS):
			
 
				+            return True
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def _find_count_start_page_index(page_texts: List[str]) -> int:
			
 
				+    toc_start_index: int | None = None
			
 
				+    for index, text in enumerate(page_texts):
			
 
				+        if _looks_like_toc_page(text):
			
 
				+            toc_start_index = index
			
 
				+            break
			
 
				+
			
 
				+    search_start = toc_start_index + 1 if toc_start_index is not None else 0
			
 
				+    for index, text in enumerate(page_texts[search_start:], search_start):
			
 
				+        if _looks_like_toc_page(text):
			
 
				+            continue
			
 
				+        if _looks_like_body_start_page(text):
			
 
				+            return index
			
 
				+
			
 
				+    return search_start if toc_start_index is not None else 0
			
 
				+
			
 
				+
			
 
				+def _strip_catalog_count_line(line: str) -> str:
			
 
				+    cleaned = re.sub(r"\s+", " ", str(line or "").strip())
			
 
				+    if not cleaned:
			
 
				+        return ""
			
 
				+
			
 
				+    page_match = TOC_PAGE_SUFFIX_PATTERN.search(cleaned)
			
 
				+    if page_match:
			
 
				+        return cleaned[:page_match.start()].strip(" .\u2026\u00b7\u2022-\u2013\u2014")
			
 
				+
			
 
				+    return re.sub(r"\s{2,}\d{1,3}\s*$", "", cleaned).strip()
			
 
				+
			
 
				+
			
 
				+def _iter_front_catalog_lines(pdf_path: Path, clip_top: float, clip_bottom: float) -> List[str]:
			
 
				+    catalog_texts: List[str] = []
			
 
				+    saw_catalog = False
			
 
				+
			
 
				+    with fitz.open(pdf_path) as doc:
			
 
				+        for page_index in range(min(len(doc), 12)):
			
 
				+            page = doc.load_page(page_index)
			
 
				+            rect = page.rect
			
 
				+            clip_box = fitz.Rect(0, clip_top, rect.width, rect.height - clip_bottom)
			
 
				+            text = page.get_text("text", clip=clip_box)
			
 
				+            if _looks_like_toc_page(text):
			
 
				+                saw_catalog = True
			
 
				+                catalog_texts.append(text)
			
 
				+                continue
			
 
				+            if saw_catalog:
			
 
				+                break
			
 
				+
			
 
				+    lines: List[str] = []
			
 
				+    for text in catalog_texts:
			
 
				+        for raw_line in (text or "").splitlines():
			
 
				+            line = _strip_catalog_count_line(raw_line)
			
 
				+            if not line:
			
 
				+                continue
			
 
				+            compact = re.sub(r"\s+", "", line)
			
 
				+            if compact in {"目录", "目", "录"}:
			
 
				+                continue
			
 
				+            lines.append(line)
			
 
				+    return _merge_split_catalog_heading_lines(lines)
			
 
				+
			
 
				+
			
 
				+def _merge_split_catalog_heading_lines(lines: List[str]) -> List[str]:
			
 
				+    merged: List[str] = []
			
 
				+    index = 0
			
 
				+    split_chapter_pattern = re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]\s*$")
			
 
				+
			
 
				+    while index < len(lines):
			
 
				+        line = lines[index]
			
 
				+        if split_chapter_pattern.match(line) and index + 1 < len(lines):
			
 
				+            next_line = lines[index + 1].strip()
			
 
				+            if next_line and not any(pattern.match(next_line) for pattern in CATALOG_L2_PATTERNS):
			
 
				+                merged.append(f"{line} {next_line}")
			
 
				+                index += 2
			
 
				+                continue
			
 
				+        merged.append(line)
			
 
				+        index += 1
			
 
				+
			
 
				+    return merged
			
 
				+
			
 
				+
			
 
				+def _classify_catalog_line_level(
			
 
				+    line: str,
			
 
				+    next_line: str,
			
 
				+    saw_explicit_l1: bool,
			
 
				+) -> int | None:
			
 
				+    if any(pattern.match(line) for pattern in CATALOG_L1_PATTERNS):
			
 
				+        return 1
			
 
				+
			
 
				+    if CATALOG_CN_LIST_PATTERN.match(line):
			
 
				+        if not saw_explicit_l1 and CATALOG_NUMERIC_SECTION_PATTERN.match(next_line):
			
 
				+            return 1
			
 
				+        return 2
			
 
				+
			
 
				+    if any(pattern.match(line) for pattern in CATALOG_L2_PATTERNS):
			
 
				+        return 2
			
 
				+
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom: float) -> List[Dict[str, Any]]:
			
 
				+    items: List[Dict[str, Any]] = []
			
 
				+    lines = _iter_front_catalog_lines(pdf_path, clip_top, clip_bottom)
			
 
				+    saw_explicit_l1 = False
			
 
				+
			
 
				+    for index, line in enumerate(lines):
			
 
				+        next_line = lines[index + 1] if index + 1 < len(lines) else ""
			
 
				+        level = _classify_catalog_line_level(line, next_line, saw_explicit_l1)
			
 
				+        if level is None:
			
 
				+            continue
			
 
				+
			
 
				+        items.append({"level": level, "title": line})
			
 
				+        if level == 1 and any(pattern.match(line) for pattern in CATALOG_L1_PATTERNS):
			
 
				+            saw_explicit_l1 = True
			
 
				+
			
 
				+    return items
			
 
				+
			
 
				+
			
 
				+def count_catalog_item_levels(items: List[Dict[str, Any]]) -> Tuple[int, int]:
			
 
				+    l1_count = sum(1 for item in items if item.get("level") == 1)
			
 
				+    l2_count = sum(1 for item in items if item.get("level") == 2)
			
 
				+    return l1_count, l2_count
			
 
				+
			
 
				+
			
 
				+def compute_original_catalog_counts(pdf_path: Path, clip_top: float, clip_bottom: float) -> Tuple[int, int]:
			
 
				+    return count_catalog_item_levels(extract_original_catalog_items(pdf_path, clip_top, clip_bottom))
			
 
				+
			
 
				+
			
 
				+def compute_raw_char_count_with_scope(pdf_path: Path, clip_top: float, clip_bottom: float) -> Tuple[int, int]:
			
 
				+    page_texts: List[str] = []
			
 
				+    with fitz.open(pdf_path) as doc:
			
 
				+        for page in doc:
			
 
				+            rect = page.rect
			
 
				+            clip_box = fitz.Rect(0, clip_top, rect.width, rect.height - clip_bottom)
			
 
				+            page_texts.append(page.get_text("text", clip=clip_box))
			
 
				+
			
 
				+    start_index = _find_count_start_page_index(page_texts)
			
 
				+    total = sum(_count_text_chars(text) for text in page_texts[start_index:])
			
 
				+    return total, start_index + 1
			
 
				+
			
 
				+
			
 
				+def compute_raw_char_count(pdf_path: Path, clip_top: float, clip_bottom: float) -> int:
			
 
				+    total, _ = compute_raw_char_count_with_scope(pdf_path, clip_top, clip_bottom)
			
 
				+    return total
			
 
				+
			
 
				+
			
 
				+def compute_extracted_char_count(result: Dict[str, Any]) -> int:
			
 
				+    total = 0
			
 
				+    chapters = result.get("chapters", {}) or {}
			
 
				+    for chapter_title, sections in chapters.items():
			
 
				+        chapter_title = str(chapter_title or "").strip()
			
 
				+        if chapter_title:
			
 
				+            total += len(chapter_title)
			
 
				+
			
 
				+        if not isinstance(sections, dict):
			
 
				+            continue
			
 
				+
			
 
				+        for section_title, payload in sections.items():
			
 
				+            section_title = str(section_title or "").strip()
			
 
				+            if section_title and section_title not in SPECIAL_SECTION_KEYS:
			
 
				+                total += len(section_title)
			
 
				+
			
 
				+            if isinstance(payload, dict):
			
 
				+                content = str(payload.get("content", "") or "").strip()
			
 
				+            else:
			
 
				+                content = str(payload or "").strip()
			
 
				+            total += _count_text_chars(content)
			
 
				+    return total
			
 
				+
			
 
				+
			
 
				+def compute_quality_rate(raw_char_count: int, extracted_char_count: int) -> Tuple[float, str]:
			
 
				+    if raw_char_count <= 0:
			
 
				+        return 0.0, "0.0%"
			
 
				+
			
 
				+    rate = extracted_char_count / raw_char_count
			
 
				+    rate = max(0.0, min(rate, 1.0))
			
 
				+    return rate, f"{rate * 100:.1f}%"
			
 
				+
			
 
				+
			
 
				+def count_sections(result: Dict[str, Any]) -> int:
			
 
				+    chapters = result.get("chapters", {}) or {}
			
 
				+    total = 0
			
 
				+    for sections in chapters.values():
			
 
				+        if not isinstance(sections, dict):
			
 
				+            continue
			
 
				+        total += sum(1 for key in sections.keys() if key not in SPECIAL_SECTION_KEYS)
			
 
				+    return total
			
 
				+
			
 
				+
			
 
				+def _catalog_title_from_entry(entry: Any) -> str:
			
 
				+    if isinstance(entry, str):
			
 
				+        return entry.strip()
			
 
				+    if not isinstance(entry, dict):
			
 
				+        return ""
			
 
				+
			
 
				+    for key in ("title", "name", "text", "chapter_title", "section_title", "heading", "original"):
			
 
				+        value = str(entry.get(key, "") or "").strip()
			
 
				+        if value:
			
 
				+            return value
			
 
				+    return ""
			
 
				+
			
 
				+
			
 
				+def _iter_catalog_subsections(chapter: Dict[str, Any]) -> Iterable[Any]:
			
 
				+    subsections = chapter.get("subsections")
			
 
				+    if subsections is None:
			
 
				+        subsections = chapter.get("sections")
			
 
				+
			
 
				+    if isinstance(subsections, list):
			
 
				+        return subsections
			
 
				+    if isinstance(subsections, dict):
			
 
				+        return [
			
 
				+            value if isinstance(value, dict) and _catalog_title_from_entry(value) else {"title": key}
			
 
				+            for key, value in subsections.items()
			
 
				+        ]
			
 
				+    return []
			
 
				+
			
 
				+
			
 
				+def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]:
			
 
				+    items: List[Dict[str, Any]] = []
			
 
				+    catalog = result.get("catalog") or result.get("body_catalog") or {}
			
 
				+
			
 
				+    if isinstance(catalog, dict) and isinstance(catalog.get("chapters"), list):
			
 
				+        for chapter in catalog.get("chapters") or []:
			
 
				+            if not isinstance(chapter, dict):
			
 
				+                continue
			
 
				+
			
 
				+            chapter_title = _catalog_title_from_entry(chapter)
			
 
				+            if chapter_title:
			
 
				+                items.append({"level": 1, "title": chapter_title})
			
 
				+
			
 
				+            for subsection in _iter_catalog_subsections(chapter):
			
 
				+                section_title = _catalog_title_from_entry(subsection)
			
 
				+                if section_title:
			
 
				+                    items.append({"level": 2, "title": section_title})
			
 
				+
			
 
				+        return items
			
 
				+
			
 
				+    chapters = result.get("chapters", {}) or {}
			
 
				+    if not isinstance(chapters, dict):
			
 
				+        return items
			
 
				+
			
 
				+    for chapter_title, sections in chapters.items():
			
 
				+        chapter_title = str(chapter_title or "").strip()
			
 
				+        if chapter_title:
			
 
				+            items.append({"level": 1, "title": chapter_title})
			
 
				+
			
 
				+        if not isinstance(sections, dict):
			
 
				+            continue
			
 
				+        for section_title in sections.keys():
			
 
				+            section_title = str(section_title or "").strip()
			
 
				+            if section_title and section_title not in SPECIAL_SECTION_KEYS:
			
 
				+                items.append({"level": 2, "title": section_title})
			
 
				+
			
 
				+    return items
			
 
				+
			
 
				+
			
 
				+def count_extracted_catalog_items(result: Dict[str, Any]) -> Tuple[int, int]:
			
 
				+    return count_catalog_item_levels(extract_result_catalog_items(result))
			
 
				+
			
 
				+
			
 
				+def _normalize_catalog_title(text: str) -> str:
			
 
				+    cleaned = _strip_catalog_count_line(str(text or ""))
			
 
				+    cleaned = cleaned.translate(
			
 
				+        str.maketrans({
			
 
				+            "\uff08": "(",
			
 
				+            "\uff09": ")",
			
 
				+            "\uff0c": ",",
			
 
				+            "\uff0e": ".",
			
 
				+            "\u3002": ".",
			
 
				+            "\u3001": ",",
			
 
				+            "\uff1a": ":",
			
 
				+            "\uff1b": ";",
			
 
				+            "\u3000": " ",
			
 
				+        })
			
 
				+    )
			
 
				+    cleaned = re.sub(r"\s+", "", cleaned)
			
 
				+    cleaned = re.sub(r"[\u00b7\u2022\u2026.,:;_\-()\[\]{}<>/\\]+", "", cleaned)
			
 
				+    return cleaned.lower()
			
 
				+
			
 
				+
			
 
				+def _strip_catalog_heading_prefix(text: str) -> str:
			
 
				+    cleaned = _strip_catalog_count_line(str(text or "")).strip()
			
 
				+    cn_num = r"\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u96f6\u4e24"
			
 
				+    prefix_patterns = (
			
 
				+        rf"^第\s*(?:\d+|[{cn_num}]+)\s*[章部部分篇]\s*[、,，.．。:：\-\s]*",
			
 
				+        rf"^第\s*(?:\d+|[{cn_num}]+)\s*节\s*[、,，.．。:：\-\s]*",
			
 
				+        r"^\d+(?:\.\d+){0,3}\.?\s*",
			
 
				+        rf"^[{cn_num}]+[、,，）)\]]\s*",
			
 
				+        r"^[【\[]\s*\d+\s*[】\]]\s*",
			
 
				+    )
			
 
				+    for pattern in prefix_patterns:
			
 
				+        stripped = re.sub(pattern, "", cleaned, count=1)
			
 
				+        if stripped != cleaned:
			
 
				+            return stripped.strip()
			
 
				+    return cleaned
			
 
				+
			
 
				+
			
 
				+def _catalog_title_similarity(left: str, right: str) -> float:
			
 
				+    left_full = _normalize_catalog_title(left)
			
 
				+    right_full = _normalize_catalog_title(right)
			
 
				+    if not left_full or not right_full:
			
 
				+        return 0.0
			
 
				+    if left_full == right_full:
			
 
				+        return 1.0
			
 
				+
			
 
				+    scores = [SequenceMatcher(None, left_full, right_full).ratio()]
			
 
				+
			
 
				+    left_body = _normalize_catalog_title(_strip_catalog_heading_prefix(left))
			
 
				+    right_body = _normalize_catalog_title(_strip_catalog_heading_prefix(right))
			
 
				+    if left_body and right_body:
			
 
				+        if left_body == right_body:
			
 
				+            scores.append(1.0)
			
 
				+        elif min(len(left_body), len(right_body)) >= 4 and (
			
 
				+            left_body in right_body or right_body in left_body
			
 
				+        ):
			
 
				+            scores.append(0.95)
			
 
				+        else:
			
 
				+            scores.append(SequenceMatcher(None, left_body, right_body).ratio())
			
 
				+
			
 
				+    if min(len(left_full), len(right_full)) >= 4 and (
			
 
				+        left_full in right_full or right_full in left_full
			
 
				+    ):
			
 
				+        scores.append(0.95)
			
 
				+
			
 
				+    return max(scores)
			
 
				+
			
 
				+
			
 
				+def _longest_increasing_subsequence_length(values: List[int]) -> int:
			
 
				+    if not values:
			
 
				+        return 0
			
 
				+
			
 
				+    lengths = [1] * len(values)
			
 
				+    for index, value in enumerate(values):
			
 
				+        for prev_index in range(index):
			
 
				+            if values[prev_index] < value:
			
 
				+                lengths[index] = max(lengths[index], lengths[prev_index] + 1)
			
 
				+    return max(lengths)
			
 
				+
			
 
				+
			
 
				+def _catalog_count_score(original_count: int, extracted_count: int) -> float:
			
 
				+    max_count = max(original_count, extracted_count)
			
 
				+    if max_count <= 0:
			
 
				+        return 1.0
			
 
				+    return min(original_count, extracted_count) / max_count
			
 
				+
			
 
				+
			
 
				+def _match_catalog_level(
			
 
				+    original_items: List[Dict[str, Any]],
			
 
				+    extracted_items: List[Dict[str, Any]],
			
 
				+    level: int,
			
 
				+) -> Dict[str, Any]:
			
 
				+    originals = [item for item in original_items if item.get("level") == level]
			
 
				+    extracted = [item for item in extracted_items if item.get("level") == level]
			
 
				+    used_extracted_indexes: set[int] = set()
			
 
				+    matches: List[Dict[str, Any]] = []
			
 
				+
			
 
				+    for original_index, original in enumerate(originals):
			
 
				+        best_index = -1
			
 
				+        best_score = 0.0
			
 
				+        original_title = str(original.get("title", "") or "")
			
 
				+
			
 
				+        for extracted_index, candidate in enumerate(extracted):
			
 
				+            if extracted_index in used_extracted_indexes:
			
 
				+                continue
			
 
				+            score = _catalog_title_similarity(original_title, str(candidate.get("title", "") or ""))
			
 
				+            if score > best_score:
			
 
				+                best_score = score
			
 
				+                best_index = extracted_index
			
 
				+
			
 
				+        threshold = 0.82 if level == 1 else 0.78
			
 
				+        if best_index >= 0 and best_score >= threshold:
			
 
				+            used_extracted_indexes.add(best_index)
			
 
				+            matches.append({
			
 
				+                "original_index": original_index,
			
 
				+                "extracted_index": best_index,
			
 
				+                "score": best_score,
			
 
				+            })
			
 
				+
			
 
				+    original_count = len(originals)
			
 
				+    extracted_count = len(extracted)
			
 
				+    matched_count = len(matches)
			
 
				+    precision = matched_count / extracted_count if extracted_count else (1.0 if original_count == 0 else 0.0)
			
 
				+    recall = matched_count / original_count if original_count else (1.0 if extracted_count == 0 else 0.0)
			
 
				+    f1 = 0.0 if precision + recall == 0 else 2 * precision * recall / (precision + recall)
			
 
				+
			
 
				+    ordered_extracted_indexes = [match["extracted_index"] for match in sorted(matches, key=lambda item: item["original_index"])]
			
 
				+    order_score = (
			
 
				+        _longest_increasing_subsequence_length(ordered_extracted_indexes) / matched_count
			
 
				+        if matched_count
			
 
				+        else (1.0 if original_count == 0 and extracted_count == 0 else 0.0)
			
 
				+    )
			
 
				+
			
 
				+    return {
			
 
				+        "original": original_count,
			
 
				+        "extracted": extracted_count,
			
 
				+        "matched": matched_count,
			
 
				+        "precision": precision,
			
 
				+        "recall": recall,
			
 
				+        "title_f1": f1,
			
 
				+        "count_score": _catalog_count_score(original_count, extracted_count),
			
 
				+        "order_score": order_score,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def _weighted_catalog_score(level_details: Dict[str, Dict[str, Any]], metric: str) -> float:
			
 
				+    weighted_scores: List[Tuple[float, float]] = []
			
 
				+    if max(level_details["chapter"]["original"], level_details["chapter"]["extracted"]) > 0:
			
 
				+        weighted_scores.append((0.35, float(level_details["chapter"][metric])))
			
 
				+    if max(level_details["section"]["original"], level_details["section"]["extracted"]) > 0:
			
 
				+        weighted_scores.append((0.65, float(level_details["section"][metric])))
			
 
				+
			
 
				+    if not weighted_scores:
			
 
				+        return 0.0
			
 
				+
			
 
				+    total_weight = sum(weight for weight, _ in weighted_scores)
			
 
				+    return sum(weight * score for weight, score in weighted_scores) / total_weight
			
 
				+
			
 
				+
			
 
				+def _round_catalog_detail(value: Any) -> Any:
			
 
				+    if isinstance(value, float):
			
 
				+        return round(value, 4)
			
 
				+    if isinstance(value, dict):
			
 
				+        return {key: _round_catalog_detail(item) for key, item in value.items()}
			
 
				+    return value
			
 
				+
			
 
				+
			
 
				+def compute_catalog_quality_rate_from_items(
			
 
				+    original_items: List[Dict[str, Any]],
			
 
				+    extracted_items: List[Dict[str, Any]],
			
 
				+) -> Tuple[float, str, Dict[str, Any]]:
			
 
				+    level_details = {
			
 
				+        "chapter": _match_catalog_level(original_items, extracted_items, 1),
			
 
				+        "section": _match_catalog_level(original_items, extracted_items, 2),
			
 
				+    }
			
 
				+    title_score = _weighted_catalog_score(level_details, "title_f1")
			
 
				+    count_score = _weighted_catalog_score(level_details, "count_score")
			
 
				+    order_score = _weighted_catalog_score(level_details, "order_score")
			
 
				+    rate = 0.70 * title_score + 0.20 * count_score + 0.10 * order_score
			
 
				+    rate = max(0.0, min(rate, 1.0))
			
 
				+
			
 
				+    detail = {
			
 
				+        "score_model": "title_f1_70_count_20_order_10",
			
 
				+        "title_score": title_score,
			
 
				+        "count_score": count_score,
			
 
				+        "order_score": order_score,
			
 
				+        "level_details": level_details,
			
 
				+    }
			
 
				+    return rate, f"{rate * 100:.1f}%", _round_catalog_detail(detail)
			
 
				+
			
 
				+
			
 
				+def append_static_record(
			
 
				+    stat_path: Path,
			
 
				+    pdf_path: Path,
			
 
				+    original_l1_count: int,
			
 
				+    extracted_l1_count: int,
			
 
				+    original_l2_count: int,
			
 
				+    extracted_l2_count: int,
			
 
				+    catalog_quality_rate_text: str,
			
 
				+    content_quality_rate_text: str,
			
 
				+) -> None:
			
 
				+    stat_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    needs_header = not stat_path.exists() or stat_path.stat().st_size == 0
			
 
				+    with stat_path.open("a", encoding="utf-8", newline="") as file:
			
 
				+        if needs_header:
			
 
				+            file.write("文件名\t一级目录(原PDF/提取)\t二级目录(原PDF/提取)\t目录合格率\t内容合格率\n")
			
 
				+        file.write(
			
 
				+            f"{pdf_path.name}\t"
			
 
				+            f"{original_l1_count}/{extracted_l1_count}\t"
			
 
				+            f"{original_l2_count}/{extracted_l2_count}\t"
			
 
				+            f"{catalog_quality_rate_text}\t"
			
 
				+            f"{content_quality_rate_text}\n"
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def sanitize_filename_component(value: str) -> str:
			
 
				+    sanitized = value.strip()
			
 
				+    for char in '<>:"/\\|?*':
			
 
				+        sanitized = sanitized.replace(char, "_")
			
 
				+    return sanitized or "output"
			
 
				+
			
 
				+
			
 
				+def build_output_path(
			
 
				+    pdf_path: Path,
			
 
				+    input_dir: Path,
			
 
				+    output_dir: Path | None,
			
 
				+    quality_rate_text: str,
			
 
				+) -> Path:
			
 
				+    if output_dir is None:
			
 
				+        target_dir = pdf_path.parent
			
 
				+        stem_source = pdf_path.stem
			
 
				+    else:
			
 
				+        target_dir = output_dir
			
 
				+        try:
			
 
				+            relative_stem = pdf_path.relative_to(input_dir).with_suffix("")
			
 
				+            stem_source = "__".join(relative_stem.parts)
			
 
				+        except ValueError:
			
 
				+            stem_source = pdf_path.stem
			
 
				+
			
 
				+    target_dir.mkdir(parents=True, exist_ok=True)
			
 
				+    safe_stem = sanitize_filename_component(stem_source)
			
 
				+    filename = f"{quality_rate_text}_{safe_stem}.json"
			
 
				+    return target_dir / filename
			
 
				+
			
 
				+
			
 
				+def build_output_payload(
			
 
				+    pdf_path: Path,
			
 
				+    extractor_result: Dict[str, Any],
			
 
				+    raw_char_count: int,
			
 
				+    raw_char_count_start_page: int,
			
 
				+    extracted_char_count: int,
			
 
				+    quality_rate_text: str,
			
 
				+    use_ocr: bool,
			
 
				+    detect_toc: bool,
			
 
				+    extractor_name: str,
			
 
				+) -> Dict[str, Any]:
			
 
				+    chapters = extractor_result.get("chapters", {}) or {}
			
 
				+    catalog = extractor_result.get("catalog") or {}
			
 
				+    return {
			
 
				+        "metadata": {
			
 
				+            "filename": pdf_path.name,
			
 
				+            "source_path": str(pdf_path),
			
 
				+            "generated_at": datetime.now().isoformat(timespec="seconds"),
			
 
				+            "quality_rate": quality_rate_text,
			
 
				+            "raw_char_count": raw_char_count,
			
 
				+            "raw_char_count_scope": "body_only",
			
 
				+            "raw_char_count_start_page": raw_char_count_start_page,
			
 
				+            "extracted_char_count": extracted_char_count,
			
 
				+            "chapter_count": len(chapters),
			
 
				+            "section_count": count_sections(extractor_result),
			
 
				+            "total_pages": extractor_result.get("total_pages", 0),
			
 
				+            "catalog_chapter_count": catalog.get("total_chapters", 0) if isinstance(catalog, dict) else 0,
			
 
				+            "use_ocr": use_ocr,
			
 
				+            "detect_toc": detect_toc,
			
 
				+            "extractor": extractor_name,
			
 
				+        },
			
 
				+        "extracted_result": extractor_result,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def process_pdf(
			
 
				+    pdf_path: Path,
			
 
				+    input_dir: Path,
			
 
				+    output_dir: Path | None,
			
 
				+    extractor: Any,
			
 
				+    clip_top: float,
			
 
				+    clip_bottom: float,
			
 
				+    use_ocr: bool,
			
 
				+    detect_toc: bool,
			
 
				+    extractor_name: str,
			
 
				+) -> Tuple[Path, str]:
			
 
				+    raw_char_count, raw_char_count_start_page = compute_raw_char_count_with_scope(pdf_path, clip_top, clip_bottom)
			
 
				+    original_catalog_items = extract_original_catalog_items(pdf_path, clip_top, clip_bottom)
			
 
				+    original_l1_count, original_l2_count = count_catalog_item_levels(original_catalog_items)
			
 
				+    file_content = pdf_path.read_bytes()
			
 
				+    extractor_result = extractor.extract(file_content)
			
 
				+    extracted_char_count = compute_extracted_char_count(extractor_result)
			
 
				+    _, quality_rate_text = compute_quality_rate(raw_char_count, extracted_char_count)
			
 
				+    extracted_catalog_items = extract_result_catalog_items(extractor_result)
			
 
				+    extracted_l1_count, extracted_l2_count = count_catalog_item_levels(extracted_catalog_items)
			
 
				+    _, catalog_quality_rate_text, catalog_quality_detail = compute_catalog_quality_rate_from_items(
			
 
				+        original_items=original_catalog_items,
			
 
				+        extracted_items=extracted_catalog_items,
			
 
				+    )
			
 
				+
			
 
				+    payload = build_output_payload(
			
 
				+        pdf_path=pdf_path,
			
 
				+        extractor_result=extractor_result,
			
 
				+        raw_char_count=raw_char_count,
			
 
				+        raw_char_count_start_page=raw_char_count_start_page,
			
 
				+        extracted_char_count=extracted_char_count,
			
 
				+        quality_rate_text=quality_rate_text,
			
 
				+        use_ocr=use_ocr,
			
 
				+        detect_toc=detect_toc,
			
 
				+            extractor_name=extractor_name,
			
 
				+    )
			
 
				+    payload["metadata"].update({
			
 
				+        "original_catalog_chapter_count": original_l1_count,
			
 
				+        "original_catalog_section_count": original_l2_count,
			
 
				+        "extracted_catalog_chapter_count": extracted_l1_count,
			
 
				+        "extracted_catalog_section_count": extracted_l2_count,
			
 
				+        "catalog_quality_rate": catalog_quality_rate_text,
			
 
				+        "catalog_quality_detail": catalog_quality_detail,
			
 
				+    })
			
 
				+
			
 
				+    output_path = build_output_path(pdf_path, input_dir, output_dir, quality_rate_text)
			
 
				+    output_path.write_text(
			
 
				+        json.dumps(payload, ensure_ascii=False, indent=2),
			
 
				+        encoding="utf-8",
			
 
				+    )
			
 
				+    append_static_record(
			
 
				+        stat_path=output_path.parent / STAT_FILE_NAME,
			
 
				+        pdf_path=pdf_path,
			
 
				+        original_l1_count=original_l1_count,
			
 
				+        extracted_l1_count=extracted_l1_count,
			
 
				+        original_l2_count=original_l2_count,
			
 
				+        extracted_l2_count=extracted_l2_count,
			
 
				+        catalog_quality_rate_text=catalog_quality_rate_text,
			
 
				+        content_quality_rate_text=quality_rate_text,
			
 
				+    )
			
 
				+    return output_path, quality_rate_text
			
 
				+
			
 
				+
			
 
				+def main() -> int:
			
 
				+    args = parse_args()
			
 
				+    input_dir = Path(args.input_dir).expanduser().resolve()
			
 
				+    output_dir = Path(args.output_dir).expanduser().resolve() if args.output_dir else None
			
 
				+
			
 
				+    if not input_dir.exists() or not input_dir.is_dir():
			
 
				+        print(f"[ERROR] Input directory does not exist: {input_dir}")
			
 
				+        return 1
			
 
				+
			
 
				+    pdf_files = iter_pdf_files(input_dir, args.recursive)
			
 
				+    if not pdf_files:
			
 
				+        print(f"[ERROR] No PDF files found in: {input_dir}")
			
 
				+        return 1
			
 
				+
			
 
				+    PdfStructureExtractor = load_pdf_structure_extractor(args.extractor)
			
 
				+    extractor = PdfStructureExtractor(
			
 
				+        clip_top=args.clip_top,
			
 
				+        clip_bottom=args.clip_bottom,
			
 
				+        use_ocr=args.use_ocr,
			
 
				+        ocr_api_url=args.ocr_api_url,
			
 
				+        ocr_timeout=args.ocr_timeout,
			
 
				+        ocr_api_key=args.ocr_api_key,
			
 
				+        detect_toc=not args.disable_toc,
			
 
				+        toc_model_path=args.toc_model_path,
			
 
				+    )
			
 
				+
			
 
				+    print("=" * 80)
			
 
				+    print(f"Found {len(pdf_files)} PDF file(s) in: {input_dir}")
			
 
				+    print(f"Extractor: {args.extractor}")
			
 
				+    print("=" * 80)
			
 
				+
			
 
				+    success_count = 0
			
 
				+    for index, pdf_path in enumerate(pdf_files, 1):
			
 
				+        print(f"[{index}/{len(pdf_files)}] Processing: {pdf_path.name}")
			
 
				+        try:
			
 
				+            output_path, quality_rate_text = process_pdf(
			
 
				+                pdf_path=pdf_path,
			
 
				+                input_dir=input_dir,
			
 
				+                output_dir=output_dir,
			
 
				+                extractor=extractor,
			
 
				+                clip_top=args.clip_top,
			
 
				+                clip_bottom=args.clip_bottom,
			
 
				+                use_ocr=args.use_ocr,
			
 
				+                detect_toc=not args.disable_toc,
			
 
				+                extractor_name=args.extractor,
			
 
				+            )
			
 
				+            success_count += 1
			
 
				+            print(f"  [OK] quality={quality_rate_text} -> {output_path}")
			
 
				+        except Exception as exc:
			
 
				+            print(f"  [FAILED] {exc}")
			
 
				+
			
 
				+    print("=" * 80)
			
 
				+    print(f"Finished. Success: {success_count}/{len(pdf_files)}")
			
 
				+    print("=" * 80)
			
 
				+    return 0 if success_count == len(pdf_files) else 2
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    raise SystemExit(main())
			
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -17,7 +17,7 @@ from typing import Dict, Any, Optional, Tuple, List
 
				 from foundation.observability.logger.loggering import review_logger as logger
			
 
				 from foundation.observability.cachefiles import cache, CacheBaseDir
			
 
				 
			
 
				-from .pdf_extractor import PdfStructureExtractor
			
 
				+from .pdf_extractor1 import PdfStructureExtractor
			
 
				 from .toc_builder import build_toc_items_from_structure
			
 
				 from .chunk_assembler import assemble_chunks
			
 
				 from ..doc_worker.classification.hierarchy_classifier import HierarchyClassifier