ソースを参照

fix(不使用ocr提取目录)

tangle 1 日 前
コミット
2c2f6a9753

+ 1325 - 0
core/construction_review/component/minimal_pipeline/pdf_extractor1.py

@@ -0,0 +1,1325 @@
+from __future__ import annotations
+
+"""
+PDF 结构提取器。
+
+"""
+
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+import fitz
+
+try:
+    from .ocr_processor import OcrProcessor, OcrResult, TableRegion
+except ImportError:  # pragma: no cover - direct script-style imports
+    try:
+        from ocr_processor import OcrProcessor, OcrResult, TableRegion  # type: ignore
+    except ImportError:  # pragma: no cover - OCR dependencies are optional
+        OcrProcessor = None  # type: ignore
+        OcrResult = Any  # type: ignore
+        TableRegion = Any  # type: ignore
+
+
+SECTION_TITLE_KEY = "章节标题"
+EMPTY_SECTION_PLACEHOLDER = "[本节无纯文本,原文档中可能为纯图片或表格]"
+
+
+TABLE_OCR_START = "[表格OCR识别结果]:"
+TABLE_OCR_END = "[/表格]"
+CN_LIST_L1_NUMERIC_L2_RULE = "Rule_8_中文序号章数字小节派"
+
+
+@dataclass(frozen=True)
+class BodyLine:
+    """一条规范化后的正文行,以及它所在的 PDF 页码。"""
+
+    page: int
+    text: str
+
+
+class PdfStructureExtractor:
+    """基于规则的 PDF 正文结构提取器,可选增强表格 OCR 内容。"""
+
+    RULE_LIB = {
+        "Rule_1_纯数字派": {
+            "l1": re.compile(r"^\d{1,2}(?:[\..。])?\s+(?!\d)[\u4e00-\u9fa5A-Za-z].*"),
+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_2_混合章派": {
+            "l1": re.compile(r"^第\s*(\d+)\s*[章部部分篇]\s*[、]?\s*(.*)"),
+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_3_中英混血派": {
+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部部分篇]\s*[、]?\s*(.*)"),
+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_4_传统公文派": {
+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部部分篇]\s*[、]?\s*(.*)"),
+            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[、\s]+([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_5_单边括号派": {
+            "l1": re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]\s*[、]?\s*(.*)"),
+            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[))\]]\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_6_小节派": {
+            "l1": re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]\s*[、]?\s*(.*)"),
+            "l2": re.compile(r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*节\s*[、]?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        "Rule_7_粗体括号派": {
+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部部分篇]\s*[、]?\s*(.*)"),
+            "l2": re.compile(r"^[【\[]\s*(\d+)\s*[\]】]\s*([\u4e00-\u9fa5]+.*)"),
+        },
+        CN_LIST_L1_NUMERIC_L2_RULE: {
+            "l1": re.compile(r"^([一二三四五六七八九十百零两]+)[、))\]]\s*([\u4e00-\u9fa5A-Za-z].*)"),
+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5]+.*)"),
+        },
+    }
+
+    CN_NUM_MAP = {
+        "零": 0,
+        "〇": 0,
+        "一": 1,
+        "二": 2,
+        "两": 2,
+        "三": 3,
+        "四": 4,
+        "五": 5,
+        "六": 6,
+        "七": 7,
+        "八": 8,
+        "九": 9,
+    }
+
+    TOC_PATTERN = re.compile(r"\.{3,}|…{2,}|-{3,}|·{3,}|•{3,}")
+
+    def __init__(
+        self,
+        clip_top: float = 60,
+        clip_bottom: float = 60,
+        use_ocr: bool = False,
+        ocr_api_url: str = "",
+        ocr_timeout: int = 600,
+        ocr_api_key: str = "",
+        detect_toc: bool = True,
+        toc_model_path: str = "",
+    ):
+        """初始化提取参数,并在依赖可用时启用 OCR。"""
+
+        self.clip_top = clip_top
+        self.clip_bottom = clip_bottom
+        self.ocr_requested = bool(use_ocr)
+        self.ocr_processor = None
+        self.use_ocr = False
+        # OCR 是可选增强:rapid_layout 或 OCR 依赖缺失时,正文规则提取仍然照常运行。
+        if use_ocr and OcrProcessor is not None:
+            self.ocr_processor = OcrProcessor(
+                ocr_api_url=ocr_api_url,
+                ocr_timeout=ocr_timeout,
+                ocr_api_key=ocr_api_key,
+            )
+            self.use_ocr = self.ocr_processor.is_available()
+        self.detect_toc = False
+        self.ocr_api_url = ocr_api_url
+        self.ocr_timeout = ocr_timeout
+        self.ocr_api_key = ocr_api_key
+        self.toc_model_path = toc_model_path
+
+    def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
+        """提取章节、正文派生目录、规则诊断信息,以及可选的表格 OCR 内容。"""
+
+        result: Dict[str, Any] = {
+            "chapters": {},
+            "total_pages": 0,
+            "catalog": None,
+            "body_catalog": None,
+            "ocr_catalog": None,
+            "catalog_mode": "testc_body_only",
+            "body_rule": None,
+            "body_coverage": 0.0,
+            "rule_performance": {},
+            "ocr_content_mode": "disabled",
+            "ocr_table_count": 0,
+            "ocr_success_count": 0,
+            "ocr_inserted_count": 0,
+        }
+
+        doc = fitz.open(stream=file_content, filetype="pdf")
+        try:
+            # 正文切分仍由 PyMuPDF 文本和标题规则驱动,OCR 只在切分后作为小节内容补充。
+            body_lines = self._extract_body_lines(doc, progress_callback)
+            ocr_results = self._extract_table_ocr_results(doc, progress_callback)
+            raw_data, winning_rule, coverage_rate, rule_performance = self._extract_body_with_best_rule(body_lines)
+            chapters = self._convert_rule_output_to_chapters(raw_data)
+            ocr_stats = self._insert_ocr_results_into_chapters(chapters, ocr_results)
+            body_catalog = self._build_body_catalog_from_chapters(chapters)
+
+            result["chapters"] = chapters
+            result["total_pages"] = len(doc)
+            result["catalog"] = body_catalog
+            result["body_catalog"] = body_catalog
+            result["body_rule"] = winning_rule
+            result["body_coverage"] = coverage_rate
+            result["rule_performance"] = rule_performance
+            result["ocr_table_count"] = ocr_stats["table_count"]
+            result["ocr_success_count"] = ocr_stats["success_count"]
+            result["ocr_inserted_count"] = ocr_stats["inserted_count"]
+            # 记录 OCR 是否实际影响输出,方便批处理统计时判断 OCR 状态。
+            # disabled:默认值,表示本次没有请求 OCR。
+            # unavailable:请求了 OCR,但依赖不可用,例如 rapid_layout 未安装或检测器不可用。
+            # enabled_no_table:OCR 已启用,但没有检测到可识别的表格区域。
+            # table_regions_inserted:OCR 已启用,并且表格识别结果已经成功回填到正文小节。
+            # enabled_no_insert:OCR 已启用,但没有成功回填,通常是 OCR 失败或未定位到合适小节。
+            if self.ocr_requested and not self.use_ocr:
+                result["ocr_content_mode"] = "unavailable"
+            elif self.use_ocr and ocr_stats["table_count"] == 0:
+                result["ocr_content_mode"] = "enabled_no_table"
+            elif self.use_ocr and ocr_stats["inserted_count"] > 0:
+                result["ocr_content_mode"] = "table_regions_inserted"
+            elif self.use_ocr:
+                result["ocr_content_mode"] = "enabled_no_insert"
+            return result
+        finally:
+            doc.close()
+
+    def _extract_table_ocr_results(self, doc: fitz.Document, progress_callback=None) -> List[OcrResult]:
+        """在 OCR 启用时检测 PDF 表格区域,并发执行表格识别。"""
+
+        if not self.use_ocr or self.ocr_processor is None:
+            return []
+
+        def _emit_progress(stage: str, current: int, message: str) -> None:
+            """转发 OCR 进度,同时避免回调异常中断提取流程。"""
+
+            if not progress_callback:
+                return
+            try:
+                progress_callback(stage, current, message)
+            except Exception:
+                pass
+
+        table_regions: List[TableRegion] = []
+        total_pages = len(doc)
+        for page_index in range(total_pages):
+            page = doc.load_page(page_index)
+            rect = page.rect
+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+            regions = self.ocr_processor.detect_table_regions(page, page_index + 1, clip_box)
+            # 保存页面对象和区域坐标,便于 OcrProcessor 后续精确渲染表格裁剪区域。
+            for bbox, score in regions:
+                table_regions.append(TableRegion(
+                    page_num=page_index + 1,
+                    page=page,
+                    bbox=bbox,
+                    score=score,
+                ))
+
+            if page_index + 1 == total_pages or (page_index + 1) % 5 == 0:
+                progress = int((page_index + 1) / max(total_pages, 1) * 30)
+                _emit_progress("ocr_layout", progress, f"scan tables {page_index + 1}/{total_pages}")
+
+        if not table_regions:
+            return []
+
+        _emit_progress("ocr", 35, f"ocr tables 0/{len(table_regions)}")
+
+        def _progress_adapter(completed: int, total: int) -> None:
+            """把 OcrProcessor 的 completed/total 进度转换为提取器统一的进度格式。"""
+
+            progress = 35 + int(completed / max(total, 1) * 15)
+            _emit_progress("ocr", progress, f"ocr tables {completed}/{total}")
+
+        return self.ocr_processor.process_ocr_concurrent(
+            table_regions,
+            progress_callback=_progress_adapter,
+        )
+
+    def _insert_ocr_results_into_chapters(
+        self,
+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
+        ocr_results: List[OcrResult],
+    ) -> Dict[str, int]:
+        """把成功识别的表格 OCR 文本追加到同页最可能的小节正文中。"""
+
+        stats = {
+            "table_count": len(ocr_results),
+            "success_count": 0,
+            "inserted_count": 0,
+        }
+        if not chapters or not ocr_results:
+            return stats
+
+        successful_results = [
+            result for result in ocr_results
+            if getattr(result, "success", False) and str(getattr(result, "text", "") or "").strip()
+        ]
+        stats["success_count"] = len(successful_results)
+
+        for ocr_result in sorted(successful_results, key=lambda item: (item.page_num, item.bbox[1], item.bbox[0])):
+            # 轻量提取器在切分后不再保留文本块坐标,因此使用页码范围作为 OCR 回填的稳定定位信号。
+            target = self._find_ocr_target_section(chapters, ocr_result.page_num)
+            if target is None:
+                continue
+
+            _, _, payload = target
+            original_content = str(payload.get("content", "") or "").strip()
+            if original_content == EMPTY_SECTION_PLACEHOLDER:
+                original_content = ""
+
+            ocr_text = str(ocr_result.text or "").strip()
+            table_text = f"{TABLE_OCR_START}\n{ocr_text}\n{TABLE_OCR_END}"
+            payload["content"] = f"{original_content}\n\n{table_text}".strip()
+            payload["page_start"] = min(
+                self._safe_page_number(payload.get("page_start"), ocr_result.page_num),
+                ocr_result.page_num,
+            )
+            payload["page_end"] = max(
+                self._safe_page_number(payload.get("page_end"), ocr_result.page_num),
+                ocr_result.page_num,
+            )
+            stats["inserted_count"] += 1
+
+        return stats
+
+    def _find_ocr_target_section(
+        self,
+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
+        page_num: int,
+    ) -> Optional[Tuple[str, str, Dict[str, Any]]]:
+        """查找页码范围最能覆盖 OCR 表格所在页的小节。"""
+
+        candidates: List[Tuple[int, int, str, str, Dict[str, Any]]] = []
+        fallback: Optional[Tuple[str, str, Dict[str, Any]]] = None
+
+        for chapter_title, sections in chapters.items():
+            if not isinstance(sections, dict):
+                continue
+
+            for section_title, payload in sections.items():
+                if not isinstance(payload, dict):
+                    continue
+
+                page_start = self._safe_page_number(payload.get("page_start"), page_num)
+                page_end = self._safe_page_number(payload.get("page_end"), page_start)
+                if section_title == SECTION_TITLE_KEY:
+                    if fallback is None and page_start <= page_num <= page_end:
+                        fallback = (chapter_title, section_title, payload)
+                    continue
+
+                # 优先选择页码范围最窄的小节,过宽的范围通常是章节级内容外溢。
+                if page_start <= page_num <= page_end:
+                    span = max(page_end - page_start, 0)
+                    candidates.append((span, -page_start, chapter_title, section_title, payload))
+                elif page_start <= page_num:
+                    fallback = (chapter_title, section_title, payload)
+
+        if candidates:
+            _, _, chapter_title, section_title, payload = min(candidates, key=lambda item: (item[0], item[1]))
+            return chapter_title, section_title, payload
+        return fallback
+
+    def _extract_body_lines(self, doc: fitz.Document, progress_callback=None) -> List[BodyLine]:
+        """读取裁剪后的页面文本,规范化正文行,并移除重复的非标题噪声。"""
+
+        page_lines_by_page: List[Tuple[int, List[str]]] = []
+        total_pages = len(doc)
+
+        for page_index in range(total_pages):
+            page = doc.load_page(page_index)
+            rect = page.rect
+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
+            text = page.get_text("text", clip=clip_box)
+
+            page_lines: List[str] = []
+            for line in self._prepare_page_lines(text):
+                stripped = line.strip()
+                if not stripped or self._is_header_footer(stripped):
+                    continue
+                page_lines.append(stripped)
+
+            page_lines_by_page.append((page_index + 1, page_lines))
+
+            if progress_callback and (page_index + 1 == total_pages or (page_index + 1) % 10 == 0):
+                try:
+                    progress_callback(
+                        "正文抽取",
+                        int((page_index + 1) / max(total_pages, 1) * 60),
+                        f"读取正文页 {page_index + 1}/{total_pages}",
+                    )
+                except Exception:
+                    pass
+
+        # 页眉页脚往往跨页重复,但真实标题不能被误删,所以只移除“不像标题”的重复行。
+        repeated_noise_keys = self._find_repeated_non_heading_lines(page_lines_by_page, total_pages)
+        body_lines: List[BodyLine] = []
+        for page, lines in page_lines_by_page:
+            for line in lines:
+                if self._normalize_repeated_line_key(line) in repeated_noise_keys:
+                    continue
+                body_lines.append(BodyLine(page=page, text=line))
+        return body_lines
+
+    def _extract_body_with_best_rule(
+        self,
+        body_lines: List[BodyLine],
+    ) -> Tuple[Dict[str, Dict[str, List[Dict[str, Any]]]], Optional[str], float, Dict[str, Any]]:
+        """运行所有候选标题规则,并返回评分最高的正文结构。"""
+
+        total_raw_chars = sum(len(item.text.strip()) for item in body_lines if item.text.strip())
+        best_score = -9999
+        best_rule_name: Optional[str] = None
+        best_data: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
+        best_coverage = 0.0
+        rule_performance: Dict[str, Any] = {}
+
+        for rule_name, rule_set in self.RULE_LIB.items():
+            data = self._extract_with_rule(body_lines, rule_name, rule_set)
+            score, coverage_rate = self._evaluate_extraction(data, total_raw_chars)
+            l1_count = len(data)
+            l2_count = sum(
+                len([key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY])
+                for sections in data.values()
+            )
+            if (
+                rule_name == CN_LIST_L1_NUMERIC_L2_RULE
+                and not self._is_viable_cn_list_l1_numeric_l2_structure(data, l1_count, l2_count)
+            ):
+                score -= 1500
+            rule_performance[rule_name] = {
+                "score": score,
+                "coverage_rate": f"{coverage_rate * 100:.1f}%",
+                "l1_count": l1_count,
+                "l2_count": l2_count,
+            }
+
+            # 规则选择以综合得分为主,覆盖率保留用于兜底过滤和诊断输出。
+            if score > best_score:
+                best_score = score
+                best_rule_name = rule_name
+                best_data = data
+                best_coverage = coverage_rate
+
+        if best_score <= 0 or best_coverage < 0.15:
+            return {}, best_rule_name, best_coverage, rule_performance
+
+        return best_data, best_rule_name, best_coverage, rule_performance
+
+    def _extract_with_rule(
+        self,
+        body_lines: List[BodyLine],
+        rule_name: str,
+        rule_set: Dict[str, re.Pattern],
+    ) -> Dict[str, Dict[str, List[Dict[str, Any]]]]:
+        """使用单个候选标题规则,把正文行切分到章节和小节桶中。"""
+
+        structured_data: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
+        current_l1: Optional[str] = None
+        current_l1_num = 0
+        current_l2: Optional[str] = None
+        pending_prefix: Optional[str] = None
+        pending_page: Optional[int] = None
+        last_l2_sub_num = 0
+
+        backup_l1: Optional[str] = None
+        backup_l1_num = 0
+        backup_l2: Optional[str] = None
+        backup_l2_sub_num = 0
+
+        is_numeric_l2 = rule_name in {
+            "Rule_1_纯数字派",
+            "Rule_2_混合章派",
+            "Rule_3_中英混血派",
+            CN_LIST_L1_NUMERIC_L2_RULE,
+        }
+
+        for index, item in enumerate(body_lines):
+            # 先处理跨行标题碎片,再进入章/节识别,避免“第X章”单独成行时丢标题。
+            original_line = item.text.strip()
+            page = item.page
+            if not original_line or original_line.isdigit():
+                continue
+
+            line = self._strip_leading_page_number_from_heading(original_line)
+            if pending_prefix:
+                line = f"{pending_prefix} {line}".strip()
+                original_line = line
+                page = pending_page or page
+                pending_prefix = None
+                pending_page = None
+
+            if self._is_incomplete_heading_fragment(line) and len(line) <= 15:
+                pending_prefix = line
+                pending_page = page
+                continue
+
+            has_toc = self._is_toc_line(line)
+
+            match_l1 = rule_set["l1"].match(line)
+            if match_l1 and not has_toc:
+                core_text = self._blind_strip(line)
+                if len(core_text) < 2:
+                    pending_prefix = line
+                    pending_page = page
+                    continue
+
+                if self._is_valid_heading_strict(line, is_l1=True):
+                    l1_candidate_num = self._extract_l1_number(line, rule_name, match_l1, current_l1_num)
+
+                    if rule_name == CN_LIST_L1_NUMERIC_L2_RULE:
+                        if not self._has_expected_numeric_l2_ahead(body_lines, index, l1_candidate_num):
+                            continue
+
+                    if rule_name == "Rule_1_纯数字派":
+                        # 纯数字一级标题更容易误中表格行或编号列表,因此需要额外的序号和噪声校验。
+                        if current_l1 is None and l1_candidate_num not in {1, 2}:
+                            continue
+                        if self._looks_like_plain_numeric_l1_noise(line):
+                            continue
+
+                    if rule_name in {"Rule_1_纯数字派", "Rule_2_混合章派"} and current_l1 is not None:
+                        if l1_candidate_num < current_l1_num:
+                            continue
+                        if l1_candidate_num - current_l1_num > 2:
+                            continue
+                        if l1_candidate_num == current_l1_num:
+                            # 同编号章节重复出现时,若旧章节尚未出现小节,则把旧章节视作误判并回收内容。
+                            if not self._chapter_has_l2(structured_data.get(current_l1, {})):
+                                old_preface = structured_data[current_l1].get(SECTION_TITLE_KEY, [])
+                                old_page = self._safe_page_number(structured_data[current_l1].get("_chapter_page"), page)
+                                restored = [{"text": current_l1, "page": old_page}] + old_preface
+                                del structured_data[current_l1]
+
+                                current_l1 = self._clean_chapter_title(line)
+                                structured_data[current_l1] = {"_chapter_page": page}  # type: ignore[assignment]
+                                if restored:
+                                    structured_data[current_l1][SECTION_TITLE_KEY] = restored
+                                current_l1_num = l1_candidate_num
+                                current_l2 = None
+                                last_l2_sub_num = 0
+                            continue
+
+                    backup_l1 = current_l1
+                    backup_l1_num = current_l1_num
+                    backup_l2 = current_l2
+                    backup_l2_sub_num = last_l2_sub_num
+
+                    current_l1 = self._clean_chapter_title(line)
+                    current_l1_num = l1_candidate_num
+                    structured_data.setdefault(current_l1, {"_chapter_page": page})  # type: ignore[assignment]
+                    current_l2 = None
+                    last_l2_sub_num = 0
+                    continue
+
+            match_l2 = rule_set["l2"].match(line)
+            if current_l1 and match_l2 and not has_toc:
+                if self._is_valid_heading_strict(line, is_l1=False):
+                    if is_numeric_l2:
+                        l2_main_num = int(match_l2.group(1))
+                        l2_sub_num = int(match_l2.group(2))
+
+                        if l2_main_num != current_l1_num and l2_main_num == backup_l1_num and backup_l1 is not None:
+                            # 若小节编号指向上一个章节,说明当前章节可能是误识别标题,优先回退到备份章节。
+                            has_l2 = self._chapter_has_l2(structured_data.get(current_l1, {}))
+                            if not has_l2 and current_l1 in structured_data:
+                                fake_preface = structured_data[current_l1].get(SECTION_TITLE_KEY, [])
+                                chapter_page = self._safe_page_number(
+                                    structured_data[current_l1].get("_chapter_page"),
+                                    page,
+                                )
+                                text_to_restore = [{"text": current_l1, "page": chapter_page}] + fake_preface
+                                target_node = backup_l2 or SECTION_TITLE_KEY
+                                structured_data.setdefault(backup_l1, {"_chapter_page": chapter_page})  # type: ignore[arg-type]
+                                structured_data[backup_l1].setdefault(target_node, []).extend(text_to_restore)
+                                del structured_data[current_l1]
+                                current_l1 = backup_l1
+                                current_l1_num = backup_l1_num
+                                current_l2 = backup_l2
+                                last_l2_sub_num = backup_l2_sub_num
+
+                        if l2_main_num != current_l1_num:
+                            pass
+                        elif l2_sub_num <= last_l2_sub_num:
+                            pass
+                        elif self._is_suspicious_numeric_l2_jump(l2_sub_num, last_l2_sub_num):
+                            # 大跨度跳号常见于正文引用,例如 1.2 后出现 1.9,不直接当作新小节。
+                            pass
+                        else:
+                            current_l2 = self._clean_section_title(line)
+                            last_l2_sub_num = l2_sub_num
+                            self._ensure_section_node(structured_data, current_l1, current_l2, page)
+                            continue
+                    else:
+                        l2_sub_num = self._extract_non_numeric_l2_number(match_l2.group(1))
+                        if l2_sub_num <= last_l2_sub_num:
+                            pass
+                        else:
+                            current_l2 = self._clean_section_title(line)
+                            last_l2_sub_num = l2_sub_num
+                            self._ensure_section_node(structured_data, current_l1, current_l2, page)
+                            continue
+
+            if current_l1 and not has_toc:
+                target_key = current_l2 or SECTION_TITLE_KEY
+                self._ensure_section_node(structured_data, current_l1, target_key, page)
+                structured_data[current_l1][target_key].append({"text": original_line, "page": page})
+
+        for chapter_title in list(structured_data.keys()):
+            chapter_sections = structured_data[chapter_title]
+            if list(chapter_sections.keys()) == ["_chapter_page"]:
+                del structured_data[chapter_title]
+
+        return structured_data
+
+    def _has_expected_numeric_l2_ahead(
+        self,
+        body_lines: List[BodyLine],
+        current_index: int,
+        chapter_number: int,
+    ) -> bool:
+        """校验中文序号一级标题后,是否跟着同主序号的数字二级标题。"""
+
+        if chapter_number <= 0 or current_index >= len(body_lines):
+            return False
+
+        start_page = body_lines[current_index].page
+        max_index = min(len(body_lines), current_index + 40)
+        max_page = start_page + 3
+        expected_pattern = re.compile(
+            rf"^{chapter_number}\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5A-Za-z].*)"
+        )
+
+        for next_index in range(current_index + 1, max_index):
+            candidate_item = body_lines[next_index]
+            if candidate_item.page > max_page:
+                break
+
+            candidate_line = self._strip_leading_page_number_from_heading(candidate_item.text.strip())
+            if not candidate_line or self._is_toc_line(candidate_line):
+                continue
+
+            if (
+                expected_pattern.match(candidate_line)
+                and self._is_valid_heading_strict(candidate_line, is_l1=False)
+            ):
+                return True
+
+            if next_index > current_index + 1 and any(
+                rule["l1"].match(candidate_line)
+                for name, rule in self.RULE_LIB.items()
+                if name != CN_LIST_L1_NUMERIC_L2_RULE
+            ):
+                break
+
+        return False
+
+    @staticmethod
+    def _is_viable_cn_list_l1_numeric_l2_structure(
+        raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
+        l1_count: int,
+        l2_count: int,
+    ) -> bool:
+        """限制新规则只在真正形成“中文章 + 数字小节”结构时参与竞争。"""
+
+        if l1_count < 2 or l2_count < 3:
+            return False
+
+        chapters_with_l2 = sum(
+            1
+            for sections in raw_data.values()
+            if any(key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY)
+        )
+        return chapters_with_l2 >= max(2, (l1_count + 1) // 2)
+
+    def _convert_rule_output_to_chapters(
+        self,
+        raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
+    ) -> Dict[str, Dict[str, Dict[str, Any]]]:
+        """把规则提取出的临时结构转换为最终 chapters JSON 结构。"""
+
+        chapters: Dict[str, Dict[str, Dict[str, Any]]] = {}
+
+        for chapter_title, sections in raw_data.items():
+            chapter_page = self._safe_page_number(sections.get("_chapter_page"), 1)
+            chapter_payloads: Dict[str, Dict[str, Any]] = {}
+
+            for section_title, entries in sections.items():
+                if section_title.startswith("_"):
+                    continue
+
+                if entries:
+                    page_start = self._safe_page_number(entries[0].get("page"), chapter_page)
+                    page_end = self._safe_page_number(entries[-1].get("page"), page_start)
+                    content = "\n".join(str(entry.get("text", "") or "") for entry in entries).strip()
+                else:
+                    page_start = chapter_page
+                    page_end = chapter_page
+                    content = ""
+
+                chapter_payloads[section_title] = {
+                    "content": content or EMPTY_SECTION_PLACEHOLDER,
+                    "page_start": page_start,
+                    "page_end": page_end,
+                }
+
+            chapter_payloads.setdefault(
+                SECTION_TITLE_KEY,
+                {"content": "", "page_start": chapter_page, "page_end": chapter_page},
+            )
+            chapters[chapter_title] = chapter_payloads
+
+        return chapters
+
+    def _evaluate_extraction(
+        self,
+        raw_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
+        total_raw_chars: int,
+    ) -> Tuple[int, float]:
+        """根据章节数量、小节数量、空章节比例和正文覆盖率评估规则效果。"""
+
+        if not raw_data or total_raw_chars == 0:
+            return 0, 0.0
+
+        l1_count = len(raw_data)
+        l2_total_count = sum(
+            len([key for key in sections.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY])
+            for sections in raw_data.values()
+        )
+
+        extracted_chars = 0
+        empty_l1_count = 0
+
+        for chapter_title, sections in raw_data.items():
+            extracted_chars += len(chapter_title)
+            chapter_has_content = False
+
+            for section_title, entries in sections.items():
+                if section_title.startswith("_"):
+                    continue
+                if section_title != SECTION_TITLE_KEY:
+                    extracted_chars += len(section_title)
+                content = "\n".join(str(entry.get("text", "") or "") for entry in entries).strip()
+                if content:
+                    extracted_chars += len(content)
+                    chapter_has_content = True
+
+            if not chapter_has_content:
+                empty_l1_count += 1
+
+        raw_coverage_rate = extracted_chars / total_raw_chars if total_raw_chars > 0 else 0.0
+        coverage_rate = min(raw_coverage_rate, 1.0)
+        score = 0
+
+        if 2 <= l1_count <= 25:
+            score += l1_count * 15
+        elif l1_count > 25:
+            score += 100
+
+        score += l2_total_count * 5
+
+        if l1_count > 0 and (empty_l1_count / l1_count) > 0.8:
+            score -= 500
+
+        if raw_coverage_rate > 0.8:
+            score += int(min(raw_coverage_rate, 1.0) * 1000)
+        elif raw_coverage_rate < 0.5:
+            score -= 1000
+
+        return score, coverage_rate
+
+    def _build_body_catalog_from_chapters(
+        self,
+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
+    ) -> Optional[Dict[str, Any]]:
+        """从正文切分结果反向生成 body_catalog/catalog 结构。"""
+
+        if not chapters:
+            return None
+
+        catalog_chapters: List[Dict[str, Any]] = []
+        for chapter_title, sections in chapters.items():
+            if not isinstance(sections, dict):
+                continue
+
+            page_start, page_end = self._resolve_chapter_page_span(sections)
+            title_payload = sections.get(SECTION_TITLE_KEY, {})
+            catalog_chapter = {
+                "index": len(catalog_chapters) + 1,
+                "title": chapter_title,
+                "page": str(page_start),
+                "original": chapter_title,
+                "content": title_payload.get("content", "") if isinstance(title_payload, dict) else "",
+                "page_start": page_start,
+                "page_end": page_end,
+                "subsections": [],
+            }
+
+            for section_title, payload in sections.items():
+                if section_title == SECTION_TITLE_KEY or not isinstance(payload, dict):
+                    continue
+                subsection_page_start = self._safe_page_number(payload.get("page_start"), page_start)
+                subsection_page_end = self._safe_page_number(payload.get("page_end"), subsection_page_start)
+                catalog_chapter["subsections"].append({
+                    "title": section_title,
+                    "page": str(subsection_page_start),
+                    "level": 2,
+                    "original": section_title,
+                    "content": payload.get("content", ""),
+                    "page_start": subsection_page_start,
+                    "page_end": subsection_page_end,
+                })
+
+            catalog_chapters.append(catalog_chapter)
+
+        if not catalog_chapters:
+            return None
+
+        return {
+            "chapters": catalog_chapters,
+            "total_chapters": len(catalog_chapters),
+            "source": "body_titles",
+            "formatted_text": self._format_catalog_chapters(catalog_chapters),
+        }
+
+    @classmethod
+    def _prepare_page_lines(cls, text: str) -> List[str]:
+        """把页面原始文本拆成行,并提前合并可能被换行拆开的标题。"""
+
+        raw_lines = [line.strip() for line in (text or "").splitlines() if line.strip()]
+        prepared: List[str] = []
+        index = 0
+
+        while index < len(raw_lines):
+            merged_line, consumed = cls._merge_heading_fragment(raw_lines, index)
+            if merged_line:
+                prepared.append(merged_line)
+                index += consumed
+                continue
+            prepared.append(raw_lines[index])
+            index += 1
+
+        return prepared
+
+    @classmethod
+    def _merge_heading_fragment(cls, lines: List[str], start_index: int) -> Tuple[Optional[str], int]:
+        """尝试把当前位置开始的 2 到 3 行合并为一个完整标题。"""
+
+        first_line = lines[start_index].strip()
+        if not first_line:
+            return None, 1
+
+        first_normalized = cls._strip_leading_page_number_from_heading(first_line)
+        first_is_heading = cls._matches_any_heading(first_normalized)
+        first_is_incomplete = cls._is_incomplete_heading_fragment(first_normalized)
+        max_span = min(3, len(lines) - start_index)
+
+        for span in range(2, max_span + 1):
+            candidate_lines = [
+                cls._strip_leading_page_number_from_heading(lines[start_index + offset])
+                for offset in range(span)
+            ]
+            candidate_text = " ".join(item for item in candidate_lines if item).strip()
+            if not candidate_text or cls.TOC_PATTERN.search(candidate_text):
+                continue
+            if cls._looks_like_table_fragment(lines, start_index, span):
+                continue
+            if not cls._matches_any_heading(candidate_text):
+                continue
+            if first_is_incomplete or not first_is_heading:
+                return candidate_text, span
+
+        return None, 1
+
+    @classmethod
+    def _looks_like_table_fragment(cls, lines: List[str], start_index: int, span: int) -> bool:
+        """判断候选跨行标题是否更像表格单元格碎片。"""
+
+        first_line = lines[start_index].strip()
+        if not re.fullmatch(r"\d{1,2}(?:\.\d{1,2})?", first_line):
+            return False
+
+        next_lines = [lines[idx].strip() for idx in range(start_index + 1, min(len(lines), start_index + 5))]
+        if next_lines and cls._is_short_table_cell(next_lines[0]):
+            return True
+        return sum(1 for item in next_lines if cls._is_short_table_cell(item) or cls._looks_like_quantity_cell(item)) >= 2
+
+    @classmethod
+    def _find_repeated_non_heading_lines(
+        cls,
+        page_lines_by_page: List[Tuple[int, List[str]]],
+        total_pages: int,
+    ) -> set[str]:
+        """找出跨页重复出现、且不属于标题的页眉页脚类噪声行。"""
+
+        if total_pages < 3:
+            return set()
+
+        pages_by_key: Dict[str, set[int]] = {}
+        for page, lines in page_lines_by_page:
+            for line in lines:
+                key = cls._normalize_repeated_line_key(line)
+                if not key or not (4 <= len(key) <= 80):
+                    continue
+                normalized = cls._strip_leading_page_number_from_heading(line)
+                if cls._matches_any_heading(normalized) or cls._is_toc_line(normalized):
+                    continue
+                pages_by_key.setdefault(key, set()).add(page)
+
+        threshold = max(3, (total_pages + 11) // 12)
+        return {key for key, pages in pages_by_key.items() if len(pages) >= threshold}
+
+    @staticmethod
+    def _normalize_repeated_line_key(line: str) -> str:
+        """生成重复行检测使用的无空白 key。"""
+
+        return re.sub(r"\s+", "", str(line or "").strip())
+
+    @classmethod
+    def _matches_any_heading(cls, line: str) -> bool:
+        """判断文本是否命中任意一套章/节标题规则。"""
+
+        clean_line = line.strip()
+        return any(rule["l1"].match(clean_line) or rule["l2"].match(clean_line) for rule in cls.RULE_LIB.values())
+
+    @classmethod
+    def _is_incomplete_heading_fragment(cls, line: str) -> bool:
+        """识别只有编号或标题前缀、需要等待下一行拼接的标题碎片。"""
+
+        clean_line = re.sub(r"\s+", "", str(line or "").strip())
+        if not clean_line:
+            return False
+
+        fragment_patterns = (
+            r"^第(?:\d+|[一二三四五六七八九十百零两]+)[章部部分篇]$",
+            r"^\d{1,2}(?:[\..。、])?$",
+            r"^\d{1,2}\.\d{1,2}(?!\.\d)\.?$",
+            r"^[一二三四五六七八九十百零两]+[、))\]]$",
+            r"^第(?:\d+|[一二三四五六七八九十百零两]+)节$",
+            r"^[【\[]\d+[\]】]$",
+        )
+        return any(re.match(pattern, clean_line) for pattern in fragment_patterns)
+
+    @classmethod
+    def _is_toc_line(cls, line: str) -> bool:
+        """判断一行文本是否像目录行。"""
+
+        clean_line = str(line or "").strip()
+        if cls.TOC_PATTERN.search(clean_line):
+            return True
+        return bool(re.search(r"\s{2,}\d{1,3}$", clean_line))
+
+    @classmethod
+    def _is_header_footer(cls, line: str) -> bool:
+        """过滤页码、页眉页脚和重复方案名等非正文内容。"""
+
+        compact = re.sub(r"\s+", "", str(line or "").strip())
+        if not compact:
+            return False
+        if compact.isdigit():
+            return True
+        if re.fullmatch(r"第\d+页(?:共\d+页)?", compact):
+            return True
+        if re.fullmatch(r"第\d+页/共\d+页", compact):
+            return True
+        if compact.upper() in {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"}:
+            return True
+        if compact in {"目录", "目", "录"}:
+            return False
+        normalized = cls._strip_leading_page_number_from_heading(line)
+        return not cls._matches_any_heading(normalized) and compact in {"专项方案", "施工方案"}
+
+    @classmethod
+    def _is_valid_heading_strict(cls, line: str, is_l1: bool = False) -> bool:
+        """用长度、标点、单位和数量词规则过滤疑似误命中的标题。"""
+
+        clean_line = str(line or "").strip()
+        if not (2 <= len(clean_line) <= 60):
+            return False
+        if clean_line.endswith(("、", ",", "。", ";", ":", ",", ";", ":")):
+            return False
+        if len(clean_line.split()) > 3:
+            return False
+        if re.search(r"[\((][A-Za-z\*/]+[\))]\s*$", clean_line):
+            return False
+
+        unit_pattern = (
+            r"(?:版|版本|年一遇|倍|t|m|kg|cm|mm|km|m2|m3|㎡|m³|L|ml|MPa|kPa|kN|Hz|kW|KV|"
+            r"千克|公斤|千米|公里|平方米|立方米|平方|立方|分钟|小时|秒|工日|台班|台套|万元|亿元)"
+        )
+        if re.match(rf"^\d+(?:\.\d+)?\s*{unit_pattern}(?:\s|$|[\u4e00-\u9fa5])", clean_line, re.IGNORECASE):
+            return False
+
+        quantity_pattern = (
+            r"(?:人|名|位|个|组|班|件|项|把|根|台|套|辆|部|架|座|块|片|张|卷|桶|袋|车|"
+            r"号|步|天|吨|箱|艘|磅|米|升|斤|两|次|条|孔|跨|排|层)"
+        )
+        if re.match(rf"^\d+(?:\.\d+)?\s*{quantity_pattern}(?:\s|$)", clean_line, re.IGNORECASE):
+            return False
+
+        if is_l1:
+            if re.match(r"^0\d+", clean_line):
+                return False
+            number_match = re.search(r"^\d+|第\s*(\d+)", clean_line)
+            if number_match:
+                raw_number = number_match.group(1) or number_match.group(0)
+                if raw_number.isdigit() and int(raw_number) > 30:
+                    return False
+            if re.search(r"[,。!!,??;;::]", clean_line):
+                return False
+            if len(clean_line) > 35:
+                return False
+            if cls._looks_like_numbered_quantity_line(clean_line):
+                return False
+
+        return True
+
+    @staticmethod
+    def _looks_like_numbered_quantity_line(line: str) -> bool:
+        """判断数字开头的行是否更像数量清单而不是一级标题。"""
+
+        clean_line = re.sub(r"\s+", "", str(line or "").strip())
+        return bool(
+            re.match(
+                r"^\d+(?:号|步|天|吨|套|件|箱|把|根|辆|部|艘|块|片|张|卷|桶|袋|车|磅|米|升|斤|两|秒)",
+                clean_line,
+            )
+        )
+
+    @staticmethod
+    def _looks_like_plain_numeric_l1_noise(line: str) -> bool:
+        """识别纯数字一级标题规则中常见的图名、规范名和岗位名噪声。"""
+
+        clean_line = re.sub(r"\s+", " ", str(line or "").strip())
+        match = re.match(r"^\d{1,2}(?:[\..。、])?\s+(.+)$", clean_line)
+        if not match:
+            return False
+
+        title = match.group(1).strip()
+        compact = re.sub(r"\s+", "", title)
+        if not compact:
+            return True
+
+        figure_terms = (
+            "示意图",
+            "布置图",
+            "断面图",
+            "构造图",
+            "大样图",
+            "详图",
+            "平面图",
+            "立面图",
+            "剖面图",
+            "流程图",
+            "曲线图",
+        )
+        if any(term in compact for term in figure_terms):
+            return True
+
+        if re.search(r"(规范|标准|规程|指南|办法|条例|规定|导则|手册|文件)$", compact):
+            return True
+
+        if re.search(r"(部|室|经理|总工|部长|主任|办公室|试验室)$", compact):
+            return True
+
+        if re.search(r"(地震动|峰值加速度|反应谱|特征周期|场地类别|荷载组合|荷载标准值|分项系数)", compact):
+            return True
+
+        chapter_keywords = (
+            "工程",
+            "编制",
+            "施工",
+            "安全",
+            "质量",
+            "环保",
+            "水保",
+            "文明",
+            "应急",
+            "验收",
+            "计算",
+            "附件",
+            "附录",
+            "总体",
+            "计划",
+            "组织",
+            "管理",
+            "保证",
+            "措施",
+            "方案",
+            "工艺",
+            "技术",
+            "要求",
+            "概况",
+            "依据",
+            "原则",
+            "资源",
+            "设备",
+            "材料",
+            "人员",
+            "进度",
+            "监测",
+            "风险",
+            "分析",
+            "检查",
+            "图纸",
+            "设计",
+            "部署",
+            "安排",
+        )
+        return not any(keyword in compact for keyword in chapter_keywords)
+
+    @staticmethod
+    def _is_suspicious_numeric_l2_jump(l2_sub_num: int, last_l2_sub_num: int) -> bool:
+        """判断数字小节编号是否出现过大的可疑跳号。"""
+
+        if last_l2_sub_num <= 0:
+            return False
+        return l2_sub_num - last_l2_sub_num > 3
+
+    @staticmethod
+    def _is_short_table_cell(text: str) -> bool:
+        """判断文本是否像短表格单元格。"""
+
+        clean = str(text or "").strip()
+        if not clean:
+            return False
+        if len(clean) <= 4 and re.fullmatch(r"[\u4e00-\u9fa5A-Za-z]{1,4}", clean):
+            return True
+        return bool(re.fullmatch(r"\d+(?:\.\d+)?", clean))
+
+    @staticmethod
+    def _looks_like_quantity_cell(text: str) -> bool:
+        """判断文本是否像数量、单位或状态类表格单元格。"""
+
+        clean = str(text or "").strip()
+        if not clean:
+            return False
+        if clean in {"正常", "可使用", "若干", "大量"}:
+            return True
+        return bool(
+            re.match(
+                r"^\d+(?:\.\d+)?\s*(?:台|套|辆|部|架|座|个|件|人|m|km|cm|mm|kg|t|%)",
+                clean,
+                re.IGNORECASE,
+            )
+        )
+
+    @staticmethod
+    def _blind_strip(text: str) -> str:
+        """粗略剥离标题编号前缀,用于判断剩余标题核心文本长度。"""
+
+        return re.sub(
+            r"^[第的一二三四五六七八九十百零两\d\.\s、))\]】\[((章节部部分篇]+",
+            "",
+            str(text or ""),
+        ).strip()
+
+    @classmethod
+    def _strip_leading_page_number_from_heading(cls, line: str) -> str:
+        """去掉标题行前方误混入的页码。"""
+
+        cleaned = re.sub(r"\s+", " ", str(line or "").strip())
+        if not cleaned:
+            return ""
+
+        return re.sub(
+            r"^\d{1,3}\s+(?="
+            r"(?:第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇])|"
+            r"(?:\d{1,2}\.\d{1,2}(?!\.\d)\.?\s*[\u4e00-\u9fa5])|"
+            r"(?:\d{1,2}\s+[\u4e00-\u9fa5])|"
+            r"(?:[一二三四五六七八九十百零两]+[、))\]]\s*[\u4e00-\u9fa5])|"
+            r"(?:[【\[]\s*\d+\s*[\]】]\s*[\u4e00-\u9fa5])"
+            r")",
+            "",
+            cleaned,
+            count=1,
+        ).strip()
+
+    @classmethod
+    def _extract_l1_number(
+        cls,
+        line: str,
+        rule_name: str,
+        match_l1: re.Match[str],
+        current_l1_num: int,
+    ) -> int:
+        """从一级标题文本中提取章节序号,提取失败时顺延当前章节号。"""
+
+        if rule_name == "Rule_1_纯数字派":
+            number_match = re.match(r"^(\d+)", line)
+            return int(number_match.group(1)) if number_match else 999
+
+        if rule_name == "Rule_2_混合章派":
+            return int(match_l1.group(1))
+
+        if rule_name == CN_LIST_L1_NUMERIC_L2_RULE:
+            cn_match = re.match(r"^([一二三四五六七八九十百零两]+)[、))\]]", line)
+            if cn_match:
+                return cls._cn_to_int(cn_match.group(1))
+
+        chapter_match = re.search(r"^第\s*(\d+|[一二三四五六七八九十百零两]+)", line)
+        if chapter_match:
+            chapter_number = chapter_match.group(1)
+            return int(chapter_number) if chapter_number.isdigit() else cls._cn_to_int(chapter_number)
+
+        return current_l1_num + 1
+
+    @classmethod
+    def _extract_non_numeric_l2_number(cls, prefix: str) -> int:
+        """把非数字小节前缀转换为用于顺序比较的整数。"""
+
+        prefix = str(prefix or "").strip()
+        if prefix.isdigit():
+            return int(prefix)
+        return cls._cn_to_int(prefix)
+
+    @classmethod
+    def _cn_to_int(cls, text: str) -> int:
+        """把中文数字文本转换为整数。"""
+
+        normalized = str(text or "").replace("两", "二").strip()
+        if not normalized:
+            return 0
+        if normalized.isdigit():
+            return int(normalized)
+        if normalized == "十":
+            return 10
+        if "百" in normalized:
+            left, right = normalized.split("百", 1)
+            hundreds = cls.CN_NUM_MAP.get(left, 1) if left else 1
+            return hundreds * 100 + cls._cn_to_int(right)
+        if "十" in normalized:
+            left, right = normalized.split("十", 1)
+            tens = cls.CN_NUM_MAP.get(left, 1) if left else 1
+            ones = cls.CN_NUM_MAP.get(right, 0) if right else 0
+            return tens * 10 + ones
+        return cls.CN_NUM_MAP.get(normalized, 0)
+
+    @staticmethod
+    def _ensure_section_node(
+        structured_data: Dict[str, Dict[str, List[Dict[str, Any]]]],
+        chapter_title: str,
+        section_title: str,
+        page: int,
+    ) -> None:
+        """确保章节和小节节点存在。"""
+
+        structured_data.setdefault(chapter_title, {"_chapter_page": page})  # type: ignore[assignment]
+        structured_data[chapter_title].setdefault(section_title, [])
+
+    @staticmethod
+    def _chapter_has_l2(chapter_data: Dict[str, Any]) -> bool:
+        """判断章节临时结构中是否已经出现真实二级小节。"""
+
+        return any(key for key in chapter_data.keys() if not key.startswith("_") and key != SECTION_TITLE_KEY)
+
+    @staticmethod
+    def _strip_catalog_page_suffix(text: str) -> str:
+        """清理目录行尾部的点线和页码。"""
+
+        cleaned = re.sub(r"\s+", " ", str(text or "").strip())
+        if not cleaned:
+            return ""
+        cleaned = re.sub(r"(?:[.\u2026\u00b7\u2022]{2,})[-\u2013\u2014 ]*\d+\s*$", "", cleaned).strip()
+        return re.sub(r"\s+\d{1,3}\s*$", "", cleaned).strip()
+
+    @classmethod
+    def _clean_chapter_title(cls, line: str) -> str:
+        """规范化一级标题文本,保留编号和标题主体。"""
+
+        cleaned = cls._strip_catalog_page_suffix(line)
+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
+
+        cn_match = re.match(r"^(第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇])[\s、::.-]*(.*)$", cleaned)
+        if cn_match:
+            prefix = re.sub(r"\s+", "", cn_match.group(1))
+            title = cn_match.group(2).strip()
+            return f"{prefix} {title}".strip()
+
+        cn_list_match = re.match(r"^([一二三四五六七八九十百零两]+[、))\]])\s*(.*)$", cleaned)
+        if cn_list_match:
+            prefix = cn_list_match.group(1).strip()
+            title = cn_list_match.group(2).strip()
+            return f"{prefix} {title}".strip()
+
+        num_match = re.match(r"^(\d{1,2})(?:[\..。、])?\s*(.*)$", cleaned)
+        if num_match:
+            prefix = num_match.group(1)
+            title = num_match.group(2).strip()
+            return f"{prefix} {title}".strip()
+
+        return cleaned
+
+    @classmethod
+    def _clean_section_title(cls, line: str) -> str:
+        """规范化二级标题文本,保留小节编号和标题主体。"""
+
+        cleaned = cls._strip_catalog_page_suffix(line)
+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
+
+        numeric_match = re.match(r"^(\d+\.\d+)(?!\.\d)\.?\s*(.*)$", cleaned)
+        if numeric_match:
+            prefix = numeric_match.group(1)
+            title = numeric_match.group(2).strip()
+            return f"{prefix} {title}".strip()
+
+        cn_section_match = re.match(r"^(第\s*[一二三四五六七八九十百零两]+\s*节)[\s、::.-]*(.*)$", cleaned)
+        if cn_section_match:
+            prefix = re.sub(r"\s+", "", cn_section_match.group(1))
+            title = cn_section_match.group(2).strip()
+            return f"{prefix} {title}".strip()
+
+        cn_list_match = re.match(r"^([一二三四五六七八九十百零两]+[、))\]])\s*(.*)$", cleaned)
+        if cn_list_match:
+            prefix = cn_list_match.group(1).strip()
+            title = cn_list_match.group(2).strip()
+            return f"{prefix} {title}".strip()
+
+        bracket_match = re.match(r"^([【\[]\s*\d+\s*[\]】])\s*(.*)$", cleaned)
+        if bracket_match:
+            prefix = re.sub(r"\s+", "", bracket_match.group(1))
+            title = bracket_match.group(2).strip()
+            return f"{prefix} {title}".strip()
+
+        return cleaned
+
+    @staticmethod
+    def _resolve_chapter_page_span(sections: Dict[str, Dict[str, Any]]) -> Tuple[int, int]:
+        """根据章节下所有小节的页码范围,计算章节整体页码范围。"""
+
+        page_starts: List[int] = []
+        page_ends: List[int] = []
+        for payload in sections.values():
+            if not isinstance(payload, dict):
+                continue
+            page_start = PdfStructureExtractor._safe_page_number(payload.get("page_start"), 1)
+            page_end = PdfStructureExtractor._safe_page_number(payload.get("page_end"), page_start)
+            page_starts.append(page_start)
+            page_ends.append(page_end)
+        if not page_starts:
+            return 1, 1
+        return min(page_starts), max(page_ends)
+
+    @staticmethod
+    def _format_catalog_chapters(chapters: List[Dict[str, Any]]) -> str:
+        """把目录章节结构格式化为便于查看的纯文本。"""
+
+        lines: List[str] = []
+        for chapter in chapters:
+            title = str(chapter.get("title", "") or "").strip()
+            if not title:
+                continue
+            lines.append(title)
+            for subsection in chapter.get("subsections", []) or []:
+                sub_title = str(subsection.get("title", "") or "").strip()
+                if sub_title:
+                    lines.append(f"  {sub_title}")
+        return "\n".join(lines)
+
+    @staticmethod
+    def _safe_page_number(value: Any, default: int = 1) -> int:
+        """安全地把页码值转换为不小于 1 的整数。"""
+
+        try:
+            return max(1, int(str(value).strip()))
+        except Exception:
+            return default

+ 946 - 0
core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py

@@ -0,0 +1,946 @@
+from __future__ import annotations
+
+r"""
+Batch runner for PDF structure extraction.
+
+Example commands:
+
+1. Run the original extractor:
+python core\construction_review\component\minimal_pipeline\pdf_extractor_batch_runner.py c:\work\桥梁公司施工规范\最终 --output-dir c:\work\桥梁公司施工规范\结果_v1 --recursive --extractor pdf_extractor
+
+2. Run the testc-style body-only extractor:
+python core\construction_review\component\minimal_pipeline\pdf_extractor_batch_runner.py c:\work\桥梁公司施工规范\最终 --output-dir c:\work\桥梁公司施工规范\结果_v1 --recursive --extractor pdf_extractor1
+
+3. Run the split-catalog extractor:
+python core\construction_review\component\minimal_pipeline\pdf_extractor_batch_runner.py c:\work\桥梁公司施工规范\最终 --output-dir c:\work\桥梁公司施工规范\结果_v2 --recursive --extractor pdf_extractor2
+"""
+
+import argparse
+import importlib.util
+import json
+import re
+import sys
+import types
+from datetime import datetime
+from difflib import SequenceMatcher
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Tuple
+
+import fitz
+
+
+REPO_ROOT = Path(__file__).resolve().parents[4]
+MODULE_DIR = Path(__file__).resolve().parent
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+
+SPECIAL_SECTION_KEYS = {"章节标题", "默认部分"}
+STAT_FILE_NAME = "static.text"
+TOC_LINE_PATTERN = re.compile(r"(?:[.\u2026·•…]{2,}|-{3,}).{0,30}\d+\s*$")
+TOC_PAGE_SUFFIX_PATTERN = re.compile(
+    r"(?:[.\u2026\u00b7\u2022·•…]{2,}|-{3,})[-\u2013\u2014 ]*(?:-\s*)?\d{1,3}(?:\s*-)?\s*$"
+)
+BODY_HEADING_PATTERNS = (
+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇][\s、::.\-]*\S+"),
+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇]\s*$"),
+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*节[\s、::.\-]*\S+"),
+    re.compile(r"^\d{1,2}(?:[\..。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].{1,40}$"),
+    re.compile(r"^[一二三四五六七八九十百零两]+[、)\)\]]\s*[\u4e00-\u9fa5A-Za-z].{1,40}$"),
+)
+CATALOG_L1_PATTERNS = (
+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇][\s、::.\-]*\S+"),
+    re.compile(r"^\d{1,2}(?:[\..。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].{1,60}$"),
+)
+CATALOG_L2_PATTERNS = (
+    re.compile(r"^\d+\.\d+(?!\.\d)\.?\s*[\u4e00-\u9fa5A-Za-z].*"),
+    re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*节[\s、::.\-]*\S+"),
+    re.compile(r"^[一二三四五六七八九十百零两]+[、)\)\]]\s*[\u4e00-\u9fa5A-Za-z].*"),
+    re.compile(r"^[【\[]\s*\d+\s*[\]】]\s*[\u4e00-\u9fa5A-Za-z].*"),
+)
+CATALOG_CN_LIST_PATTERN = re.compile(r"^[一二三四五六七八九十百零两]+[、)\)\]]\s*[\u4e00-\u9fa5A-Za-z].*")
+CATALOG_NUMERIC_SECTION_PATTERN = re.compile(r"^\d+\.\d+(?!\.\d)\.?\s*[\u4e00-\u9fa5A-Za-z].*")
+
+
+class _SilentLogger:
+    def debug(self, *args, **kwargs) -> None:
+        pass
+
+    def info(self, *args, **kwargs) -> None:
+        pass
+
+    def warning(self, *args, **kwargs) -> None:
+        pass
+
+    def error(self, *args, **kwargs) -> None:
+        pass
+
+    def exception(self, *args, **kwargs) -> None:
+        pass
+
+    def critical(self, *args, **kwargs) -> None:
+        pass
+
+
+def install_silent_logger_module() -> None:
+    module_name = "foundation.observability.logger.loggering"
+    if module_name in sys.modules:
+        return
+
+    silent_logger = _SilentLogger()
+
+    foundation_module = sys.modules.get("foundation")
+    if foundation_module is None:
+        foundation_module = types.ModuleType("foundation")
+        foundation_module.__path__ = [str(REPO_ROOT / "foundation")]
+        sys.modules["foundation"] = foundation_module
+
+    observability_module = sys.modules.get("foundation.observability")
+    if observability_module is None:
+        observability_module = types.ModuleType("foundation.observability")
+        observability_module.__path__ = []
+        sys.modules["foundation.observability"] = observability_module
+        foundation_module.observability = observability_module
+
+    logger_package = sys.modules.get("foundation.observability.logger")
+    if logger_package is None:
+        logger_package = types.ModuleType("foundation.observability.logger")
+        logger_package.__path__ = []
+        sys.modules["foundation.observability.logger"] = logger_package
+        observability_module.logger = logger_package
+
+    loggering_module = types.ModuleType(module_name)
+    loggering_module.review_logger = silent_logger
+    loggering_module.server_logger = silent_logger
+    loggering_module.CompatibleLogger = _SilentLogger
+    sys.modules[module_name] = loggering_module
+    logger_package.loggering = loggering_module
+
+
+def ensure_local_package(package_name: str) -> None:
+    if package_name in sys.modules:
+        return
+
+    package_module = types.ModuleType(package_name)
+    package_module.__path__ = [str(MODULE_DIR)]
+    sys.modules[package_name] = package_module
+
+
+def load_module_from_file(module_name: str, file_path: Path) -> Any:
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Unable to load module from: {file_path}")
+
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def load_pdf_structure_extractor(extractor_module: str = "pdf_extractor") -> Any:
+    install_silent_logger_module()
+
+    package_name = "_batch_local_minimal_pipeline"
+    ensure_local_package(package_name)
+
+    ocr_module_name = f"{package_name}.ocr_processor"
+    if ocr_module_name not in sys.modules:
+        load_module_from_file(ocr_module_name, MODULE_DIR / "ocr_processor.py")
+
+    module_stem = extractor_module.removesuffix(".py")
+    module_file = MODULE_DIR / f"{module_stem}.py"
+    if not module_file.exists():
+        raise FileNotFoundError(f"Extractor module not found: {module_file}")
+
+    pdf_module_name = f"{package_name}.{module_stem}"
+    if pdf_module_name not in sys.modules:
+        load_module_from_file(pdf_module_name, module_file)
+
+    return sys.modules[pdf_module_name].PdfStructureExtractor
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Batch extract PDF structure with PdfStructureExtractor."
+    )
+    parser.add_argument(
+        "input_dir",
+        nargs="?",
+        default=".",
+        help="Directory containing PDF files. Default: current directory.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="Directory for JSON outputs. Default: same directory as each PDF.",
+    )
+    parser.add_argument(
+        "--recursive",
+        action="store_true",
+        help="Scan PDF files recursively.",
+    )
+    parser.add_argument(
+        "--use-ocr",
+        action="store_true",
+        help="Enable OCR for table regions.",
+    )
+    parser.add_argument(
+        "--disable-toc",
+        action="store_true",
+        help="Disable TOC detection.",
+    )
+    parser.add_argument(
+        "--clip-top",
+        type=float,
+        default=60.0,
+        help="Top clip margin in points. Default: 60.",
+    )
+    parser.add_argument(
+        "--clip-bottom",
+        type=float,
+        default=60.0,
+        help="Bottom clip margin in points. Default: 60.",
+    )
+    parser.add_argument(
+        "--ocr-api-url",
+        default="http://183.220.37.46:25429/v1/chat/completions",
+        help="OCR API URL.",
+    )
+    parser.add_argument(
+        "--ocr-timeout",
+        type=int,
+        default=600,
+        help="OCR timeout in seconds. Default: 600.",
+    )
+    parser.add_argument(
+        "--ocr-api-key",
+        default="",
+        help="OCR API key.",
+    )
+    parser.add_argument(
+        "--toc-model-path",
+        default="config/yolo/best.pt",
+        help="TOC detector model path.",
+    )
+    parser.add_argument(
+        "--extractor",
+        default="pdf_extractor",
+        choices=["pdf_extractor", "pdf_extractor1", "pdf_extractor2"],
+        help="Extractor implementation to run. Default: pdf_extractor.",
+    )
+    return parser.parse_args()
+
+
+def iter_pdf_files(input_dir: Path, recursive: bool) -> List[Path]:
+    pattern = "*.pdf"
+    files: Iterable[Path] = input_dir.rglob(pattern) if recursive else input_dir.glob(pattern)
+    return sorted(path for path in files if path.is_file())
+
+
+def _count_text_chars(text: str) -> int:
+    return sum(len(line.strip()) for line in text.splitlines() if line.strip())
+
+
+def _looks_like_toc_page(text: str) -> bool:
+    lines = [line.strip() for line in (text or "").splitlines() if line.strip()]
+    if not lines:
+        return False
+
+    compact = re.sub(r"\s+", "", "\n".join(lines))
+    if "目录" in compact or "目錄" in compact:
+        return True
+
+    toc_like_count = sum(
+        1
+        for line in lines
+        if TOC_LINE_PATTERN.search(line) or TOC_PAGE_SUFFIX_PATTERN.search(line)
+    )
+    return toc_like_count >= 3
+
+
+def _strip_leading_page_number_for_heading(line: str) -> str:
+    cleaned = re.sub(r"\s+", " ", str(line or "").strip())
+    if not cleaned:
+        return ""
+
+    return re.sub(
+        r"^\d{1,3}\s+(?="
+        r"(?:第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇])|"
+        r"(?:第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*节)|"
+        r"(?:\d{1,2}(?:[\..。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z])|"
+        r"(?:[一二三四五六七八九十百零两]+[、)\)\]]\s*[\u4e00-\u9fa5A-Za-z])"
+        r")",
+        "",
+        cleaned,
+        count=1,
+    ).strip()
+
+
+def _looks_like_body_start_page(text: str) -> bool:
+    for raw_line in (text or "").splitlines():
+        line = _strip_leading_page_number_for_heading(raw_line)
+        if not line or TOC_LINE_PATTERN.search(line):
+            continue
+        if any(pattern.match(line) for pattern in BODY_HEADING_PATTERNS):
+            return True
+    return False
+
+
+def _find_count_start_page_index(page_texts: List[str]) -> int:
+    toc_start_index: int | None = None
+    for index, text in enumerate(page_texts):
+        if _looks_like_toc_page(text):
+            toc_start_index = index
+            break
+
+    search_start = toc_start_index + 1 if toc_start_index is not None else 0
+    for index, text in enumerate(page_texts[search_start:], search_start):
+        if _looks_like_toc_page(text):
+            continue
+        if _looks_like_body_start_page(text):
+            return index
+
+    return search_start if toc_start_index is not None else 0
+
+
+def _strip_catalog_count_line(line: str) -> str:
+    cleaned = re.sub(r"\s+", " ", str(line or "").strip())
+    if not cleaned:
+        return ""
+
+    page_match = TOC_PAGE_SUFFIX_PATTERN.search(cleaned)
+    if page_match:
+        return cleaned[:page_match.start()].strip(" .\u2026\u00b7\u2022-\u2013\u2014")
+
+    return re.sub(r"\s{2,}\d{1,3}\s*$", "", cleaned).strip()
+
+
+def _iter_front_catalog_lines(pdf_path: Path, clip_top: float, clip_bottom: float) -> List[str]:
+    catalog_texts: List[str] = []
+    saw_catalog = False
+
+    with fitz.open(pdf_path) as doc:
+        for page_index in range(min(len(doc), 12)):
+            page = doc.load_page(page_index)
+            rect = page.rect
+            clip_box = fitz.Rect(0, clip_top, rect.width, rect.height - clip_bottom)
+            text = page.get_text("text", clip=clip_box)
+            if _looks_like_toc_page(text):
+                saw_catalog = True
+                catalog_texts.append(text)
+                continue
+            if saw_catalog:
+                break
+
+    lines: List[str] = []
+    for text in catalog_texts:
+        for raw_line in (text or "").splitlines():
+            line = _strip_catalog_count_line(raw_line)
+            if not line:
+                continue
+            compact = re.sub(r"\s+", "", line)
+            if compact in {"目录", "目", "录"}:
+                continue
+            lines.append(line)
+    return _merge_split_catalog_heading_lines(lines)
+
+
+def _merge_split_catalog_heading_lines(lines: List[str]) -> List[str]:
+    merged: List[str] = []
+    index = 0
+    split_chapter_pattern = re.compile(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部部分篇]\s*$")
+
+    while index < len(lines):
+        line = lines[index]
+        if split_chapter_pattern.match(line) and index + 1 < len(lines):
+            next_line = lines[index + 1].strip()
+            if next_line and not any(pattern.match(next_line) for pattern in CATALOG_L2_PATTERNS):
+                merged.append(f"{line} {next_line}")
+                index += 2
+                continue
+        merged.append(line)
+        index += 1
+
+    return merged
+
+
+def _classify_catalog_line_level(
+    line: str,
+    next_line: str,
+    saw_explicit_l1: bool,
+) -> int | None:
+    if any(pattern.match(line) for pattern in CATALOG_L1_PATTERNS):
+        return 1
+
+    if CATALOG_CN_LIST_PATTERN.match(line):
+        if not saw_explicit_l1 and CATALOG_NUMERIC_SECTION_PATTERN.match(next_line):
+            return 1
+        return 2
+
+    if any(pattern.match(line) for pattern in CATALOG_L2_PATTERNS):
+        return 2
+
+    return None
+
+
+def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom: float) -> List[Dict[str, Any]]:
+    items: List[Dict[str, Any]] = []
+    lines = _iter_front_catalog_lines(pdf_path, clip_top, clip_bottom)
+    saw_explicit_l1 = False
+
+    for index, line in enumerate(lines):
+        next_line = lines[index + 1] if index + 1 < len(lines) else ""
+        level = _classify_catalog_line_level(line, next_line, saw_explicit_l1)
+        if level is None:
+            continue
+
+        items.append({"level": level, "title": line})
+        if level == 1 and any(pattern.match(line) for pattern in CATALOG_L1_PATTERNS):
+            saw_explicit_l1 = True
+
+    return items
+
+
+def count_catalog_item_levels(items: List[Dict[str, Any]]) -> Tuple[int, int]:
+    l1_count = sum(1 for item in items if item.get("level") == 1)
+    l2_count = sum(1 for item in items if item.get("level") == 2)
+    return l1_count, l2_count
+
+
+def compute_original_catalog_counts(pdf_path: Path, clip_top: float, clip_bottom: float) -> Tuple[int, int]:
+    return count_catalog_item_levels(extract_original_catalog_items(pdf_path, clip_top, clip_bottom))
+
+
+def compute_raw_char_count_with_scope(pdf_path: Path, clip_top: float, clip_bottom: float) -> Tuple[int, int]:
+    page_texts: List[str] = []
+    with fitz.open(pdf_path) as doc:
+        for page in doc:
+            rect = page.rect
+            clip_box = fitz.Rect(0, clip_top, rect.width, rect.height - clip_bottom)
+            page_texts.append(page.get_text("text", clip=clip_box))
+
+    start_index = _find_count_start_page_index(page_texts)
+    total = sum(_count_text_chars(text) for text in page_texts[start_index:])
+    return total, start_index + 1
+
+
+def compute_raw_char_count(pdf_path: Path, clip_top: float, clip_bottom: float) -> int:
+    total, _ = compute_raw_char_count_with_scope(pdf_path, clip_top, clip_bottom)
+    return total
+
+
+def compute_extracted_char_count(result: Dict[str, Any]) -> int:
+    total = 0
+    chapters = result.get("chapters", {}) or {}
+    for chapter_title, sections in chapters.items():
+        chapter_title = str(chapter_title or "").strip()
+        if chapter_title:
+            total += len(chapter_title)
+
+        if not isinstance(sections, dict):
+            continue
+
+        for section_title, payload in sections.items():
+            section_title = str(section_title or "").strip()
+            if section_title and section_title not in SPECIAL_SECTION_KEYS:
+                total += len(section_title)
+
+            if isinstance(payload, dict):
+                content = str(payload.get("content", "") or "").strip()
+            else:
+                content = str(payload or "").strip()
+            total += _count_text_chars(content)
+    return total
+
+
+def compute_quality_rate(raw_char_count: int, extracted_char_count: int) -> Tuple[float, str]:
+    if raw_char_count <= 0:
+        return 0.0, "0.0%"
+
+    rate = extracted_char_count / raw_char_count
+    rate = max(0.0, min(rate, 1.0))
+    return rate, f"{rate * 100:.1f}%"
+
+
+def count_sections(result: Dict[str, Any]) -> int:
+    chapters = result.get("chapters", {}) or {}
+    total = 0
+    for sections in chapters.values():
+        if not isinstance(sections, dict):
+            continue
+        total += sum(1 for key in sections.keys() if key not in SPECIAL_SECTION_KEYS)
+    return total
+
+
+def _catalog_title_from_entry(entry: Any) -> str:
+    if isinstance(entry, str):
+        return entry.strip()
+    if not isinstance(entry, dict):
+        return ""
+
+    for key in ("title", "name", "text", "chapter_title", "section_title", "heading", "original"):
+        value = str(entry.get(key, "") or "").strip()
+        if value:
+            return value
+    return ""
+
+
+def _iter_catalog_subsections(chapter: Dict[str, Any]) -> Iterable[Any]:
+    subsections = chapter.get("subsections")
+    if subsections is None:
+        subsections = chapter.get("sections")
+
+    if isinstance(subsections, list):
+        return subsections
+    if isinstance(subsections, dict):
+        return [
+            value if isinstance(value, dict) and _catalog_title_from_entry(value) else {"title": key}
+            for key, value in subsections.items()
+        ]
+    return []
+
+
+def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]:
+    items: List[Dict[str, Any]] = []
+    catalog = result.get("catalog") or result.get("body_catalog") or {}
+
+    if isinstance(catalog, dict) and isinstance(catalog.get("chapters"), list):
+        for chapter in catalog.get("chapters") or []:
+            if not isinstance(chapter, dict):
+                continue
+
+            chapter_title = _catalog_title_from_entry(chapter)
+            if chapter_title:
+                items.append({"level": 1, "title": chapter_title})
+
+            for subsection in _iter_catalog_subsections(chapter):
+                section_title = _catalog_title_from_entry(subsection)
+                if section_title:
+                    items.append({"level": 2, "title": section_title})
+
+        return items
+
+    chapters = result.get("chapters", {}) or {}
+    if not isinstance(chapters, dict):
+        return items
+
+    for chapter_title, sections in chapters.items():
+        chapter_title = str(chapter_title or "").strip()
+        if chapter_title:
+            items.append({"level": 1, "title": chapter_title})
+
+        if not isinstance(sections, dict):
+            continue
+        for section_title in sections.keys():
+            section_title = str(section_title or "").strip()
+            if section_title and section_title not in SPECIAL_SECTION_KEYS:
+                items.append({"level": 2, "title": section_title})
+
+    return items
+
+
+def count_extracted_catalog_items(result: Dict[str, Any]) -> Tuple[int, int]:
+    return count_catalog_item_levels(extract_result_catalog_items(result))
+
+
+def _normalize_catalog_title(text: str) -> str:
+    cleaned = _strip_catalog_count_line(str(text or ""))
+    cleaned = cleaned.translate(
+        str.maketrans({
+            "\uff08": "(",
+            "\uff09": ")",
+            "\uff0c": ",",
+            "\uff0e": ".",
+            "\u3002": ".",
+            "\u3001": ",",
+            "\uff1a": ":",
+            "\uff1b": ";",
+            "\u3000": " ",
+        })
+    )
+    cleaned = re.sub(r"\s+", "", cleaned)
+    cleaned = re.sub(r"[\u00b7\u2022\u2026.,:;_\-()\[\]{}<>/\\]+", "", cleaned)
+    return cleaned.lower()
+
+
+def _strip_catalog_heading_prefix(text: str) -> str:
+    cleaned = _strip_catalog_count_line(str(text or "")).strip()
+    cn_num = r"\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u96f6\u4e24"
+    prefix_patterns = (
+        rf"^第\s*(?:\d+|[{cn_num}]+)\s*[章部部分篇]\s*[、,,..。::\-\s]*",
+        rf"^第\s*(?:\d+|[{cn_num}]+)\s*节\s*[、,,..。::\-\s]*",
+        r"^\d+(?:\.\d+){0,3}\.?\s*",
+        rf"^[{cn_num}]+[、,,))\]]\s*",
+        r"^[【\[]\s*\d+\s*[】\]]\s*",
+    )
+    for pattern in prefix_patterns:
+        stripped = re.sub(pattern, "", cleaned, count=1)
+        if stripped != cleaned:
+            return stripped.strip()
+    return cleaned
+
+
+def _catalog_title_similarity(left: str, right: str) -> float:
+    left_full = _normalize_catalog_title(left)
+    right_full = _normalize_catalog_title(right)
+    if not left_full or not right_full:
+        return 0.0
+    if left_full == right_full:
+        return 1.0
+
+    scores = [SequenceMatcher(None, left_full, right_full).ratio()]
+
+    left_body = _normalize_catalog_title(_strip_catalog_heading_prefix(left))
+    right_body = _normalize_catalog_title(_strip_catalog_heading_prefix(right))
+    if left_body and right_body:
+        if left_body == right_body:
+            scores.append(1.0)
+        elif min(len(left_body), len(right_body)) >= 4 and (
+            left_body in right_body or right_body in left_body
+        ):
+            scores.append(0.95)
+        else:
+            scores.append(SequenceMatcher(None, left_body, right_body).ratio())
+
+    if min(len(left_full), len(right_full)) >= 4 and (
+        left_full in right_full or right_full in left_full
+    ):
+        scores.append(0.95)
+
+    return max(scores)
+
+
+def _longest_increasing_subsequence_length(values: List[int]) -> int:
+    if not values:
+        return 0
+
+    lengths = [1] * len(values)
+    for index, value in enumerate(values):
+        for prev_index in range(index):
+            if values[prev_index] < value:
+                lengths[index] = max(lengths[index], lengths[prev_index] + 1)
+    return max(lengths)
+
+
+def _catalog_count_score(original_count: int, extracted_count: int) -> float:
+    max_count = max(original_count, extracted_count)
+    if max_count <= 0:
+        return 1.0
+    return min(original_count, extracted_count) / max_count
+
+
+def _match_catalog_level(
+    original_items: List[Dict[str, Any]],
+    extracted_items: List[Dict[str, Any]],
+    level: int,
+) -> Dict[str, Any]:
+    originals = [item for item in original_items if item.get("level") == level]
+    extracted = [item for item in extracted_items if item.get("level") == level]
+    used_extracted_indexes: set[int] = set()
+    matches: List[Dict[str, Any]] = []
+
+    for original_index, original in enumerate(originals):
+        best_index = -1
+        best_score = 0.0
+        original_title = str(original.get("title", "") or "")
+
+        for extracted_index, candidate in enumerate(extracted):
+            if extracted_index in used_extracted_indexes:
+                continue
+            score = _catalog_title_similarity(original_title, str(candidate.get("title", "") or ""))
+            if score > best_score:
+                best_score = score
+                best_index = extracted_index
+
+        threshold = 0.82 if level == 1 else 0.78
+        if best_index >= 0 and best_score >= threshold:
+            used_extracted_indexes.add(best_index)
+            matches.append({
+                "original_index": original_index,
+                "extracted_index": best_index,
+                "score": best_score,
+            })
+
+    original_count = len(originals)
+    extracted_count = len(extracted)
+    matched_count = len(matches)
+    precision = matched_count / extracted_count if extracted_count else (1.0 if original_count == 0 else 0.0)
+    recall = matched_count / original_count if original_count else (1.0 if extracted_count == 0 else 0.0)
+    f1 = 0.0 if precision + recall == 0 else 2 * precision * recall / (precision + recall)
+
+    ordered_extracted_indexes = [match["extracted_index"] for match in sorted(matches, key=lambda item: item["original_index"])]
+    order_score = (
+        _longest_increasing_subsequence_length(ordered_extracted_indexes) / matched_count
+        if matched_count
+        else (1.0 if original_count == 0 and extracted_count == 0 else 0.0)
+    )
+
+    return {
+        "original": original_count,
+        "extracted": extracted_count,
+        "matched": matched_count,
+        "precision": precision,
+        "recall": recall,
+        "title_f1": f1,
+        "count_score": _catalog_count_score(original_count, extracted_count),
+        "order_score": order_score,
+    }
+
+
+def _weighted_catalog_score(level_details: Dict[str, Dict[str, Any]], metric: str) -> float:
+    weighted_scores: List[Tuple[float, float]] = []
+    if max(level_details["chapter"]["original"], level_details["chapter"]["extracted"]) > 0:
+        weighted_scores.append((0.35, float(level_details["chapter"][metric])))
+    if max(level_details["section"]["original"], level_details["section"]["extracted"]) > 0:
+        weighted_scores.append((0.65, float(level_details["section"][metric])))
+
+    if not weighted_scores:
+        return 0.0
+
+    total_weight = sum(weight for weight, _ in weighted_scores)
+    return sum(weight * score for weight, score in weighted_scores) / total_weight
+
+
+def _round_catalog_detail(value: Any) -> Any:
+    if isinstance(value, float):
+        return round(value, 4)
+    if isinstance(value, dict):
+        return {key: _round_catalog_detail(item) for key, item in value.items()}
+    return value
+
+
+def compute_catalog_quality_rate_from_items(
+    original_items: List[Dict[str, Any]],
+    extracted_items: List[Dict[str, Any]],
+) -> Tuple[float, str, Dict[str, Any]]:
+    level_details = {
+        "chapter": _match_catalog_level(original_items, extracted_items, 1),
+        "section": _match_catalog_level(original_items, extracted_items, 2),
+    }
+    title_score = _weighted_catalog_score(level_details, "title_f1")
+    count_score = _weighted_catalog_score(level_details, "count_score")
+    order_score = _weighted_catalog_score(level_details, "order_score")
+    rate = 0.70 * title_score + 0.20 * count_score + 0.10 * order_score
+    rate = max(0.0, min(rate, 1.0))
+
+    detail = {
+        "score_model": "title_f1_70_count_20_order_10",
+        "title_score": title_score,
+        "count_score": count_score,
+        "order_score": order_score,
+        "level_details": level_details,
+    }
+    return rate, f"{rate * 100:.1f}%", _round_catalog_detail(detail)
+
+
+def append_static_record(
+    stat_path: Path,
+    pdf_path: Path,
+    original_l1_count: int,
+    extracted_l1_count: int,
+    original_l2_count: int,
+    extracted_l2_count: int,
+    catalog_quality_rate_text: str,
+    content_quality_rate_text: str,
+) -> None:
+    stat_path.parent.mkdir(parents=True, exist_ok=True)
+    needs_header = not stat_path.exists() or stat_path.stat().st_size == 0
+    with stat_path.open("a", encoding="utf-8", newline="") as file:
+        if needs_header:
+            file.write("文件名\t一级目录(原PDF/提取)\t二级目录(原PDF/提取)\t目录合格率\t内容合格率\n")
+        file.write(
+            f"{pdf_path.name}\t"
+            f"{original_l1_count}/{extracted_l1_count}\t"
+            f"{original_l2_count}/{extracted_l2_count}\t"
+            f"{catalog_quality_rate_text}\t"
+            f"{content_quality_rate_text}\n"
+        )
+
+
+def sanitize_filename_component(value: str) -> str:
+    sanitized = value.strip()
+    for char in '<>:"/\\|?*':
+        sanitized = sanitized.replace(char, "_")
+    return sanitized or "output"
+
+
+def build_output_path(
+    pdf_path: Path,
+    input_dir: Path,
+    output_dir: Path | None,
+    quality_rate_text: str,
+) -> Path:
+    if output_dir is None:
+        target_dir = pdf_path.parent
+        stem_source = pdf_path.stem
+    else:
+        target_dir = output_dir
+        try:
+            relative_stem = pdf_path.relative_to(input_dir).with_suffix("")
+            stem_source = "__".join(relative_stem.parts)
+        except ValueError:
+            stem_source = pdf_path.stem
+
+    target_dir.mkdir(parents=True, exist_ok=True)
+    safe_stem = sanitize_filename_component(stem_source)
+    filename = f"{quality_rate_text}_{safe_stem}.json"
+    return target_dir / filename
+
+
+def build_output_payload(
+    pdf_path: Path,
+    extractor_result: Dict[str, Any],
+    raw_char_count: int,
+    raw_char_count_start_page: int,
+    extracted_char_count: int,
+    quality_rate_text: str,
+    use_ocr: bool,
+    detect_toc: bool,
+    extractor_name: str,
+) -> Dict[str, Any]:
+    chapters = extractor_result.get("chapters", {}) or {}
+    catalog = extractor_result.get("catalog") or {}
+    return {
+        "metadata": {
+            "filename": pdf_path.name,
+            "source_path": str(pdf_path),
+            "generated_at": datetime.now().isoformat(timespec="seconds"),
+            "quality_rate": quality_rate_text,
+            "raw_char_count": raw_char_count,
+            "raw_char_count_scope": "body_only",
+            "raw_char_count_start_page": raw_char_count_start_page,
+            "extracted_char_count": extracted_char_count,
+            "chapter_count": len(chapters),
+            "section_count": count_sections(extractor_result),
+            "total_pages": extractor_result.get("total_pages", 0),
+            "catalog_chapter_count": catalog.get("total_chapters", 0) if isinstance(catalog, dict) else 0,
+            "use_ocr": use_ocr,
+            "detect_toc": detect_toc,
+            "extractor": extractor_name,
+        },
+        "extracted_result": extractor_result,
+    }
+
+
+def process_pdf(
+    pdf_path: Path,
+    input_dir: Path,
+    output_dir: Path | None,
+    extractor: Any,
+    clip_top: float,
+    clip_bottom: float,
+    use_ocr: bool,
+    detect_toc: bool,
+    extractor_name: str,
+) -> Tuple[Path, str]:
+    raw_char_count, raw_char_count_start_page = compute_raw_char_count_with_scope(pdf_path, clip_top, clip_bottom)
+    original_catalog_items = extract_original_catalog_items(pdf_path, clip_top, clip_bottom)
+    original_l1_count, original_l2_count = count_catalog_item_levels(original_catalog_items)
+    file_content = pdf_path.read_bytes()
+    extractor_result = extractor.extract(file_content)
+    extracted_char_count = compute_extracted_char_count(extractor_result)
+    _, quality_rate_text = compute_quality_rate(raw_char_count, extracted_char_count)
+    extracted_catalog_items = extract_result_catalog_items(extractor_result)
+    extracted_l1_count, extracted_l2_count = count_catalog_item_levels(extracted_catalog_items)
+    _, catalog_quality_rate_text, catalog_quality_detail = compute_catalog_quality_rate_from_items(
+        original_items=original_catalog_items,
+        extracted_items=extracted_catalog_items,
+    )
+
+    payload = build_output_payload(
+        pdf_path=pdf_path,
+        extractor_result=extractor_result,
+        raw_char_count=raw_char_count,
+        raw_char_count_start_page=raw_char_count_start_page,
+        extracted_char_count=extracted_char_count,
+        quality_rate_text=quality_rate_text,
+        use_ocr=use_ocr,
+        detect_toc=detect_toc,
+            extractor_name=extractor_name,
+    )
+    payload["metadata"].update({
+        "original_catalog_chapter_count": original_l1_count,
+        "original_catalog_section_count": original_l2_count,
+        "extracted_catalog_chapter_count": extracted_l1_count,
+        "extracted_catalog_section_count": extracted_l2_count,
+        "catalog_quality_rate": catalog_quality_rate_text,
+        "catalog_quality_detail": catalog_quality_detail,
+    })
+
+    output_path = build_output_path(pdf_path, input_dir, output_dir, quality_rate_text)
+    output_path.write_text(
+        json.dumps(payload, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    append_static_record(
+        stat_path=output_path.parent / STAT_FILE_NAME,
+        pdf_path=pdf_path,
+        original_l1_count=original_l1_count,
+        extracted_l1_count=extracted_l1_count,
+        original_l2_count=original_l2_count,
+        extracted_l2_count=extracted_l2_count,
+        catalog_quality_rate_text=catalog_quality_rate_text,
+        content_quality_rate_text=quality_rate_text,
+    )
+    return output_path, quality_rate_text
+
+
+def main() -> int:
+    args = parse_args()
+    input_dir = Path(args.input_dir).expanduser().resolve()
+    output_dir = Path(args.output_dir).expanduser().resolve() if args.output_dir else None
+
+    if not input_dir.exists() or not input_dir.is_dir():
+        print(f"[ERROR] Input directory does not exist: {input_dir}")
+        return 1
+
+    pdf_files = iter_pdf_files(input_dir, args.recursive)
+    if not pdf_files:
+        print(f"[ERROR] No PDF files found in: {input_dir}")
+        return 1
+
+    PdfStructureExtractor = load_pdf_structure_extractor(args.extractor)
+    extractor = PdfStructureExtractor(
+        clip_top=args.clip_top,
+        clip_bottom=args.clip_bottom,
+        use_ocr=args.use_ocr,
+        ocr_api_url=args.ocr_api_url,
+        ocr_timeout=args.ocr_timeout,
+        ocr_api_key=args.ocr_api_key,
+        detect_toc=not args.disable_toc,
+        toc_model_path=args.toc_model_path,
+    )
+
+    print("=" * 80)
+    print(f"Found {len(pdf_files)} PDF file(s) in: {input_dir}")
+    print(f"Extractor: {args.extractor}")
+    print("=" * 80)
+
+    success_count = 0
+    for index, pdf_path in enumerate(pdf_files, 1):
+        print(f"[{index}/{len(pdf_files)}] Processing: {pdf_path.name}")
+        try:
+            output_path, quality_rate_text = process_pdf(
+                pdf_path=pdf_path,
+                input_dir=input_dir,
+                output_dir=output_dir,
+                extractor=extractor,
+                clip_top=args.clip_top,
+                clip_bottom=args.clip_bottom,
+                use_ocr=args.use_ocr,
+                detect_toc=not args.disable_toc,
+                extractor_name=args.extractor,
+            )
+            success_count += 1
+            print(f"  [OK] quality={quality_rate_text} -> {output_path}")
+        except Exception as exc:
+            print(f"  [FAILED] {exc}")
+
+    print("=" * 80)
+    print(f"Finished. Success: {success_count}/{len(pdf_files)}")
+    print("=" * 80)
+    return 0 if success_count == len(pdf_files) else 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 1 - 1
core/construction_review/component/minimal_pipeline/simple_processor.py

@@ -17,7 +17,7 @@ from typing import Dict, Any, Optional, Tuple, List
 from foundation.observability.logger.loggering import review_logger as logger
 from foundation.observability.cachefiles import cache, CacheBaseDir
 
-from .pdf_extractor import PdfStructureExtractor
+from .pdf_extractor1 import PdfStructureExtractor
 from .toc_builder import build_toc_items_from_structure
 from .chunk_assembler import assemble_chunks
 from ..doc_worker.classification.hierarchy_classifier import HierarchyClassifier