4 gün önce · 39395d5c72
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
@@ -11,7 +11,7 @@ import io
 
				 import re
			
 
				 from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				 from dataclasses import dataclass
			
 
				-from typing import Dict, Any, List, Optional, Tuple
			
 
				+from typing import Dict, Any, List, Optional, Tuple, Set
			
 
				 
			
 
				 import fitz
			
 
				 import numpy as np
			
@@ -169,6 +169,14 @@ class PdfStructureExtractor:
 
				                     structure.get("chapters", {}),
			
 
				                     result["catalog"],
			
 
				                 )
			
 
				+                rebuilt_chapters = self._rebuild_section_contents_from_catalog(
			
 
				+                    structure.get("chapters", {}),
			
 
				+                    result["catalog"],
			
 
				+                    structure.get("_body_lines", []),
			
 
				+                )
			
 
				+                if rebuilt_chapters:
			
 
				+                    structure["chapters"] = rebuilt_chapters
			
 
				+            structure.pop("_body_lines", None)
			
 
				             result["chapters"] = structure.get("chapters", {})
			
 
				             result["total_pages"] = len(doc)
			
 
				             return result
			
@@ -251,6 +259,7 @@ class PdfStructureExtractor:
 
				 
			
 
				         # === 阶段3: 提取页面文本（应用 OCR 结果）并切分章节 ===
			
 
				         structured_data: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				+        body_lines: List[Dict[str, Any]] = []
			
 
				         current_chapter = "未分类前言"
			
 
				         current_section = "默认部分"
			
 
				         in_body = False
			
@@ -280,7 +289,14 @@ class PdfStructureExtractor:
 
				             else:
			
 
				                 text = page.get_text("text", clip=clip_box)
			
 
				 
			
 
				-            lines = text.split("\n")
			
 
				+            lines = self._prepare_page_lines(text)
			
 
				+            for line in lines:
			
 
				+                if not line or self._is_header_footer(line):
			
 
				+                    continue
			
 
				+                body_lines.append({
			
 
				+                    "page": page_num + 1,
			
 
				+                    "text": line,
			
 
				+                })
			
 
				 
			
 
				             for line in lines:
			
 
				                 line = line.strip()
			
@@ -358,7 +374,7 @@ class PdfStructureExtractor:
 
				                 structured_data[current_chapter][current_section]["page_end"] = page_num + 1
			
 
				 
			
 
				         # 将行列表拼接为文本
			
 
				-        result: Dict[str, Any] = {"chapters": {}}
			
 
				+        result: Dict[str, Any] = {"chapters": {}, "_body_lines": body_lines}
			
 
				         for chap, sections in structured_data.items():
			
 
				             result["chapters"][chap] = {}
			
 
				             for sec, data in sections.items():
			
@@ -376,12 +392,26 @@ class PdfStructureExtractor:
 
				             return {}
			
 
				 
			
 
				         normalized = dict(catalog)
			
 
				+        existing_chapters = self._sanitize_catalog_chapters(catalog.get("chapters", []))
			
 
				         raw_text = catalog.get("raw_ocr_text", "")
			
 
				         parsed_chapters = self._parse_catalog_from_raw_text(raw_text) if isinstance(raw_text, str) else []
			
 
				+        selected_chapters = existing_chapters
			
 
				+
			
 
				         if parsed_chapters:
			
 
				-            normalized["chapters"] = parsed_chapters
			
 
				-            normalized["total_chapters"] = len(parsed_chapters)
			
 
				-            normalized["formatted_text"] = self._format_catalog_chapters(parsed_chapters)
			
 
				+            if self._should_prefer_parsed_catalog(parsed_chapters, existing_chapters):
			
 
				+                selected_chapters = parsed_chapters
			
 
				+            elif existing_chapters:
			
 
				+                logger.info(
			
 
				+                    "[PDF提取] raw_ocr_text目录解析结果异常，保留原始目录骨架: "
			
 
				+                    f"parsed={len(parsed_chapters)}, original={len(existing_chapters)}"
			
 
				+                )
			
 
				+            else:
			
 
				+                selected_chapters = parsed_chapters
			
 
				+
			
 
				+        if selected_chapters:
			
 
				+            normalized["chapters"] = selected_chapters
			
 
				+            normalized["total_chapters"] = len(selected_chapters)
			
 
				+            normalized["formatted_text"] = self._format_catalog_chapters(selected_chapters)
			
 
				         return normalized
			
 
				 
			
 
				     def _parse_catalog_from_raw_text(self, text: str) -> List[Dict[str, Any]]:
			
@@ -391,6 +421,7 @@ class PdfStructureExtractor:
 
				         chapters: List[Dict[str, Any]] = []
			
 
				         current_chapter: Optional[Dict[str, Any]] = None
			
 
				         active_l2_rule: Optional[str] = None
			
 
				+        document_l1_rules: Optional[List[str]] = None
			
 
				 
			
 
				         for raw_line in text.splitlines():
			
 
				             title_text, page = self._split_catalog_entry(raw_line)
			
@@ -401,8 +432,10 @@ class PdfStructureExtractor:
 
				             if compact in {"目录", "目錄"}:
			
 
				                 continue
			
 
				 
			
 
				-            chapter_matches = self._matching_rule_names(title_text, "l1")
			
 
				+            chapter_matches = self._matching_rule_names(title_text, "l1", document_l1_rules)
			
 
				             if chapter_matches:
			
 
				+                if document_l1_rules is None:
			
 
				+                    document_l1_rules = chapter_matches
			
 
				                 current_chapter = {
			
 
				                     "index": len(chapters) + 1,
			
 
				                     "title": self._clean_chapter_title(title_text),
			
@@ -419,6 +452,24 @@ class PdfStructureExtractor:
 
				 
			
 
				             section_matches = self._matching_rule_names(title_text, "l2")
			
 
				             if not section_matches:
			
 
				+                numeric_section_title = self._coerce_numeric_catalog_section(
			
 
				+                    title_text,
			
 
				+                    document_l1_rules,
			
 
				+                    active_l2_rule,
			
 
				+                )
			
 
				+                if numeric_section_title:
			
 
				+                    section_key = self._normalize_heading_key(numeric_section_title)
			
 
				+                    existing_keys = {
			
 
				+                        self._normalize_heading_key(sub.get("title", ""))
			
 
				+                        for sub in current_chapter.get("subsections", [])
			
 
				+                    }
			
 
				+                    if section_key not in existing_keys:
			
 
				+                        current_chapter["subsections"].append({
			
 
				+                            "title": numeric_section_title,
			
 
				+                            "page": str(page or current_chapter.get("page", 1)),
			
 
				+                            "level": 2,
			
 
				+                            "original": raw_line.strip(),
			
 
				+                        })
			
 
				                 continue
			
 
				 
			
 
				             if active_l2_rule is None:
			
@@ -444,6 +495,174 @@ class PdfStructureExtractor:
 
				 
			
 
				         return chapters
			
 
				 
			
 
				+    @classmethod
			
 
				+    def _sanitize_catalog_chapters(cls, chapters: Any) -> List[Dict[str, Any]]:
			
 
				+        if not isinstance(chapters, list):
			
 
				+            return []
			
 
				+
			
 
				+        sanitized: List[Dict[str, Any]] = []
			
 
				+        seen_chapter_keys: Set[str] = set()
			
 
				+
			
 
				+        for idx, chapter in enumerate(chapters, 1):
			
 
				+            if not isinstance(chapter, dict):
			
 
				+                continue
			
 
				+
			
 
				+            chapter_title = cls._clean_chapter_title(str(chapter.get("title", "") or ""))
			
 
				+            chapter_key = cls._normalize_heading_key(chapter_title)
			
 
				+            if not chapter_key or chapter_key in seen_chapter_keys:
			
 
				+                continue
			
 
				+
			
 
				+            seen_chapter_keys.add(chapter_key)
			
 
				+            chapter_page = str(chapter.get("page") or idx)
			
 
				+            subsections: List[Dict[str, Any]] = []
			
 
				+            seen_section_keys: Set[str] = set()
			
 
				+
			
 
				+            for subsection in chapter.get("subsections", []) or []:
			
 
				+                if not isinstance(subsection, dict):
			
 
				+                    continue
			
 
				+
			
 
				+                section_title = cls._clean_section_title(str(subsection.get("title", "") or ""))
			
 
				+                section_key = cls._normalize_heading_key(section_title)
			
 
				+                if not section_key or section_key in seen_section_keys:
			
 
				+                    continue
			
 
				+
			
 
				+                seen_section_keys.add(section_key)
			
 
				+                subsections.append({
			
 
				+                    "title": section_title,
			
 
				+                    "page": str(subsection.get("page") or chapter_page),
			
 
				+                    "level": 2,
			
 
				+                    "original": subsection.get("original", "") or section_title,
			
 
				+                })
			
 
				+
			
 
				+            sanitized.append({
			
 
				+                "index": len(sanitized) + 1,
			
 
				+                "title": chapter_title,
			
 
				+                "page": chapter_page,
			
 
				+                "original": chapter.get("original", "") or chapter_title,
			
 
				+                "subsections": subsections,
			
 
				+            })
			
 
				+
			
 
				+        return sanitized
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _should_prefer_parsed_catalog(
			
 
				+        cls,
			
 
				+        parsed_chapters: List[Dict[str, Any]],
			
 
				+        existing_chapters: List[Dict[str, Any]],
			
 
				+    ) -> bool:
			
 
				+        if not parsed_chapters:
			
 
				+            return False
			
 
				+
			
 
				+        if cls._catalog_has_suspicious_structure(parsed_chapters):
			
 
				+            return False
			
 
				+
			
 
				+        if not existing_chapters:
			
 
				+            return True
			
 
				+
			
 
				+        if cls._catalog_has_suspicious_structure(existing_chapters):
			
 
				+            return True
			
 
				+
			
 
				+        parsed_score = cls._catalog_structure_score(parsed_chapters)
			
 
				+        existing_score = cls._catalog_structure_score(existing_chapters)
			
 
				+        if parsed_score <= existing_score:
			
 
				+            return False
			
 
				+
			
 
				+        if not cls._catalog_has_suspicious_structure(existing_chapters):
			
 
				+            existing_count = len(existing_chapters)
			
 
				+            parsed_count = len(parsed_chapters)
			
 
				+            if parsed_count > max(existing_count * 2, existing_count + 8):
			
 
				+                return False
			
 
				+            if existing_count >= 4 and parsed_count < max(2, existing_count // 2):
			
 
				+                return False
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _catalog_has_suspicious_structure(cls, chapters: List[Dict[str, Any]]) -> bool:
			
 
				+        if not chapters:
			
 
				+            return False
			
 
				+
			
 
				+        titles = [(chapter.get("title", "") or "").strip() for chapter in chapters]
			
 
				+        chinese_chapter_count = sum(
			
 
				+            1 for title in titles
			
 
				+            if re.match(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]", title)
			
 
				+        )
			
 
				+        numeric_heading_count = sum(
			
 
				+            1 for title in titles
			
 
				+            if re.match(r"^\d{1,2}(?:[\.．。、])?\s+\S+", title)
			
 
				+        )
			
 
				+        embedded_numeric_body_count = 0
			
 
				+        repeated_chapter_no_count = 0
			
 
				+        reversed_chapter_no_count = 0
			
 
				+        seen_chapter_numbers: Set[str] = set()
			
 
				+        previous_numeric_chapter_no: Optional[int] = None
			
 
				+
			
 
				+        for title in titles:
			
 
				+            chapter_match = re.match(
			
 
				+                r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]\s*(.*)$",
			
 
				+                title,
			
 
				+            )
			
 
				+            if not chapter_match:
			
 
				+                continue
			
 
				+
			
 
				+            chapter_no = re.sub(r"\s+", "", chapter_match.group(1))
			
 
				+            chapter_body = (chapter_match.group(2) or "").strip()
			
 
				+            if chapter_no in seen_chapter_numbers:
			
 
				+                repeated_chapter_no_count += 1
			
 
				+            seen_chapter_numbers.add(chapter_no)
			
 
				+
			
 
				+            if chapter_no.isdigit():
			
 
				+                current_numeric_no = int(chapter_no)
			
 
				+                if previous_numeric_chapter_no is not None and current_numeric_no < previous_numeric_chapter_no:
			
 
				+                    reversed_chapter_no_count += 1
			
 
				+                previous_numeric_chapter_no = current_numeric_no
			
 
				+
			
 
				+            if re.match(r"^\d{1,2}(?:\.\d{1,2})*\.?(?:\s+|$)", chapter_body):
			
 
				+                embedded_numeric_body_count += 1
			
 
				+
			
 
				+        if chinese_chapter_count >= 2 and numeric_heading_count >= max(3, chinese_chapter_count // 2):
			
 
				+            return True
			
 
				+
			
 
				+        if chinese_chapter_count >= max(2, len(titles) // 3) and numeric_heading_count >= max(2, len(titles) // 6):
			
 
				+            return True
			
 
				+
			
 
				+        if embedded_numeric_body_count >= max(2, len(titles) // 5):
			
 
				+            return True
			
 
				+
			
 
				+        if repeated_chapter_no_count > 0 or reversed_chapter_no_count > 0:
			
 
				+            return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _catalog_structure_score(chapters: List[Dict[str, Any]]) -> int:
			
 
				+        score = 0
			
 
				+        for chapter in chapters:
			
 
				+            score += 1
			
 
				+            score += len(chapter.get("subsections", []) or [])
			
 
				+        return score
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _coerce_numeric_catalog_section(
			
 
				+        cls,
			
 
				+        title_text: str,
			
 
				+        document_l1_rules: Optional[List[str]],
			
 
				+        active_l2_rule: Optional[str],
			
 
				+    ) -> Optional[str]:
			
 
				+        if active_l2_rule is not None:
			
 
				+            return None
			
 
				+
			
 
				+        if not document_l1_rules:
			
 
				+            return None
			
 
				+
			
 
				+        if "Rule_1_纯数字派" in document_l1_rules:
			
 
				+            return None
			
 
				+
			
 
				+        if re.match(r"^\d{1,2}(?:[\.．。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].*", title_text.strip()):
			
 
				+            return cls._clean_section_title(title_text)
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				     @staticmethod
			
 
				     def _split_catalog_entry(line: str) -> Tuple[str, Optional[int]]:
			
 
				         cleaned = line.strip()
			
@@ -582,6 +801,220 @@ class PdfStructureExtractor:
 
				             "page_end": page_num,
			
 
				         }
			
 
				 
			
 
				+    @classmethod
			
 
				+    def _prepare_page_lines(cls, text: str) -> List[str]:
			
 
				+        raw_lines = [line.strip() for line in text.split("\n") if line.strip()]
			
 
				+        prepared_lines: List[str] = []
			
 
				+        index = 0
			
 
				+
			
 
				+        while index < len(raw_lines):
			
 
				+            merged_line, consumed = cls._merge_heading_fragment(raw_lines, index)
			
 
				+            if merged_line:
			
 
				+                prepared_lines.append(merged_line)
			
 
				+                index += consumed
			
 
				+                continue
			
 
				+
			
 
				+            prepared_lines.append(raw_lines[index])
			
 
				+            index += 1
			
 
				+
			
 
				+        return prepared_lines
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _merge_heading_fragment(
			
 
				+        cls,
			
 
				+        lines: List[str],
			
 
				+        start_index: int,
			
 
				+    ) -> Tuple[Optional[str], int]:
			
 
				+        first_line = lines[start_index].strip()
			
 
				+        if not first_line:
			
 
				+            return None, 1
			
 
				+
			
 
				+        first_is_heading = bool(cls._matching_rule_names(first_line, "l1") or cls._matching_rule_names(first_line, "l2"))
			
 
				+        first_is_incomplete = cls._is_incomplete_heading_fragment(first_line)
			
 
				+        max_span = min(3, len(lines) - start_index)
			
 
				+
			
 
				+        for span in range(2, max_span + 1):
			
 
				+            candidate_lines = [lines[start_index + offset].strip() for offset in range(span)]
			
 
				+            candidate_text = " ".join(candidate_lines).strip()
			
 
				+            if not candidate_text or cls.TOC_PATTERN.search(candidate_text):
			
 
				+                continue
			
 
				+            if not (cls._matching_rule_names(candidate_text, "l1") or cls._matching_rule_names(candidate_text, "l2")):
			
 
				+                continue
			
 
				+            if first_is_incomplete or not first_is_heading:
			
 
				+                return candidate_text, span
			
 
				+
			
 
				+        return None, 1
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _is_incomplete_heading_fragment(line: str) -> bool:
			
 
				+        clean_line = re.sub(r"\s+", "", line.strip())
			
 
				+        if not clean_line:
			
 
				+            return False
			
 
				+
			
 
				+        fragment_patterns = (
			
 
				+            r"^第(?:\d+|[一二三四五六七八九十百零两]+)[章部分篇]$",
			
 
				+            r"^\d{1,2}(?:[\.．。、])$",
			
 
				+            r"^[【\[]\d+[\]】]$",
			
 
				+            r"^[一二三四五六七八九十百零两]+[、）\)\]]$",
			
 
				+            r"^第[一二三四五六七八九十百零两]+节$",
			
 
				+            r"^\d+\.\d+(?!\.\d)\.?$",
			
 
				+        )
			
 
				+        return any(re.match(pattern, clean_line) for pattern in fragment_patterns)
			
 
				+
			
 
				+    def _rebuild_section_contents_from_catalog(
			
 
				+        self,
			
 
				+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				+        catalog: Dict[str, Any],
			
 
				+        body_lines: List[Dict[str, Any]],
			
 
				+    ) -> Dict[str, Dict[str, Dict[str, Any]]]:
			
 
				+        catalog_chapters = catalog.get("chapters", []) if isinstance(catalog, dict) else []
			
 
				+        if not catalog_chapters or not body_lines:
			
 
				+            return chapters
			
 
				+
			
 
				+        expected_items: List[Dict[str, Any]] = []
			
 
				+        total_sections = 0
			
 
				+        for chapter in catalog_chapters:
			
 
				+            chapter_title = (chapter.get("title", "") or "").strip()
			
 
				+            if not chapter_title:
			
 
				+                continue
			
 
				+            chapter_page = self._safe_page_number(chapter.get("page"))
			
 
				+            expected_items.append({
			
 
				+                "kind": "chapter",
			
 
				+                "title": chapter_title,
			
 
				+                "chapter_title": chapter_title,
			
 
				+                "section_title": "章节标题",
			
 
				+                "page_hint": chapter_page,
			
 
				+                "line_index": None,
			
 
				+                "page": chapter_page,
			
 
				+            })
			
 
				+
			
 
				+            for subsection in chapter.get("subsections", []):
			
 
				+                section_title = (subsection.get("title", "") or "").strip()
			
 
				+                if not section_title:
			
 
				+                    continue
			
 
				+                total_sections += 1
			
 
				+                expected_items.append({
			
 
				+                    "kind": "section",
			
 
				+                    "title": section_title,
			
 
				+                    "chapter_title": chapter_title,
			
 
				+                    "section_title": section_title,
			
 
				+                    "page_hint": self._safe_page_number(subsection.get("page"), chapter_page),
			
 
				+                    "line_index": None,
			
 
				+                    "page": self._safe_page_number(subsection.get("page"), chapter_page),
			
 
				+                })
			
 
				+
			
 
				+        if not expected_items or total_sections == 0:
			
 
				+            return chapters
			
 
				+
			
 
				+        search_start = 0
			
 
				+        found_sections = 0
			
 
				+        for item in expected_items:
			
 
				+            line_index = self._find_heading_line_index(
			
 
				+                body_lines,
			
 
				+                item["title"],
			
 
				+                item["kind"],
			
 
				+                search_start,
			
 
				+            )
			
 
				+            item["line_index"] = line_index
			
 
				+            if line_index is not None:
			
 
				+                item["page"] = body_lines[line_index]["page"]
			
 
				+                search_start = line_index + 1
			
 
				+                if item["kind"] == "section":
			
 
				+                    found_sections += 1
			
 
				+
			
 
				+        if found_sections == 0:
			
 
				+            return chapters
			
 
				+
			
 
				+        rebuilt: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				+        section_title_key = "章节标题"
			
 
				+
			
 
				+        for chapter in catalog_chapters:
			
 
				+            chapter_title = (chapter.get("title", "") or "").strip()
			
 
				+            if not chapter_title:
			
 
				+                continue
			
 
				+
			
 
				+            chapter_page = self._safe_page_number(chapter.get("page"))
			
 
				+            existing_sections = chapters.get(chapter_title, {})
			
 
				+            rebuilt[chapter_title] = {
			
 
				+                section_title_key: existing_sections.get(section_title_key, self._empty_section_payload(chapter_page))
			
 
				+            }
			
 
				+
			
 
				+            for subsection in chapter.get("subsections", []):
			
 
				+                section_title = (subsection.get("title", "") or "").strip()
			
 
				+                if not section_title:
			
 
				+                    continue
			
 
				+                rebuilt[chapter_title][section_title] = existing_sections.get(
			
 
				+                    section_title,
			
 
				+                    self._empty_section_payload(self._safe_page_number(subsection.get("page"), chapter_page)),
			
 
				+                )
			
 
				+
			
 
				+        for idx, item in enumerate(expected_items):
			
 
				+            if item["kind"] != "section" or item["line_index"] is None:
			
 
				+                continue
			
 
				+
			
 
				+            next_heading_index = len(body_lines)
			
 
				+            for later in expected_items[idx + 1:]:
			
 
				+                if later["line_index"] is not None:
			
 
				+                    next_heading_index = later["line_index"]
			
 
				+                    break
			
 
				+
			
 
				+            content_entries = body_lines[item["line_index"] + 1:next_heading_index]
			
 
				+            content_text = "\n".join(entry["text"] for entry in content_entries).strip()
			
 
				+            existing_payload = rebuilt[item["chapter_title"]].get(item["section_title"], {})
			
 
				+
			
 
				+            if not content_text and (existing_payload.get("content") or "").strip():
			
 
				+                continue
			
 
				+
			
 
				+            if content_entries:
			
 
				+                page_start = content_entries[0]["page"]
			
 
				+                page_end = content_entries[-1]["page"]
			
 
				+            else:
			
 
				+                page_start = item["page"]
			
 
				+                page_end = item["page"]
			
 
				+
			
 
				+            rebuilt[item["chapter_title"]][item["section_title"]] = {
			
 
				+                "content": content_text,
			
 
				+                "page_start": page_start,
			
 
				+                "page_end": page_end,
			
 
				+            }
			
 
				+
			
 
				+        return rebuilt or chapters
			
 
				+
			
 
				+    def _find_heading_line_index(
			
 
				+        self,
			
 
				+        body_lines: List[Dict[str, Any]],
			
 
				+        target_title: str,
			
 
				+        heading_kind: str,
			
 
				+        start_index: int,
			
 
				+    ) -> Optional[int]:
			
 
				+        target_key = self._normalize_heading_key(target_title)
			
 
				+        if not target_key:
			
 
				+            return None
			
 
				+
			
 
				+        for index in range(start_index, len(body_lines)):
			
 
				+            candidate_text = (body_lines[index].get("text") or "").strip()
			
 
				+            if not candidate_text or self.TOC_PATTERN.search(candidate_text):
			
 
				+                continue
			
 
				+
			
 
				+            if heading_kind == "chapter":
			
 
				+                candidate_key = self._normalize_heading_key(self._clean_chapter_title(candidate_text))
			
 
				+            else:
			
 
				+                candidate_key = self._normalize_heading_key(self._clean_section_title(candidate_text))
			
 
				+
			
 
				+            if candidate_key == target_key:
			
 
				+                return index
			
 
				+
			
 
				+            raw_candidate_key = self._normalize_heading_key(candidate_text)
			
 
				+            if raw_candidate_key.endswith(target_key):
			
 
				+                prefix = raw_candidate_key[:-len(target_key)]
			
 
				+                if not prefix or re.fullmatch(
			
 
				+                    r"[\dA-Za-z\.\-_/|,:;()\[\]\u3001\u3002\uff0c\uff1a\uff1b\uff08\uff09\u3010\u3011]+",
			
 
				+                    prefix,
			
 
				+                ):
			
 
				+                    return index
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				     def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
			
 
				         """同步并发处理 OCR（使用 ThreadPoolExecutor）"""
			
 
				         results: List[OcrResult] = []
			
@@ -751,78 +1184,96 @@ class PdfStructureExtractor:
 
				         if not ocr_results:
			
 
				             return original_text
			
 
				 
			
 
				-        # 获取页面上的文本块及其坐标
			
 
				         text_blocks = []
			
 
				         for block in page.get_text("blocks"):
			
 
				             x0, y0, x1, y1, text, _, _ = block
			
 
				-            # 只考虑裁剪区域内的文本
			
 
				             if y0 >= clip_box.y0 and y1 <= clip_box.y1:
			
 
				                 text_blocks.append({
			
 
				                     "bbox": (x0, y0, x1, y1),
			
 
				                     "text": text.strip(),
			
 
				                 })
			
 
				 
			
 
				-        # 按 Y 坐标排序
			
 
				         text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
			
 
				 
			
 
				-        # 找出属于表格区域的文本块
			
 
				+        if not text_blocks:
			
 
				+            return original_text
			
 
				+
			
 
				+        region_entries: List[Dict[str, Any]] = []
			
 
				         replaced_indices: Set[int] = set()
			
 
				-        for ocr_result in ocr_results:
			
 
				-            bbox = ocr_result["bbox"]
			
 
				-            rx0, ry0, rx1, ry1 = bbox
			
 
				+
			
 
				+        for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
			
 
				+            rx0, ry0, rx1, ry1 = ocr_result["bbox"]
			
 
				+            current_indices: List[int] = []
			
 
				 
			
 
				             for idx, block in enumerate(text_blocks):
			
 
				                 if idx in replaced_indices:
			
 
				                     continue
			
 
				-                bx0, by0, bx1, by1 = block["bbox"]
			
 
				+                if self._block_contains_heading(block["text"]):
			
 
				+                    continue
			
 
				 
			
 
				-                # 检查重叠
			
 
				+                bx0, by0, bx1, by1 = block["bbox"]
			
 
				                 overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
			
 
				                 overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
			
 
				                 overlap_area = overlap_x * overlap_y
			
 
				-                block_area = (bx1 - bx0) * (by1 - by0)
			
 
				+                block_area = max((bx1 - bx0) * (by1 - by0), 1)
			
 
				 
			
 
				-                if block_area > 0 and overlap_area / block_area > 0.5:
			
 
				-                    replaced_indices.add(idx)
			
 
				+                if overlap_area / block_area > 0.5:
			
 
				+                    current_indices.append(idx)
			
 
				 
			
 
				-        # 构建新文本
			
 
				-        result_parts: List[str] = []
			
 
				-        last_idx = 0
			
 
				+            if not current_indices:
			
 
				+                continue
			
 
				 
			
 
				-        for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
			
 
				-            bbox = ocr_result["bbox"]
			
 
				-            rx0, ry0, rx1, ry1 = bbox
			
 
				+            replaced_indices.update(current_indices)
			
 
				+            region_entries.append({
			
 
				+                "start": min(current_indices),
			
 
				+                "end": max(current_indices),
			
 
				+                "ocr_text": (ocr_result.get("ocr_text") or "").strip(),
			
 
				+            })
			
 
				+
			
 
				+        if not region_entries:
			
 
				+            return original_text
			
 
				+
			
 
				+        region_by_start = {entry["start"]: entry for entry in region_entries}
			
 
				+        result_parts: List[str] = []
			
 
				+        idx = 0
			
 
				+
			
 
				+        while idx < len(text_blocks):
			
 
				+            region = region_by_start.get(idx)
			
 
				+            if region is not None:
			
 
				+                if region["ocr_text"]:
			
 
				+                    result_parts.append(region["ocr_text"])
			
 
				+                    result_parts.append("\n")
			
 
				+                else:
			
 
				+                    for block_idx in range(region["start"], region["end"] + 1):
			
 
				+                        block_text = text_blocks[block_idx]["text"]
			
 
				+                        if block_text:
			
 
				+                            result_parts.append(block_text)
			
 
				+                            result_parts.append("\n")
			
 
				+                idx = region["end"] + 1
			
 
				+                continue
			
 
				 
			
 
				-            # 找到该表格区域之前的文本
			
 
				-            region_start_idx = None
			
 
				-            for idx, block in enumerate(text_blocks):
			
 
				-                if idx in replaced_indices:
			
 
				-                    bx0, by0, bx1, by1 = block["bbox"]
			
 
				-                    if (bx0 >= rx0 - 5 and bx1 <= rx1 + 5 and
			
 
				-                        by0 >= ry0 - 5 and by1 <= ry1 + 5):
			
 
				-                        if region_start_idx is None:
			
 
				-                            region_start_idx = idx
			
 
				-                        last_idx = idx + 1
			
 
				-
			
 
				-            if region_start_idx is not None:
			
 
				-                # 添加表格前的非表格文本
			
 
				-                for idx in range(last_idx - (last_idx - region_start_idx), region_start_idx):
			
 
				-                    if idx not in replaced_indices and idx < len(text_blocks):
			
 
				-                        result_parts.append(text_blocks[idx]["text"])
			
 
				-                        result_parts.append("\n")
			
 
				-
			
 
				-                # 添加 OCR 结果
			
 
				-                result_parts.append(ocr_result["ocr_text"])
			
 
				-                result_parts.append("\n")
			
 
				-
			
 
				-        # 添加剩余文本
			
 
				-        for idx in range(last_idx, len(text_blocks)):
			
 
				             if idx not in replaced_indices:
			
 
				-                result_parts.append(text_blocks[idx]["text"])
			
 
				-                result_parts.append("\n")
			
 
				+                block_text = text_blocks[idx]["text"]
			
 
				+                if block_text:
			
 
				+                    result_parts.append(block_text)
			
 
				+                    result_parts.append("\n")
			
 
				+            idx += 1
			
 
				 
			
 
				         return "".join(result_parts).strip() or original_text
			
 
				 
			
 
				+    @classmethod
			
 
				+    def _block_contains_heading(cls, text: str) -> bool:
			
 
				+        if not text or not text.strip():
			
 
				+            return False
			
 
				+
			
 
				+        for line in cls._prepare_page_lines(text):
			
 
				+            stripped = line.strip()
			
 
				+            if not stripped:
			
 
				+                continue
			
 
				+            if cls._matching_rule_names(stripped, "l1") or cls._matching_rule_names(stripped, "l2"):
			
 
				+                return True
			
 
				+        return False
			
 
				+
			
 
				     def _compress_image(self, img_bytes: bytes) -> bytes:
			
 
				         """压缩图片"""
			
 
				         try:
			
@@ -873,10 +1324,36 @@ class PdfStructureExtractor:
 
				 
			
 
				     @staticmethod
			
 
				     def _is_header_footer(line: str) -> bool:
			
 
				+        compact_line = re.sub(r"\s+", "", line.strip())
			
 
				+        if not compact_line:
			
 
				+            return False
			
 
				+
			
 
				+        heading_prefix = re.match(
			
 
				+            r"^(第[\d一二三四五六七八九十百零两]+[章节部分篇]|[\d]+\.\d+|[\d]+[\.．。、]?|[一二三四五六七八九十百零两]+[、）\)\]]|第[一二三四五六七八九十百零两]+节|【\d+】)",
			
 
				+            compact_line,
			
 
				+        )
			
 
				+
			
 
				+        if compact_line.isdigit():
			
 
				+            return True
			
 
				+
			
 
				+        if (
			
 
				+            compact_line.endswith("有限责任公司")
			
 
				+            or compact_line.endswith("有限公司")
			
 
				+            or compact_line.endswith("股份有限公司")
			
 
				+        ) and not heading_prefix:
			
 
				+            return True
			
 
				+
			
 
				+        if compact_line.endswith("专项施工方案") and not heading_prefix:
			
 
				+            return True
			
 
				+
			
 
				         return (
			
 
				             "四川路桥建设集团股份有限公司" in line
			
 
				             or "T梁运输及安装专项施工方案" in line
			
 
				-            or line.isdigit()
			
 
				+            or (
			
 
				+                compact_line.endswith("工程项目")
			
 
				+                and len(compact_line) >= 8
			
 
				+                and not compact_line.startswith("第")
			
 
				+            )
			
 
				         )
			
 
				 
			
 
				     @classmethod
			
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -122,9 +122,9 @@ class SimpleDocumentProcessor:
 
				                     pass
			
 
				 
			
 
				         structure = self.pdf_extractor.extract(file_content, progress_callback=_extraction_progress)
			
 
				-        print("-"*50)
			
 
				-        print(f'{json.dumps(structure, ensure_ascii=False, indent=2)}')
			
 
				-        print("-"*50)
			
 
				+        logger.info("-"*50)
			
 
				+        logger.info(f'{json.dumps(structure, ensure_ascii=False, indent=2)}')
			
 
				+        logger.info("-"*50)
			
 
				         catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
			
 
				 
			
 
				         # 对 catalog 进行分类（如果存在）