tangle 4 өдөр өмнө
parent
commit
2ca8bdca4b

+ 69 - 3
core/construction_review/component/minimal_pipeline/pdf_extractor2.py

@@ -433,7 +433,7 @@ class PdfStructureExtractor:
         active_l2_rule: Optional[str] = None
         document_l1_rules: Optional[List[str]] = None
 
-        for raw_line in text.splitlines():
+        for raw_line in self._prepare_catalog_raw_lines(text):
             title_text, page = self._split_catalog_entry(raw_line)
             if not title_text:
                 continue
@@ -554,6 +554,41 @@ class PdfStructureExtractor:
 
         return sanitized
 
+    @classmethod
+    def _prepare_catalog_raw_lines(cls, text: str) -> List[str]:
+        raw_lines = [line.strip() for line in text.splitlines() if line.strip()]
+        prepared: List[str] = []
+        index = 0
+
+        while index < len(raw_lines):
+            current = raw_lines[index].strip()
+            compact_current = re.sub(r"\s+", "", current)
+
+            if compact_current in {"目", "錄", "录"} and index + 1 < len(raw_lines):
+                next_compact = re.sub(r"\s+", "", raw_lines[index + 1].strip())
+                if compact_current + next_compact in {"目录", "目錄"}:
+                    prepared.append(compact_current + next_compact)
+                    index += 2
+                    continue
+
+            if cls._is_incomplete_heading_fragment(current) and index + 1 < len(raw_lines):
+                next_line = raw_lines[index + 1].strip()
+                candidate = f"{current} {next_line}".strip()
+                _, candidate_page = cls._split_catalog_entry(candidate)
+                if (
+                    cls._matching_rule_names(candidate, "l1")
+                    or cls._matching_rule_names(candidate, "l2")
+                    or candidate_page is not None
+                ):
+                    prepared.append(candidate)
+                    index += 2
+                    continue
+
+            prepared.append(current)
+            index += 1
+
+        return prepared
+
     @classmethod
     def _should_prefer_parsed_catalog(
         cls,
@@ -821,7 +856,10 @@ class PdfStructureExtractor:
             return "", None
 
         cleaned = re.sub(r"\s+", " ", cleaned).strip()
-        page_match = re.search(r"(?:[.\u2026\u00b7\u2022 ]{2,})(\d+)\s*$", cleaned)
+        page_match = re.search(
+            r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*(\d+)\s*[-\u2013\u2014 ]*$",
+            cleaned,
+        )
         if page_match:
             title_text = cleaned[:page_match.start()].strip()
             title_text = re.sub(r"[.\u2026\u00b7\u2022 ]+$", "", title_text).strip()
@@ -960,6 +998,9 @@ class PdfStructureExtractor:
         section_title_key = "章节标题"
         chapter_title_payloads: Dict[str, List[Dict[str, Any]]] = {}
         flat_sections: List[Tuple[str, Dict[str, Any]]] = []
+        matched_chapter_count = 0
+        matched_section_count = 0
+        total_catalog_sections = 0
 
         for chapter_title, sections in chapters.items():
             title_key = self._normalize_heading_key(chapter_title)
@@ -995,7 +1036,10 @@ class PdfStructureExtractor:
             chapter_page = self._safe_page_number(chapter.get("page"))
             chapter_key = self._normalize_heading_key(chapter_title)
             title_candidates = chapter_title_payloads.get(chapter_key, [])
+            has_title_match = bool(title_candidates)
             title_payload = title_candidates.pop(0) if title_candidates else self._empty_section_payload(chapter_page)
+            if has_title_match:
+                matched_chapter_count += 1
 
             rebuilt[chapter_title] = {
                 section_title_key: title_payload,
@@ -1005,6 +1049,7 @@ class PdfStructureExtractor:
                 section_title = (subsection.get("title", "") or "").strip()
                 if not section_title:
                     continue
+                total_catalog_sections += 1
 
                 target_key = self._normalize_heading_key(section_title)
                 match_index = None
@@ -1026,16 +1071,23 @@ class PdfStructureExtractor:
                     used_indices.add(match_index)
                     search_start = max(search_start, match_index + 1)
                     rebuilt[chapter_title][section_title] = flat_sections[match_index][1]
+                    matched_section_count += 1
                 else:
                     rebuilt[chapter_title][section_title] = self._empty_section_payload(
                         self._safe_page_number(subsection.get("page"), chapter_page)
                     )
 
+        if total_catalog_sections > 0 and matched_section_count == 0:
+            return chapters
+
+        if matched_chapter_count == 0 and matched_section_count == 0:
+            return chapters
+
         return rebuilt or chapters
 
     @staticmethod
     def _normalize_heading_key(text: str) -> str:
-        normalized = (text or "").strip()
+        normalized = PdfStructureExtractor._strip_catalog_page_suffix((text or "").strip())
         normalized = normalized.replace("【", "[").replace("】", "]")
         normalized = normalized.replace("(", "(").replace(")", ")")
         normalized = normalized.replace(".", ".").replace("。", ".")
@@ -1653,6 +1705,7 @@ class PdfStructureExtractor:
     @staticmethod
     def _clean_chapter_title(line: str) -> str:
         cleaned = PdfStructureExtractor._strip_leading_page_number_from_cn_chapter(line)
+        cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
         cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
         cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
         cleaned = re.sub(r"\s+", " ", cleaned).strip()
@@ -1677,6 +1730,7 @@ class PdfStructureExtractor:
     @staticmethod
     def _clean_section_title(line: str) -> str:
         cleaned = line.strip()
+        cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
         cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
         cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
         cleaned = re.sub(r"\s+", " ", cleaned).strip()
@@ -1706,3 +1760,15 @@ class PdfStructureExtractor:
             return f"{prefix} {title}".strip()
 
         return cleaned
+
+    @staticmethod
+    def _strip_catalog_page_suffix(text: str) -> str:
+        cleaned = re.sub(r"\s+", " ", (text or "").strip())
+        if not cleaned:
+            return ""
+
+        return re.sub(
+            r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*\d+\s*[-\u2013\u2014 ]*$",
+            "",
+            cleaned,
+        ).strip()