1 сар өмнө · 2ca8bdca4b
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
@@ -433,7 +433,7 @@ class PdfStructureExtractor:
 
				         active_l2_rule: Optional[str] = None
			
 
				         document_l1_rules: Optional[List[str]] = None
			
 
				 
			
 
				-        for raw_line in text.splitlines():
			
 
				+        for raw_line in self._prepare_catalog_raw_lines(text):
			
 
				             title_text, page = self._split_catalog_entry(raw_line)
			
 
				             if not title_text:
			
 
				                 continue
			
@@ -554,6 +554,41 @@ class PdfStructureExtractor:
 
				 
			
 
				         return sanitized
			
 
				 
			
 
				+    @classmethod
			
 
				+    def _prepare_catalog_raw_lines(cls, text: str) -> List[str]:
			
 
				+        raw_lines = [line.strip() for line in text.splitlines() if line.strip()]
			
 
				+        prepared: List[str] = []
			
 
				+        index = 0
			
 
				+
			
 
				+        while index < len(raw_lines):
			
 
				+            current = raw_lines[index].strip()
			
 
				+            compact_current = re.sub(r"\s+", "", current)
			
 
				+
			
 
				+            if compact_current in {"目", "錄", "录"} and index + 1 < len(raw_lines):
			
 
				+                next_compact = re.sub(r"\s+", "", raw_lines[index + 1].strip())
			
 
				+                if compact_current + next_compact in {"目录", "目錄"}:
			
 
				+                    prepared.append(compact_current + next_compact)
			
 
				+                    index += 2
			
 
				+                    continue
			
 
				+
			
 
				+            if cls._is_incomplete_heading_fragment(current) and index + 1 < len(raw_lines):
			
 
				+                next_line = raw_lines[index + 1].strip()
			
 
				+                candidate = f"{current} {next_line}".strip()
			
 
				+                _, candidate_page = cls._split_catalog_entry(candidate)
			
 
				+                if (
			
 
				+                    cls._matching_rule_names(candidate, "l1")
			
 
				+                    or cls._matching_rule_names(candidate, "l2")
			
 
				+                    or candidate_page is not None
			
 
				+                ):
			
 
				+                    prepared.append(candidate)
			
 
				+                    index += 2
			
 
				+                    continue
			
 
				+
			
 
				+            prepared.append(current)
			
 
				+            index += 1
			
 
				+
			
 
				+        return prepared
			
 
				+
			
 
				     @classmethod
			
 
				     def _should_prefer_parsed_catalog(
			
 
				         cls,
			
@@ -821,7 +856,10 @@ class PdfStructureExtractor:
 
				             return "", None
			
 
				 
			
 
				         cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
 
				-        page_match = re.search(r"(?:[.\u2026\u00b7\u2022 ]{2,})(\d+)\s*$", cleaned)
			
 
				+        page_match = re.search(
			
 
				+            r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*(\d+)\s*[-\u2013\u2014 ]*$",
			
 
				+            cleaned,
			
 
				+        )
			
 
				         if page_match:
			
 
				             title_text = cleaned[:page_match.start()].strip()
			
 
				             title_text = re.sub(r"[.\u2026\u00b7\u2022 ]+$", "", title_text).strip()
			
@@ -960,6 +998,9 @@ class PdfStructureExtractor:
 
				         section_title_key = "章节标题"
			
 
				         chapter_title_payloads: Dict[str, List[Dict[str, Any]]] = {}
			
 
				         flat_sections: List[Tuple[str, Dict[str, Any]]] = []
			
 
				+        matched_chapter_count = 0
			
 
				+        matched_section_count = 0
			
 
				+        total_catalog_sections = 0
			
 
				 
			
 
				         for chapter_title, sections in chapters.items():
			
 
				             title_key = self._normalize_heading_key(chapter_title)
			
@@ -995,7 +1036,10 @@ class PdfStructureExtractor:
 
				             chapter_page = self._safe_page_number(chapter.get("page"))
			
 
				             chapter_key = self._normalize_heading_key(chapter_title)
			
 
				             title_candidates = chapter_title_payloads.get(chapter_key, [])
			
 
				+            has_title_match = bool(title_candidates)
			
 
				             title_payload = title_candidates.pop(0) if title_candidates else self._empty_section_payload(chapter_page)
			
 
				+            if has_title_match:
			
 
				+                matched_chapter_count += 1
			
 
				 
			
 
				             rebuilt[chapter_title] = {
			
 
				                 section_title_key: title_payload,
			
@@ -1005,6 +1049,7 @@ class PdfStructureExtractor:
 
				                 section_title = (subsection.get("title", "") or "").strip()
			
 
				                 if not section_title:
			
 
				                     continue
			
 
				+                total_catalog_sections += 1
			
 
				 
			
 
				                 target_key = self._normalize_heading_key(section_title)
			
 
				                 match_index = None
			
@@ -1026,16 +1071,23 @@ class PdfStructureExtractor:
 
				                     used_indices.add(match_index)
			
 
				                     search_start = max(search_start, match_index + 1)
			
 
				                     rebuilt[chapter_title][section_title] = flat_sections[match_index][1]
			
 
				+                    matched_section_count += 1
			
 
				                 else:
			
 
				                     rebuilt[chapter_title][section_title] = self._empty_section_payload(
			
 
				                         self._safe_page_number(subsection.get("page"), chapter_page)
			
 
				                     )
			
 
				 
			
 
				+        if total_catalog_sections > 0 and matched_section_count == 0:
			
 
				+            return chapters
			
 
				+
			
 
				+        if matched_chapter_count == 0 and matched_section_count == 0:
			
 
				+            return chapters
			
 
				+
			
 
				         return rebuilt or chapters
			
 
				 
			
 
				     @staticmethod
			
 
				     def _normalize_heading_key(text: str) -> str:
			
 
				-        normalized = (text or "").strip()
			
 
				+        normalized = PdfStructureExtractor._strip_catalog_page_suffix((text or "").strip())
			
 
				         normalized = normalized.replace("【", "[").replace("】", "]")
			
 
				         normalized = normalized.replace("（", "(").replace("）", ")")
			
 
				         normalized = normalized.replace("．", ".").replace("。", ".")
			
@@ -1653,6 +1705,7 @@ class PdfStructureExtractor:
 
				     @staticmethod
			
 
				     def _clean_chapter_title(line: str) -> str:
			
 
				         cleaned = PdfStructureExtractor._strip_leading_page_number_from_cn_chapter(line)
			
 
				+        cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
			
 
				         cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
			
 
				         cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
			
 
				         cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
@@ -1677,6 +1730,7 @@ class PdfStructureExtractor:
 
				     @staticmethod
			
 
				     def _clean_section_title(line: str) -> str:
			
 
				         cleaned = line.strip()
			
 
				+        cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
			
 
				         cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
			
 
				         cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
			
 
				         cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
@@ -1706,3 +1760,15 @@ class PdfStructureExtractor:
 
				             return f"{prefix} {title}".strip()
			
 
				 
			
 
				         return cleaned
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _strip_catalog_page_suffix(text: str) -> str:
			
 
				+        cleaned = re.sub(r"\s+", " ", (text or "").strip())
			
 
				+        if not cleaned:
			
 
				+            return ""
			
 
				+
			
 
				+        return re.sub(
			
 
				+            r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*\d+\s*[-\u2013\u2014 ]*$",
			
 
				+            "",
			
 
				+            cleaned,
			
 
				+        ).strip()