hai 1 mes · 6f04793c1a
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
@@ -176,6 +176,12 @@ class PdfStructureExtractor:
 
															                 )
														
 
															                 if rebuilt_chapters:
														
 
															                     structure["chapters"] = rebuilt_chapters
														
 
															+                enriched_catalog = self._enrich_catalog_with_structure(
														
 
															+                    result["catalog"],
														
 
															+                    structure.get("chapters", {}),
														
 
															+                )
														
 
															+                if enriched_catalog:
														
 
															+                    result["catalog"] = enriched_catalog
														
 
															             structure.pop("_body_lines", None)
														
 
															             result["chapters"] = structure.get("chapters", {})
														
 
															             result["total_pages"] = len(doc)
														
@@ -409,6 +415,10 @@ class PdfStructureExtractor:
 
															                 selected_chapters = parsed_chapters
														
 
															         if selected_chapters:
														
 
															+            selected_chapters = self._merge_catalog_chapters(
														
 
															+                selected_chapters,
														
 
															+                parsed_chapters,
														
 
															+            )
														
 
															             normalized["chapters"] = selected_chapters
														
 
															             normalized["total_chapters"] = len(selected_chapters)
														
 
															             normalized["formatted_text"] = self._format_catalog_chapters(selected_chapters)
														
@@ -553,13 +563,22 @@ class PdfStructureExtractor:
 
															         if not parsed_chapters:
														
 
															             return False
														
 
															-        if cls._catalog_has_suspicious_structure(parsed_chapters):
														
 
															-            return False
														
 
															+        parsed_is_suspicious = cls._catalog_has_suspicious_structure(parsed_chapters)
														
 
															+        existing_is_suspicious = cls._catalog_has_suspicious_structure(existing_chapters)
														
 
															+
														
 
															+        if parsed_is_suspicious:
														
 
															+            if not existing_chapters or not existing_is_suspicious:
														
 
															+                return False
														
 
															+
														
 
															+            parsed_score = cls._catalog_structure_score(parsed_chapters)
														
 
															+            existing_score = cls._catalog_structure_score(existing_chapters)
														
 
															+            overlap_ratio = cls._catalog_chapter_overlap_ratio(parsed_chapters, existing_chapters)
														
 
															+            return overlap_ratio >= 0.6 and parsed_score > existing_score
														
 
															         if not existing_chapters:
														
 
															             return True
														
 
															-        if cls._catalog_has_suspicious_structure(existing_chapters):
														
 
															+        if existing_is_suspicious:
														
 
															             return True
														
 
															         parsed_score = cls._catalog_structure_score(parsed_chapters)
														
@@ -642,6 +661,138 @@ class PdfStructureExtractor:
 
															             score += len(chapter.get("subsections", []) or [])
														
 
															         return score
														
 
															+    @classmethod
														
 
															+    def _catalog_chapter_overlap_ratio(
														
 
															+        cls,
														
 
															+        chapters_a: List[Dict[str, Any]],
														
 
															+        chapters_b: List[Dict[str, Any]],
														
 
															+    ) -> float:
														
 
															+        if not chapters_a or not chapters_b:
														
 
															+            return 0.0
														
 
															+
														
 
															+        keys_a = {
														
 
															+            cls._catalog_chapter_identity_key(chapter.get("title", ""))
														
 
															+            for chapter in chapters_a
														
 
															+            if chapter.get("title")
														
 
															+        }
														
 
															+        keys_b = {
														
 
															+            cls._catalog_chapter_identity_key(chapter.get("title", ""))
														
 
															+            for chapter in chapters_b
														
 
															+            if chapter.get("title")
														
 
															+        }
														
 
															+        if not keys_a or not keys_b:
														
 
															+            return 0.0
														
 
															+
														
 
															+        return len(keys_a & keys_b) / max(1, min(len(keys_a), len(keys_b)))
														
 
															+
														
 
															+    @classmethod
														
 
															+    def _catalog_chapter_identity_key(cls, title: str) -> str:
														
 
															+        cleaned = cls._clean_chapter_title(title)
														
 
															+        if not cleaned:
														
 
															+            return ""
														
 
															+
														
 
															+        chapter_match = re.match(
														
 
															+            r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]\s*(.*)$",
														
 
															+            cleaned,
														
 
															+        )
														
 
															+        if chapter_match:
														
 
															+            chapter_body = cls._normalize_heading_key(chapter_match.group(1))
														
 
															+            if chapter_body:
														
 
															+                return chapter_body
														
 
															+
														
 
															+        numeric_match = re.match(r"^\d{1,2}(?:[\.．。、])?\s*(.*)$", cleaned)
														
 
															+        if numeric_match:
														
 
															+            numeric_body = cls._normalize_heading_key(numeric_match.group(1))
														
 
															+            if numeric_body:
														
 
															+                return numeric_body
														
 
															+
														
 
															+        return cls._normalize_heading_key(cleaned)
														
 
															+
														
 
															+    @classmethod
														
 
															+    def _merge_catalog_chapters(
														
 
															+        cls,
														
 
															+        base_chapters: List[Dict[str, Any]],
														
 
															+        supplemental_chapters: List[Dict[str, Any]],
														
 
															+    ) -> List[Dict[str, Any]]:
														
 
															+        if not base_chapters:
														
 
															+            return supplemental_chapters or []
														
 
															+        if not supplemental_chapters:
														
 
															+            return base_chapters
														
 
															+
														
 
															+        merged: List[Dict[str, Any]] = []
														
 
															+        supplemental_by_key = {
														
 
															+            cls._catalog_chapter_identity_key(chapter.get("title", "")): chapter
														
 
															+            for chapter in supplemental_chapters
														
 
															+            if chapter.get("title")
														
 
															+        }
														
 
															+
														
 
															+        for index, chapter in enumerate(base_chapters, 1):
														
 
															+            chapter_copy = {
														
 
															+                **chapter,
														
 
															+                "subsections": [dict(sub) for sub in chapter.get("subsections", []) or []],
														
 
															+            }
														
 
															+            chapter_key = cls._catalog_chapter_identity_key(chapter_copy.get("title", ""))
														
 
															+            supplemental = supplemental_by_key.get(chapter_key)
														
 
															+            if supplemental:
														
 
															+                merged_subsections = cls._merge_catalog_subsections(
														
 
															+                    chapter_copy.get("subsections", []),
														
 
															+                    supplemental.get("subsections", []) or [],
														
 
															+                )
														
 
															+                chapter_copy["subsections"] = merged_subsections
														
 
															+            chapter_copy["index"] = index
														
 
															+            merged.append(chapter_copy)
														
 
															+
														
 
															+        return merged
														
 
															+
														
 
															+    @classmethod
														
 
															+    def _merge_catalog_subsections(
														
 
															+        cls,
														
 
															+        base_subsections: List[Dict[str, Any]],
														
 
															+        supplemental_subsections: List[Dict[str, Any]],
														
 
															+    ) -> List[Dict[str, Any]]:
														
 
															+        if not base_subsections:
														
 
															+            return [dict(sub) for sub in supplemental_subsections]
														
 
															+        if not supplemental_subsections:
														
 
															+            return [dict(sub) for sub in base_subsections]
														
 
															+
														
 
															+        def _subsection_score(items: List[Dict[str, Any]]) -> int:
														
 
															+            score = 0
														
 
															+            for item in items:
														
 
															+                title = (item.get("title", "") or "").strip()
														
 
															+                if not title:
														
 
															+                    continue
														
 
															+                score += 1
														
 
															+                if re.match(r"^\d+\.\d+(?!\.\d)\.?\s*", title):
														
 
															+                    score += 3
														
 
															+                elif re.match(r"^(第\s*[一二三四五六七八九十百零两]+\s*节)", title):
														
 
															+                    score += 3
														
 
															+                elif re.match(r"^([一二三四五六七八九十百零两]+[、）\)\]])", title):
														
 
															+                    score += 3
														
 
															+                elif re.match(r"^[【\[]\s*\d+\s*[\]】]", title):
														
 
															+                    score += 3
														
 
															+                elif re.match(r"^\d{1,2}[\.．。、]\s*", title):
														
 
															+                    score += 1
														
 
															+            return score
														
 
															+
														
 
															+        base_score = _subsection_score(base_subsections)
														
 
															+        supplemental_score = _subsection_score(supplemental_subsections)
														
 
															+        if supplemental_score > base_score:
														
 
															+            return [dict(sub) for sub in supplemental_subsections]
														
 
															+
														
 
															+        merged = [dict(sub) for sub in base_subsections]
														
 
															+        seen_keys = {
														
 
															+            cls._normalize_heading_key(sub.get("title", ""))
														
 
															+            for sub in merged
														
 
															+            if sub.get("title")
														
 
															+        }
														
 
															+        for subsection in supplemental_subsections:
														
 
															+            subsection_key = cls._normalize_heading_key(subsection.get("title", ""))
														
 
															+            if not subsection_key or subsection_key in seen_keys:
														
 
															+                continue
														
 
															+            merged.append(dict(subsection))
														
 
															+            seen_keys.add(subsection_key)
														
 
															+        return merged
														
 
															+
														
 
															     @classmethod
														
 
															     def _coerce_numeric_catalog_section(
														
 
															         cls,
														
@@ -692,6 +843,111 @@ class PdfStructureExtractor:
 
															                     lines.append(f"  {sub_title}")
														
 
															         return "\n".join(lines)
														
 
															+    def _enrich_catalog_with_structure(
														
 
															+        self,
														
 
															+        catalog: Dict[str, Any],
														
 
															+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
														
 
															+    ) -> Dict[str, Any]:
														
 
															+        catalog_chapters = catalog.get("chapters", []) if isinstance(catalog, dict) else []
														
 
															+        if not catalog_chapters or not chapters:
														
 
															+            return catalog
														
 
															+
														
 
															+        enriched = dict(catalog)
														
 
															+        structure_items = list(chapters.items())
														
 
															+        structure_by_key = {
														
 
															+            self._catalog_chapter_identity_key(chapter_title): (chapter_title, sections)
														
 
															+            for chapter_title, sections in structure_items
														
 
															+        }
														
 
															+        used_structure_keys: Set[str] = set()
														
 
															+
														
 
															+        enriched_chapters: List[Dict[str, Any]] = []
														
 
															+        for catalog_chapter in catalog_chapters:
														
 
															+            chapter_copy = dict(catalog_chapter)
														
 
															+            chapter_key = self._catalog_chapter_identity_key(chapter_copy.get("title", ""))
														
 
															+            structure_match = structure_by_key.get(chapter_key)
														
 
															+            if structure_match is None:
														
 
															+                enriched_chapters.append(chapter_copy)
														
 
															+                continue
														
 
															+
														
 
															+            structure_title, structure_sections = structure_match
														
 
															+            used_structure_keys.add(chapter_key)
														
 
															+            title_payload = structure_sections.get("章节标题", {})
														
 
															+            chapter_copy["title"] = structure_title
														
 
															+            chapter_copy["content"] = title_payload.get("content", "")
														
 
															+            chapter_copy["page_start"] = title_payload.get("page_start", self._safe_page_number(chapter_copy.get("page")))
														
 
															+            chapter_copy["page_end"] = title_payload.get("page_end", chapter_copy["page_start"])
														
 
															+
														
 
															+            structure_subsections = [
														
 
															+                (section_title, payload)
														
 
															+                for section_title, payload in structure_sections.items()
														
 
															+                if section_title != "章节标题"
														
 
															+            ]
														
 
															+            catalog_subsections = chapter_copy.get("subsections", []) or []
														
 
															+            subsection_by_key = {
														
 
															+                self._normalize_heading_key(subsection.get("title", "")): subsection
														
 
															+                for subsection in catalog_subsections
														
 
															+                if subsection.get("title")
														
 
															+            }
														
 
															+
														
 
															+            enriched_subsections: List[Dict[str, Any]] = []
														
 
															+            for section_title, payload in structure_subsections:
														
 
															+                section_key = self._normalize_heading_key(section_title)
														
 
															+                subsection = dict(subsection_by_key.get(section_key, {}))
														
 
															+                subsection.setdefault("title", section_title)
														
 
															+                subsection.setdefault("page", str(payload.get("page_start", chapter_copy["page_start"])))
														
 
															+                subsection.setdefault("level", 2)
														
 
															+                subsection.setdefault("original", section_title)
														
 
															+                subsection["content"] = payload.get("content", "")
														
 
															+                subsection["page_start"] = payload.get("page_start", chapter_copy["page_start"])
														
 
															+                subsection["page_end"] = payload.get("page_end", subsection["page_start"])
														
 
															+                enriched_subsections.append(subsection)
														
 
															+
														
 
															+            chapter_copy["subsections"] = enriched_subsections
														
 
															+            enriched_chapters.append(chapter_copy)
														
 
															+
														
 
															+        existing_catalog_keys = {
														
 
															+            self._catalog_chapter_identity_key(chapter.get("title", ""))
														
 
															+            for chapter in enriched_chapters
														
 
															+            if chapter.get("title")
														
 
															+        }
														
 
															+        for chapter_title, structure_sections in structure_items:
														
 
															+            chapter_key = self._catalog_chapter_identity_key(chapter_title)
														
 
															+            if chapter_key in existing_catalog_keys or chapter_key in used_structure_keys:
														
 
															+                continue
														
 
															+
														
 
															+            title_payload = structure_sections.get("章节标题", {})
														
 
															+            new_chapter = {
														
 
															+                "index": len(enriched_chapters) + 1,
														
 
															+                "title": chapter_title,
														
 
															+                "page": str(title_payload.get("page_start", 1)),
														
 
															+                "original": chapter_title,
														
 
															+                "content": title_payload.get("content", ""),
														
 
															+                "page_start": title_payload.get("page_start", 1),
														
 
															+                "page_end": title_payload.get("page_end", title_payload.get("page_start", 1)),
														
 
															+                "subsections": [],
														
 
															+            }
														
 
															+            for section_title, payload in structure_sections.items():
														
 
															+                if section_title == "章节标题":
														
 
															+                    continue
														
 
															+                new_chapter["subsections"].append({
														
 
															+                    "title": section_title,
														
 
															+                    "page": str(payload.get("page_start", new_chapter["page_start"])),
														
 
															+                    "level": 2,
														
 
															+                    "original": section_title,
														
 
															+                    "content": payload.get("content", ""),
														
 
															+                    "page_start": payload.get("page_start", new_chapter["page_start"]),
														
 
															+                    "page_end": payload.get("page_end", payload.get("page_start", new_chapter["page_start"])),
														
 
															+                })
														
 
															+            enriched_chapters.append(new_chapter)
														
 
															+
														
 
															+        for index, chapter in enumerate(enriched_chapters, 1):
														
 
															+            chapter["index"] = index
														
 
															+
														
 
															+        enriched["chapters"] = enriched_chapters
														
 
															+        enriched["total_chapters"] = len(enriched_chapters)
														
 
															+        enriched["formatted_text"] = self._format_catalog_chapters(enriched_chapters)
														
 
															+        return enriched
														
 
															+
														
 
															     def _reconcile_structure_with_catalog(
														
 
															         self,
														
 
															         chapters: Dict[str, Dict[str, Dict[str, Any]]],
														
@@ -1364,6 +1620,8 @@ class PdfStructureExtractor:
 
															         rule_names: Optional[List[str]] = None,
														
 
															     ) -> List[str]:
														
 
															         clean_line = line.strip()
														
 
															+        if level == "l1":
														
 
															+            clean_line = cls._strip_leading_page_number_from_cn_chapter(clean_line)
														
 
															         names = rule_names or list(cls.RULE_LIB.keys())
														
 
															         return [
														
 
															             rule_name
														
@@ -1379,9 +1637,22 @@ class PdfStructureExtractor:
 
															     def _matches_section_heading(cls, line: str, rule_names: Optional[List[str]] = None) -> bool:
														
 
															         return bool(cls._matching_rule_names(line, "l2", rule_names))
														
 
															+    @staticmethod
														
 
															+    def _strip_leading_page_number_from_cn_chapter(line: str) -> str:
														
 
															+        cleaned = re.sub(r"\s+", " ", line.strip())
														
 
															+        if not cleaned:
														
 
															+            return ""
														
 
															+
														
 
															+        return re.sub(
														
 
															+            r"^\d{1,3}\s+(?=第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇])",
														
 
															+            "",
														
 
															+            cleaned,
														
 
															+            count=1,
														
 
															+        ).strip()
														
 
															+
														
 
															     @staticmethod
														
 
															     def _clean_chapter_title(line: str) -> str:
														
 
															-        cleaned = line.strip()
														
 
															+        cleaned = PdfStructureExtractor._strip_leading_page_number_from_cn_chapter(line)
														
 
															         cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
														
 
															         cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
														
 
															         cleaned = re.sub(r"\s+", " ", cleaned).strip()