hace 3 días · 9fb856f621
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
@@ -564,6 +564,8 @@ class PdfStructureExtractor:
 
															                 if not is_viable:
														
 
															                     score -= 1500
														
 
															             rule_style_preference = self._score_rule_cn_l2_style_preference(rule_name, preferred_cn_l2_style)
														
 
															+            if rule_style_preference > 0:
														
 
															+                score += rule_style_preference
														
 
															             rule_performance[rule_name] = {
														
 
															                 "score": score,
														
 
															                 "coverage_rate": f"{coverage_rate * 100:.1f}%",
														
@@ -946,8 +948,9 @@ class PdfStructureExtractor:
 
															         return None
														
 
															     def _detect_document_cn_order_l2_style(self, body_lines: List[BodyLine]) -> Optional[str]:
														
 
															-        """按章节扫描正文早期小节样式，为 Rule_4/5 平分时提供稳定偏好。"""
														
 
															+        """按章节扫描正文早期小节样式，为候选规则选择提供稳定偏好。"""
														
 
															+        section_count = 0
														
 
															         plain_count = 0
														
 
															         bracket_count = 0
														
 
															         lines_since_chapter = -1
														
@@ -969,6 +972,17 @@ class PdfStructureExtractor:
 
															                 lines_since_chapter = -1
														
 
															                 continue
														
 
															+            if (
														
 
															+                re.match(
														
 
															+                    r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*节[\s、：:.-]*[\u4e00-\u9fa5A-Za-z].*",
														
 
															+                    line,
														
 
															+                )
														
 
															+                and self._is_valid_heading_strict(line, is_l1=False)
														
 
															+            ):
														
 
															+                section_count += 1
														
 
															+                lines_since_chapter = -1
														
 
															+                continue
														
 
															+
														
 
															             style = self._detect_cn_order_l2_style(line)
														
 
															             if style is None or not self._is_valid_heading_strict(line, is_l1=False):
														
 
															                 continue
														
@@ -979,6 +993,8 @@ class PdfStructureExtractor:
 
															                 bracket_count += 1
														
 
															             lines_since_chapter = -1
														
 
															+        if section_count >= 2 and section_count >= max(plain_count, bracket_count):
														
 
															+            return "section"
														
 
															         if plain_count == bracket_count:
														
 
															             return None
														
 
															         return "plain" if plain_count > bracket_count else "bracket"
														
@@ -991,6 +1007,8 @@ class PdfStructureExtractor:
 
															             return 1
														
 
															         if preferred_style == "bracket" and rule_name == "Rule_5_单边括号派":
														
 
															             return 1
														
 
															+        if preferred_style == "section" and rule_name == "Rule_6_小节派":
														
 
															+            return 80
														
 
															         return 0
														
 
															     def _convert_rule_output_to_chapters(
														
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py
@@ -1,7 +1,7 @@
 
															 from __future__ import annotations
														
 
															 r"""
														
 
															-Batch runner for PDF structure extraction.
														
 
															+批量抽取 PDF 目录和正文，并评估抽取质量。
														
 
															 Example commands:
														
@@ -421,6 +421,7 @@ def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom:
 
															     items: List[Dict[str, Any]] = []
														
 
															     lines = _iter_front_catalog_lines(pdf_path, clip_top, clip_bottom)
														
 
															     saw_explicit_l1 = False
														
 
															+    current_chapter_title = ""
														
 
															     for index, line in enumerate(lines):
														
 
															         next_line = lines[index + 1] if index + 1 < len(lines) else ""
														
@@ -428,7 +429,12 @@ def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom:
 
															         if level is None:
														
 
															             continue
														
 
															-        items.append({"level": level, "title": line})
														
 
															+        item = {"level": level, "title": line}
														
 
															+        if level == 2 and current_chapter_title:
														
 
															+            item["parent_title"] = current_chapter_title
														
 
															+        items.append(item)
														
 
															+        if level == 1:
														
 
															+            current_chapter_title = line
														
 
															         if level == 1 and any(pattern.match(line) for pattern in CATALOG_L1_PATTERNS):
														
 
															             saw_explicit_l1 = True
														
@@ -554,7 +560,10 @@ def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]
 
															             for subsection in _iter_catalog_subsections(chapter):
														
 
															                 section_title = _catalog_title_from_entry(subsection)
														
 
															                 if section_title:
														
 
															-                    items.append({"level": 2, "title": section_title})
														
 
															+                    item = {"level": 2, "title": section_title}
														
 
															+                    if chapter_title:
														
 
															+                        item["parent_title"] = chapter_title
														
 
															+                    items.append(item)
														
 
															         return items
														
@@ -572,7 +581,10 @@ def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]
 
															         for section_title in sections.keys():
														
 
															             section_title = str(section_title or "").strip()
														
 
															             if section_title and section_title not in SPECIAL_SECTION_KEYS:
														
 
															-                items.append({"level": 2, "title": section_title})
														
 
															+                item = {"level": 2, "title": section_title}
														
 
															+                if chapter_title:
														
 
															+                    item["parent_title"] = chapter_title
														
 
															+                items.append(item)
														
 
															     return items
														
@@ -653,6 +665,25 @@ def _catalog_title_similarity(left: str, right: str) -> float:
 
															     return max(scores)
														
 
															+def _catalog_item_similarity(original: Dict[str, Any], candidate: Dict[str, Any], level: int) -> float:
														
 
															+    title_score = _catalog_title_similarity(
														
 
															+        str(original.get("title", "") or ""),
														
 
															+        str(candidate.get("title", "") or ""),
														
 
															+    )
														
 
															+    if level != 2:
														
 
															+        return title_score
														
 
															+
														
 
															+    original_parent = str(original.get("parent_title", "") or "")
														
 
															+    candidate_parent = str(candidate.get("parent_title", "") or "")
														
 
															+    if not original_parent or not candidate_parent:
														
 
															+        return title_score
														
 
															+
														
 
															+    parent_score = _catalog_title_similarity(original_parent, candidate_parent)
														
 
															+    if parent_score < 0.82:
														
 
															+        return min(title_score, parent_score)
														
 
															+    return min(1.0, 0.85 * title_score + 0.15 * parent_score)
														
 
															+
														
 
															+
														
 
															 def _longest_increasing_subsequence_length(values: List[int]) -> int:
														
 
															     if not values:
														
 
															         return 0
														
@@ -693,12 +724,11 @@ def _match_catalog_level(
 
															     for original_index, original in enumerate(originals):
														
 
															         best_index = -1
														
 
															         best_score = 0.0
														
 
															-        original_title = str(original.get("title", "") or "")
														
 
															         for extracted_index, candidate in enumerate(extracted):
														
 
															             if extracted_index in used_extracted_indexes:
														
 
															                 continue
														
 
															-            score = _catalog_title_similarity(original_title, str(candidate.get("title", "") or ""))
														
 
															+            score = _catalog_item_similarity(original, candidate, level)
														
 
															             if score > best_score:
														
 
															                 best_score = score
														
 
															                 best_index = extracted_index
														
@@ -785,7 +815,7 @@ def compute_catalog_quality_rate_from_items(
 
															     rate = max(0.0, min(rate, 1.0))
														
 
															     detail = {
														
 
															-        "score_model": "title_f1_70_count_20_order_10",
														
 
															+        "score_model": "parent_aware_title_f1_70_count_20_order_10",
														
 
															         "title_score": title_score,
														
 
															         "count_score": count_score,
														
 
															         "order_score": order_score,
														
@@ -1006,7 +1036,7 @@ def main() -> int:
 
															         return 1
														
 
															     PdfStructureExtractor = load_pdf_structure_extractor(args.extractor)
														
 
															-    effective_detect_toc = (not args.disable_toc) and args.extractor != "pdf_extractor1"
														
 
															+    effective_detect_toc = not args.disable_toc
														
 
															     extractor = PdfStructureExtractor(
														
 
															         clip_top=args.clip_top,
														
 
															         clip_bottom=args.clip_bottom,