ソースを参照

fix(提取规则)

tangle 3 日 前
コミット
9fb856f621

+ 19 - 1
core/construction_review/component/minimal_pipeline/pdf_extractor1.py

@@ -564,6 +564,8 @@ class PdfStructureExtractor:
                 if not is_viable:
                     score -= 1500
             rule_style_preference = self._score_rule_cn_l2_style_preference(rule_name, preferred_cn_l2_style)
+            if rule_style_preference > 0:
+                score += rule_style_preference
             rule_performance[rule_name] = {
                 "score": score,
                 "coverage_rate": f"{coverage_rate * 100:.1f}%",
@@ -946,8 +948,9 @@ class PdfStructureExtractor:
         return None
 
     def _detect_document_cn_order_l2_style(self, body_lines: List[BodyLine]) -> Optional[str]:
-        """按章节扫描正文早期小节样式,为 Rule_4/5 平分时提供稳定偏好。"""
+        """按章节扫描正文早期小节样式,为候选规则选择提供稳定偏好。"""
 
+        section_count = 0
         plain_count = 0
         bracket_count = 0
         lines_since_chapter = -1
@@ -969,6 +972,17 @@ class PdfStructureExtractor:
                 lines_since_chapter = -1
                 continue
 
+            if (
+                re.match(
+                    r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*节[\s、::.-]*[\u4e00-\u9fa5A-Za-z].*",
+                    line,
+                )
+                and self._is_valid_heading_strict(line, is_l1=False)
+            ):
+                section_count += 1
+                lines_since_chapter = -1
+                continue
+
             style = self._detect_cn_order_l2_style(line)
             if style is None or not self._is_valid_heading_strict(line, is_l1=False):
                 continue
@@ -979,6 +993,8 @@ class PdfStructureExtractor:
                 bracket_count += 1
             lines_since_chapter = -1
 
+        if section_count >= 2 and section_count >= max(plain_count, bracket_count):
+            return "section"
         if plain_count == bracket_count:
             return None
         return "plain" if plain_count > bracket_count else "bracket"
@@ -991,6 +1007,8 @@ class PdfStructureExtractor:
             return 1
         if preferred_style == "bracket" and rule_name == "Rule_5_单边括号派":
             return 1
+        if preferred_style == "section" and rule_name == "Rule_6_小节派":
+            return 80
         return 0
 
     def _convert_rule_output_to_chapters(

+ 38 - 8
core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py

@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 r"""
-Batch runner for PDF structure extraction.
+批量抽取 PDF 目录和正文,并评估抽取质量。
 
 Example commands:
 
@@ -421,6 +421,7 @@ def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom:
     items: List[Dict[str, Any]] = []
     lines = _iter_front_catalog_lines(pdf_path, clip_top, clip_bottom)
     saw_explicit_l1 = False
+    current_chapter_title = ""
 
     for index, line in enumerate(lines):
         next_line = lines[index + 1] if index + 1 < len(lines) else ""
@@ -428,7 +429,12 @@ def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom:
         if level is None:
             continue
 
-        items.append({"level": level, "title": line})
+        item = {"level": level, "title": line}
+        if level == 2 and current_chapter_title:
+            item["parent_title"] = current_chapter_title
+        items.append(item)
+        if level == 1:
+            current_chapter_title = line
         if level == 1 and any(pattern.match(line) for pattern in CATALOG_L1_PATTERNS):
             saw_explicit_l1 = True
 
@@ -554,7 +560,10 @@ def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]
             for subsection in _iter_catalog_subsections(chapter):
                 section_title = _catalog_title_from_entry(subsection)
                 if section_title:
-                    items.append({"level": 2, "title": section_title})
+                    item = {"level": 2, "title": section_title}
+                    if chapter_title:
+                        item["parent_title"] = chapter_title
+                    items.append(item)
 
         return items
 
@@ -572,7 +581,10 @@ def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]
         for section_title in sections.keys():
             section_title = str(section_title or "").strip()
             if section_title and section_title not in SPECIAL_SECTION_KEYS:
-                items.append({"level": 2, "title": section_title})
+                item = {"level": 2, "title": section_title}
+                if chapter_title:
+                    item["parent_title"] = chapter_title
+                items.append(item)
 
     return items
 
@@ -653,6 +665,25 @@ def _catalog_title_similarity(left: str, right: str) -> float:
     return max(scores)
 
 
+def _catalog_item_similarity(original: Dict[str, Any], candidate: Dict[str, Any], level: int) -> float:
+    title_score = _catalog_title_similarity(
+        str(original.get("title", "") or ""),
+        str(candidate.get("title", "") or ""),
+    )
+    if level != 2:
+        return title_score
+
+    original_parent = str(original.get("parent_title", "") or "")
+    candidate_parent = str(candidate.get("parent_title", "") or "")
+    if not original_parent or not candidate_parent:
+        return title_score
+
+    parent_score = _catalog_title_similarity(original_parent, candidate_parent)
+    if parent_score < 0.82:
+        return min(title_score, parent_score)
+    return min(1.0, 0.85 * title_score + 0.15 * parent_score)
+
+
 def _longest_increasing_subsequence_length(values: List[int]) -> int:
     if not values:
         return 0
@@ -693,12 +724,11 @@ def _match_catalog_level(
     for original_index, original in enumerate(originals):
         best_index = -1
         best_score = 0.0
-        original_title = str(original.get("title", "") or "")
 
         for extracted_index, candidate in enumerate(extracted):
             if extracted_index in used_extracted_indexes:
                 continue
-            score = _catalog_title_similarity(original_title, str(candidate.get("title", "") or ""))
+            score = _catalog_item_similarity(original, candidate, level)
             if score > best_score:
                 best_score = score
                 best_index = extracted_index
@@ -785,7 +815,7 @@ def compute_catalog_quality_rate_from_items(
     rate = max(0.0, min(rate, 1.0))
 
     detail = {
-        "score_model": "title_f1_70_count_20_order_10",
+        "score_model": "parent_aware_title_f1_70_count_20_order_10",
         "title_score": title_score,
         "count_score": count_score,
         "order_score": order_score,
@@ -1006,7 +1036,7 @@ def main() -> int:
         return 1
 
     PdfStructureExtractor = load_pdf_structure_extractor(args.extractor)
-    effective_detect_toc = (not args.disable_toc) and args.extractor != "pdf_extractor1"
+    effective_detect_toc = not args.disable_toc
     extractor = PdfStructureExtractor(
         clip_top=args.clip_top,
         clip_bottom=args.clip_bottom,