3 日前 · 9fb856f621
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
@@ -564,6 +564,8 @@ class PdfStructureExtractor:
 
				                 if not is_viable:
			
 
				                     score -= 1500
			
 
				             rule_style_preference = self._score_rule_cn_l2_style_preference(rule_name, preferred_cn_l2_style)
			
 
				+            if rule_style_preference > 0:
			
 
				+                score += rule_style_preference
			
 
				             rule_performance[rule_name] = {
			
 
				                 "score": score,
			
 
				                 "coverage_rate": f"{coverage_rate * 100:.1f}%",
			
@@ -946,8 +948,9 @@ class PdfStructureExtractor:
 
				         return None
			
 
				 
			
 
				     def _detect_document_cn_order_l2_style(self, body_lines: List[BodyLine]) -> Optional[str]:
			
 
				-        """按章节扫描正文早期小节样式，为 Rule_4/5 平分时提供稳定偏好。"""
			
 
				+        """按章节扫描正文早期小节样式，为候选规则选择提供稳定偏好。"""
			
 
				 
			
 
				+        section_count = 0
			
 
				         plain_count = 0
			
 
				         bracket_count = 0
			
 
				         lines_since_chapter = -1
			
@@ -969,6 +972,17 @@ class PdfStructureExtractor:
 
				                 lines_since_chapter = -1
			
 
				                 continue
			
 
				 
			
 
				+            if (
			
 
				+                re.match(
			
 
				+                    r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*节[\s、：:.-]*[\u4e00-\u9fa5A-Za-z].*",
			
 
				+                    line,
			
 
				+                )
			
 
				+                and self._is_valid_heading_strict(line, is_l1=False)
			
 
				+            ):
			
 
				+                section_count += 1
			
 
				+                lines_since_chapter = -1
			
 
				+                continue
			
 
				+
			
 
				             style = self._detect_cn_order_l2_style(line)
			
 
				             if style is None or not self._is_valid_heading_strict(line, is_l1=False):
			
 
				                 continue
			
@@ -979,6 +993,8 @@ class PdfStructureExtractor:
 
				                 bracket_count += 1
			
 
				             lines_since_chapter = -1
			
 
				 
			
 
				+        if section_count >= 2 and section_count >= max(plain_count, bracket_count):
			
 
				+            return "section"
			
 
				         if plain_count == bracket_count:
			
 
				             return None
			
 
				         return "plain" if plain_count > bracket_count else "bracket"
			
@@ -991,6 +1007,8 @@ class PdfStructureExtractor:
 
				             return 1
			
 
				         if preferred_style == "bracket" and rule_name == "Rule_5_单边括号派":
			
 
				             return 1
			
 
				+        if preferred_style == "section" and rule_name == "Rule_6_小节派":
			
 
				+            return 80
			
 
				         return 0
			
 
				 
			
 
				     def _convert_rule_output_to_chapters(
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py
@@ -1,7 +1,7 @@
 
				 from __future__ import annotations
			
 
				 
			
 
				 r"""
			
 
				-Batch runner for PDF structure extraction.
			
 
				+批量抽取 PDF 目录和正文，并评估抽取质量。
			
 
				 
			
 
				 Example commands:
			
 
				 
			
@@ -421,6 +421,7 @@ def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom:
 
				     items: List[Dict[str, Any]] = []
			
 
				     lines = _iter_front_catalog_lines(pdf_path, clip_top, clip_bottom)
			
 
				     saw_explicit_l1 = False
			
 
				+    current_chapter_title = ""
			
 
				 
			
 
				     for index, line in enumerate(lines):
			
 
				         next_line = lines[index + 1] if index + 1 < len(lines) else ""
			
@@ -428,7 +429,12 @@ def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom:
 
				         if level is None:
			
 
				             continue
			
 
				 
			
 
				-        items.append({"level": level, "title": line})
			
 
				+        item = {"level": level, "title": line}
			
 
				+        if level == 2 and current_chapter_title:
			
 
				+            item["parent_title"] = current_chapter_title
			
 
				+        items.append(item)
			
 
				+        if level == 1:
			
 
				+            current_chapter_title = line
			
 
				         if level == 1 and any(pattern.match(line) for pattern in CATALOG_L1_PATTERNS):
			
 
				             saw_explicit_l1 = True
			
 
				 
			
@@ -554,7 +560,10 @@ def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]
 
				             for subsection in _iter_catalog_subsections(chapter):
			
 
				                 section_title = _catalog_title_from_entry(subsection)
			
 
				                 if section_title:
			
 
				-                    items.append({"level": 2, "title": section_title})
			
 
				+                    item = {"level": 2, "title": section_title}
			
 
				+                    if chapter_title:
			
 
				+                        item["parent_title"] = chapter_title
			
 
				+                    items.append(item)
			
 
				 
			
 
				         return items
			
 
				 
			
@@ -572,7 +581,10 @@ def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]
 
				         for section_title in sections.keys():
			
 
				             section_title = str(section_title or "").strip()
			
 
				             if section_title and section_title not in SPECIAL_SECTION_KEYS:
			
 
				-                items.append({"level": 2, "title": section_title})
			
 
				+                item = {"level": 2, "title": section_title}
			
 
				+                if chapter_title:
			
 
				+                    item["parent_title"] = chapter_title
			
 
				+                items.append(item)
			
 
				 
			
 
				     return items
			
 
				 
			
@@ -653,6 +665,25 @@ def _catalog_title_similarity(left: str, right: str) -> float:
 
				     return max(scores)
			
 
				 
			
 
				 
			
 
				+def _catalog_item_similarity(original: Dict[str, Any], candidate: Dict[str, Any], level: int) -> float:
			
 
				+    title_score = _catalog_title_similarity(
			
 
				+        str(original.get("title", "") or ""),
			
 
				+        str(candidate.get("title", "") or ""),
			
 
				+    )
			
 
				+    if level != 2:
			
 
				+        return title_score
			
 
				+
			
 
				+    original_parent = str(original.get("parent_title", "") or "")
			
 
				+    candidate_parent = str(candidate.get("parent_title", "") or "")
			
 
				+    if not original_parent or not candidate_parent:
			
 
				+        return title_score
			
 
				+
			
 
				+    parent_score = _catalog_title_similarity(original_parent, candidate_parent)
			
 
				+    if parent_score < 0.82:
			
 
				+        return min(title_score, parent_score)
			
 
				+    return min(1.0, 0.85 * title_score + 0.15 * parent_score)
			
 
				+
			
 
				+
			
 
				 def _longest_increasing_subsequence_length(values: List[int]) -> int:
			
 
				     if not values:
			
 
				         return 0
			
@@ -693,12 +724,11 @@ def _match_catalog_level(
 
				     for original_index, original in enumerate(originals):
			
 
				         best_index = -1
			
 
				         best_score = 0.0
			
 
				-        original_title = str(original.get("title", "") or "")
			
 
				 
			
 
				         for extracted_index, candidate in enumerate(extracted):
			
 
				             if extracted_index in used_extracted_indexes:
			
 
				                 continue
			
 
				-            score = _catalog_title_similarity(original_title, str(candidate.get("title", "") or ""))
			
 
				+            score = _catalog_item_similarity(original, candidate, level)
			
 
				             if score > best_score:
			
 
				                 best_score = score
			
 
				                 best_index = extracted_index
			
@@ -785,7 +815,7 @@ def compute_catalog_quality_rate_from_items(
 
				     rate = max(0.0, min(rate, 1.0))
			
 
				 
			
 
				     detail = {
			
 
				-        "score_model": "title_f1_70_count_20_order_10",
			
 
				+        "score_model": "parent_aware_title_f1_70_count_20_order_10",
			
 
				         "title_score": title_score,
			
 
				         "count_score": count_score,
			
 
				         "order_score": order_score,
			
@@ -1006,7 +1036,7 @@ def main() -> int:
 
				         return 1
			
 
				 
			
 
				     PdfStructureExtractor = load_pdf_structure_extractor(args.extractor)
			
 
				-    effective_detect_toc = (not args.disable_toc) and args.extractor != "pdf_extractor1"
			
 
				+    effective_detect_toc = not args.disable_toc
			
 
				     extractor = PdfStructureExtractor(
			
 
				         clip_top=args.clip_top,
			
 
				         clip_bottom=args.clip_bottom,