|
@@ -1,7 +1,7 @@
|
|
|
from __future__ import annotations
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
r"""
|
|
r"""
|
|
|
-Batch runner for PDF structure extraction.
|
|
|
|
|
|
|
+批量抽取 PDF 目录和正文,并评估抽取质量。
|
|
|
|
|
|
|
|
Example commands:
|
|
Example commands:
|
|
|
|
|
|
|
@@ -421,6 +421,7 @@ def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom:
|
|
|
items: List[Dict[str, Any]] = []
|
|
items: List[Dict[str, Any]] = []
|
|
|
lines = _iter_front_catalog_lines(pdf_path, clip_top, clip_bottom)
|
|
lines = _iter_front_catalog_lines(pdf_path, clip_top, clip_bottom)
|
|
|
saw_explicit_l1 = False
|
|
saw_explicit_l1 = False
|
|
|
|
|
+ current_chapter_title = ""
|
|
|
|
|
|
|
|
for index, line in enumerate(lines):
|
|
for index, line in enumerate(lines):
|
|
|
next_line = lines[index + 1] if index + 1 < len(lines) else ""
|
|
next_line = lines[index + 1] if index + 1 < len(lines) else ""
|
|
@@ -428,7 +429,12 @@ def extract_original_catalog_items(pdf_path: Path, clip_top: float, clip_bottom:
|
|
|
if level is None:
|
|
if level is None:
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
- items.append({"level": level, "title": line})
|
|
|
|
|
|
|
+ item = {"level": level, "title": line}
|
|
|
|
|
+ if level == 2 and current_chapter_title:
|
|
|
|
|
+ item["parent_title"] = current_chapter_title
|
|
|
|
|
+ items.append(item)
|
|
|
|
|
+ if level == 1:
|
|
|
|
|
+ current_chapter_title = line
|
|
|
if level == 1 and any(pattern.match(line) for pattern in CATALOG_L1_PATTERNS):
|
|
if level == 1 and any(pattern.match(line) for pattern in CATALOG_L1_PATTERNS):
|
|
|
saw_explicit_l1 = True
|
|
saw_explicit_l1 = True
|
|
|
|
|
|
|
@@ -554,7 +560,10 @@ def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]
|
|
|
for subsection in _iter_catalog_subsections(chapter):
|
|
for subsection in _iter_catalog_subsections(chapter):
|
|
|
section_title = _catalog_title_from_entry(subsection)
|
|
section_title = _catalog_title_from_entry(subsection)
|
|
|
if section_title:
|
|
if section_title:
|
|
|
- items.append({"level": 2, "title": section_title})
|
|
|
|
|
|
|
+ item = {"level": 2, "title": section_title}
|
|
|
|
|
+ if chapter_title:
|
|
|
|
|
+ item["parent_title"] = chapter_title
|
|
|
|
|
+ items.append(item)
|
|
|
|
|
|
|
|
return items
|
|
return items
|
|
|
|
|
|
|
@@ -572,7 +581,10 @@ def extract_result_catalog_items(result: Dict[str, Any]) -> List[Dict[str, Any]]
|
|
|
for section_title in sections.keys():
|
|
for section_title in sections.keys():
|
|
|
section_title = str(section_title or "").strip()
|
|
section_title = str(section_title or "").strip()
|
|
|
if section_title and section_title not in SPECIAL_SECTION_KEYS:
|
|
if section_title and section_title not in SPECIAL_SECTION_KEYS:
|
|
|
- items.append({"level": 2, "title": section_title})
|
|
|
|
|
|
|
+ item = {"level": 2, "title": section_title}
|
|
|
|
|
+ if chapter_title:
|
|
|
|
|
+ item["parent_title"] = chapter_title
|
|
|
|
|
+ items.append(item)
|
|
|
|
|
|
|
|
return items
|
|
return items
|
|
|
|
|
|
|
@@ -653,6 +665,25 @@ def _catalog_title_similarity(left: str, right: str) -> float:
|
|
|
return max(scores)
|
|
return max(scores)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _catalog_item_similarity(original: Dict[str, Any], candidate: Dict[str, Any], level: int) -> float:
|
|
|
|
|
+ title_score = _catalog_title_similarity(
|
|
|
|
|
+ str(original.get("title", "") or ""),
|
|
|
|
|
+ str(candidate.get("title", "") or ""),
|
|
|
|
|
+ )
|
|
|
|
|
+ if level != 2:
|
|
|
|
|
+ return title_score
|
|
|
|
|
+
|
|
|
|
|
+ original_parent = str(original.get("parent_title", "") or "")
|
|
|
|
|
+ candidate_parent = str(candidate.get("parent_title", "") or "")
|
|
|
|
|
+ if not original_parent or not candidate_parent:
|
|
|
|
|
+ return title_score
|
|
|
|
|
+
|
|
|
|
|
+ parent_score = _catalog_title_similarity(original_parent, candidate_parent)
|
|
|
|
|
+ if parent_score < 0.82:
|
|
|
|
|
+ return min(title_score, parent_score)
|
|
|
|
|
+ return min(1.0, 0.85 * title_score + 0.15 * parent_score)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def _longest_increasing_subsequence_length(values: List[int]) -> int:
|
|
def _longest_increasing_subsequence_length(values: List[int]) -> int:
|
|
|
if not values:
|
|
if not values:
|
|
|
return 0
|
|
return 0
|
|
@@ -693,12 +724,11 @@ def _match_catalog_level(
|
|
|
for original_index, original in enumerate(originals):
|
|
for original_index, original in enumerate(originals):
|
|
|
best_index = -1
|
|
best_index = -1
|
|
|
best_score = 0.0
|
|
best_score = 0.0
|
|
|
- original_title = str(original.get("title", "") or "")
|
|
|
|
|
|
|
|
|
|
for extracted_index, candidate in enumerate(extracted):
|
|
for extracted_index, candidate in enumerate(extracted):
|
|
|
if extracted_index in used_extracted_indexes:
|
|
if extracted_index in used_extracted_indexes:
|
|
|
continue
|
|
continue
|
|
|
- score = _catalog_title_similarity(original_title, str(candidate.get("title", "") or ""))
|
|
|
|
|
|
|
+ score = _catalog_item_similarity(original, candidate, level)
|
|
|
if score > best_score:
|
|
if score > best_score:
|
|
|
best_score = score
|
|
best_score = score
|
|
|
best_index = extracted_index
|
|
best_index = extracted_index
|
|
@@ -785,7 +815,7 @@ def compute_catalog_quality_rate_from_items(
|
|
|
rate = max(0.0, min(rate, 1.0))
|
|
rate = max(0.0, min(rate, 1.0))
|
|
|
|
|
|
|
|
detail = {
|
|
detail = {
|
|
|
- "score_model": "title_f1_70_count_20_order_10",
|
|
|
|
|
|
|
+ "score_model": "parent_aware_title_f1_70_count_20_order_10",
|
|
|
"title_score": title_score,
|
|
"title_score": title_score,
|
|
|
"count_score": count_score,
|
|
"count_score": count_score,
|
|
|
"order_score": order_score,
|
|
"order_score": order_score,
|
|
@@ -1006,7 +1036,7 @@ def main() -> int:
|
|
|
return 1
|
|
return 1
|
|
|
|
|
|
|
|
PdfStructureExtractor = load_pdf_structure_extractor(args.extractor)
|
|
PdfStructureExtractor = load_pdf_structure_extractor(args.extractor)
|
|
|
- effective_detect_toc = (not args.disable_toc) and args.extractor != "pdf_extractor1"
|
|
|
|
|
|
|
+ effective_detect_toc = not args.disable_toc
|
|
|
extractor = PdfStructureExtractor(
|
|
extractor = PdfStructureExtractor(
|
|
|
clip_top=args.clip_top,
|
|
clip_top=args.clip_top,
|
|
|
clip_bottom=args.clip_bottom,
|
|
clip_bottom=args.clip_bottom,
|