|
|
@@ -37,6 +37,7 @@ if str(REPO_ROOT) not in sys.path:
|
|
|
|
|
|
SPECIAL_SECTION_KEYS = {"章节标题", "默认部分"}
|
|
|
STAT_FILE_NAME = "static.text"
|
|
|
+UNMATCHED_STANDARD_CATALOG_FILE_NAME = "unmatched_standard_catalog_titles.md"
|
|
|
TOC_LINE_PATTERN = re.compile(r"(?:[.\u2026·•…]{2,}|-{3,}).{0,30}\d+\s*$")
|
|
|
TOC_PAGE_SUFFIX_PATTERN = re.compile(
|
|
|
r"(?:[.\u2026\u00b7\u2022·•…]{2,}|-{3,})[-\u2013\u2014 ]*(?:-\s*)?\d{1,3}(?:\s*-)?\s*$"
|
|
|
@@ -884,6 +885,85 @@ def append_static_record(
|
|
|
)
|
|
|
|
|
|
|
|
|
+def collect_unmatched_standard_catalog_titles(
|
|
|
+ pdf_path: Path,
|
|
|
+ extractor: Any,
|
|
|
+ extractor_result: Dict[str, Any],
|
|
|
+) -> List[Dict[str, Any]]:
|
|
|
+ rule_name = extractor_result.get("body_rule")
|
|
|
+ rule_set = getattr(extractor, "RULE_LIB", {}).get(rule_name)
|
|
|
+ if not rule_name or not rule_set:
|
|
|
+ return []
|
|
|
+
|
|
|
+ records: List[Dict[str, Any]] = []
|
|
|
+ seen: set[Tuple[str, int]] = set()
|
|
|
+
|
|
|
+ try:
|
|
|
+ with fitz.open(pdf_path) as doc:
|
|
|
+ body_lines, _ = extractor._extract_body_lines(doc)
|
|
|
+ except Exception:
|
|
|
+ return []
|
|
|
+
|
|
|
+ current_l1_title = ""
|
|
|
+ for item in body_lines:
|
|
|
+ original = str(getattr(item, "text", "") or "").strip()
|
|
|
+ if not original or original.isdigit():
|
|
|
+ continue
|
|
|
+
|
|
|
+ line = extractor._strip_leading_page_number_from_heading(original)
|
|
|
+ if not line:
|
|
|
+ continue
|
|
|
+
|
|
|
+ source = "目录候选" if extractor._looks_like_toc_candidate(line) else "正文标题"
|
|
|
+ candidates: List[int] = []
|
|
|
+
|
|
|
+ if rule_set["l1"].match(line):
|
|
|
+ if source == "目录候选" or extractor._is_valid_heading_strict(line, is_l1=True):
|
|
|
+ current_l1_title = extractor._normalize_catalog_name(line)
|
|
|
+ if not extractor._match_standard_catalog_chapter(line):
|
|
|
+ candidates.append(1)
|
|
|
+
|
|
|
+ if rule_set["l2"].match(line):
|
|
|
+ if source == "目录候选" or extractor._is_valid_heading_strict(line, is_l1=False):
|
|
|
+ if not extractor._match_standard_catalog_section(line, None):
|
|
|
+ candidates.append(2)
|
|
|
+
|
|
|
+ for level in candidates:
|
|
|
+ normalized = extractor._normalize_catalog_name(line)
|
|
|
+ key = (normalized, level)
|
|
|
+ if not normalized or key in seen:
|
|
|
+ continue
|
|
|
+ seen.add(key)
|
|
|
+ records.append({
|
|
|
+ "normalized": normalized,
|
|
|
+ "level": "一级" if level == 1 else "二级",
|
|
|
+ "parent_l1": normalized if level == 1 else current_l1_title,
|
|
|
+ })
|
|
|
+
|
|
|
+ return records
|
|
|
+
|
|
|
+
|
|
|
+def append_unmatched_standard_catalog_titles(
|
|
|
+ md_path: Path,
|
|
|
+ records: List[Dict[str, Any]],
|
|
|
+) -> None:
|
|
|
+ if not records:
|
|
|
+ return
|
|
|
+
|
|
|
+ md_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
+ needs_header = not md_path.exists() or md_path.stat().st_size == 0
|
|
|
+ with md_path.open("a", encoding="utf-8", newline="") as file:
|
|
|
+ if needs_header:
|
|
|
+ file.write("| 一级标题 | 标题名称 | 级别 |\n")
|
|
|
+ file.write("|---|---|---|\n")
|
|
|
+ for record in records:
|
|
|
+ parent_l1 = str(record.get("parent_l1", "") or "").strip()
|
|
|
+ title = str(record.get("normalized", "") or "").strip()
|
|
|
+ level = str(record.get("level", "") or "").strip()
|
|
|
+ if title:
|
|
|
+ file.write(f"| {parent_l1} | {title} | {level} |\n")
|
|
|
+
|
|
|
+
|
|
|
def sanitize_filename_component(value: str) -> str:
|
|
|
sanitized = value.strip()
|
|
|
for char in '<>:"/\\|?*':
|
|
|
@@ -1018,6 +1098,15 @@ def process_pdf(
|
|
|
catalog_quality_rate_text=catalog_quality_rate_text,
|
|
|
content_quality_rate_text=content_quality_rate_text,
|
|
|
)
|
|
|
+ unmatched_records = collect_unmatched_standard_catalog_titles(
|
|
|
+ pdf_path=pdf_path,
|
|
|
+ extractor=extractor,
|
|
|
+ extractor_result=extractor_result,
|
|
|
+ )
|
|
|
+ append_unmatched_standard_catalog_titles(
|
|
|
+ md_path=output_path.parent / UNMATCHED_STANDARD_CATALOG_FILE_NAME,
|
|
|
+ records=unmatched_records,
|
|
|
+ )
|
|
|
return output_path, quality_rate_text
|
|
|
|
|
|
|
|
|
@@ -1053,6 +1142,11 @@ def main() -> int:
|
|
|
print(f"Extractor: {args.extractor}")
|
|
|
print("=" * 80)
|
|
|
|
|
|
+ if output_dir is not None:
|
|
|
+ unmatched_path = output_dir / UNMATCHED_STANDARD_CATALOG_FILE_NAME
|
|
|
+ if unmatched_path.exists():
|
|
|
+ unmatched_path.unlink()
|
|
|
+
|
|
|
success_count = 0
|
|
|
for index, pdf_path in enumerate(pdf_files, 1):
|
|
|
print(f"[{index}/{len(pdf_files)}] Processing: {pdf_path.name}")
|