|
|
@@ -433,7 +433,7 @@ class PdfStructureExtractor:
|
|
|
active_l2_rule: Optional[str] = None
|
|
|
document_l1_rules: Optional[List[str]] = None
|
|
|
|
|
|
- for raw_line in text.splitlines():
|
|
|
+ for raw_line in self._prepare_catalog_raw_lines(text):
|
|
|
title_text, page = self._split_catalog_entry(raw_line)
|
|
|
if not title_text:
|
|
|
continue
|
|
|
@@ -554,6 +554,41 @@ class PdfStructureExtractor:
|
|
|
|
|
|
return sanitized
|
|
|
|
|
|
+ @classmethod
|
|
|
+ def _prepare_catalog_raw_lines(cls, text: str) -> List[str]:
|
|
|
+ raw_lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
|
+ prepared: List[str] = []
|
|
|
+ index = 0
|
|
|
+
|
|
|
+ while index < len(raw_lines):
|
|
|
+ current = raw_lines[index].strip()
|
|
|
+ compact_current = re.sub(r"\s+", "", current)
|
|
|
+
|
|
|
+ if compact_current in {"目", "錄", "录"} and index + 1 < len(raw_lines):
|
|
|
+ next_compact = re.sub(r"\s+", "", raw_lines[index + 1].strip())
|
|
|
+ if compact_current + next_compact in {"目录", "目錄"}:
|
|
|
+ prepared.append(compact_current + next_compact)
|
|
|
+ index += 2
|
|
|
+ continue
|
|
|
+
|
|
|
+ if cls._is_incomplete_heading_fragment(current) and index + 1 < len(raw_lines):
|
|
|
+ next_line = raw_lines[index + 1].strip()
|
|
|
+ candidate = f"{current} {next_line}".strip()
|
|
|
+ _, candidate_page = cls._split_catalog_entry(candidate)
|
|
|
+ if (
|
|
|
+ cls._matching_rule_names(candidate, "l1")
|
|
|
+ or cls._matching_rule_names(candidate, "l2")
|
|
|
+ or candidate_page is not None
|
|
|
+ ):
|
|
|
+ prepared.append(candidate)
|
|
|
+ index += 2
|
|
|
+ continue
|
|
|
+
|
|
|
+ prepared.append(current)
|
|
|
+ index += 1
|
|
|
+
|
|
|
+ return prepared
|
|
|
+
|
|
|
@classmethod
|
|
|
def _should_prefer_parsed_catalog(
|
|
|
cls,
|
|
|
@@ -821,7 +856,10 @@ class PdfStructureExtractor:
|
|
|
return "", None
|
|
|
|
|
|
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
|
- page_match = re.search(r"(?:[.\u2026\u00b7\u2022 ]{2,})(\d+)\s*$", cleaned)
|
|
|
+ page_match = re.search(
|
|
|
+ r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*(\d+)\s*[-\u2013\u2014 ]*$",
|
|
|
+ cleaned,
|
|
|
+ )
|
|
|
if page_match:
|
|
|
title_text = cleaned[:page_match.start()].strip()
|
|
|
title_text = re.sub(r"[.\u2026\u00b7\u2022 ]+$", "", title_text).strip()
|
|
|
@@ -960,6 +998,9 @@ class PdfStructureExtractor:
|
|
|
section_title_key = "章节标题"
|
|
|
chapter_title_payloads: Dict[str, List[Dict[str, Any]]] = {}
|
|
|
flat_sections: List[Tuple[str, Dict[str, Any]]] = []
|
|
|
+ matched_chapter_count = 0
|
|
|
+ matched_section_count = 0
|
|
|
+ total_catalog_sections = 0
|
|
|
|
|
|
for chapter_title, sections in chapters.items():
|
|
|
title_key = self._normalize_heading_key(chapter_title)
|
|
|
@@ -995,7 +1036,10 @@ class PdfStructureExtractor:
|
|
|
chapter_page = self._safe_page_number(chapter.get("page"))
|
|
|
chapter_key = self._normalize_heading_key(chapter_title)
|
|
|
title_candidates = chapter_title_payloads.get(chapter_key, [])
|
|
|
+ has_title_match = bool(title_candidates)
|
|
|
title_payload = title_candidates.pop(0) if title_candidates else self._empty_section_payload(chapter_page)
|
|
|
+ if has_title_match:
|
|
|
+ matched_chapter_count += 1
|
|
|
|
|
|
rebuilt[chapter_title] = {
|
|
|
section_title_key: title_payload,
|
|
|
@@ -1005,6 +1049,7 @@ class PdfStructureExtractor:
|
|
|
section_title = (subsection.get("title", "") or "").strip()
|
|
|
if not section_title:
|
|
|
continue
|
|
|
+ total_catalog_sections += 1
|
|
|
|
|
|
target_key = self._normalize_heading_key(section_title)
|
|
|
match_index = None
|
|
|
@@ -1026,16 +1071,23 @@ class PdfStructureExtractor:
|
|
|
used_indices.add(match_index)
|
|
|
search_start = max(search_start, match_index + 1)
|
|
|
rebuilt[chapter_title][section_title] = flat_sections[match_index][1]
|
|
|
+ matched_section_count += 1
|
|
|
else:
|
|
|
rebuilt[chapter_title][section_title] = self._empty_section_payload(
|
|
|
self._safe_page_number(subsection.get("page"), chapter_page)
|
|
|
)
|
|
|
|
|
|
+ if total_catalog_sections > 0 and matched_section_count == 0:
|
|
|
+ return chapters
|
|
|
+
|
|
|
+ if matched_chapter_count == 0 and matched_section_count == 0:
|
|
|
+ return chapters
|
|
|
+
|
|
|
return rebuilt or chapters
|
|
|
|
|
|
@staticmethod
|
|
|
def _normalize_heading_key(text: str) -> str:
|
|
|
- normalized = (text or "").strip()
|
|
|
+ normalized = PdfStructureExtractor._strip_catalog_page_suffix((text or "").strip())
|
|
|
normalized = normalized.replace("【", "[").replace("】", "]")
|
|
|
normalized = normalized.replace("(", "(").replace(")", ")")
|
|
|
normalized = normalized.replace(".", ".").replace("。", ".")
|
|
|
@@ -1653,6 +1705,7 @@ class PdfStructureExtractor:
|
|
|
@staticmethod
|
|
|
def _clean_chapter_title(line: str) -> str:
|
|
|
cleaned = PdfStructureExtractor._strip_leading_page_number_from_cn_chapter(line)
|
|
|
+ cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
|
|
|
cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
|
|
|
cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
|
|
|
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
|
@@ -1677,6 +1730,7 @@ class PdfStructureExtractor:
|
|
|
@staticmethod
|
|
|
def _clean_section_title(line: str) -> str:
|
|
|
cleaned = line.strip()
|
|
|
+ cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
|
|
|
cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
|
|
|
cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
|
|
|
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
|
@@ -1706,3 +1760,15 @@ class PdfStructureExtractor:
|
|
|
return f"{prefix} {title}".strip()
|
|
|
|
|
|
return cleaned
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def _strip_catalog_page_suffix(text: str) -> str:
|
|
|
+ cleaned = re.sub(r"\s+", " ", (text or "").strip())
|
|
|
+ if not cleaned:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ return re.sub(
|
|
|
+ r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*\d+\s*[-\u2013\u2014 ]*$",
|
|
|
+ "",
|
|
|
+ cleaned,
|
|
|
+ ).strip()
|