|
@@ -11,7 +11,7 @@ import io
|
|
|
import re
|
|
import re
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
from dataclasses import dataclass
|
|
from dataclasses import dataclass
|
|
|
-from typing import Dict, Any, List, Optional, Tuple
|
|
|
|
|
|
|
+from typing import Dict, Any, List, Optional, Tuple, Set
|
|
|
|
|
|
|
|
import fitz
|
|
import fitz
|
|
|
import numpy as np
|
|
import numpy as np
|
|
@@ -169,6 +169,14 @@ class PdfStructureExtractor:
|
|
|
structure.get("chapters", {}),
|
|
structure.get("chapters", {}),
|
|
|
result["catalog"],
|
|
result["catalog"],
|
|
|
)
|
|
)
|
|
|
|
|
+ rebuilt_chapters = self._rebuild_section_contents_from_catalog(
|
|
|
|
|
+ structure.get("chapters", {}),
|
|
|
|
|
+ result["catalog"],
|
|
|
|
|
+ structure.get("_body_lines", []),
|
|
|
|
|
+ )
|
|
|
|
|
+ if rebuilt_chapters:
|
|
|
|
|
+ structure["chapters"] = rebuilt_chapters
|
|
|
|
|
+ structure.pop("_body_lines", None)
|
|
|
result["chapters"] = structure.get("chapters", {})
|
|
result["chapters"] = structure.get("chapters", {})
|
|
|
result["total_pages"] = len(doc)
|
|
result["total_pages"] = len(doc)
|
|
|
return result
|
|
return result
|
|
@@ -251,6 +259,7 @@ class PdfStructureExtractor:
|
|
|
|
|
|
|
|
# === 阶段3: 提取页面文本(应用 OCR 结果)并切分章节 ===
|
|
# === 阶段3: 提取页面文本(应用 OCR 结果)并切分章节 ===
|
|
|
structured_data: Dict[str, Dict[str, Dict[str, Any]]] = {}
|
|
structured_data: Dict[str, Dict[str, Dict[str, Any]]] = {}
|
|
|
|
|
+ body_lines: List[Dict[str, Any]] = []
|
|
|
current_chapter = "未分类前言"
|
|
current_chapter = "未分类前言"
|
|
|
current_section = "默认部分"
|
|
current_section = "默认部分"
|
|
|
in_body = False
|
|
in_body = False
|
|
@@ -280,7 +289,14 @@ class PdfStructureExtractor:
|
|
|
else:
|
|
else:
|
|
|
text = page.get_text("text", clip=clip_box)
|
|
text = page.get_text("text", clip=clip_box)
|
|
|
|
|
|
|
|
- lines = text.split("\n")
|
|
|
|
|
|
|
+ lines = self._prepare_page_lines(text)
|
|
|
|
|
+ for line in lines:
|
|
|
|
|
+ if not line or self._is_header_footer(line):
|
|
|
|
|
+ continue
|
|
|
|
|
+ body_lines.append({
|
|
|
|
|
+ "page": page_num + 1,
|
|
|
|
|
+ "text": line,
|
|
|
|
|
+ })
|
|
|
|
|
|
|
|
for line in lines:
|
|
for line in lines:
|
|
|
line = line.strip()
|
|
line = line.strip()
|
|
@@ -358,7 +374,7 @@ class PdfStructureExtractor:
|
|
|
structured_data[current_chapter][current_section]["page_end"] = page_num + 1
|
|
structured_data[current_chapter][current_section]["page_end"] = page_num + 1
|
|
|
|
|
|
|
|
# 将行列表拼接为文本
|
|
# 将行列表拼接为文本
|
|
|
- result: Dict[str, Any] = {"chapters": {}}
|
|
|
|
|
|
|
+ result: Dict[str, Any] = {"chapters": {}, "_body_lines": body_lines}
|
|
|
for chap, sections in structured_data.items():
|
|
for chap, sections in structured_data.items():
|
|
|
result["chapters"][chap] = {}
|
|
result["chapters"][chap] = {}
|
|
|
for sec, data in sections.items():
|
|
for sec, data in sections.items():
|
|
@@ -376,12 +392,26 @@ class PdfStructureExtractor:
|
|
|
return {}
|
|
return {}
|
|
|
|
|
|
|
|
normalized = dict(catalog)
|
|
normalized = dict(catalog)
|
|
|
|
|
+ existing_chapters = self._sanitize_catalog_chapters(catalog.get("chapters", []))
|
|
|
raw_text = catalog.get("raw_ocr_text", "")
|
|
raw_text = catalog.get("raw_ocr_text", "")
|
|
|
parsed_chapters = self._parse_catalog_from_raw_text(raw_text) if isinstance(raw_text, str) else []
|
|
parsed_chapters = self._parse_catalog_from_raw_text(raw_text) if isinstance(raw_text, str) else []
|
|
|
|
|
+ selected_chapters = existing_chapters
|
|
|
|
|
+
|
|
|
if parsed_chapters:
|
|
if parsed_chapters:
|
|
|
- normalized["chapters"] = parsed_chapters
|
|
|
|
|
- normalized["total_chapters"] = len(parsed_chapters)
|
|
|
|
|
- normalized["formatted_text"] = self._format_catalog_chapters(parsed_chapters)
|
|
|
|
|
|
|
+ if self._should_prefer_parsed_catalog(parsed_chapters, existing_chapters):
|
|
|
|
|
+ selected_chapters = parsed_chapters
|
|
|
|
|
+ elif existing_chapters:
|
|
|
|
|
+ logger.info(
|
|
|
|
|
+ "[PDF提取] raw_ocr_text目录解析结果异常,保留原始目录骨架: "
|
|
|
|
|
+ f"parsed={len(parsed_chapters)}, original={len(existing_chapters)}"
|
|
|
|
|
+ )
|
|
|
|
|
+ else:
|
|
|
|
|
+ selected_chapters = parsed_chapters
|
|
|
|
|
+
|
|
|
|
|
+ if selected_chapters:
|
|
|
|
|
+ normalized["chapters"] = selected_chapters
|
|
|
|
|
+ normalized["total_chapters"] = len(selected_chapters)
|
|
|
|
|
+ normalized["formatted_text"] = self._format_catalog_chapters(selected_chapters)
|
|
|
return normalized
|
|
return normalized
|
|
|
|
|
|
|
|
def _parse_catalog_from_raw_text(self, text: str) -> List[Dict[str, Any]]:
|
|
def _parse_catalog_from_raw_text(self, text: str) -> List[Dict[str, Any]]:
|
|
@@ -391,6 +421,7 @@ class PdfStructureExtractor:
|
|
|
chapters: List[Dict[str, Any]] = []
|
|
chapters: List[Dict[str, Any]] = []
|
|
|
current_chapter: Optional[Dict[str, Any]] = None
|
|
current_chapter: Optional[Dict[str, Any]] = None
|
|
|
active_l2_rule: Optional[str] = None
|
|
active_l2_rule: Optional[str] = None
|
|
|
|
|
+ document_l1_rules: Optional[List[str]] = None
|
|
|
|
|
|
|
|
for raw_line in text.splitlines():
|
|
for raw_line in text.splitlines():
|
|
|
title_text, page = self._split_catalog_entry(raw_line)
|
|
title_text, page = self._split_catalog_entry(raw_line)
|
|
@@ -401,8 +432,10 @@ class PdfStructureExtractor:
|
|
|
if compact in {"目录", "目錄"}:
|
|
if compact in {"目录", "目錄"}:
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
- chapter_matches = self._matching_rule_names(title_text, "l1")
|
|
|
|
|
|
|
+ chapter_matches = self._matching_rule_names(title_text, "l1", document_l1_rules)
|
|
|
if chapter_matches:
|
|
if chapter_matches:
|
|
|
|
|
+ if document_l1_rules is None:
|
|
|
|
|
+ document_l1_rules = chapter_matches
|
|
|
current_chapter = {
|
|
current_chapter = {
|
|
|
"index": len(chapters) + 1,
|
|
"index": len(chapters) + 1,
|
|
|
"title": self._clean_chapter_title(title_text),
|
|
"title": self._clean_chapter_title(title_text),
|
|
@@ -419,6 +452,24 @@ class PdfStructureExtractor:
|
|
|
|
|
|
|
|
section_matches = self._matching_rule_names(title_text, "l2")
|
|
section_matches = self._matching_rule_names(title_text, "l2")
|
|
|
if not section_matches:
|
|
if not section_matches:
|
|
|
|
|
+ numeric_section_title = self._coerce_numeric_catalog_section(
|
|
|
|
|
+ title_text,
|
|
|
|
|
+ document_l1_rules,
|
|
|
|
|
+ active_l2_rule,
|
|
|
|
|
+ )
|
|
|
|
|
+ if numeric_section_title:
|
|
|
|
|
+ section_key = self._normalize_heading_key(numeric_section_title)
|
|
|
|
|
+ existing_keys = {
|
|
|
|
|
+ self._normalize_heading_key(sub.get("title", ""))
|
|
|
|
|
+ for sub in current_chapter.get("subsections", [])
|
|
|
|
|
+ }
|
|
|
|
|
+ if section_key not in existing_keys:
|
|
|
|
|
+ current_chapter["subsections"].append({
|
|
|
|
|
+ "title": numeric_section_title,
|
|
|
|
|
+ "page": str(page or current_chapter.get("page", 1)),
|
|
|
|
|
+ "level": 2,
|
|
|
|
|
+ "original": raw_line.strip(),
|
|
|
|
|
+ })
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
if active_l2_rule is None:
|
|
if active_l2_rule is None:
|
|
@@ -444,6 +495,174 @@ class PdfStructureExtractor:
|
|
|
|
|
|
|
|
return chapters
|
|
return chapters
|
|
|
|
|
|
|
|
|
|
+ @classmethod
|
|
|
|
|
+ def _sanitize_catalog_chapters(cls, chapters: Any) -> List[Dict[str, Any]]:
|
|
|
|
|
+ if not isinstance(chapters, list):
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ sanitized: List[Dict[str, Any]] = []
|
|
|
|
|
+ seen_chapter_keys: Set[str] = set()
|
|
|
|
|
+
|
|
|
|
|
+ for idx, chapter in enumerate(chapters, 1):
|
|
|
|
|
+ if not isinstance(chapter, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ chapter_title = cls._clean_chapter_title(str(chapter.get("title", "") or ""))
|
|
|
|
|
+ chapter_key = cls._normalize_heading_key(chapter_title)
|
|
|
|
|
+ if not chapter_key or chapter_key in seen_chapter_keys:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ seen_chapter_keys.add(chapter_key)
|
|
|
|
|
+ chapter_page = str(chapter.get("page") or idx)
|
|
|
|
|
+ subsections: List[Dict[str, Any]] = []
|
|
|
|
|
+ seen_section_keys: Set[str] = set()
|
|
|
|
|
+
|
|
|
|
|
+ for subsection in chapter.get("subsections", []) or []:
|
|
|
|
|
+ if not isinstance(subsection, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ section_title = cls._clean_section_title(str(subsection.get("title", "") or ""))
|
|
|
|
|
+ section_key = cls._normalize_heading_key(section_title)
|
|
|
|
|
+ if not section_key or section_key in seen_section_keys:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ seen_section_keys.add(section_key)
|
|
|
|
|
+ subsections.append({
|
|
|
|
|
+ "title": section_title,
|
|
|
|
|
+ "page": str(subsection.get("page") or chapter_page),
|
|
|
|
|
+ "level": 2,
|
|
|
|
|
+ "original": subsection.get("original", "") or section_title,
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ sanitized.append({
|
|
|
|
|
+ "index": len(sanitized) + 1,
|
|
|
|
|
+ "title": chapter_title,
|
|
|
|
|
+ "page": chapter_page,
|
|
|
|
|
+ "original": chapter.get("original", "") or chapter_title,
|
|
|
|
|
+ "subsections": subsections,
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ return sanitized
|
|
|
|
|
+
|
|
|
|
|
+ @classmethod
|
|
|
|
|
+ def _should_prefer_parsed_catalog(
|
|
|
|
|
+ cls,
|
|
|
|
|
+ parsed_chapters: List[Dict[str, Any]],
|
|
|
|
|
+ existing_chapters: List[Dict[str, Any]],
|
|
|
|
|
+ ) -> bool:
|
|
|
|
|
+ if not parsed_chapters:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ if cls._catalog_has_suspicious_structure(parsed_chapters):
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ if not existing_chapters:
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ if cls._catalog_has_suspicious_structure(existing_chapters):
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ parsed_score = cls._catalog_structure_score(parsed_chapters)
|
|
|
|
|
+ existing_score = cls._catalog_structure_score(existing_chapters)
|
|
|
|
|
+ if parsed_score <= existing_score:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ if not cls._catalog_has_suspicious_structure(existing_chapters):
|
|
|
|
|
+ existing_count = len(existing_chapters)
|
|
|
|
|
+ parsed_count = len(parsed_chapters)
|
|
|
|
|
+ if parsed_count > max(existing_count * 2, existing_count + 8):
|
|
|
|
|
+ return False
|
|
|
|
|
+ if existing_count >= 4 and parsed_count < max(2, existing_count // 2):
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ @classmethod
|
|
|
|
|
+ def _catalog_has_suspicious_structure(cls, chapters: List[Dict[str, Any]]) -> bool:
|
|
|
|
|
+ if not chapters:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ titles = [(chapter.get("title", "") or "").strip() for chapter in chapters]
|
|
|
|
|
+ chinese_chapter_count = sum(
|
|
|
|
|
+ 1 for title in titles
|
|
|
|
|
+ if re.match(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]", title)
|
|
|
|
|
+ )
|
|
|
|
|
+ numeric_heading_count = sum(
|
|
|
|
|
+ 1 for title in titles
|
|
|
|
|
+ if re.match(r"^\d{1,2}(?:[\..。、])?\s+\S+", title)
|
|
|
|
|
+ )
|
|
|
|
|
+ embedded_numeric_body_count = 0
|
|
|
|
|
+ repeated_chapter_no_count = 0
|
|
|
|
|
+ reversed_chapter_no_count = 0
|
|
|
|
|
+ seen_chapter_numbers: Set[str] = set()
|
|
|
|
|
+ previous_numeric_chapter_no: Optional[int] = None
|
|
|
|
|
+
|
|
|
|
|
+ for title in titles:
|
|
|
|
|
+ chapter_match = re.match(
|
|
|
|
|
+ r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]\s*(.*)$",
|
|
|
|
|
+ title,
|
|
|
|
|
+ )
|
|
|
|
|
+ if not chapter_match:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ chapter_no = re.sub(r"\s+", "", chapter_match.group(1))
|
|
|
|
|
+ chapter_body = (chapter_match.group(2) or "").strip()
|
|
|
|
|
+ if chapter_no in seen_chapter_numbers:
|
|
|
|
|
+ repeated_chapter_no_count += 1
|
|
|
|
|
+ seen_chapter_numbers.add(chapter_no)
|
|
|
|
|
+
|
|
|
|
|
+ if chapter_no.isdigit():
|
|
|
|
|
+ current_numeric_no = int(chapter_no)
|
|
|
|
|
+ if previous_numeric_chapter_no is not None and current_numeric_no < previous_numeric_chapter_no:
|
|
|
|
|
+ reversed_chapter_no_count += 1
|
|
|
|
|
+ previous_numeric_chapter_no = current_numeric_no
|
|
|
|
|
+
|
|
|
|
|
+ if re.match(r"^\d{1,2}(?:\.\d{1,2})*\.?(?:\s+|$)", chapter_body):
|
|
|
|
|
+ embedded_numeric_body_count += 1
|
|
|
|
|
+
|
|
|
|
|
+ if chinese_chapter_count >= 2 and numeric_heading_count >= max(3, chinese_chapter_count // 2):
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ if chinese_chapter_count >= max(2, len(titles) // 3) and numeric_heading_count >= max(2, len(titles) // 6):
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ if embedded_numeric_body_count >= max(2, len(titles) // 5):
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ if repeated_chapter_no_count > 0 or reversed_chapter_no_count > 0:
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ @staticmethod
|
|
|
|
|
+ def _catalog_structure_score(chapters: List[Dict[str, Any]]) -> int:
|
|
|
|
|
+ score = 0
|
|
|
|
|
+ for chapter in chapters:
|
|
|
|
|
+ score += 1
|
|
|
|
|
+ score += len(chapter.get("subsections", []) or [])
|
|
|
|
|
+ return score
|
|
|
|
|
+
|
|
|
|
|
+ @classmethod
|
|
|
|
|
+ def _coerce_numeric_catalog_section(
|
|
|
|
|
+ cls,
|
|
|
|
|
+ title_text: str,
|
|
|
|
|
+ document_l1_rules: Optional[List[str]],
|
|
|
|
|
+ active_l2_rule: Optional[str],
|
|
|
|
|
+ ) -> Optional[str]:
|
|
|
|
|
+ if active_l2_rule is not None:
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ if not document_l1_rules:
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ if "Rule_1_纯数字派" in document_l1_rules:
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ if re.match(r"^\d{1,2}(?:[\..。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].*", title_text.strip()):
|
|
|
|
|
+ return cls._clean_section_title(title_text)
|
|
|
|
|
+
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
@staticmethod
|
|
@staticmethod
|
|
|
def _split_catalog_entry(line: str) -> Tuple[str, Optional[int]]:
|
|
def _split_catalog_entry(line: str) -> Tuple[str, Optional[int]]:
|
|
|
cleaned = line.strip()
|
|
cleaned = line.strip()
|
|
@@ -582,6 +801,220 @@ class PdfStructureExtractor:
|
|
|
"page_end": page_num,
|
|
"page_end": page_num,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ @classmethod
|
|
|
|
|
+ def _prepare_page_lines(cls, text: str) -> List[str]:
|
|
|
|
|
+ raw_lines = [line.strip() for line in text.split("\n") if line.strip()]
|
|
|
|
|
+ prepared_lines: List[str] = []
|
|
|
|
|
+ index = 0
|
|
|
|
|
+
|
|
|
|
|
+ while index < len(raw_lines):
|
|
|
|
|
+ merged_line, consumed = cls._merge_heading_fragment(raw_lines, index)
|
|
|
|
|
+ if merged_line:
|
|
|
|
|
+ prepared_lines.append(merged_line)
|
|
|
|
|
+ index += consumed
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ prepared_lines.append(raw_lines[index])
|
|
|
|
|
+ index += 1
|
|
|
|
|
+
|
|
|
|
|
+ return prepared_lines
|
|
|
|
|
+
|
|
|
|
|
+ @classmethod
|
|
|
|
|
+ def _merge_heading_fragment(
|
|
|
|
|
+ cls,
|
|
|
|
|
+ lines: List[str],
|
|
|
|
|
+ start_index: int,
|
|
|
|
|
+ ) -> Tuple[Optional[str], int]:
|
|
|
|
|
+ first_line = lines[start_index].strip()
|
|
|
|
|
+ if not first_line:
|
|
|
|
|
+ return None, 1
|
|
|
|
|
+
|
|
|
|
|
+ first_is_heading = bool(cls._matching_rule_names(first_line, "l1") or cls._matching_rule_names(first_line, "l2"))
|
|
|
|
|
+ first_is_incomplete = cls._is_incomplete_heading_fragment(first_line)
|
|
|
|
|
+ max_span = min(3, len(lines) - start_index)
|
|
|
|
|
+
|
|
|
|
|
+ for span in range(2, max_span + 1):
|
|
|
|
|
+ candidate_lines = [lines[start_index + offset].strip() for offset in range(span)]
|
|
|
|
|
+ candidate_text = " ".join(candidate_lines).strip()
|
|
|
|
|
+ if not candidate_text or cls.TOC_PATTERN.search(candidate_text):
|
|
|
|
|
+ continue
|
|
|
|
|
+ if not (cls._matching_rule_names(candidate_text, "l1") or cls._matching_rule_names(candidate_text, "l2")):
|
|
|
|
|
+ continue
|
|
|
|
|
+ if first_is_incomplete or not first_is_heading:
|
|
|
|
|
+ return candidate_text, span
|
|
|
|
|
+
|
|
|
|
|
+ return None, 1
|
|
|
|
|
+
|
|
|
|
|
+ @staticmethod
|
|
|
|
|
+ def _is_incomplete_heading_fragment(line: str) -> bool:
|
|
|
|
|
+ clean_line = re.sub(r"\s+", "", line.strip())
|
|
|
|
|
+ if not clean_line:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ fragment_patterns = (
|
|
|
|
|
+ r"^第(?:\d+|[一二三四五六七八九十百零两]+)[章部分篇]$",
|
|
|
|
|
+ r"^\d{1,2}(?:[\..。、])$",
|
|
|
|
|
+ r"^[【\[]\d+[\]】]$",
|
|
|
|
|
+ r"^[一二三四五六七八九十百零两]+[、)\)\]]$",
|
|
|
|
|
+ r"^第[一二三四五六七八九十百零两]+节$",
|
|
|
|
|
+ r"^\d+\.\d+(?!\.\d)\.?$",
|
|
|
|
|
+ )
|
|
|
|
|
+ return any(re.match(pattern, clean_line) for pattern in fragment_patterns)
|
|
|
|
|
+
|
|
|
|
|
+ def _rebuild_section_contents_from_catalog(
|
|
|
|
|
+ self,
|
|
|
|
|
+ chapters: Dict[str, Dict[str, Dict[str, Any]]],
|
|
|
|
|
+ catalog: Dict[str, Any],
|
|
|
|
|
+ body_lines: List[Dict[str, Any]],
|
|
|
|
|
+ ) -> Dict[str, Dict[str, Dict[str, Any]]]:
|
|
|
|
|
+ catalog_chapters = catalog.get("chapters", []) if isinstance(catalog, dict) else []
|
|
|
|
|
+ if not catalog_chapters or not body_lines:
|
|
|
|
|
+ return chapters
|
|
|
|
|
+
|
|
|
|
|
+ expected_items: List[Dict[str, Any]] = []
|
|
|
|
|
+ total_sections = 0
|
|
|
|
|
+ for chapter in catalog_chapters:
|
|
|
|
|
+ chapter_title = (chapter.get("title", "") or "").strip()
|
|
|
|
|
+ if not chapter_title:
|
|
|
|
|
+ continue
|
|
|
|
|
+ chapter_page = self._safe_page_number(chapter.get("page"))
|
|
|
|
|
+ expected_items.append({
|
|
|
|
|
+ "kind": "chapter",
|
|
|
|
|
+ "title": chapter_title,
|
|
|
|
|
+ "chapter_title": chapter_title,
|
|
|
|
|
+ "section_title": "章节标题",
|
|
|
|
|
+ "page_hint": chapter_page,
|
|
|
|
|
+ "line_index": None,
|
|
|
|
|
+ "page": chapter_page,
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ for subsection in chapter.get("subsections", []):
|
|
|
|
|
+ section_title = (subsection.get("title", "") or "").strip()
|
|
|
|
|
+ if not section_title:
|
|
|
|
|
+ continue
|
|
|
|
|
+ total_sections += 1
|
|
|
|
|
+ expected_items.append({
|
|
|
|
|
+ "kind": "section",
|
|
|
|
|
+ "title": section_title,
|
|
|
|
|
+ "chapter_title": chapter_title,
|
|
|
|
|
+ "section_title": section_title,
|
|
|
|
|
+ "page_hint": self._safe_page_number(subsection.get("page"), chapter_page),
|
|
|
|
|
+ "line_index": None,
|
|
|
|
|
+ "page": self._safe_page_number(subsection.get("page"), chapter_page),
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ if not expected_items or total_sections == 0:
|
|
|
|
|
+ return chapters
|
|
|
|
|
+
|
|
|
|
|
+ search_start = 0
|
|
|
|
|
+ found_sections = 0
|
|
|
|
|
+ for item in expected_items:
|
|
|
|
|
+ line_index = self._find_heading_line_index(
|
|
|
|
|
+ body_lines,
|
|
|
|
|
+ item["title"],
|
|
|
|
|
+ item["kind"],
|
|
|
|
|
+ search_start,
|
|
|
|
|
+ )
|
|
|
|
|
+ item["line_index"] = line_index
|
|
|
|
|
+ if line_index is not None:
|
|
|
|
|
+ item["page"] = body_lines[line_index]["page"]
|
|
|
|
|
+ search_start = line_index + 1
|
|
|
|
|
+ if item["kind"] == "section":
|
|
|
|
|
+ found_sections += 1
|
|
|
|
|
+
|
|
|
|
|
+ if found_sections == 0:
|
|
|
|
|
+ return chapters
|
|
|
|
|
+
|
|
|
|
|
+ rebuilt: Dict[str, Dict[str, Dict[str, Any]]] = {}
|
|
|
|
|
+ section_title_key = "章节标题"
|
|
|
|
|
+
|
|
|
|
|
+ for chapter in catalog_chapters:
|
|
|
|
|
+ chapter_title = (chapter.get("title", "") or "").strip()
|
|
|
|
|
+ if not chapter_title:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ chapter_page = self._safe_page_number(chapter.get("page"))
|
|
|
|
|
+ existing_sections = chapters.get(chapter_title, {})
|
|
|
|
|
+ rebuilt[chapter_title] = {
|
|
|
|
|
+ section_title_key: existing_sections.get(section_title_key, self._empty_section_payload(chapter_page))
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ for subsection in chapter.get("subsections", []):
|
|
|
|
|
+ section_title = (subsection.get("title", "") or "").strip()
|
|
|
|
|
+ if not section_title:
|
|
|
|
|
+ continue
|
|
|
|
|
+ rebuilt[chapter_title][section_title] = existing_sections.get(
|
|
|
|
|
+ section_title,
|
|
|
|
|
+ self._empty_section_payload(self._safe_page_number(subsection.get("page"), chapter_page)),
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ for idx, item in enumerate(expected_items):
|
|
|
|
|
+ if item["kind"] != "section" or item["line_index"] is None:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ next_heading_index = len(body_lines)
|
|
|
|
|
+ for later in expected_items[idx + 1:]:
|
|
|
|
|
+ if later["line_index"] is not None:
|
|
|
|
|
+ next_heading_index = later["line_index"]
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ content_entries = body_lines[item["line_index"] + 1:next_heading_index]
|
|
|
|
|
+ content_text = "\n".join(entry["text"] for entry in content_entries).strip()
|
|
|
|
|
+ existing_payload = rebuilt[item["chapter_title"]].get(item["section_title"], {})
|
|
|
|
|
+
|
|
|
|
|
+ if not content_text and (existing_payload.get("content") or "").strip():
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if content_entries:
|
|
|
|
|
+ page_start = content_entries[0]["page"]
|
|
|
|
|
+ page_end = content_entries[-1]["page"]
|
|
|
|
|
+ else:
|
|
|
|
|
+ page_start = item["page"]
|
|
|
|
|
+ page_end = item["page"]
|
|
|
|
|
+
|
|
|
|
|
+ rebuilt[item["chapter_title"]][item["section_title"]] = {
|
|
|
|
|
+ "content": content_text,
|
|
|
|
|
+ "page_start": page_start,
|
|
|
|
|
+ "page_end": page_end,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return rebuilt or chapters
|
|
|
|
|
+
|
|
|
|
|
+ def _find_heading_line_index(
|
|
|
|
|
+ self,
|
|
|
|
|
+ body_lines: List[Dict[str, Any]],
|
|
|
|
|
+ target_title: str,
|
|
|
|
|
+ heading_kind: str,
|
|
|
|
|
+ start_index: int,
|
|
|
|
|
+ ) -> Optional[int]:
|
|
|
|
|
+ target_key = self._normalize_heading_key(target_title)
|
|
|
|
|
+ if not target_key:
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ for index in range(start_index, len(body_lines)):
|
|
|
|
|
+ candidate_text = (body_lines[index].get("text") or "").strip()
|
|
|
|
|
+ if not candidate_text or self.TOC_PATTERN.search(candidate_text):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if heading_kind == "chapter":
|
|
|
|
|
+ candidate_key = self._normalize_heading_key(self._clean_chapter_title(candidate_text))
|
|
|
|
|
+ else:
|
|
|
|
|
+ candidate_key = self._normalize_heading_key(self._clean_section_title(candidate_text))
|
|
|
|
|
+
|
|
|
|
|
+ if candidate_key == target_key:
|
|
|
|
|
+ return index
|
|
|
|
|
+
|
|
|
|
|
+ raw_candidate_key = self._normalize_heading_key(candidate_text)
|
|
|
|
|
+ if raw_candidate_key.endswith(target_key):
|
|
|
|
|
+ prefix = raw_candidate_key[:-len(target_key)]
|
|
|
|
|
+ if not prefix or re.fullmatch(
|
|
|
|
|
+ r"[\dA-Za-z\.\-_/|,:;()\[\]\u3001\u3002\uff0c\uff1a\uff1b\uff08\uff09\u3010\u3011]+",
|
|
|
|
|
+ prefix,
|
|
|
|
|
+ ):
|
|
|
|
|
+ return index
|
|
|
|
|
+
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
|
|
def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
|
|
|
"""同步并发处理 OCR(使用 ThreadPoolExecutor)"""
|
|
"""同步并发处理 OCR(使用 ThreadPoolExecutor)"""
|
|
|
results: List[OcrResult] = []
|
|
results: List[OcrResult] = []
|
|
@@ -751,78 +1184,96 @@ class PdfStructureExtractor:
|
|
|
if not ocr_results:
|
|
if not ocr_results:
|
|
|
return original_text
|
|
return original_text
|
|
|
|
|
|
|
|
- # 获取页面上的文本块及其坐标
|
|
|
|
|
text_blocks = []
|
|
text_blocks = []
|
|
|
for block in page.get_text("blocks"):
|
|
for block in page.get_text("blocks"):
|
|
|
x0, y0, x1, y1, text, _, _ = block
|
|
x0, y0, x1, y1, text, _, _ = block
|
|
|
- # 只考虑裁剪区域内的文本
|
|
|
|
|
if y0 >= clip_box.y0 and y1 <= clip_box.y1:
|
|
if y0 >= clip_box.y0 and y1 <= clip_box.y1:
|
|
|
text_blocks.append({
|
|
text_blocks.append({
|
|
|
"bbox": (x0, y0, x1, y1),
|
|
"bbox": (x0, y0, x1, y1),
|
|
|
"text": text.strip(),
|
|
"text": text.strip(),
|
|
|
})
|
|
})
|
|
|
|
|
|
|
|
- # 按 Y 坐标排序
|
|
|
|
|
text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
|
|
text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
|
|
|
|
|
|
|
|
- # 找出属于表格区域的文本块
|
|
|
|
|
|
|
+ if not text_blocks:
|
|
|
|
|
+ return original_text
|
|
|
|
|
+
|
|
|
|
|
+ region_entries: List[Dict[str, Any]] = []
|
|
|
replaced_indices: Set[int] = set()
|
|
replaced_indices: Set[int] = set()
|
|
|
- for ocr_result in ocr_results:
|
|
|
|
|
- bbox = ocr_result["bbox"]
|
|
|
|
|
- rx0, ry0, rx1, ry1 = bbox
|
|
|
|
|
|
|
+
|
|
|
|
|
+ for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
|
|
|
|
|
+ rx0, ry0, rx1, ry1 = ocr_result["bbox"]
|
|
|
|
|
+ current_indices: List[int] = []
|
|
|
|
|
|
|
|
for idx, block in enumerate(text_blocks):
|
|
for idx, block in enumerate(text_blocks):
|
|
|
if idx in replaced_indices:
|
|
if idx in replaced_indices:
|
|
|
continue
|
|
continue
|
|
|
- bx0, by0, bx1, by1 = block["bbox"]
|
|
|
|
|
|
|
+ if self._block_contains_heading(block["text"]):
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- # 检查重叠
|
|
|
|
|
|
|
+ bx0, by0, bx1, by1 = block["bbox"]
|
|
|
overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
|
|
overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
|
|
|
overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
|
|
overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
|
|
|
overlap_area = overlap_x * overlap_y
|
|
overlap_area = overlap_x * overlap_y
|
|
|
- block_area = (bx1 - bx0) * (by1 - by0)
|
|
|
|
|
|
|
+ block_area = max((bx1 - bx0) * (by1 - by0), 1)
|
|
|
|
|
|
|
|
- if block_area > 0 and overlap_area / block_area > 0.5:
|
|
|
|
|
- replaced_indices.add(idx)
|
|
|
|
|
|
|
+ if overlap_area / block_area > 0.5:
|
|
|
|
|
+ current_indices.append(idx)
|
|
|
|
|
|
|
|
- # 构建新文本
|
|
|
|
|
- result_parts: List[str] = []
|
|
|
|
|
- last_idx = 0
|
|
|
|
|
|
|
+ if not current_indices:
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
|
|
|
|
|
- bbox = ocr_result["bbox"]
|
|
|
|
|
- rx0, ry0, rx1, ry1 = bbox
|
|
|
|
|
|
|
+ replaced_indices.update(current_indices)
|
|
|
|
|
+ region_entries.append({
|
|
|
|
|
+ "start": min(current_indices),
|
|
|
|
|
+ "end": max(current_indices),
|
|
|
|
|
+ "ocr_text": (ocr_result.get("ocr_text") or "").strip(),
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ if not region_entries:
|
|
|
|
|
+ return original_text
|
|
|
|
|
+
|
|
|
|
|
+ region_by_start = {entry["start"]: entry for entry in region_entries}
|
|
|
|
|
+ result_parts: List[str] = []
|
|
|
|
|
+ idx = 0
|
|
|
|
|
+
|
|
|
|
|
+ while idx < len(text_blocks):
|
|
|
|
|
+ region = region_by_start.get(idx)
|
|
|
|
|
+ if region is not None:
|
|
|
|
|
+ if region["ocr_text"]:
|
|
|
|
|
+ result_parts.append(region["ocr_text"])
|
|
|
|
|
+ result_parts.append("\n")
|
|
|
|
|
+ else:
|
|
|
|
|
+ for block_idx in range(region["start"], region["end"] + 1):
|
|
|
|
|
+ block_text = text_blocks[block_idx]["text"]
|
|
|
|
|
+ if block_text:
|
|
|
|
|
+ result_parts.append(block_text)
|
|
|
|
|
+ result_parts.append("\n")
|
|
|
|
|
+ idx = region["end"] + 1
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- # 找到该表格区域之前的文本
|
|
|
|
|
- region_start_idx = None
|
|
|
|
|
- for idx, block in enumerate(text_blocks):
|
|
|
|
|
- if idx in replaced_indices:
|
|
|
|
|
- bx0, by0, bx1, by1 = block["bbox"]
|
|
|
|
|
- if (bx0 >= rx0 - 5 and bx1 <= rx1 + 5 and
|
|
|
|
|
- by0 >= ry0 - 5 and by1 <= ry1 + 5):
|
|
|
|
|
- if region_start_idx is None:
|
|
|
|
|
- region_start_idx = idx
|
|
|
|
|
- last_idx = idx + 1
|
|
|
|
|
-
|
|
|
|
|
- if region_start_idx is not None:
|
|
|
|
|
- # 添加表格前的非表格文本
|
|
|
|
|
- for idx in range(last_idx - (last_idx - region_start_idx), region_start_idx):
|
|
|
|
|
- if idx not in replaced_indices and idx < len(text_blocks):
|
|
|
|
|
- result_parts.append(text_blocks[idx]["text"])
|
|
|
|
|
- result_parts.append("\n")
|
|
|
|
|
-
|
|
|
|
|
- # 添加 OCR 结果
|
|
|
|
|
- result_parts.append(ocr_result["ocr_text"])
|
|
|
|
|
- result_parts.append("\n")
|
|
|
|
|
-
|
|
|
|
|
- # 添加剩余文本
|
|
|
|
|
- for idx in range(last_idx, len(text_blocks)):
|
|
|
|
|
if idx not in replaced_indices:
|
|
if idx not in replaced_indices:
|
|
|
- result_parts.append(text_blocks[idx]["text"])
|
|
|
|
|
- result_parts.append("\n")
|
|
|
|
|
|
|
+ block_text = text_blocks[idx]["text"]
|
|
|
|
|
+ if block_text:
|
|
|
|
|
+ result_parts.append(block_text)
|
|
|
|
|
+ result_parts.append("\n")
|
|
|
|
|
+ idx += 1
|
|
|
|
|
|
|
|
return "".join(result_parts).strip() or original_text
|
|
return "".join(result_parts).strip() or original_text
|
|
|
|
|
|
|
|
|
|
+ @classmethod
|
|
|
|
|
+ def _block_contains_heading(cls, text: str) -> bool:
|
|
|
|
|
+ if not text or not text.strip():
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ for line in cls._prepare_page_lines(text):
|
|
|
|
|
+ stripped = line.strip()
|
|
|
|
|
+ if not stripped:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if cls._matching_rule_names(stripped, "l1") or cls._matching_rule_names(stripped, "l2"):
|
|
|
|
|
+ return True
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
def _compress_image(self, img_bytes: bytes) -> bytes:
|
|
def _compress_image(self, img_bytes: bytes) -> bytes:
|
|
|
"""压缩图片"""
|
|
"""压缩图片"""
|
|
|
try:
|
|
try:
|
|
@@ -873,10 +1324,36 @@ class PdfStructureExtractor:
|
|
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
|
def _is_header_footer(line: str) -> bool:
|
|
def _is_header_footer(line: str) -> bool:
|
|
|
|
|
+ compact_line = re.sub(r"\s+", "", line.strip())
|
|
|
|
|
+ if not compact_line:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ heading_prefix = re.match(
|
|
|
|
|
+ r"^(第[\d一二三四五六七八九十百零两]+[章节部分篇]|[\d]+\.\d+|[\d]+[\..。、]?|[一二三四五六七八九十百零两]+[、)\)\]]|第[一二三四五六七八九十百零两]+节|【\d+】)",
|
|
|
|
|
+ compact_line,
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if compact_line.isdigit():
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ if (
|
|
|
|
|
+ compact_line.endswith("有限责任公司")
|
|
|
|
|
+ or compact_line.endswith("有限公司")
|
|
|
|
|
+ or compact_line.endswith("股份有限公司")
|
|
|
|
|
+ ) and not heading_prefix:
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ if compact_line.endswith("专项施工方案") and not heading_prefix:
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
return (
|
|
return (
|
|
|
"四川路桥建设集团股份有限公司" in line
|
|
"四川路桥建设集团股份有限公司" in line
|
|
|
or "T梁运输及安装专项施工方案" in line
|
|
or "T梁运输及安装专项施工方案" in line
|
|
|
- or line.isdigit()
|
|
|
|
|
|
|
+ or (
|
|
|
|
|
+ compact_line.endswith("工程项目")
|
|
|
|
|
+ and len(compact_line) >= 8
|
|
|
|
|
+ and not compact_line.startswith("第")
|
|
|
|
|
+ )
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|