|
@@ -357,23 +357,15 @@ class PdfStructureExtractor:
|
|
|
return fallback
|
|
return fallback
|
|
|
|
|
|
|
|
def _extract_body_lines(self, doc: fitz.Document, progress_callback=None) -> List[BodyLine]:
|
|
def _extract_body_lines(self, doc: fitz.Document, progress_callback=None) -> List[BodyLine]:
|
|
|
- """读取裁剪后的页面文本,规范化正文行,并移除重复的非标题噪声。"""
|
|
|
|
|
|
|
+ """读取页面正文文本,规范化正文行,并移除重复的非标题噪声。"""
|
|
|
|
|
|
|
|
page_lines_by_page: List[Tuple[int, List[str]]] = []
|
|
page_lines_by_page: List[Tuple[int, List[str]]] = []
|
|
|
total_pages = len(doc)
|
|
total_pages = len(doc)
|
|
|
|
|
+ repeated_margin_keys = self._find_repeated_margin_block_lines(doc)
|
|
|
|
|
|
|
|
for page_index in range(total_pages):
|
|
for page_index in range(total_pages):
|
|
|
page = doc.load_page(page_index)
|
|
page = doc.load_page(page_index)
|
|
|
- rect = page.rect
|
|
|
|
|
- clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
|
|
|
|
|
- text = page.get_text("text", clip=clip_box)
|
|
|
|
|
-
|
|
|
|
|
- page_lines: List[str] = []
|
|
|
|
|
- for line in self._prepare_page_lines(text):
|
|
|
|
|
- stripped = line.strip()
|
|
|
|
|
- if not stripped or self._is_header_footer(stripped):
|
|
|
|
|
- continue
|
|
|
|
|
- page_lines.append(stripped)
|
|
|
|
|
|
|
+ page_lines = self._extract_page_lines_with_margin_filter(page, repeated_margin_keys)
|
|
|
|
|
|
|
|
recovered_headings, clipped_fragment_keys = self._recover_top_clipped_l1_headings(page, page_lines)
|
|
recovered_headings, clipped_fragment_keys = self._recover_top_clipped_l1_headings(page, page_lines)
|
|
|
if clipped_fragment_keys:
|
|
if clipped_fragment_keys:
|
|
@@ -407,6 +399,147 @@ class PdfStructureExtractor:
|
|
|
body_lines.append(BodyLine(page=page, text=line))
|
|
body_lines.append(BodyLine(page=page, text=line))
|
|
|
return body_lines
|
|
return body_lines
|
|
|
|
|
|
|
|
|
|
+ def _extract_page_lines_with_margin_filter(
|
|
|
|
|
+ self,
|
|
|
|
|
+ page: fitz.Page,
|
|
|
|
|
+ repeated_margin_keys: set[str],
|
|
|
|
|
+ ) -> List[str]:
|
|
|
|
|
+ """按文本块读取页面,并过滤跨页重复的页边页眉/页脚行。"""
|
|
|
|
|
+
|
|
|
|
|
+ rect = page.rect
|
|
|
|
|
+ body_top = self.clip_top
|
|
|
|
|
+ body_bottom = rect.height - self.clip_bottom
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ page_dict = page.get_text("dict")
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ clip_box = fitz.Rect(0, body_top, rect.width, body_bottom)
|
|
|
|
|
+ text = page.get_text("text", clip=clip_box)
|
|
|
|
|
+ return [
|
|
|
|
|
+ stripped
|
|
|
|
|
+ for stripped in (line.strip() for line in self._prepare_page_lines(text))
|
|
|
|
|
+ if stripped and not self._is_header_footer(stripped)
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ page_lines: List[str] = []
|
|
|
|
|
+ blocks = sorted(
|
|
|
|
|
+ (block for block in page_dict.get("blocks", []) if block.get("type") == 0),
|
|
|
|
|
+ key=lambda item: (
|
|
|
|
|
+ item.get("bbox", [0, 0, 0, 0])[1],
|
|
|
|
|
+ item.get("bbox", [0, 0, 0, 0])[0],
|
|
|
|
|
+ ),
|
|
|
|
|
+ )
|
|
|
|
|
+ for block in blocks:
|
|
|
|
|
+ bbox = block.get("bbox") or ()
|
|
|
|
|
+ if len(bbox) != 4:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ _, y0, _, y1 = bbox
|
|
|
|
|
+ if y1 <= body_top or y0 >= body_bottom:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ block_text = self._extract_text_block_text(block)
|
|
|
|
|
+ if not block_text:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ in_margin = self._is_margin_band(y0, y1, rect.height)
|
|
|
|
|
+ for line in self._prepare_page_lines(block_text):
|
|
|
|
|
+ stripped = line.strip()
|
|
|
|
|
+ if not stripped or self._is_header_footer(stripped):
|
|
|
|
|
+ continue
|
|
|
|
|
+ if in_margin and self._is_repeated_margin_noise(stripped, repeated_margin_keys):
|
|
|
|
|
+ continue
|
|
|
|
|
+ page_lines.append(stripped)
|
|
|
|
|
+ return page_lines
|
|
|
|
|
+
|
|
|
|
|
+ def _find_repeated_margin_block_lines(self, doc: fitz.Document) -> set[str]:
|
|
|
|
|
+ """统计顶部/底部页边区域中跨页重复出现、且不像标题的文本行。"""
|
|
|
|
|
+
|
|
|
|
|
+ total_pages = len(doc)
|
|
|
|
|
+ if total_pages < 3:
|
|
|
|
|
+ return set()
|
|
|
|
|
+
|
|
|
|
|
+ pages_by_key: Dict[str, set[int]] = {}
|
|
|
|
|
+ for page_index in range(total_pages):
|
|
|
|
|
+ page = doc.load_page(page_index)
|
|
|
|
|
+ try:
|
|
|
|
|
+ page_dict = page.get_text("dict")
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ page_height = page.rect.height
|
|
|
|
|
+ seen_on_page: set[str] = set()
|
|
|
|
|
+ for block in page_dict.get("blocks", []) or []:
|
|
|
|
|
+ if block.get("type") != 0:
|
|
|
|
|
+ continue
|
|
|
|
|
+ bbox = block.get("bbox") or ()
|
|
|
|
|
+ if len(bbox) != 4:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ _, y0, _, y1 = bbox
|
|
|
|
|
+ if not self._is_margin_band(y0, y1, page_height):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ block_text = self._extract_text_block_text(block)
|
|
|
|
|
+ if not block_text:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ for line in self._prepare_page_lines(block_text):
|
|
|
|
|
+ stripped = line.strip()
|
|
|
|
|
+ if not stripped or self._is_header_footer(stripped):
|
|
|
|
|
+ continue
|
|
|
|
|
+ if self._is_protected_margin_line(stripped):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ key = self._normalize_margin_noise_key(stripped)
|
|
|
|
|
+ if not key or not (2 <= len(key) <= 120):
|
|
|
|
|
+ continue
|
|
|
|
|
+ seen_on_page.add(key)
|
|
|
|
|
+
|
|
|
|
|
+ for key in seen_on_page:
|
|
|
|
|
+ pages_by_key.setdefault(key, set()).add(page_index + 1)
|
|
|
|
|
+
|
|
|
|
|
+ threshold = max(3, (total_pages + 11) // 12)
|
|
|
|
|
+ return {key for key, pages in pages_by_key.items() if len(pages) >= threshold}
|
|
|
|
|
+
|
|
|
|
|
+ def _is_margin_band(self, y0: float, y1: float, page_height: float) -> bool:
|
|
|
|
|
+ """判断文本块是否落在页眉/页脚候选区域。"""
|
|
|
|
|
+
|
|
|
|
|
+ band_height = max(self.clip_top + 20, page_height * 0.08)
|
|
|
|
|
+ bottom_start = page_height - max(self.clip_bottom + 20, page_height * 0.08)
|
|
|
|
|
+ return y0 < band_height or y1 > bottom_start
|
|
|
|
|
+
|
|
|
|
|
+ @classmethod
|
|
|
|
|
+ def _is_repeated_margin_noise(cls, line: str, repeated_margin_keys: set[str]) -> bool:
|
|
|
|
|
+ """判断当前页边文本是否属于预先识别出的跨页重复噪声。"""
|
|
|
|
|
+
|
|
|
|
|
+ if not repeated_margin_keys:
|
|
|
|
|
+ return False
|
|
|
|
|
+ if cls._is_protected_margin_line(line):
|
|
|
|
|
+ return False
|
|
|
|
|
+ key = cls._normalize_margin_noise_key(line)
|
|
|
|
|
+ return bool(key and key in repeated_margin_keys)
|
|
|
|
|
+
|
|
|
|
|
+ @classmethod
|
|
|
|
|
+ def _is_protected_margin_line(cls, line: str) -> bool:
|
|
|
|
|
+ """保护真实章节标题和目录行,避免被页边重复过滤误删。"""
|
|
|
|
|
+
|
|
|
|
|
+ normalized = cls._strip_leading_page_number_from_heading(line)
|
|
|
|
|
+ return cls._matches_any_heading(normalized) or cls._is_toc_line(normalized)
|
|
|
|
|
+
|
|
|
|
|
+ @staticmethod
|
|
|
|
|
+ def _normalize_margin_noise_key(line: str) -> str:
|
|
|
|
|
+ """生成页边重复检测 key,弱化页码和数字差异。"""
|
|
|
|
|
+
|
|
|
|
|
+ compact = re.sub(r"\s+", "", str(line or "").strip())
|
|
|
|
|
+ if not compact:
|
|
|
|
|
+ return ""
|
|
|
|
|
+ compact = re.sub(r"第\d+页/共\d+页", "第#页共#页", compact)
|
|
|
|
|
+ compact = re.sub(r"第\d+页(?:共\d+页)?", "第#页", compact)
|
|
|
|
|
+ compact = re.sub(r"\d+", "#", compact)
|
|
|
|
|
+ compact = re.sub(r"[·•\-—_~~\..。,::;;/\\|]", "", compact)
|
|
|
|
|
+ return compact
|
|
|
|
|
+
|
|
|
def _recover_top_clipped_l1_headings(
|
|
def _recover_top_clipped_l1_headings(
|
|
|
self,
|
|
self,
|
|
|
page: fitz.Page,
|
|
page: fitz.Page,
|