Quellcode durchsuchen

fix(增加页眉页脚判断)

tangle vor 4 Wochen
Ursprung
Commit
a31705dedd
1 geänderte Dateien mit 144 neuen und 11 gelöschten Zeilen
  1. 144 11
      core/construction_review/component/minimal_pipeline/pdf_extractor1.py

+ 144 - 11
core/construction_review/component/minimal_pipeline/pdf_extractor1.py

@@ -357,23 +357,15 @@ class PdfStructureExtractor:
         return fallback
         return fallback
 
 
     def _extract_body_lines(self, doc: fitz.Document, progress_callback=None) -> List[BodyLine]:
     def _extract_body_lines(self, doc: fitz.Document, progress_callback=None) -> List[BodyLine]:
-        """读取裁剪后的页面文本,规范化正文行,并移除重复的非标题噪声。"""
+        """读取页面正文文本,规范化正文行,并移除重复的非标题噪声。"""
 
 
         page_lines_by_page: List[Tuple[int, List[str]]] = []
         page_lines_by_page: List[Tuple[int, List[str]]] = []
         total_pages = len(doc)
         total_pages = len(doc)
+        repeated_margin_keys = self._find_repeated_margin_block_lines(doc)
 
 
         for page_index in range(total_pages):
         for page_index in range(total_pages):
             page = doc.load_page(page_index)
             page = doc.load_page(page_index)
-            rect = page.rect
-            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-            text = page.get_text("text", clip=clip_box)
-
-            page_lines: List[str] = []
-            for line in self._prepare_page_lines(text):
-                stripped = line.strip()
-                if not stripped or self._is_header_footer(stripped):
-                    continue
-                page_lines.append(stripped)
+            page_lines = self._extract_page_lines_with_margin_filter(page, repeated_margin_keys)
 
 
             recovered_headings, clipped_fragment_keys = self._recover_top_clipped_l1_headings(page, page_lines)
             recovered_headings, clipped_fragment_keys = self._recover_top_clipped_l1_headings(page, page_lines)
             if clipped_fragment_keys:
             if clipped_fragment_keys:
@@ -407,6 +399,147 @@ class PdfStructureExtractor:
                 body_lines.append(BodyLine(page=page, text=line))
                 body_lines.append(BodyLine(page=page, text=line))
         return body_lines
         return body_lines
 
 
+    def _extract_page_lines_with_margin_filter(
+        self,
+        page: fitz.Page,
+        repeated_margin_keys: set[str],
+    ) -> List[str]:
+        """按文本块读取页面,并过滤跨页重复的页边页眉/页脚行。"""
+
+        rect = page.rect
+        body_top = self.clip_top
+        body_bottom = rect.height - self.clip_bottom
+
+        try:
+            page_dict = page.get_text("dict")
+        except Exception:
+            clip_box = fitz.Rect(0, body_top, rect.width, body_bottom)
+            text = page.get_text("text", clip=clip_box)
+            return [
+                stripped
+                for stripped in (line.strip() for line in self._prepare_page_lines(text))
+                if stripped and not self._is_header_footer(stripped)
+            ]
+
+        page_lines: List[str] = []
+        blocks = sorted(
+            (block for block in page_dict.get("blocks", []) if block.get("type") == 0),
+            key=lambda item: (
+                item.get("bbox", [0, 0, 0, 0])[1],
+                item.get("bbox", [0, 0, 0, 0])[0],
+            ),
+        )
+        for block in blocks:
+            bbox = block.get("bbox") or ()
+            if len(bbox) != 4:
+                continue
+
+            _, y0, _, y1 = bbox
+            if y1 <= body_top or y0 >= body_bottom:
+                continue
+
+            block_text = self._extract_text_block_text(block)
+            if not block_text:
+                continue
+
+            in_margin = self._is_margin_band(y0, y1, rect.height)
+            for line in self._prepare_page_lines(block_text):
+                stripped = line.strip()
+                if not stripped or self._is_header_footer(stripped):
+                    continue
+                if in_margin and self._is_repeated_margin_noise(stripped, repeated_margin_keys):
+                    continue
+                page_lines.append(stripped)
+        return page_lines
+
+    def _find_repeated_margin_block_lines(self, doc: fitz.Document) -> set[str]:
+        """统计顶部/底部页边区域中跨页重复出现、且不像标题的文本行。"""
+
+        total_pages = len(doc)
+        if total_pages < 3:
+            return set()
+
+        pages_by_key: Dict[str, set[int]] = {}
+        for page_index in range(total_pages):
+            page = doc.load_page(page_index)
+            try:
+                page_dict = page.get_text("dict")
+            except Exception:
+                continue
+
+            page_height = page.rect.height
+            seen_on_page: set[str] = set()
+            for block in page_dict.get("blocks", []) or []:
+                if block.get("type") != 0:
+                    continue
+                bbox = block.get("bbox") or ()
+                if len(bbox) != 4:
+                    continue
+
+                _, y0, _, y1 = bbox
+                if not self._is_margin_band(y0, y1, page_height):
+                    continue
+
+                block_text = self._extract_text_block_text(block)
+                if not block_text:
+                    continue
+
+                for line in self._prepare_page_lines(block_text):
+                    stripped = line.strip()
+                    if not stripped or self._is_header_footer(stripped):
+                        continue
+                    if self._is_protected_margin_line(stripped):
+                        continue
+
+                    key = self._normalize_margin_noise_key(stripped)
+                    if not key or not (2 <= len(key) <= 120):
+                        continue
+                    seen_on_page.add(key)
+
+            for key in seen_on_page:
+                pages_by_key.setdefault(key, set()).add(page_index + 1)
+
+        threshold = max(3, (total_pages + 11) // 12)
+        return {key for key, pages in pages_by_key.items() if len(pages) >= threshold}
+
+    def _is_margin_band(self, y0: float, y1: float, page_height: float) -> bool:
+        """判断文本块是否落在页眉/页脚候选区域。"""
+
+        band_height = max(self.clip_top + 20, page_height * 0.08)
+        bottom_start = page_height - max(self.clip_bottom + 20, page_height * 0.08)
+        return y0 < band_height or y1 > bottom_start
+
+    @classmethod
+    def _is_repeated_margin_noise(cls, line: str, repeated_margin_keys: set[str]) -> bool:
+        """判断当前页边文本是否属于预先识别出的跨页重复噪声。"""
+
+        if not repeated_margin_keys:
+            return False
+        if cls._is_protected_margin_line(line):
+            return False
+        key = cls._normalize_margin_noise_key(line)
+        return bool(key and key in repeated_margin_keys)
+
+    @classmethod
+    def _is_protected_margin_line(cls, line: str) -> bool:
+        """保护真实章节标题和目录行,避免被页边重复过滤误删。"""
+
+        normalized = cls._strip_leading_page_number_from_heading(line)
+        return cls._matches_any_heading(normalized) or cls._is_toc_line(normalized)
+
+    @staticmethod
+    def _normalize_margin_noise_key(line: str) -> str:
+        """生成页边重复检测 key,弱化页码和数字差异。"""
+
+        compact = re.sub(r"\s+", "", str(line or "").strip())
+        if not compact:
+            return ""
+        compact = re.sub(r"第\d+页/共\d+页", "第#页共#页", compact)
+        compact = re.sub(r"第\d+页(?:共\d+页)?", "第#页", compact)
+        compact = re.sub(r"\d+", "#", compact)
+        compact = re.sub(r"[·•\-—_~~\..。,::;;/\\|]", "", compact)
+        return compact
+
     def _recover_top_clipped_l1_headings(
     def _recover_top_clipped_l1_headings(
         self,
         self,
         page: fitz.Page,
         page: fitz.Page,