Przeglądaj źródła

Merge branch 'dev-planWrite' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev

tangle 4 tygodni temu
rodzic
commit
791a08c1a8

+ 144 - 11
core/construction_review/component/minimal_pipeline/pdf_extractor1.py

@@ -357,23 +357,15 @@ class PdfStructureExtractor:
         return fallback
 
     def _extract_body_lines(self, doc: fitz.Document, progress_callback=None) -> List[BodyLine]:
-        """读取裁剪后的页面文本,规范化正文行,并移除重复的非标题噪声。"""
+        """读取页面正文文本,规范化正文行,并移除重复的非标题噪声。"""
 
         page_lines_by_page: List[Tuple[int, List[str]]] = []
         total_pages = len(doc)
+        repeated_margin_keys = self._find_repeated_margin_block_lines(doc)
 
         for page_index in range(total_pages):
             page = doc.load_page(page_index)
-            rect = page.rect
-            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-            text = page.get_text("text", clip=clip_box)
-
-            page_lines: List[str] = []
-            for line in self._prepare_page_lines(text):
-                stripped = line.strip()
-                if not stripped or self._is_header_footer(stripped):
-                    continue
-                page_lines.append(stripped)
+            page_lines = self._extract_page_lines_with_margin_filter(page, repeated_margin_keys)
 
             recovered_headings, clipped_fragment_keys = self._recover_top_clipped_l1_headings(page, page_lines)
             if clipped_fragment_keys:
@@ -407,6 +399,147 @@ class PdfStructureExtractor:
                 body_lines.append(BodyLine(page=page, text=line))
         return body_lines
 
+    def _extract_page_lines_with_margin_filter(
+        self,
+        page: fitz.Page,
+        repeated_margin_keys: set[str],
+    ) -> List[str]:
+        """按文本块读取页面,并过滤跨页重复的页边页眉/页脚行。"""
+
+        rect = page.rect
+        body_top = self.clip_top
+        body_bottom = rect.height - self.clip_bottom
+
+        try:
+            page_dict = page.get_text("dict")
+        except Exception:
+            clip_box = fitz.Rect(0, body_top, rect.width, body_bottom)
+            text = page.get_text("text", clip=clip_box)
+            return [
+                stripped
+                for stripped in (line.strip() for line in self._prepare_page_lines(text))
+                if stripped and not self._is_header_footer(stripped)
+            ]
+
+        page_lines: List[str] = []
+        blocks = sorted(
+            (block for block in page_dict.get("blocks", []) if block.get("type") == 0),
+            key=lambda item: (
+                item.get("bbox", [0, 0, 0, 0])[1],
+                item.get("bbox", [0, 0, 0, 0])[0],
+            ),
+        )
+        for block in blocks:
+            bbox = block.get("bbox") or ()
+            if len(bbox) != 4:
+                continue
+
+            _, y0, _, y1 = bbox
+            if y1 <= body_top or y0 >= body_bottom:
+                continue
+
+            block_text = self._extract_text_block_text(block)
+            if not block_text:
+                continue
+
+            in_margin = self._is_margin_band(y0, y1, rect.height)
+            for line in self._prepare_page_lines(block_text):
+                stripped = line.strip()
+                if not stripped or self._is_header_footer(stripped):
+                    continue
+                if in_margin and self._is_repeated_margin_noise(stripped, repeated_margin_keys):
+                    continue
+                page_lines.append(stripped)
+        return page_lines
+
+    def _find_repeated_margin_block_lines(self, doc: fitz.Document) -> set[str]:
+        """统计顶部/底部页边区域中跨页重复出现、且不像标题的文本行。"""
+
+        total_pages = len(doc)
+        if total_pages < 3:
+            return set()
+
+        pages_by_key: Dict[str, set[int]] = {}
+        for page_index in range(total_pages):
+            page = doc.load_page(page_index)
+            try:
+                page_dict = page.get_text("dict")
+            except Exception:
+                continue
+
+            page_height = page.rect.height
+            seen_on_page: set[str] = set()
+            for block in page_dict.get("blocks", []) or []:
+                if block.get("type") != 0:
+                    continue
+                bbox = block.get("bbox") or ()
+                if len(bbox) != 4:
+                    continue
+
+                _, y0, _, y1 = bbox
+                if not self._is_margin_band(y0, y1, page_height):
+                    continue
+
+                block_text = self._extract_text_block_text(block)
+                if not block_text:
+                    continue
+
+                for line in self._prepare_page_lines(block_text):
+                    stripped = line.strip()
+                    if not stripped or self._is_header_footer(stripped):
+                        continue
+                    if self._is_protected_margin_line(stripped):
+                        continue
+
+                    key = self._normalize_margin_noise_key(stripped)
+                    if not key or not (2 <= len(key) <= 120):
+                        continue
+                    seen_on_page.add(key)
+
+            for key in seen_on_page:
+                pages_by_key.setdefault(key, set()).add(page_index + 1)
+
+        threshold = max(3, (total_pages + 11) // 12)
+        return {key for key, pages in pages_by_key.items() if len(pages) >= threshold}
+
+    def _is_margin_band(self, y0: float, y1: float, page_height: float) -> bool:
+        """判断文本块是否落在页眉/页脚候选区域。"""
+
+        band_height = max(self.clip_top + 20, page_height * 0.08)
+        bottom_start = page_height - max(self.clip_bottom + 20, page_height * 0.08)
+        return y0 < band_height or y1 > bottom_start
+
+    @classmethod
+    def _is_repeated_margin_noise(cls, line: str, repeated_margin_keys: set[str]) -> bool:
+        """判断当前页边文本是否属于预先识别出的跨页重复噪声。"""
+
+        if not repeated_margin_keys:
+            return False
+        if cls._is_protected_margin_line(line):
+            return False
+        key = cls._normalize_margin_noise_key(line)
+        return bool(key and key in repeated_margin_keys)
+
+    @classmethod
+    def _is_protected_margin_line(cls, line: str) -> bool:
+        """保护真实章节标题和目录行,避免被页边重复过滤误删。"""
+
+        normalized = cls._strip_leading_page_number_from_heading(line)
+        return cls._matches_any_heading(normalized) or cls._is_toc_line(normalized)
+
+    @staticmethod
+    def _normalize_margin_noise_key(line: str) -> str:
+        """生成页边重复检测 key,弱化页码和数字差异。"""
+
+        compact = re.sub(r"\s+", "", str(line or "").strip())
+        if not compact:
+            return ""
+        compact = re.sub(r"第\d+页/共\d+页", "第#页共#页", compact)
+        compact = re.sub(r"第\d+页(?:共\d+页)?", "第#页", compact)
+        compact = re.sub(r"\d+", "#", compact)
+        compact = re.sub(r"[·•\-—_~~\..。,::;;/\\|]", "", compact)
+        return compact
+
     def _recover_top_clipped_l1_headings(
         self,
         page: fitz.Page,

+ 4 - 48
core/construction_write/component/outline_generator.py

@@ -638,32 +638,6 @@ class OutlineGenerator:
             return None
         return self._keyword_rules_cache.get(code)
 
-    def _resolve_llm_model_name(
-        self,
-        function_name: Optional[str] = None,
-        model_name: Optional[str] = None
-    ) -> str:
-        """解析本次 LLM 调用实际会使用的模型名,用于日志排查。"""
-        try:
-            from foundation.ai.models.model_config_loader import get_model_for_function
-
-            # 保持和 generate_model_client.get_model_generate_invoke 的解析顺序一致。
-            if function_name:
-                resolved_model = get_model_for_function(function_name)
-                if resolved_model:
-                    return resolved_model
-
-            if model_name:
-                return model_name
-
-            default_model = get_model_for_function("default")
-            if default_model:
-                return default_model
-        except Exception as e:
-            logger.warning(f"[LLM调用] 解析模型名失败: {str(e)}")
-
-        return model_name or "default"
-
     async def _call_llm(
         self,
         trace_id: str,
@@ -735,14 +709,7 @@ class OutlineGenerator:
                 "task_prompt": chat_template
             }
 
-            resolved_model_name = self._resolve_llm_model_name(
-                function_name=function_name,
-                model_name=model_name
-            )
-            logger.info(
-                f"[LLM调用] trace_id: {trace_id}, model: {resolved_model_name}, "
-                f"function_name: {function_name}, 开始生成内容"
-            )
+            logger.info(f"[LLM调用] trace_id: {trace_id}, 开始生成内容")
 
             # 调用模型生成(非流式)
             generated_content = await generate_model_client.get_model_generate_invoke(
@@ -753,26 +720,15 @@ class OutlineGenerator:
                 function_name=function_name,
             )
 
-            logger.info(
-                f"[LLM调用] trace_id: {trace_id}, model: {resolved_model_name}, "
-                f"生成完成,内容长度: {len(generated_content)}"
-            )
+            logger.info(f"[LLM调用] trace_id: {trace_id}, 生成完成,内容长度: {len(generated_content)}")
 
             return generated_content
 
         except TimeoutError as e:
-            resolved_model_name = self._resolve_llm_model_name(
-                function_name=function_name,
-                model_name=model_name
-            )
-            logger.error(f"[LLM调用] trace_id: {trace_id}, model: {resolved_model_name}, 超时: {str(e)}")
+            logger.error(f"[LLM调用] trace_id: {trace_id}, 超时: {str(e)}")
             raise
         except Exception as e:
-            resolved_model_name = self._resolve_llm_model_name(
-                function_name=function_name,
-                model_name=model_name
-            )
-            logger.error(f"[LLM调用] trace_id: {trace_id}, model: {resolved_model_name}, 异常: {str(e)}")
+            logger.error(f"[LLM调用] trace_id: {trace_id}, 异常: {str(e)}")
             raise
 
     async def _generate_single_chapter(