hai 1 mes · ed563f2d81
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor.py
@@ -50,8 +50,8 @@ class OcrResult:
 
															 class PdfStructureExtractor:
														
 
															     """PDF 章节结构提取器（支持 OCR 异步并发）"""
														
 
															-    CHAPTER_PATTERN = re.compile(r"^第[一二三四五六七八九十百]+章\s*.*")
														
 
															-    SECTION_PATTERN = re.compile(r"^[一二三四五六七八九十百]+、\s*.*")
														
 
															+    CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
														
 
															+    SECTION_PATTERN = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
														
 
															     TOC_PATTERN = re.compile(r"\.{3,}|…{2,}")
														
 
															     # OCR 配置
														
@@ -162,7 +162,16 @@ class PdfStructureExtractor:
 
															         return self._toc_extractor.detect_and_extract(file_content, progress_callback)
														
 
															     def _extract_from_doc(self, doc: fitz.Document, progress_callback=None) -> Dict[str, Any]:
														
 
															-        """提取文档结构（支持 OCR 异步并发）"""
														
 
															+        """
														
 
															+        提取文档结构（支持 OCR 异步并发）- 带坐标的精准回填方案。
														
 
															+
														
 
															+        流程：
														
 
															+        1. 提取带坐标的文本块
														
 
															+        2. 章节标题匹配 + 块归属划分
														
 
															+        3. 扫描表格区域并 OCR
														
 
															+        4. 根据表格坐标，将其作为新的块插入到对应小节
														
 
															+        5. 将每个小节的块列表按顺序拼接成纯文本输出
														
 
															+        """
														
 
															         def _emit_progress(stage: str, current: int, message: str):
														
 
															             """发送进度回调"""
														
@@ -172,83 +181,30 @@ class PdfStructureExtractor:
 
															                 except Exception:
														
 
															                     pass
														
 
															-        # === 阶段1: 收集所有需要 OCR 的表格区域 ===
														
 
															-        table_regions: List[TableRegion] = []
														
 
															-
														
 
															-        if self.use_ocr:
														
 
															-            logger.info("[OCR预处理] 扫描所有页面的表格区域...")
														
 
															-            total_pages = len(doc)
														
 
															-            for page_num in range(total_pages):
														
 
															-                page = doc.load_page(page_num)
														
 
															-                rect = page.rect
														
 
															-                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
														
 
															-                regions = self._detect_table_regions(page, page_num + 1, clip_box)
														
 
															-                for bbox, score in regions:
														
 
															-                    table_regions.append(TableRegion(
														
 
															-                        page_num=page_num + 1,
														
 
															-                        page=page,
														
 
															-                        bbox=bbox,
														
 
															-                        score=score
														
 
															-                    ))
														
 
															-                # 每5页或最后一页推送一次进度
														
 
															-                if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
														
 
															-                    progress = int((page_num + 1) / total_pages * 30)  # OCR预处理占30%进度
														
 
															-                    _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
														
 
															-            logger.info(f"[OCR预处理] 共发现 {len(table_regions)} 个表格区域需要 OCR")
														
 
															-
														
 
															-        # === 阶段2: 异步并发执行 OCR (5并发) ===
														
 
															-        ocr_results: List[OcrResult] = []
														
 
															-
														
 
															-        if table_regions:
														
 
															-            logger.info(f"[OCR执行] 使用 {self.OCR_CONCURRENT_WORKERS} 并发执行 OCR...")
														
 
															-            _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格，开始OCR识别...")
														
 
															-            ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
														
 
															-            success_count = sum(1 for r in ocr_results if r.success)
														
 
															-            logger.info(f"[OCR执行] 完成 {success_count}/{len(table_regions)} 个表格 OCR")
														
 
															-            _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
														
 
															+        total_pages = len(doc)
														
 
															-        # 按页码分组 OCR 结果
														
 
															-        ocr_by_page: Dict[int, List[OcrResult]] = {}
														
 
															-        for result in ocr_results:
														
 
															-            if result.success:
														
 
															-                if result.page_num not in ocr_by_page:
														
 
															-                    ocr_by_page[result.page_num] = []
														
 
															-                ocr_by_page[result.page_num].append(result)
														
 
															+        # ==================== 阶段1: 提取带坐标的文本块并归属到章节/小节====================
														
 
															+        logger.info("[阶段1] 提取带坐标的文本块并归属章节...")
														
 
															-        # === 阶段3: 提取页面文本（应用 OCR 结果）并切分章节 ===
														
 
															-        structured_data: Dict[str, Dict[str, Dict[str, Any]]] = {}
														
 
															+        # 数据结构: {(chapter_name, section_name): [blocks_with_position]}
														
 
															+        chapter_blocks: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
														
 
															         current_chapter = "未分类前言"
														
 
															         current_section = "默认部分"
														
 
															         in_body = False
														
 
															-        logger.info("[文本提取] 提取页面内容并切分章节...")
														
 
															-
														
 
															-        for page_num in range(len(doc)):
														
 
															+        for page_num in range(total_pages):
														
 
															             page = doc.load_page(page_num)
														
 
															             rect = page.rect
														
 
															             clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
														
 
															-            # 获取页面文本（应用 OCR 结果）
														
 
															-            if page_num + 1 in ocr_by_page:
														
 
															-                original_text = page.get_text("text", clip=clip_box)
														
 
															-                ocr_results_list = [
														
 
															-                    {
														
 
															-                        "region_index": i,
														
 
															-                        "bbox": r.bbox,
														
 
															-                        "score": r.score,
														
 
															-                        "ocr_text": r.text,
														
 
															-                    }
														
 
															-                    for i, r in enumerate(ocr_by_page[page_num + 1])
														
 
															-                ]
														
 
															-                text = self._replace_table_regions(page, original_text, ocr_results_list, clip_box)
														
 
															-            else:
														
 
															-                text = page.get_text("text", clip=clip_box)
														
 
															+            # 获取带坐标的文本块
														
 
															+            blocks = self._extract_text_blocks_with_position(page, clip_box)
														
 
															-            lines = text.split("\n")
														
 
															+            for block in blocks:
														
 
															+                line = block["text"]
														
 
															-            for line in lines:
														
 
															-                line = line.strip()
														
 
															-                if not line:
														
 
															+                # 跳过空行和页眉页脚
														
 
															+                if not line.strip():
														
 
															                     continue
														
 
															                 if self._is_header_footer(line):
														
 
															                     continue
														
@@ -268,53 +224,94 @@ class PdfStructureExtractor:
 
															                 if self.CHAPTER_PATTERN.match(line):
														
 
															                     current_chapter = self._clean_chapter_title(line)
														
 
															                     current_section = "章节标题"
														
 
															-                    if current_chapter not in structured_data:
														
 
															-                        structured_data[current_chapter] = {}
														
 
															-                    if current_section not in structured_data[current_chapter]:
														
 
															-                        structured_data[current_chapter][current_section] = {
														
 
															-                            "lines": [],
														
 
															-                            "page_start": page_num + 1,
														
 
															-                            "page_end": page_num + 1,
														
 
															-                        }
														
 
															+                    key = (current_chapter, current_section)
														
 
															+                    if key not in chapter_blocks:
														
 
															+                        chapter_blocks[key] = []
														
 
															+                    chapter_blocks[key].append(block)
														
 
															                     continue
														
 
															                 # 匹配节标题
														
 
															                 if self.SECTION_PATTERN.match(line):
														
 
															                     current_section = line
														
 
															-                    if current_chapter not in structured_data:
														
 
															-                        structured_data[current_chapter] = {}
														
 
															-                    if current_section not in structured_data[current_chapter]:
														
 
															-                        structured_data[current_chapter][current_section] = {
														
 
															-                            "lines": [],
														
 
															-                            "page_start": page_num + 1,
														
 
															-                            "page_end": page_num + 1,
														
 
															-                        }
														
 
															+                    key = (current_chapter, current_section)
														
 
															+                    if key not in chapter_blocks:
														
 
															+                        chapter_blocks[key] = []
														
 
															+                    chapter_blocks[key].append(block)
														
 
															                     continue
														
 
															-                # 确保结构存在
														
 
															-                if current_chapter not in structured_data:
														
 
															-                    structured_data[current_chapter] = {}
														
 
															-                if current_section not in structured_data[current_chapter]:
														
 
															-                    structured_data[current_chapter][current_section] = {
														
 
															-                        "lines": [],
														
 
															-                        "page_start": page_num + 1,
														
 
															-                        "page_end": page_num + 1,
														
 
															-                    }
														
 
															+                # 普通内容块
														
 
															+                key = (current_chapter, current_section)
														
 
															+                if key not in chapter_blocks:
														
 
															+                    chapter_blocks[key] = []
														
 
															+                chapter_blocks[key].append(block)
														
 
															+
														
 
															+        logger.info(f"[阶段1] 章节结构提取完成，共 {len({k[0] for k in chapter_blocks})} 个章节")
														
 
															+
														
 
															+        # ==================== 阶段2: 收集表格区域并OCR（如果启用OCR）====================
														
 
															+        table_regions: List[TableRegion] = []
														
 
															+        ocr_results: List[OcrResult] = []
														
 
															+
														
 
															+        if self.use_ocr:
														
 
															+            logger.info("[阶段2] 扫描表格区域...")
														
 
															+            for page_num in range(total_pages):
														
 
															+                page = doc.load_page(page_num)
														
 
															+                rect = page.rect
														
 
															+                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
														
 
															+                regions = self._detect_table_regions(page, page_num + 1, clip_box)
														
 
															+                for bbox, score in regions:
														
 
															+                    table_regions.append(TableRegion(
														
 
															+                        page_num=page_num + 1,
														
 
															+                        page=page,
														
 
															+                        bbox=bbox,
														
 
															+                        score=score
														
 
															+                    ))
														
 
															+                # 每5页推送进度
														
 
															+                if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
														
 
															+                    progress = int((page_num + 1) / total_pages * 30)
														
 
															+                    _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
														
 
															+
														
 
															+            logger.info(f"[阶段2] 发现 {len(table_regions)} 个表格区域")
														
 
															+
														
 
															+            # 执行OCR
														
 
															+            if table_regions:
														
 
															+                _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格，开始OCR识别...")
														
 
															+                ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
														
 
															+                success_count = sum(1 for r in ocr_results if r.success)
														
 
															+                logger.info(f"[阶段2] OCR完成 {success_count}/{len(table_regions)}")
														
 
															+                _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
														
 
															-                # 添加内容
														
 
															-                structured_data[current_chapter][current_section]["lines"].append(line)
														
 
															-                structured_data[current_chapter][current_section]["page_end"] = page_num + 1
														
 
															+        # ==================== 阶段3: 将OCR结果作为新块插入到对应章节====================
														
 
															+        if ocr_results:
														
 
															+            logger.info("[阶段3] 将OCR结果回填到对应章节...")
														
 
															+            self._insert_ocr_blocks_into_chapters(chapter_blocks, ocr_results)
														
 
															-        # 将行列表拼接为文本
														
 
															+        # ==================== 阶段4: 生成最终输出（块列表转纯文本）====================
														
 
															+        logger.info("[阶段4] 生成最终文本输出...")
														
 
															         result: Dict[str, Any] = {"chapters": {}}
														
 
															-        for chap, sections in structured_data.items():
														
 
															-            result["chapters"][chap] = {}
														
 
															-            for sec, data in sections.items():
														
 
															-                result["chapters"][chap][sec] = {
														
 
															-                    "content": "\n".join(data["lines"]),
														
 
															-                    "page_start": data["page_start"],
														
 
															-                    "page_end": data["page_end"],
														
 
															-                }
														
 
															+
														
 
															+        for (chap_name, sec_name), blocks in chapter_blocks.items():
														
 
															+            if chap_name not in result["chapters"]:
														
 
															+                result["chapters"][chap_name] = {}
														
 
															+
														
 
															+            # 按页码和Y坐标排序块
														
 
															+            blocks.sort(key=lambda b: (b["page"], b["bbox"][1]))
														
 
															+
														
 
															+            # 拼接文本
														
 
															+            lines = []
														
 
															+            page_start = blocks[0]["page"] if blocks else 1
														
 
															+            page_end = blocks[-1]["page"] if blocks else 1
														
 
															+
														
 
															+            for block in blocks:
														
 
															+                if block.get("type") == "table":
														
 
															+                    lines.append(f"\n[表格OCR识别结果]:\n{block['text']}\n[/表格]\n")
														
 
															+                else:
														
 
															+                    lines.append(block["text"])
														
 
															+
														
 
															+            result["chapters"][chap_name][sec_name] = {
														
 
															+                "content": "\n".join(lines),
														
 
															+                "page_start": page_start,
														
 
															+                "page_end": page_end,
														
 
															+            }
														
 
															         logger.info(f"[PdfExtractor] 提取完成，共 {len(result['chapters'])} 个章节")
														
 
															         return result
														
@@ -608,6 +605,170 @@ class PdfStructureExtractor:
 
															         return content
														
 
															+    def _extract_text_blocks_with_position(
														
 
															+        self,
														
 
															+        page: fitz.Page,
														
 
															+        clip_box: fitz.Rect
														
 
															+    ) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        提取带坐标的文本块列表。
														
 
															+
														
 
															+        使用 page.get_text("dict") 获取每个文本块的精确边界框和文本内容。
														
 
															+        """
														
 
															+        blocks = []
														
 
															+        page_dict = page.get_text("dict", clip=clip_box)
														
 
															+
														
 
															+        for block in page_dict.get("blocks", []):
														
 
															+            if block.get("type") == 0:  # 文本块
														
 
															+                bbox = block["bbox"]
														
 
															+                y_center = (bbox[1] + bbox[3]) / 2
														
 
															+
														
 
															+                # 拼接块内所有文本
														
 
															+                text_lines = []
														
 
															+                for line in block.get("lines", []):
														
 
															+                    line_text = ""
														
 
															+                    for span in line.get("spans", []):
														
 
															+                        line_text += span.get("text", "")
														
 
															+                    if line_text.strip():
														
 
															+                        text_lines.append(line_text)
														
 
															+
														
 
															+                if text_lines:
														
 
															+                    blocks.append({
														
 
															+                        "text": "\n".join(text_lines),
														
 
															+                        "page": page.number + 1,
														
 
															+                        "bbox": bbox,
														
 
															+                        "y_center": y_center,
														
 
															+                        "type": "text"
														
 
															+                    })
														
 
															+
														
 
															+        # 按阅读顺序排序（Y坐标为主，X坐标为辅）
														
 
															+        blocks.sort(key=lambda b: (b["page"], b["bbox"][1], b["bbox"][0]))
														
 
															+        return blocks
														
 
															+
														
 
															+    def _insert_ocr_blocks_into_chapters(
														
 
															+        self,
														
 
															+        chapter_blocks: Dict[Tuple[str, str], List[Dict[str, Any]]],
														
 
															+        ocr_results: List[OcrResult]
														
 
															+    ) -> None:
														
 
															+        """
														
 
															+        将OCR结果作为新的块插入到对应章节。
														
 
															+
														
 
															+        策略：
														
 
															+        1. 找到表格Y坐标所在的页面
														
 
															+        2. 在该页面的所有小节中，找到表格Y坐标介于哪两个文本块之间
														
 
															+        3. 将OCR块插入到正确位置
														
 
															+        """
														
 
															+        # 按页码分组OCR结果
														
 
															+        ocr_by_page: Dict[int, List[OcrResult]] = {}
														
 
															+        for result in ocr_results:
														
 
															+            if result.success:
														
 
															+                if result.page_num not in ocr_by_page:
														
 
															+                    ocr_by_page[result.page_num] = []
														
 
															+                ocr_by_page[result.page_num].append(result)
														
 
															+
														
 
															+        # 处理每个包含表格的页面
														
 
															+        for page_num, ocr_list in ocr_by_page.items():
														
 
															+            # 找到该页面涉及的所有小节
														
 
															+            page_sections = []
														
 
															+            for (chap_name, sec_name), blocks in chapter_blocks.items():
														
 
															+                # 检查该小节是否包含该页面的块
														
 
															+                page_blocks = [b for b in blocks if b["page"] == page_num]
														
 
															+                if page_blocks:
														
 
															+                    page_sections.append({
														
 
															+                        "chapter": chap_name,
														
 
															+                        "section": sec_name,
														
 
															+                        "blocks": page_blocks,
														
 
															+                        "all_blocks": blocks,  # 引用原列表用于插入
														
 
															+                    })
														
 
															+
														
 
															+            if not page_sections:
														
 
															+                logger.warning(f"[OCR回填] 第{page_num}页没有匹配到任何小节")
														
 
															+                continue
														
 
															+
														
 
															+            # 处理每个OCR结果
														
 
															+            for ocr_result in sorted(ocr_list, key=lambda r: r.bbox[1]):
														
 
															+                table_y_top = ocr_result.bbox[1]
														
 
															+                table_y_bottom = ocr_result.bbox[3]
														
 
															+                ocr_text = ocr_result.text
														
 
															+
														
 
															+                # 构造表格块
														
 
															+                table_block = {
														
 
															+                    "text": ocr_text,
														
 
															+                    "page": page_num,
														
 
															+                    "bbox": ocr_result.bbox,
														
 
															+                    "y_center": (table_y_top + table_y_bottom) / 2,
														
 
															+                    "type": "table"
														
 
															+                }
														
 
															+
														
 
															+                # 找到目标小节
														
 
															+                target_section = None
														
 
															+                insert_index = -1
														
 
															+
														
 
															+                for ps in page_sections:
														
 
															+                    # 获取该小节在该页面的所有块，按Y坐标排序
														
 
															+                    page_blocks = sorted(ps["blocks"], key=lambda b: b["bbox"][1])
														
 
															+
														
 
															+                    if not page_blocks:
														
 
															+                        continue
														
 
															+
														
 
															+                    # 找到表格应该插入的位置
														
 
															+                    # 策略：表格上边界位于哪个块之后
														
 
															+                    found = False
														
 
															+                    for i, block in enumerate(page_blocks):
														
 
															+                        block_y_bottom = block["bbox"][3]
														
 
															+                        if i < len(page_blocks) - 1:
														
 
															+                            next_y_top = page_blocks[i + 1]["bbox"][1]
														
 
															+                        else:
														
 
															+                            next_y_top = float('inf')
														
 
															+
														
 
															+                        # 如果表格位于当前块之后，且在下一块之前
														
 
															+                        if block_y_bottom <= table_y_top < next_y_top:
														
 
															+                            # 找到在原列表中的位置
														
 
															+                            try:
														
 
															+                                insert_index = ps["all_blocks"].index(block) + 1
														
 
															+                                target_section = ps
														
 
															+                                found = True
														
 
															+                                break
														
 
															+                            except ValueError:
														
 
															+                                continue
														
 
															+
														
 
															+                    # 如果表格在所有块之前
														
 
															+                    if not found and table_y_top < page_blocks[0]["bbox"][1]:
														
 
															+                        try:
														
 
															+                            insert_index = ps["all_blocks"].index(page_blocks[0])
														
 
															+                            target_section = ps
														
 
															+                            found = True
														
 
															+                        except ValueError:
														
 
															+                            continue
														
 
															+
														
 
															+                    # 如果表格在所有块之后
														
 
															+                    if not found and table_y_bottom > page_blocks[-1]["bbox"][3]:
														
 
															+                        try:
														
 
															+                            insert_index = ps["all_blocks"].index(page_blocks[-1]) + 1
														
 
															+                            target_section = ps
														
 
															+                            found = True
														
 
															+                        except ValueError:
														
 
															+                            continue
														
 
															+
														
 
															+                    if found:
														
 
															+                        break
														
 
															+
														
 
															+                # 执行插入
														
 
															+                if target_section and insert_index >= 0:
														
 
															+                    target_section["all_blocks"].insert(insert_index, table_block)
														
 
															+                    logger.debug(
														
 
															+                        f"[OCR回填] 第{page_num}页表格(Y={table_y_top:.0f}) -> "
														
 
															+                        f"{target_section['chapter']}/{target_section['section']} 位置{insert_index}"
														
 
															+                    )
														
 
															+                else:
														
 
															+                    # 兜底：追加到该页面第一个小节末尾
														
 
															+                    if page_sections:
														
 
															+                        ps = page_sections[0]
														
 
															+                        ps["all_blocks"].append(table_block)
														
 
															+                        logger.warning(
														
 
															+                            f"[OCR回填] 第{page_num}页表格无法精确定位，追加到 {ps['chapter']}/{ps['section']}"
														
 
															+                        )
														
 
															+
														
 
															     @staticmethod
														
 
															     def _is_header_footer(line: str) -> bool:
														
 
															         return (
														
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -14,6 +14,7 @@ from collections import defaultdict
 
															 from typing import Dict, Any, Optional, Tuple, List
														
 
															 from foundation.observability.logger.loggering import review_logger as logger
														
 
															+from foundation.observability.cachefiles import cache, CacheBaseDir
														
 
															 from .pdf_extractor import PdfStructureExtractor
														
 
															 from .toc_builder import build_toc_items_from_structure
														
@@ -123,7 +124,7 @@ class SimpleDocumentProcessor:
 
															         structure = self.pdf_extractor.extract(file_content, progress_callback=_extraction_progress)
														
 
															         # 文档提取质量检查
														
 
															-        self._check_extraction_quality(structure)
														
 
															+        self._check_extraction_quality(structure, file_name)
														
 
															         catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
														
@@ -459,6 +460,7 @@ class SimpleDocumentProcessor:
 
															     def _check_extraction_quality(
														
 
															         self,
														
 
															         structure: Dict[str, Any],
														
 
															+        file_name: str = "",
														
 
															         default_total_chapters: int = 10,
														
 
															         default_total_subsections: int = 41,
														
 
															         l1_threshold: float = 0.70,
														
@@ -538,3 +540,15 @@ class SimpleDocumentProcessor:
 
															         # 将质量检查结果添加到 chapters 中
														
 
															         chapters["quality_check"] = quality_result
														
 
															+
														
 
															+        # 保存提取结果到缓存目录，使用真实文件名
														
 
															+        if file_name:
														
 
															+            # 去掉扩展名，添加后缀
														
 
															+            base_name = file_name.rsplit(".", 1)[0] if "." in file_name else file_name
														
 
															+            cache_filename = f"{base_name}_预处理结果.json"
														
 
															+            cache.save(
														
 
															+                structure,
														
 
															+                subdir="document_temp",
														
 
															+                filename=cache_filename,
														
 
															+                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW,
														
 
															+            )