3 天之前 · b93cd363db
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
@@ -2,30 +2,18 @@
 
															 PDF 结构提取器 - 同步并发 OCR 版本
														
 
															 基于 splitter_pdf 逻辑，直接提取章节结构并记录页码。
														
 
															-支持 OCR 增强：检测表格区域并使用 ThreadPoolExecutor 5并发 OCR，其他文本保持 PyMuPDF 提取。
														
 
															+支持 OCR 增强：表格检测和识别委托给 OcrProcessor，其他文本保持 PyMuPDF 提取。
														
 
															 输出格式兼容后续分类与组装流程。
														
 
															 """
														
 
															-import base64
														
 
															-import io
														
 
															 import re
														
 
															-from concurrent.futures import ThreadPoolExecutor, as_completed
														
 
															 from dataclasses import dataclass
														
 
															 from typing import Dict, Any, List, Optional, Tuple, Set
														
 
															 import fitz
														
 
															-import numpy as np
														
 
															-import requests
														
 
															 from foundation.observability.logger.loggering import review_logger as logger
														
 
															-
														
 
															-# 尝试导入 RapidLayout
														
 
															-try:
														
 
															-    from rapid_layout import RapidLayout
														
 
															-    RAPID_LAYOUT_AVAILABLE = True
														
 
															-except ImportError:
														
 
															-    RAPID_LAYOUT_AVAILABLE = False
														
 
															-    RapidLayout = None
														
 
															+from .ocr_processor import OcrProcessor
														
 
															 @dataclass
														
@@ -110,12 +98,25 @@ class PdfStructureExtractor:
 
															     ):
														
 
															         self.clip_top = clip_top
														
 
															         self.clip_bottom = clip_bottom
														
 
															-        self.use_ocr = use_ocr and RAPID_LAYOUT_AVAILABLE
														
 
															         # OCR 配置
														
 
															         self.ocr_api_url = ocr_api_url
														
 
															         self.ocr_timeout = ocr_timeout
														
 
															         self.ocr_api_key = ocr_api_key
														
 
															+        self.ocr_processor: Optional[OcrProcessor] = None
														
 
															+        self.use_ocr = False
														
 
															+        if use_ocr:
														
 
															+            self.ocr_processor = OcrProcessor(
														
 
															+                ocr_api_url=ocr_api_url,
														
 
															+                ocr_timeout=ocr_timeout,
														
 
															+                ocr_api_key=ocr_api_key,
														
 
															+                max_short_edge=self.MAX_SHORT_EDGE,
														
 
															+                jpeg_quality=self.JPEG_QUALITY,
														
 
															+                ocr_dpi=self.OCR_DPI,
														
 
															+                confidence_threshold=self.OCR_CONFIDENCE_THRESHOLD,
														
 
															+                concurrent_workers=self.OCR_CONCURRENT_WORKERS,
														
 
															+            )
														
 
															+            self.use_ocr = self.ocr_processor.is_available()
														
 
															         self._layout_engine: Optional[Any] = None
														
 
															         # 目录检测配置
														
@@ -123,14 +124,11 @@ class PdfStructureExtractor:
 
															         self.toc_model_path = toc_model_path
														
 
															         self._toc_extractor = None
														
 
															-        if use_ocr and not RAPID_LAYOUT_AVAILABLE:
														
 
															-            logger.warning("RapidLayout 未安装，OCR 功能不可用")
														
 
															-
														
 
															     def _get_layout_engine(self) -> Optional[Any]:
														
 
															-        """延迟初始化 RapidLayout"""
														
 
															-        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
														
 
															-            self._layout_engine = RapidLayout()
														
 
															-        return self._layout_engine
														
 
															+        """兼容旧调用，实际由 OcrProcessor 管理版面引擎。"""
														
 
															+        if self.ocr_processor is None:
														
 
															+            return None
														
 
															+        return self.ocr_processor._get_layout_engine()
														
 
															     def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
														
 
															         """
														
@@ -1409,47 +1407,21 @@ class PdfStructureExtractor:
 
															         return None
														
 
															     def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
														
 
															-        """同步并发处理 OCR（使用 ThreadPoolExecutor）"""
														
 
															-        results: List[OcrResult] = []
														
 
															-        total = len(regions)
														
 
															-        completed = 0
														
 
															-
														
 
															-        with ThreadPoolExecutor(max_workers=self.OCR_CONCURRENT_WORKERS) as executor:
														
 
															-            # 提交所有任务
														
 
															-            future_to_region = {
														
 
															-                executor.submit(self._ocr_table_region, r.page, r.bbox): r
														
 
															-                for r in regions
														
 
															-            }
														
 
															+        """同步并发处理 OCR，具体实现委托给 OcrProcessor。"""
														
 
															+        if self.ocr_processor is None:
														
 
															+            return []
														
 
															-            # 处理完成的结果
														
 
															-            for future in as_completed(future_to_region):
														
 
															-                region = future_to_region[future]
														
 
															-                completed += 1
														
 
															-                try:
														
 
															-                    text = future.result()
														
 
															-                    results.append(OcrResult(
														
 
															-                        page_num=region.page_num,
														
 
															-                        bbox=region.bbox,
														
 
															-                        score=region.score,
														
 
															-                        text=text,
														
 
															-                        success=True,
														
 
															-                    ))
														
 
															-                except Exception as e:
														
 
															-                    logger.error(f"  第 {region.page_num} 页表格 OCR 失败: {e}")
														
 
															-                    results.append(OcrResult(
														
 
															-                        page_num=region.page_num,
														
 
															-                        bbox=region.bbox,
														
 
															-                        score=region.score,
														
 
															-                        text="",
														
 
															-                        success=False,
														
 
															-                    ))
														
 
															+        if not progress_callback:
														
 
															+            return self.ocr_processor.process_ocr_concurrent(regions)
														
 
															-                # 每完成5个或最后一个时推送进度
														
 
															-                if progress_callback and (completed % 5 == 0 or completed == total):
														
 
															-                    progress = 35 + int(completed / total * 15)  # OCR执行占15%进度(35-50)
														
 
															-                    progress_callback("版面分析", progress, f"OCR识别中 {completed}/{total}")
														
 
															+        def _progress_adapter(completed: int, total: int):
														
 
															+            progress = 35 + int(completed / total * 15) if total else 50
														
 
															+            progress_callback("版面分析", progress, f"OCR识别中 {completed}/{total}")
														
 
															-        return results
														
 
															+        return self.ocr_processor.process_ocr_concurrent(
														
 
															+            regions,
														
 
															+            progress_callback=_progress_adapter,
														
 
															+        )
														
 
															     def _detect_table_regions(
														
 
															         self,
														
@@ -1457,114 +1429,16 @@ class PdfStructureExtractor:
 
															         page_num: int,
														
 
															         clip_box: fitz.Rect
														
 
															     ) -> List[Tuple[Tuple[float, float, float, float], float]]:
														
 
															-        """检测页面中的表格区域，返回坐标列表"""
														
 
															-        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
														
 
															-
														
 
															-        if not RAPID_LAYOUT_AVAILABLE:
														
 
															-            return table_regions
														
 
															-
														
 
															-        layout_engine = self._get_layout_engine()
														
 
															-        if layout_engine is None:
														
 
															-            return table_regions
														
 
															-
														
 
															-        # 渲染页面（裁剪区域）
														
 
															-        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=clip_box)
														
 
															-        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
														
 
															-
														
 
															-        try:
														
 
															-            layout_output = layout_engine(img)
														
 
															-
														
 
															-            # 解析版面结果
														
 
															-            if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
														
 
															-                # 获取缩放比例
														
 
															-                scale_x = clip_box.width / img.shape[1]
														
 
															-                scale_y = clip_box.height / img.shape[0]
														
 
															-
														
 
															-                for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
														
 
															-                    if label == "table" and score > self.OCR_CONFIDENCE_THRESHOLD:
														
 
															-                        # 转换为 PDF 坐标
														
 
															-                        pdf_x1 = clip_box.x0 + box[0] * scale_x
														
 
															-                        pdf_y1 = clip_box.y0 + box[1] * scale_y
														
 
															-                        pdf_x2 = clip_box.x0 + box[2] * scale_x
														
 
															-                        pdf_y2 = clip_box.y0 + box[3] * scale_y
														
 
															-
														
 
															-                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
														
 
															-
														
 
															-        except Exception as e:
														
 
															-            logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
														
 
															-
														
 
															-        return table_regions
														
 
															+        """检测页面中的表格区域，具体实现委托给 OcrProcessor。"""
														
 
															+        if self.ocr_processor is None:
														
 
															+            return []
														
 
															+        return self.ocr_processor.detect_table_regions(page, page_num, clip_box)
														
 
															     def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float], max_retries: int = 3) -> str:
														
 
															-        """对指定区域进行 OCR 识别（使用 GLM-OCR），支持指数退避重试"""
														
 
															-        import time
														
 
															-
														
 
															-        # 渲染指定区域
														
 
															-        rect = fitz.Rect(bbox)
														
 
															-        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=rect)
														
 
															-        img_bytes = pix.tobytes("jpeg")
														
 
															-
														
 
															-        # 压缩图片
														
 
															-        compressed = self._compress_image(img_bytes)
														
 
															-        img_base64 = base64.b64encode(compressed).decode('utf-8')
														
 
															-
														
 
															-        # 请求 OCR
														
 
															-        payload = {
														
 
															-            "model": "GLM-OCR",
														
 
															-            "messages": [
														
 
															-                {
														
 
															-                    "role": "user",
														
 
															-                    "content": [
														
 
															-                        {
														
 
															-                            "type": "text",
														
 
															-                            "text": "识别图片中的表格内容，按原文排版输出。"
														
 
															-                                    "注意："
														
 
															-                                    "1. 表格用 Markdown 表格格式"
														
 
															-                                    "2. 保持换行和列对齐"
														
 
															-                                    "3. 只输出表格内容，不要其他说明"
														
 
															-                        },
														
 
															-                        {
														
 
															-                            "type": "image_url",
														
 
															-                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
														
 
															-                        }
														
 
															-                    ]
														
 
															-                }
														
 
															-            ],
														
 
															-            "max_tokens": 2048,
														
 
															-            "temperature": 0.1
														
 
															-        }
														
 
															-
														
 
															-        headers = {"Content-Type": "application/json"}
														
 
															-        if self.ocr_api_key:
														
 
															-            headers["Authorization"] = f"Bearer {self.ocr_api_key}"
														
 
															-
														
 
															-        # 指数退避重试
														
 
															-        last_error = None
														
 
															-        for attempt in range(max_retries):
														
 
															-            try:
														
 
															-                response = requests.post(
														
 
															-                    self.ocr_api_url,
														
 
															-                    headers=headers,
														
 
															-                    json=payload,
														
 
															-                    timeout=self.ocr_timeout
														
 
															-                )
														
 
															-                response.raise_for_status()
														
 
															-
														
 
															-                result = response.json()
														
 
															-                return self._extract_ocr_content(result)
														
 
															-
														
 
															-            except Exception as e:
														
 
															-                last_error = e
														
 
															-                if attempt < max_retries - 1:
														
 
															-                    # 指数退避: 2, 4, 8 秒
														
 
															-                    wait_time = 2 ** (attempt + 1)
														
 
															-                    logger.warning(f"  第 {page.number + 1} 页表格 OCR 第 {attempt + 1} 次失败: {e}, {wait_time}秒后重试...")
														
 
															-                    time.sleep(wait_time)
														
 
															-                else:
														
 
															-                    logger.error(f"  第 {page.number + 1} 页表格 OCR 最终失败（已重试{max_retries}次）: {e}")
														
 
															-
														
 
															-        # 所有重试都失败，抛出最后一个错误
														
 
															-        raise last_error
														
 
															+        """对指定区域进行 OCR 识别，具体实现委托给 OcrProcessor。"""
														
 
															+        if self.ocr_processor is None:
														
 
															+            raise RuntimeError("OCR processor is not initialized")
														
 
															+        return self.ocr_processor._ocr_table_region(page, bbox, max_retries=max_retries)
														
 
															     def _replace_table_regions(
														
 
															         self,
														
@@ -1573,10 +1447,16 @@ class PdfStructureExtractor:
 
															         ocr_results: List[Dict],
														
 
															         clip_box: fitz.Rect
														
 
															     ) -> str:
														
 
															-        """用 OCR 结果替换原始文本中的表格区域"""
														
 
															+        """用 OCR 结果替换原始文本中的表格区域。"""
														
 
															+        if self.ocr_processor is None:
														
 
															+            return original_text
														
 
															         if not ocr_results:
														
 
															             return original_text
														
 
															+        # 这里保留章节提取场景的兼容逻辑：
														
 
															+        # 1. 标题块不参与表格替换，避免目录/章节标题被表格框误吞；
														
 
															+        # 2. 仅替换真正落入表格区域的正文块，保留表格前后的普通文本；
														
 
															+        # 3. OCR 返回空时退回原始 PDF 文本，避免整块内容被清空。
														
 
															         text_blocks = []
														
 
															         for block in page.get_text("blocks"):
														
 
															             x0, y0, x1, y1, text, _, _ = block
														
@@ -1668,52 +1548,16 @@ class PdfStructureExtractor:
 
															         return False
														
 
															     def _compress_image(self, img_bytes: bytes) -> bytes:
														
 
															-        """压缩图片"""
														
 
															-        try:
														
 
															-            from PIL import Image
														
 
															-            img = Image.open(io.BytesIO(img_bytes))
														
 
															-
														
 
															-            if img.mode in ('RGBA', 'LA', 'P'):
														
 
															-                background = Image.new('RGB', img.size, (255, 255, 255))
														
 
															-                if img.mode == 'P':
														
 
															-                    img = img.convert('RGBA')
														
 
															-                if img.mode in ('RGBA', 'LA'):
														
 
															-                    background.paste(img, mask=img.split()[-1])
														
 
															-                img = background
														
 
															-            elif img.mode != 'RGB':
														
 
															-                img = img.convert('RGB')
														
 
															-
														
 
															-            min_edge = min(img.size)
														
 
															-            if min_edge > self.MAX_SHORT_EDGE:
														
 
															-                ratio = self.MAX_SHORT_EDGE / min_edge
														
 
															-                new_size = (int(img.width * ratio), int(img.height * ratio))
														
 
															-                img = img.resize(new_size, Image.Resampling.LANCZOS)
														
 
															-
														
 
															-            buffer = io.BytesIO()
														
 
															-            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
														
 
															-            return buffer.getvalue()
														
 
															-
														
 
															-        except Exception as e:
														
 
															-            logger.warning(f"图片压缩失败，使用原图: {e}")
														
 
															+        """压缩图片，具体实现委托给 OcrProcessor。"""
														
 
															+        if self.ocr_processor is None:
														
 
															             return img_bytes
														
 
															+        return self.ocr_processor._compress_image(img_bytes)
														
 
															     def _extract_ocr_content(self, result: Dict) -> str:
														
 
															-        """从 OCR 响应提取内容，并将 HTML 表格转换为 Markdown"""
														
 
															-        content = ""
														
 
															-        if "choices" in result and isinstance(result["choices"], list):
														
 
															-            if len(result["choices"]) > 0:
														
 
															-                message = result["choices"][0].get("message", {})
														
 
															-                content = message.get("content", "")
														
 
															-
														
 
															-        # 如果内容包含 HTML 标签，转换为 Markdown
														
 
															-        if content and "<" in content and ">" in content:
														
 
															-            try:
														
 
															-                from ..doc_worker.pdf_worker.html_to_markdown import convert_html_to_markdown
														
 
															-                content = convert_html_to_markdown(content)
														
 
															-            except Exception as e:
														
 
															-                logger.debug(f"HTML 转 Markdown 失败，保留原始内容: {e}")
														
 
															-
														
 
															-        return content
														
 
															+        """从 OCR 响应提取内容，具体实现委托给 OcrProcessor。"""
														
 
															+        if self.ocr_processor is None:
														
 
															+            return ""
														
 
															+        return self.ocr_processor._extract_ocr_content(result)
														
 
															     @staticmethod
														
 
															     def _is_header_footer(line: str) -> bool: