3 روز پیش · 175446bb96
--- a/core/construction_review/component/minimal_pipeline/ocr_processor.py
+++ b/core/construction_review/component/minimal_pipeline/ocr_processor.py
@@ -0,0 +1,458 @@
 
				+"""
			
 
				+OCR 处理模块 - 表格检测与识别
			
 
				+
			
 
				+提供 PDF 表格区域检测和 OCR 识别功能，支持：
			
 
				+- RapidLayout 表格区域检测
			
 
				+- GLM-OCR 并发识别
			
 
				+- 表格文本替换回填
			
 
				+"""
			
 
				+
			
 
				+import base64
			
 
				+import io
			
 
				+import time
			
 
				+from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Dict, Any, List, Optional, Tuple, Set
			
 
				+
			
 
				+import fitz
			
 
				+import numpy as np
			
 
				+import requests
			
 
				+
			
 
				+from foundation.observability.logger.loggering import review_logger as logger
			
 
				+
			
 
				+# 尝试导入 RapidLayout
			
 
				+try:
			
 
				+    from rapid_layout import RapidLayout
			
 
				+    RAPID_LAYOUT_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    RAPID_LAYOUT_AVAILABLE = False
			
 
				+    RapidLayout = None
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class TableRegion:
			
 
				+    """表格区域信息"""
			
 
				+    page_num: int
			
 
				+    page: fitz.Page
			
 
				+    bbox: Tuple[float, float, float, float]
			
 
				+    score: float
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class OcrResult:
			
 
				+    """OCR 结果"""
			
 
				+    page_num: int
			
 
				+    bbox: Tuple[float, float, float, float]
			
 
				+    score: float
			
 
				+    text: str
			
 
				+    success: bool
			
 
				+
			
 
				+
			
 
				+class OcrProcessor:
			
 
				+    """OCR 处理器：表格检测与识别"""
			
 
				+
			
 
				+    # 默认配置
			
 
				+    MAX_SHORT_EDGE = 1024
			
 
				+    JPEG_QUALITY = 90
			
 
				+    OCR_DPI = 200
			
 
				+    OCR_CONFIDENCE_THRESHOLD = 0.5
			
 
				+    OCR_CONCURRENT_WORKERS = 5
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
			
 
				+        ocr_timeout: int = 600,
			
 
				+        ocr_api_key: str = "",
			
 
				+        max_short_edge: int = 1024,
			
 
				+        jpeg_quality: int = 90,
			
 
				+        ocr_dpi: int = 200,
			
 
				+        confidence_threshold: float = 0.5,
			
 
				+        concurrent_workers: int = 5,
			
 
				+    ):
			
 
				+        """
			
 
				+        初始化 OCR 处理器
			
 
				+
			
 
				+        Args:
			
 
				+            ocr_api_url: OCR API 地址
			
 
				+            ocr_timeout: OCR 请求超时时间（秒）
			
 
				+            ocr_api_key: OCR API 密钥
			
 
				+            max_short_edge: 图片压缩后短边最大尺寸
			
 
				+            jpeg_quality: JPEG 压缩质量
			
 
				+            ocr_dpi: OCR 渲染 DPI
			
 
				+            confidence_threshold: 表格检测置信度阈值
			
 
				+            concurrent_workers: OCR 并发工作线程数
			
 
				+        """
			
 
				+        self.ocr_api_url = ocr_api_url
			
 
				+        self.ocr_timeout = ocr_timeout
			
 
				+        self.ocr_api_key = ocr_api_key
			
 
				+        self.max_short_edge = max_short_edge
			
 
				+        self.jpeg_quality = jpeg_quality
			
 
				+        self.ocr_dpi = ocr_dpi
			
 
				+        self.confidence_threshold = confidence_threshold
			
 
				+        self.concurrent_workers = concurrent_workers
			
 
				+
			
 
				+        self._layout_engine: Optional[Any] = None
			
 
				+
			
 
				+        if not RAPID_LAYOUT_AVAILABLE:
			
 
				+            logger.warning("RapidLayout 未安装，表格检测功能不可用")
			
 
				+
			
 
				+    def is_available(self) -> bool:
			
 
				+        """检查 OCR 功能是否可用"""
			
 
				+        return RAPID_LAYOUT_AVAILABLE
			
 
				+
			
 
				+    def _get_layout_engine(self) -> Optional[Any]:
			
 
				+        """延迟初始化 RapidLayout"""
			
 
				+        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
			
 
				+            self._layout_engine = RapidLayout()
			
 
				+        return self._layout_engine
			
 
				+
			
 
				+    def detect_table_regions(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        page_num: int,
			
 
				+        clip_box: fitz.Rect
			
 
				+    ) -> List[Tuple[Tuple[float, float, float, float], float]]:
			
 
				+        """
			
 
				+        检测页面中的表格区域
			
 
				+
			
 
				+        Args:
			
 
				+            page: PDF 页面对象
			
 
				+            page_num: 页码（用于日志）
			
 
				+            clip_box: 裁剪区域
			
 
				+
			
 
				+        Returns:
			
 
				+            列表，元素为 ((x1, y1, x2, y2), score)
			
 
				+        """
			
 
				+        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
			
 
				+
			
 
				+        if not RAPID_LAYOUT_AVAILABLE:
			
 
				+            return table_regions
			
 
				+
			
 
				+        layout_engine = self._get_layout_engine()
			
 
				+        if layout_engine is None:
			
 
				+            return table_regions
			
 
				+
			
 
				+        # 渲染页面（裁剪区域）
			
 
				+        pix = page.get_pixmap(dpi=self.ocr_dpi, clip=clip_box)
			
 
				+        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
			
 
				+
			
 
				+        try:
			
 
				+            layout_output = layout_engine(img)
			
 
				+
			
 
				+            # 解析版面结果
			
 
				+            if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
			
 
				+                # 获取缩放比例
			
 
				+                scale_x = clip_box.width / img.shape[1]
			
 
				+                scale_y = clip_box.height / img.shape[0]
			
 
				+
			
 
				+                for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
			
 
				+                    if label == "table" and score > self.confidence_threshold:
			
 
				+                        # 转换为 PDF 坐标
			
 
				+                        pdf_x1 = clip_box.x0 + box[0] * scale_x
			
 
				+                        pdf_y1 = clip_box.y0 + box[1] * scale_y
			
 
				+                        pdf_x2 = clip_box.x0 + box[2] * scale_x
			
 
				+                        pdf_y2 = clip_box.y0 + box[3] * scale_y
			
 
				+
			
 
				+                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
			
 
				+
			
 
				+        return table_regions
			
 
				+
			
 
				+    def process_ocr_concurrent(
			
 
				+        self,
			
 
				+        regions: List[TableRegion],
			
 
				+        progress_callback=None
			
 
				+    ) -> List[OcrResult]:
			
 
				+        """
			
 
				+        同步并发处理 OCR
			
 
				+
			
 
				+        Args:
			
 
				+            regions: 表格区域列表
			
 
				+            progress_callback: 进度回调函数，接收 (completed, total) 参数
			
 
				+
			
 
				+        Returns:
			
 
				+            OCR 结果列表
			
 
				+        """
			
 
				+        results: List[OcrResult] = []
			
 
				+        total = len(regions)
			
 
				+        completed = 0
			
 
				+
			
 
				+        with ThreadPoolExecutor(max_workers=self.concurrent_workers) as executor:
			
 
				+            # 提交所有任务
			
 
				+            future_to_region = {
			
 
				+                executor.submit(self._ocr_table_region, r.page, r.bbox): r
			
 
				+                for r in regions
			
 
				+            }
			
 
				+
			
 
				+            # 处理完成的结果
			
 
				+            for future in as_completed(future_to_region):
			
 
				+                region = future_to_region[future]
			
 
				+                completed += 1
			
 
				+                try:
			
 
				+                    text = future.result()
			
 
				+                    results.append(OcrResult(
			
 
				+                        page_num=region.page_num,
			
 
				+                        bbox=region.bbox,
			
 
				+                        score=region.score,
			
 
				+                        text=text,
			
 
				+                        success=True,
			
 
				+                    ))
			
 
				+                except Exception as e:
			
 
				+                    logger.error(f"  第 {region.page_num} 页表格 OCR 失败: {e}")
			
 
				+                    results.append(OcrResult(
			
 
				+                        page_num=region.page_num,
			
 
				+                        bbox=region.bbox,
			
 
				+                        score=region.score,
			
 
				+                        text="",
			
 
				+                        success=False,
			
 
				+                    ))
			
 
				+
			
 
				+                # 每完成5个或最后一个时推送进度
			
 
				+                if progress_callback and (completed % 5 == 0 or completed == total):
			
 
				+                    progress_callback(completed, total)
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				+    def _ocr_table_region(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        bbox: Tuple[float, float, float, float],
			
 
				+        max_retries: int = 3
			
 
				+    ) -> str:
			
 
				+        """
			
 
				+        对指定区域进行 OCR 识别（使用 GLM-OCR），支持指数退避重试
			
 
				+
			
 
				+        Args:
			
 
				+            page: PDF 页面对象
			
 
				+            bbox: 区域坐标 (x1, y1, x2, y2)
			
 
				+            max_retries: 最大重试次数
			
 
				+
			
 
				+        Returns:
			
 
				+            识别的文本内容
			
 
				+        """
			
 
				+        # 渲染指定区域
			
 
				+        rect = fitz.Rect(bbox)
			
 
				+        pix = page.get_pixmap(dpi=self.ocr_dpi, clip=rect)
			
 
				+        img_bytes = pix.tobytes("jpeg")
			
 
				+
			
 
				+        # 压缩图片
			
 
				+        compressed = self._compress_image(img_bytes)
			
 
				+        img_base64 = base64.b64encode(compressed).decode('utf-8')
			
 
				+
			
 
				+        # 请求 OCR
			
 
				+        payload = {
			
 
				+            "model": "GLM-OCR",
			
 
				+            "messages": [
			
 
				+                {
			
 
				+                    "role": "user",
			
 
				+                    "content": [
			
 
				+                        {
			
 
				+                            "type": "text",
			
 
				+                            "text": "识别图片中的表格内容，按原文排版输出。"
			
 
				+                                    "注意："
			
 
				+                                    "1. 表格用 Markdown 表格格式"
			
 
				+                                    "2. 保持换行和列对齐"
			
 
				+                                    "3. 只输出表格内容，不要其他说明"
			
 
				+                        },
			
 
				+                        {
			
 
				+                            "type": "image_url",
			
 
				+                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
			
 
				+                        }
			
 
				+                    ]
			
 
				+                }
			
 
				+            ],
			
 
				+            "max_tokens": 2048,
			
 
				+            "temperature": 0.1
			
 
				+        }
			
 
				+
			
 
				+        headers = {"Content-Type": "application/json"}
			
 
				+        if self.ocr_api_key:
			
 
				+            headers["Authorization"] = f"Bearer {self.ocr_api_key}"
			
 
				+
			
 
				+        # 指数退避重试
			
 
				+        last_error = None
			
 
				+        for attempt in range(max_retries):
			
 
				+            try:
			
 
				+                response = requests.post(
			
 
				+                    self.ocr_api_url,
			
 
				+                    headers=headers,
			
 
				+                    json=payload,
			
 
				+                    timeout=self.ocr_timeout
			
 
				+                )
			
 
				+                response.raise_for_status()
			
 
				+
			
 
				+                result = response.json()
			
 
				+                return self._extract_ocr_content(result)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                last_error = e
			
 
				+                if attempt < max_retries - 1:
			
 
				+                    # 指数退避: 2, 4, 8 秒
			
 
				+                    wait_time = 2 ** (attempt + 1)
			
 
				+                    logger.warning(f"  第 {page.number + 1} 页表格 OCR 第 {attempt + 1} 次失败: {e}, {wait_time}秒后重试...")
			
 
				+                    time.sleep(wait_time)
			
 
				+                else:
			
 
				+                    logger.error(f"  第 {page.number + 1} 页表格 OCR 最终失败（已重试{max_retries}次）: {e}")
			
 
				+
			
 
				+        # 所有重试都失败，抛出最后一个错误
			
 
				+        raise last_error
			
 
				+
			
 
				+    def _compress_image(self, img_bytes: bytes) -> bytes:
			
 
				+        """
			
 
				+        压缩图片
			
 
				+
			
 
				+        Args:
			
 
				+            img_bytes: 原始图片字节
			
 
				+
			
 
				+        Returns:
			
 
				+            压缩后的图片字节
			
 
				+        """
			
 
				+        try:
			
 
				+            from PIL import Image
			
 
				+            img = Image.open(io.BytesIO(img_bytes))
			
 
				+
			
 
				+            if img.mode in ('RGBA', 'LA', 'P'):
			
 
				+                background = Image.new('RGB', img.size, (255, 255, 255))
			
 
				+                if img.mode == 'P':
			
 
				+                    img = img.convert('RGBA')
			
 
				+                if img.mode in ('RGBA', 'LA'):
			
 
				+                    background.paste(img, mask=img.split()[-1])
			
 
				+                img = background
			
 
				+            elif img.mode != 'RGB':
			
 
				+                img = img.convert('RGB')
			
 
				+
			
 
				+            min_edge = min(img.size)
			
 
				+            if min_edge > self.max_short_edge:
			
 
				+                ratio = self.max_short_edge / min_edge
			
 
				+                new_size = (int(img.width * ratio), int(img.height * ratio))
			
 
				+                img = img.resize(new_size, Image.Resampling.LANCZOS)
			
 
				+
			
 
				+            buffer = io.BytesIO()
			
 
				+            img.save(buffer, format='JPEG', quality=self.jpeg_quality, optimize=True)
			
 
				+            return buffer.getvalue()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"图片压缩失败，使用原图: {e}")
			
 
				+            return img_bytes
			
 
				+
			
 
				+    def _extract_ocr_content(self, result: Dict) -> str:
			
 
				+        """
			
 
				+        从 OCR 响应提取内容，并将 HTML 表格转换为 Markdown
			
 
				+
			
 
				+        Args:
			
 
				+            result: OCR API 响应
			
 
				+
			
 
				+        Returns:
			
 
				+            提取的文本内容
			
 
				+        """
			
 
				+        content = ""
			
 
				+        if "choices" in result and isinstance(result["choices"], list):
			
 
				+            if len(result["choices"]) > 0:
			
 
				+                message = result["choices"][0].get("message", {})
			
 
				+                content = message.get("content", "")
			
 
				+
			
 
				+        # 如果内容包含 HTML 标签，转换为 Markdown
			
 
				+        if content and "<" in content and ">" in content:
			
 
				+            try:
			
 
				+                from ..doc_worker.pdf_worker.html_to_markdown import convert_html_to_markdown
			
 
				+                content = convert_html_to_markdown(content)
			
 
				+            except Exception as e:
			
 
				+                logger.debug(f"HTML 转 Markdown 失败，保留原始内容: {e}")
			
 
				+
			
 
				+        return content
			
 
				+
			
 
				+    def replace_table_regions(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        original_text: str,
			
 
				+        ocr_results: List[Dict],
			
 
				+        clip_box: fitz.Rect
			
 
				+    ) -> str:
			
 
				+        """
			
 
				+        用 OCR 结果替换原始文本中的表格区域
			
 
				+
			
 
				+        Args:
			
 
				+            page: PDF 页面对象
			
 
				+            original_text: 原始文本
			
 
				+            ocr_results: OCR 结果列表，每个元素包含 region_index, bbox, score, ocr_text
			
 
				+            clip_box: 裁剪区域
			
 
				+
			
 
				+        Returns:
			
 
				+            替换后的文本
			
 
				+        """
			
 
				+        if not ocr_results:
			
 
				+            return original_text
			
 
				+
			
 
				+        # 获取页面上的文本块及其坐标
			
 
				+        text_blocks = []
			
 
				+        for block in page.get_text("blocks"):
			
 
				+            x0, y0, x1, y1, text, _, _ = block
			
 
				+            # 只考虑裁剪区域内的文本
			
 
				+            if y0 >= clip_box.y0 and y1 <= clip_box.y1:
			
 
				+                text_blocks.append({
			
 
				+                    "bbox": (x0, y0, x1, y1),
			
 
				+                    "text": text.strip(),
			
 
				+                })
			
 
				+
			
 
				+        # 按 Y 坐标排序
			
 
				+        text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
			
 
				+
			
 
				+        # 找出属于表格区域的文本块
			
 
				+        replaced_indices: Set[int] = set()
			
 
				+        for ocr_result in ocr_results:
			
 
				+            bbox = ocr_result["bbox"]
			
 
				+            rx0, ry0, rx1, ry1 = bbox
			
 
				+
			
 
				+            for idx, block in enumerate(text_blocks):
			
 
				+                if idx in replaced_indices:
			
 
				+                    continue
			
 
				+                bx0, by0, bx1, by1 = block["bbox"]
			
 
				+
			
 
				+                # 检查重叠
			
 
				+                overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
			
 
				+                overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
			
 
				+                overlap_area = overlap_x * overlap_y
			
 
				+                block_area = (bx1 - bx0) * (by1 - by0)
			
 
				+
			
 
				+                if block_area > 0 and overlap_area / block_area > 0.5:
			
 
				+                    replaced_indices.add(idx)
			
 
				+
			
 
				+        # 构建新文本
			
 
				+        result_parts: List[str] = []
			
 
				+        last_idx = 0
			
 
				+
			
 
				+        for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
			
 
				+            bbox = ocr_result["bbox"]
			
 
				+            rx0, ry0, rx1, ry1 = bbox
			
 
				+
			
 
				+            # 找到该表格区域之前的文本
			
 
				+            region_start_idx = None
			
 
				+            for idx, block in enumerate(text_blocks):
			
 
				+                if idx in replaced_indices:
			
 
				+                    bx0, by0, bx1, by1 = block["bbox"]
			
 
				+                    if (bx0 >= rx0 - 5 and bx1 <= rx1 + 5 and
			
 
				+                        by0 >= ry0 - 5 and by1 <= ry1 + 5):
			
 
				+                        if region_start_idx is None:
			
 
				+                            region_start_idx = idx
			
 
				+                        last_idx = idx + 1
			
 
				+
			
 
				+            if region_start_idx is not None:
			
 
				+                # 添加表格前的非表格文本
			
 
				+                for idx in range(last_idx - (last_idx - region_start_idx), region_start_idx):
			
 
				+                    if idx not in replaced_indices and idx < len(text_blocks):
			
 
				+                        result_parts.append(text_blocks[idx]["text"])
			
 
				+                        result_parts.append("\n")
			
 
				+
			
 
				+                # 添加 OCR 结果
			
 
				+                result_parts.append(ocr_result["ocr_text"])
			
 
				+                result_parts.append("\n")
			
 
				+
			
 
				+        # 添加剩余文本
			
 
				+        for idx in range(last_idx, len(text_blocks)):
			
 
				+            if idx not in replaced_indices:
			
 
				+                result_parts.append(text_blocks[idx]["text"])
			
 
				+                result_parts.append("\n")
			
 
				+
			
 
				+        return "".join(result_parts).strip() or original_text
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor.py
@@ -6,19 +6,15 @@ PDF 结构提取器 - 同步并发 OCR 版本
 
				 输出格式兼容后续分类与组装流程。
			
 
				 """
			
 
				 
			
 
				-import base64
			
 
				-import io
			
 
				 import re
			
 
				-from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				-from dataclasses import dataclass
			
 
				 from typing import Dict, Any, List, Optional, Tuple
			
 
				 
			
 
				 import fitz
			
 
				-import numpy as np
			
 
				-import requests
			
 
				 
			
 
				 from foundation.observability.logger.loggering import review_logger as logger
			
 
				 
			
 
				+from .ocr_processor import OcrProcessor, TableRegion, OcrResult
			
 
				+
			
 
				 # 尝试导入 RapidLayout
			
 
				 try:
			
 
				     from rapid_layout import RapidLayout
			
@@ -28,25 +24,6 @@ except ImportError:
 
				     RapidLayout = None
			
 
				 
			
 
				 
			
 
				-@dataclass
			
 
				-class TableRegion:
			
 
				-    """表格区域信息"""
			
 
				-    page_num: int
			
 
				-    page: fitz.Page
			
 
				-    bbox: Tuple[float, float, float, float]
			
 
				-    score: float
			
 
				-
			
 
				-
			
 
				-@dataclass
			
 
				-class OcrResult:
			
 
				-    """OCR 结果"""
			
 
				-    page_num: int
			
 
				-    bbox: Tuple[float, float, float, float]
			
 
				-    score: float
			
 
				-    text: str
			
 
				-    success: bool
			
 
				-
			
 
				-
			
 
				 class PdfStructureExtractor:
			
 
				     """PDF 章节结构提取器（支持 OCR 异步并发）"""
			
 
				 
			
@@ -54,13 +31,6 @@ class PdfStructureExtractor:
 
				     SECTION_PATTERN = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
			
 
				     TOC_PATTERN = re.compile(r"\.{3,}|…{2,}")
			
 
				 
			
 
				-    # OCR 配置
			
 
				-    MAX_SHORT_EDGE = 1024
			
 
				-    JPEG_QUALITY = 90
			
 
				-    OCR_DPI = 200
			
 
				-    OCR_CONFIDENCE_THRESHOLD = 0.5
			
 
				-    OCR_CONCURRENT_WORKERS = 5
			
 
				-
			
 
				     def __init__(
			
 
				         self,
			
 
				         clip_top: float = 60,
			
@@ -76,11 +46,12 @@ class PdfStructureExtractor:
 
				         self.clip_bottom = clip_bottom
			
 
				         self.use_ocr = use_ocr and RAPID_LAYOUT_AVAILABLE
			
 
				 
			
 
				-        # OCR 配置
			
 
				-        self.ocr_api_url = ocr_api_url
			
 
				-        self.ocr_timeout = ocr_timeout
			
 
				-        self.ocr_api_key = ocr_api_key
			
 
				-        self._layout_engine: Optional[Any] = None
			
 
				+        # 初始化 OCR 处理器
			
 
				+        self._ocr_processor = OcrProcessor(
			
 
				+            ocr_api_url=ocr_api_url,
			
 
				+            ocr_timeout=ocr_timeout,
			
 
				+            ocr_api_key=ocr_api_key,
			
 
				+        ) if self.use_ocr else None
			
 
				 
			
 
				         # 目录检测配置
			
 
				         self.detect_toc = detect_toc
			
@@ -90,12 +61,6 @@ class PdfStructureExtractor:
 
				         if use_ocr and not RAPID_LAYOUT_AVAILABLE:
			
 
				             logger.warning("RapidLayout 未安装，OCR 功能不可用")
			
 
				 
			
 
				-    def _get_layout_engine(self) -> Optional[Any]:
			
 
				-        """延迟初始化 RapidLayout"""
			
 
				-        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
			
 
				-            self._layout_engine = RapidLayout()
			
 
				-        return self._layout_engine
			
 
				-
			
 
				     def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
			
 
				         """
			
 
				         从 PDF 字节流提取章节结构。
			
@@ -152,11 +117,17 @@ class PdfStructureExtractor:
 
				         from .toc_detector import TOCCatalogExtractor
			
 
				 
			
 
				         if self._toc_extractor is None:
			
 
				+            # 使用 OCR 处理器的配置（如果已初始化）
			
 
				+            ocr_config = {}
			
 
				+            if self._ocr_processor:
			
 
				+                ocr_config = {
			
 
				+                    "ocr_api_url": self._ocr_processor.ocr_api_url,
			
 
				+                    "ocr_api_key": self._ocr_processor.ocr_api_key,
			
 
				+                    "ocr_timeout": self._ocr_processor.ocr_timeout,
			
 
				+                }
			
 
				             self._toc_extractor = TOCCatalogExtractor(
			
 
				                 model_path=self.toc_model_path,
			
 
				-                ocr_api_url=self.ocr_api_url,
			
 
				-                ocr_api_key=self.ocr_api_key,
			
 
				-                ocr_timeout=self.ocr_timeout,
			
 
				+                **ocr_config
			
 
				             )
			
 
				 
			
 
				         return self._toc_extractor.detect_and_extract(file_content, progress_callback)
			
@@ -251,13 +222,13 @@ class PdfStructureExtractor:
 
				         table_regions: List[TableRegion] = []
			
 
				         ocr_results: List[OcrResult] = []
			
 
				 
			
 
				-        if self.use_ocr:
			
 
				+        if self.use_ocr and self._ocr_processor:
			
 
				             logger.info("[阶段2] 扫描表格区域...")
			
 
				             for page_num in range(total_pages):
			
 
				                 page = doc.load_page(page_num)
			
 
				                 rect = page.rect
			
 
				                 clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				-                regions = self._detect_table_regions(page, page_num + 1, clip_box)
			
 
				+                regions = self._ocr_processor.detect_table_regions(page, page_num + 1, clip_box)
			
 
				                 for bbox, score in regions:
			
 
				                     table_regions.append(TableRegion(
			
 
				                         page_num=page_num + 1,
			
@@ -275,7 +246,12 @@ class PdfStructureExtractor:
 
				             # 执行OCR
			
 
				             if table_regions:
			
 
				                 _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格，开始OCR识别...")
			
 
				-                ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
			
 
				+                ocr_results = self._ocr_processor.process_ocr_concurrent(
			
 
				+                    table_regions,
			
 
				+                    progress_callback=lambda completed, total: _emit_progress(
			
 
				+                        "版面分析", 35 + int(completed / total * 15), f"OCR识别中 {completed}/{total}"
			
 
				+                    )
			
 
				+                )
			
 
				                 success_count = sum(1 for r in ocr_results if r.success)
			
 
				                 logger.info(f"[阶段2] OCR完成 {success_count}/{len(table_regions)}")
			
 
				                 _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
			
@@ -316,295 +292,6 @@ class PdfStructureExtractor:
 
				         logger.info(f"[PdfExtractor] 提取完成，共 {len(result['chapters'])} 个章节")
			
 
				         return result
			
 
				 
			
 
				-    def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
			
 
				-        """同步并发处理 OCR（使用 ThreadPoolExecutor）"""
			
 
				-        results: List[OcrResult] = []
			
 
				-        total = len(regions)
			
 
				-        completed = 0
			
 
				-
			
 
				-        with ThreadPoolExecutor(max_workers=self.OCR_CONCURRENT_WORKERS) as executor:
			
 
				-            # 提交所有任务
			
 
				-            future_to_region = {
			
 
				-                executor.submit(self._ocr_table_region, r.page, r.bbox): r
			
 
				-                for r in regions
			
 
				-            }
			
 
				-
			
 
				-            # 处理完成的结果
			
 
				-            for future in as_completed(future_to_region):
			
 
				-                region = future_to_region[future]
			
 
				-                completed += 1
			
 
				-                try:
			
 
				-                    text = future.result()
			
 
				-                    results.append(OcrResult(
			
 
				-                        page_num=region.page_num,
			
 
				-                        bbox=region.bbox,
			
 
				-                        score=region.score,
			
 
				-                        text=text,
			
 
				-                        success=True,
			
 
				-                    ))
			
 
				-                except Exception as e:
			
 
				-                    logger.error(f"  第 {region.page_num} 页表格 OCR 失败: {e}")
			
 
				-                    results.append(OcrResult(
			
 
				-                        page_num=region.page_num,
			
 
				-                        bbox=region.bbox,
			
 
				-                        score=region.score,
			
 
				-                        text="",
			
 
				-                        success=False,
			
 
				-                    ))
			
 
				-
			
 
				-                # 每完成5个或最后一个时推送进度
			
 
				-                if progress_callback and (completed % 5 == 0 or completed == total):
			
 
				-                    progress = 35 + int(completed / total * 15)  # OCR执行占15%进度(35-50)
			
 
				-                    progress_callback("版面分析", progress, f"OCR识别中 {completed}/{total}")
			
 
				-
			
 
				-        return results
			
 
				-
			
 
				-    def _detect_table_regions(
			
 
				-        self,
			
 
				-        page: fitz.Page,
			
 
				-        page_num: int,
			
 
				-        clip_box: fitz.Rect
			
 
				-    ) -> List[Tuple[Tuple[float, float, float, float], float]]:
			
 
				-        """检测页面中的表格区域，返回坐标列表"""
			
 
				-        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
			
 
				-
			
 
				-        if not RAPID_LAYOUT_AVAILABLE:
			
 
				-            return table_regions
			
 
				-
			
 
				-        layout_engine = self._get_layout_engine()
			
 
				-        if layout_engine is None:
			
 
				-            return table_regions
			
 
				-
			
 
				-        # 渲染页面（裁剪区域）
			
 
				-        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=clip_box)
			
 
				-        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
			
 
				-
			
 
				-        try:
			
 
				-            layout_output = layout_engine(img)
			
 
				-
			
 
				-            # 解析版面结果
			
 
				-            if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
			
 
				-                # 获取缩放比例
			
 
				-                scale_x = clip_box.width / img.shape[1]
			
 
				-                scale_y = clip_box.height / img.shape[0]
			
 
				-
			
 
				-                for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
			
 
				-                    if label == "table" and score > self.OCR_CONFIDENCE_THRESHOLD:
			
 
				-                        # 转换为 PDF 坐标
			
 
				-                        pdf_x1 = clip_box.x0 + box[0] * scale_x
			
 
				-                        pdf_y1 = clip_box.y0 + box[1] * scale_y
			
 
				-                        pdf_x2 = clip_box.x0 + box[2] * scale_x
			
 
				-                        pdf_y2 = clip_box.y0 + box[3] * scale_y
			
 
				-
			
 
				-                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
			
 
				-
			
 
				-        except Exception as e:
			
 
				-            logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
			
 
				-
			
 
				-        return table_regions
			
 
				-
			
 
				-    def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float], max_retries: int = 3) -> str:
			
 
				-        """对指定区域进行 OCR 识别（使用 GLM-OCR），支持指数退避重试"""
			
 
				-        import time
			
 
				-
			
 
				-        # 渲染指定区域
			
 
				-        rect = fitz.Rect(bbox)
			
 
				-        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=rect)
			
 
				-        img_bytes = pix.tobytes("jpeg")
			
 
				-
			
 
				-        # 压缩图片
			
 
				-        compressed = self._compress_image(img_bytes)
			
 
				-        img_base64 = base64.b64encode(compressed).decode('utf-8')
			
 
				-
			
 
				-        # 请求 OCR
			
 
				-        payload = {
			
 
				-            "model": "GLM-OCR",
			
 
				-            "messages": [
			
 
				-                {
			
 
				-                    "role": "user",
			
 
				-                    "content": [
			
 
				-                        {
			
 
				-                            "type": "text",
			
 
				-                            "text": "识别图片中的表格内容，按原文排版输出。"
			
 
				-                                    "注意："
			
 
				-                                    "1. 表格用 Markdown 表格格式"
			
 
				-                                    "2. 保持换行和列对齐"
			
 
				-                                    "3. 只输出表格内容，不要其他说明"
			
 
				-                        },
			
 
				-                        {
			
 
				-                            "type": "image_url",
			
 
				-                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
			
 
				-                        }
			
 
				-                    ]
			
 
				-                }
			
 
				-            ],
			
 
				-            "max_tokens": 2048,
			
 
				-            "temperature": 0.1
			
 
				-        }
			
 
				-
			
 
				-        headers = {"Content-Type": "application/json"}
			
 
				-        if self.ocr_api_key:
			
 
				-            headers["Authorization"] = f"Bearer {self.ocr_api_key}"
			
 
				-
			
 
				-        # 指数退避重试
			
 
				-        last_error = None
			
 
				-        for attempt in range(max_retries):
			
 
				-            try:
			
 
				-                response = requests.post(
			
 
				-                    self.ocr_api_url,
			
 
				-                    headers=headers,
			
 
				-                    json=payload,
			
 
				-                    timeout=self.ocr_timeout
			
 
				-                )
			
 
				-                response.raise_for_status()
			
 
				-
			
 
				-                result = response.json()
			
 
				-                return self._extract_ocr_content(result)
			
 
				-
			
 
				-            except Exception as e:
			
 
				-                last_error = e
			
 
				-                if attempt < max_retries - 1:
			
 
				-                    # 指数退避: 2, 4, 8 秒
			
 
				-                    wait_time = 2 ** (attempt + 1)
			
 
				-                    logger.warning(f"  第 {page.number + 1} 页表格 OCR 第 {attempt + 1} 次失败: {e}, {wait_time}秒后重试...")
			
 
				-                    time.sleep(wait_time)
			
 
				-                else:
			
 
				-                    logger.error(f"  第 {page.number + 1} 页表格 OCR 最终失败（已重试{max_retries}次）: {e}")
			
 
				-
			
 
				-        # 所有重试都失败，抛出最后一个错误
			
 
				-        raise last_error
			
 
				-
			
 
				-    def _replace_table_regions(
			
 
				-        self,
			
 
				-        page: fitz.Page,
			
 
				-        original_text: str,
			
 
				-        ocr_results: List[Dict],
			
 
				-        clip_box: fitz.Rect
			
 
				-    ) -> str:
			
 
				-        """用 OCR 结果替换原始文本中的表格区域"""
			
 
				-        if not ocr_results:
			
 
				-            return original_text
			
 
				-
			
 
				-        # 获取页面上的文本块及其坐标
			
 
				-        text_blocks = []
			
 
				-        for block in page.get_text("blocks"):
			
 
				-            x0, y0, x1, y1, text, _, _ = block
			
 
				-            # 只考虑裁剪区域内的文本
			
 
				-            if y0 >= clip_box.y0 and y1 <= clip_box.y1:
			
 
				-                text_blocks.append({
			
 
				-                    "bbox": (x0, y0, x1, y1),
			
 
				-                    "text": text.strip(),
			
 
				-                })
			
 
				-
			
 
				-        # 按 Y 坐标排序
			
 
				-        text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
			
 
				-
			
 
				-        # 找出属于表格区域的文本块
			
 
				-        replaced_indices: Set[int] = set()
			
 
				-        for ocr_result in ocr_results:
			
 
				-            bbox = ocr_result["bbox"]
			
 
				-            rx0, ry0, rx1, ry1 = bbox
			
 
				-
			
 
				-            for idx, block in enumerate(text_blocks):
			
 
				-                if idx in replaced_indices:
			
 
				-                    continue
			
 
				-                bx0, by0, bx1, by1 = block["bbox"]
			
 
				-
			
 
				-                # 检查重叠
			
 
				-                overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
			
 
				-                overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
			
 
				-                overlap_area = overlap_x * overlap_y
			
 
				-                block_area = (bx1 - bx0) * (by1 - by0)
			
 
				-
			
 
				-                if block_area > 0 and overlap_area / block_area > 0.5:
			
 
				-                    replaced_indices.add(idx)
			
 
				-
			
 
				-        # 构建新文本
			
 
				-        result_parts: List[str] = []
			
 
				-        last_idx = 0
			
 
				-
			
 
				-        for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
			
 
				-            bbox = ocr_result["bbox"]
			
 
				-            rx0, ry0, rx1, ry1 = bbox
			
 
				-
			
 
				-            # 找到该表格区域之前的文本
			
 
				-            region_start_idx = None
			
 
				-            for idx, block in enumerate(text_blocks):
			
 
				-                if idx in replaced_indices:
			
 
				-                    bx0, by0, bx1, by1 = block["bbox"]
			
 
				-                    if (bx0 >= rx0 - 5 and bx1 <= rx1 + 5 and
			
 
				-                        by0 >= ry0 - 5 and by1 <= ry1 + 5):
			
 
				-                        if region_start_idx is None:
			
 
				-                            region_start_idx = idx
			
 
				-                        last_idx = idx + 1
			
 
				-
			
 
				-            if region_start_idx is not None:
			
 
				-                # 添加表格前的非表格文本
			
 
				-                for idx in range(last_idx - (last_idx - region_start_idx), region_start_idx):
			
 
				-                    if idx not in replaced_indices and idx < len(text_blocks):
			
 
				-                        result_parts.append(text_blocks[idx]["text"])
			
 
				-                        result_parts.append("\n")
			
 
				-
			
 
				-                # 添加 OCR 结果
			
 
				-                result_parts.append(ocr_result["ocr_text"])
			
 
				-                result_parts.append("\n")
			
 
				-
			
 
				-        # 添加剩余文本
			
 
				-        for idx in range(last_idx, len(text_blocks)):
			
 
				-            if idx not in replaced_indices:
			
 
				-                result_parts.append(text_blocks[idx]["text"])
			
 
				-                result_parts.append("\n")
			
 
				-
			
 
				-        return "".join(result_parts).strip() or original_text
			
 
				-
			
 
				-    def _compress_image(self, img_bytes: bytes) -> bytes:
			
 
				-        """压缩图片"""
			
 
				-        try:
			
 
				-            from PIL import Image
			
 
				-            img = Image.open(io.BytesIO(img_bytes))
			
 
				-
			
 
				-            if img.mode in ('RGBA', 'LA', 'P'):
			
 
				-                background = Image.new('RGB', img.size, (255, 255, 255))
			
 
				-                if img.mode == 'P':
			
 
				-                    img = img.convert('RGBA')
			
 
				-                if img.mode in ('RGBA', 'LA'):
			
 
				-                    background.paste(img, mask=img.split()[-1])
			
 
				-                img = background
			
 
				-            elif img.mode != 'RGB':
			
 
				-                img = img.convert('RGB')
			
 
				-
			
 
				-            min_edge = min(img.size)
			
 
				-            if min_edge > self.MAX_SHORT_EDGE:
			
 
				-                ratio = self.MAX_SHORT_EDGE / min_edge
			
 
				-                new_size = (int(img.width * ratio), int(img.height * ratio))
			
 
				-                img = img.resize(new_size, Image.Resampling.LANCZOS)
			
 
				-
			
 
				-            buffer = io.BytesIO()
			
 
				-            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
			
 
				-            return buffer.getvalue()
			
 
				-
			
 
				-        except Exception as e:
			
 
				-            logger.warning(f"图片压缩失败，使用原图: {e}")
			
 
				-            return img_bytes
			
 
				-
			
 
				-    def _extract_ocr_content(self, result: Dict) -> str:
			
 
				-        """从 OCR 响应提取内容，并将 HTML 表格转换为 Markdown"""
			
 
				-        content = ""
			
 
				-        if "choices" in result and isinstance(result["choices"], list):
			
 
				-            if len(result["choices"]) > 0:
			
 
				-                message = result["choices"][0].get("message", {})
			
 
				-                content = message.get("content", "")
			
 
				-
			
 
				-        # 如果内容包含 HTML 标签，转换为 Markdown
			
 
				-        if content and "<" in content and ">" in content:
			
 
				-            try:
			
 
				-                from ..doc_worker.pdf_worker.html_to_markdown import convert_html_to_markdown
			
 
				-                content = convert_html_to_markdown(content)
			
 
				-            except Exception as e:
			
 
				-                logger.debug(f"HTML 转 Markdown 失败，保留原始内容: {e}")
			
 
				-
			
 
				-        return content
			
 
				-
			
 
				     def _extract_text_blocks_with_position(
			
 
				         self,
			
 
				         page: fitz.Page,
			
--- a/core/construction_review/component/minimal_pipeline/test.py
+++ b/core/construction_review/component/minimal_pipeline/test.py
@@ -0,0 +1,119 @@
 
				+import fitz  # PyMuPDF
			
 
				+import re
			
 
				+import json
			
 
				+import os
			
 
				+from datetime import datetime
			
 
				+
			
 
				+def extract_and_split_construction_plan(pdf_path):
			
 
				+    # 打开PDF文件
			
 
				+    doc = fitz.open(pdf_path)
			
 
				+    
			
 
				+    # 编译正则表达式
			
 
				+    chapter_pattern = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
			
 
				+    section_pattern = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
			
 
				+    # 用于识别目录的特征：连续的三个以上小数点或省略号
			
 
				+    toc_pattern = re.compile(r'\.{3,}|…{2,}') 
			
 
				+    
			
 
				+    structured_data = {}
			
 
				+    current_chapter = "未分类前言"
			
 
				+    current_section = "默认部分"
			
 
				+    
			
 
				+    in_body = False  # 状态机：标记是否已经跳过目录，正式进入正文
			
 
				+    
			
 
				+    for page_num in range(len(doc)):
			
 
				+        page = doc.load_page(page_num)
			
 
				+        
			
 
				+        # 1. 清理页眉页脚：利用 clip 裁剪页面提取区域
			
 
				+        # 默认A4纸高度约842磅，裁剪掉顶部和底部各60磅的区域（可根据实际PDF微调）
			
 
				+        rect = page.rect
			
 
				+        clip_box = fitz.Rect(0, 60, rect.width, rect.height - 60)
			
 
				+        
			
 
				+        # 仅提取裁剪框内的纯文本
			
 
				+        text = page.get_text("text", clip=clip_box)
			
 
				+        lines = text.split('\n')
			
 
				+        
			
 
				+        for line in lines:
			
 
				+            line = line.strip()
			
 
				+            # 跳过空行
			
 
				+            if not line:
			
 
				+                continue
			
 
				+            
			
 
				+            # 双保险：过滤掉可能因排版偏移漏掉的页眉页脚特征词或孤立的页码
			
 
				+            if "四川路桥建设集团股份有限公司" in line or "T梁运输及安装专项施工方案" in line or line.isdigit():
			
 
				+                continue
			
 
				+            
			
 
				+            # 2. 删除目录逻辑：判断是否正式进入正文
			
 
				+            if not in_body:
			
 
				+                if chapter_pattern.match(line) and not toc_pattern.search(line):
			
 
				+                    in_body = True
			
 
				+                else:
			
 
				+                    continue  # 还在目录页，直接跳过
			
 
				+            
			
 
				+            # 进入正文后的防干扰处理：跳过残余目录格式
			
 
				+            if toc_pattern.search(line):
			
 
				+                continue
			
 
				+            
			
 
				+            # 匹配到一级标题
			
 
				+            if chapter_pattern.match(line):
			
 
				+                current_chapter = line
			
 
				+                current_section = "章节前言" 
			
 
				+                if current_chapter not in structured_data:
			
 
				+                    structured_data[current_chapter] = {current_section: []}
			
 
				+                continue
			
 
				+            
			
 
				+            # 匹配到二级标题
			
 
				+            if section_pattern.match(line):
			
 
				+                current_section = line
			
 
				+                if current_chapter not in structured_data:
			
 
				+                    structured_data[current_chapter] = {}
			
 
				+                if current_section not in structured_data[current_chapter]:
			
 
				+                    structured_data[current_chapter][current_section] = []
			
 
				+                continue
			
 
				+            
			
 
				+            # 容错处理：确保基础字典结构存在
			
 
				+            if current_chapter not in structured_data:
			
 
				+                structured_data[current_chapter] = {current_section: []}
			
 
				+            if current_section not in structured_data[current_chapter]:
			
 
				+                structured_data[current_chapter][current_section] = []
			
 
				+                
			
 
				+            # 3. 将正文内容累加到对应的层级下
			
 
				+            structured_data[current_chapter][current_section].append(line)
			
 
				+    
			
 
				+    # 将列表拼接成完整的文本块
			
 
				+    for chap in structured_data:
			
 
				+        for sec in structured_data[chap]:
			
 
				+            structured_data[chap][sec] = '\n'.join(structured_data[chap][sec])
			
 
				+            
			
 
				+    return structured_data
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 获取用户输入的路径
			
 
				+    user_input = input("请输入需要提取的PDF文件路径（支持直接拖入文件或粘贴路径）：")
			
 
				+    
			
 
				+    # 清理路径两端可能存在的引号和空格（应对“复制文件地址”或拖拽文件带来的双引号）
			
 
				+    pdf_file_path = user_input.strip('\'" ')
			
 
				+    
			
 
				+    # 检查文件是否存在
			
 
				+    if not os.path.exists(pdf_file_path):
			
 
				+        print(f"\n[错误] 找不到文件，请检查路径是否正确：{pdf_file_path}")
			
 
				+    else:
			
 
				+        print("\n开始提取施工方案，请稍候...")
			
 
				+        try:
			
 
				+            result_data = extract_and_split_construction_plan(pdf_file_path)
			
 
				+            
			
 
				+            # 4. 保存为本地JSON，名称为：文件名+当前时间（到秒）
			
 
				+            base_name = os.path.splitext(os.path.basename(pdf_file_path))[0]
			
 
				+            current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				+            
			
 
				+            # 将输出文件保存在与原PDF相同的目录下
			
 
				+            output_dir = os.path.dirname(pdf_file_path)
			
 
				+            output_filename = os.path.join(output_dir, f"{base_name}_{current_time}.json")
			
 
				+            
			
 
				+            with open(output_filename, 'w', encoding='utf-8') as json_file:
			
 
				+                json.dump(result_data, json_file, ensure_ascii=False, indent=4)
			
 
				+                
			
 
				+            print(f"\n[成功] 提取完成！")
			
 
				+            print(f"结构化数据已保存至: {output_filename}")
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"\n[失败] 提取过程中发生错误: {e}")