فهرست منبع

refactor(pdf_extractor): 将 OCR 功能解耦为独立模块

- 新增 ocr_processor.py 模块,封装表格检测与 OCR 识别功能
- 提取 TableRegion、OcrResult 数据类到独立模块
- OcrProcessor 类提供:
  - detect_table_regions(): RapidLayout 表格区域检测
  - process_ocr_concurrent(): 5 并发 OCR 识别
  - replace_table_regions(): 表格文本替换回填
  - _ocr_table_region(): GLM-OCR 调用(含指数退避重试)
  - _compress_image(): 图片压缩处理
  - _extract_ocr_content(): OCR 响应解析(支持 HTML 转 Markdown)

- pdf_extractor.py 精简 314 行,OCR 相关功能委托给 OcrProcessor
- 保持原有 API 不变,use_ocr 参数行为一致
WangXuMing 3 روز پیش
والد
کامیت
175446bb96

+ 458 - 0
core/construction_review/component/minimal_pipeline/ocr_processor.py

@@ -0,0 +1,458 @@
+"""
+OCR 处理模块 - 表格检测与识别
+
+提供 PDF 表格区域检测和 OCR 识别功能,支持:
+- RapidLayout 表格区域检测
+- GLM-OCR 并发识别
+- 表格文本替换回填
+"""
+
+import base64
+import io
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from typing import Dict, Any, List, Optional, Tuple, Set
+
+import fitz
+import numpy as np
+import requests
+
+from foundation.observability.logger.loggering import review_logger as logger
+
+# 尝试导入 RapidLayout
+try:
+    from rapid_layout import RapidLayout
+    RAPID_LAYOUT_AVAILABLE = True
+except ImportError:
+    RAPID_LAYOUT_AVAILABLE = False
+    RapidLayout = None
+
+
+@dataclass
+class TableRegion:
+    """表格区域信息"""
+    page_num: int
+    page: fitz.Page
+    bbox: Tuple[float, float, float, float]
+    score: float
+
+
+@dataclass
+class OcrResult:
+    """OCR 结果"""
+    page_num: int
+    bbox: Tuple[float, float, float, float]
+    score: float
+    text: str
+    success: bool
+
+
+class OcrProcessor:
+    """OCR 处理器:表格检测与识别"""
+
+    # 默认配置
+    MAX_SHORT_EDGE = 1024
+    JPEG_QUALITY = 90
+    OCR_DPI = 200
+    OCR_CONFIDENCE_THRESHOLD = 0.5
+    OCR_CONCURRENT_WORKERS = 5
+
+    def __init__(
+        self,
+        ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
+        ocr_timeout: int = 600,
+        ocr_api_key: str = "",
+        max_short_edge: int = 1024,
+        jpeg_quality: int = 90,
+        ocr_dpi: int = 200,
+        confidence_threshold: float = 0.5,
+        concurrent_workers: int = 5,
+    ):
+        """
+        初始化 OCR 处理器
+
+        Args:
+            ocr_api_url: OCR API 地址
+            ocr_timeout: OCR 请求超时时间(秒)
+            ocr_api_key: OCR API 密钥
+            max_short_edge: 图片压缩后短边最大尺寸
+            jpeg_quality: JPEG 压缩质量
+            ocr_dpi: OCR 渲染 DPI
+            confidence_threshold: 表格检测置信度阈值
+            concurrent_workers: OCR 并发工作线程数
+        """
+        self.ocr_api_url = ocr_api_url
+        self.ocr_timeout = ocr_timeout
+        self.ocr_api_key = ocr_api_key
+        self.max_short_edge = max_short_edge
+        self.jpeg_quality = jpeg_quality
+        self.ocr_dpi = ocr_dpi
+        self.confidence_threshold = confidence_threshold
+        self.concurrent_workers = concurrent_workers
+
+        self._layout_engine: Optional[Any] = None
+
+        if not RAPID_LAYOUT_AVAILABLE:
+            logger.warning("RapidLayout 未安装,表格检测功能不可用")
+
+    def is_available(self) -> bool:
+        """检查 OCR 功能是否可用"""
+        return RAPID_LAYOUT_AVAILABLE
+
+    def _get_layout_engine(self) -> Optional[Any]:
+        """延迟初始化 RapidLayout"""
+        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
+            self._layout_engine = RapidLayout()
+        return self._layout_engine
+
+    def detect_table_regions(
+        self,
+        page: fitz.Page,
+        page_num: int,
+        clip_box: fitz.Rect
+    ) -> List[Tuple[Tuple[float, float, float, float], float]]:
+        """
+        检测页面中的表格区域
+
+        Args:
+            page: PDF 页面对象
+            page_num: 页码(用于日志)
+            clip_box: 裁剪区域
+
+        Returns:
+            列表,元素为 ((x1, y1, x2, y2), score)
+        """
+        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
+
+        if not RAPID_LAYOUT_AVAILABLE:
+            return table_regions
+
+        layout_engine = self._get_layout_engine()
+        if layout_engine is None:
+            return table_regions
+
+        # 渲染页面(裁剪区域)
+        pix = page.get_pixmap(dpi=self.ocr_dpi, clip=clip_box)
+        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
+
+        try:
+            layout_output = layout_engine(img)
+
+            # 解析版面结果
+            if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
+                # 获取缩放比例
+                scale_x = clip_box.width / img.shape[1]
+                scale_y = clip_box.height / img.shape[0]
+
+                for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
+                    if label == "table" and score > self.confidence_threshold:
+                        # 转换为 PDF 坐标
+                        pdf_x1 = clip_box.x0 + box[0] * scale_x
+                        pdf_y1 = clip_box.y0 + box[1] * scale_y
+                        pdf_x2 = clip_box.x0 + box[2] * scale_x
+                        pdf_y2 = clip_box.y0 + box[3] * scale_y
+
+                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
+
+        except Exception as e:
+            logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
+
+        return table_regions
+
+    def process_ocr_concurrent(
+        self,
+        regions: List[TableRegion],
+        progress_callback=None
+    ) -> List[OcrResult]:
+        """
+        同步并发处理 OCR
+
+        Args:
+            regions: 表格区域列表
+            progress_callback: 进度回调函数,接收 (completed, total) 参数
+
+        Returns:
+            OCR 结果列表
+        """
+        results: List[OcrResult] = []
+        total = len(regions)
+        completed = 0
+
+        with ThreadPoolExecutor(max_workers=self.concurrent_workers) as executor:
+            # 提交所有任务
+            future_to_region = {
+                executor.submit(self._ocr_table_region, r.page, r.bbox): r
+                for r in regions
+            }
+
+            # 处理完成的结果
+            for future in as_completed(future_to_region):
+                region = future_to_region[future]
+                completed += 1
+                try:
+                    text = future.result()
+                    results.append(OcrResult(
+                        page_num=region.page_num,
+                        bbox=region.bbox,
+                        score=region.score,
+                        text=text,
+                        success=True,
+                    ))
+                except Exception as e:
+                    logger.error(f"  第 {region.page_num} 页表格 OCR 失败: {e}")
+                    results.append(OcrResult(
+                        page_num=region.page_num,
+                        bbox=region.bbox,
+                        score=region.score,
+                        text="",
+                        success=False,
+                    ))
+
+                # 每完成5个或最后一个时推送进度
+                if progress_callback and (completed % 5 == 0 or completed == total):
+                    progress_callback(completed, total)
+
+        return results
+
+    def _ocr_table_region(
+        self,
+        page: fitz.Page,
+        bbox: Tuple[float, float, float, float],
+        max_retries: int = 3
+    ) -> str:
+        """
+        对指定区域进行 OCR 识别(使用 GLM-OCR),支持指数退避重试
+
+        Args:
+            page: PDF 页面对象
+            bbox: 区域坐标 (x1, y1, x2, y2)
+            max_retries: 最大重试次数
+
+        Returns:
+            识别的文本内容
+        """
+        # 渲染指定区域
+        rect = fitz.Rect(bbox)
+        pix = page.get_pixmap(dpi=self.ocr_dpi, clip=rect)
+        img_bytes = pix.tobytes("jpeg")
+
+        # 压缩图片
+        compressed = self._compress_image(img_bytes)
+        img_base64 = base64.b64encode(compressed).decode('utf-8')
+
+        # 请求 OCR
+        payload = {
+            "model": "GLM-OCR",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "识别图片中的表格内容,按原文排版输出。"
+                                    "注意:"
+                                    "1. 表格用 Markdown 表格格式"
+                                    "2. 保持换行和列对齐"
+                                    "3. 只输出表格内容,不要其他说明"
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": 2048,
+            "temperature": 0.1
+        }
+
+        headers = {"Content-Type": "application/json"}
+        if self.ocr_api_key:
+            headers["Authorization"] = f"Bearer {self.ocr_api_key}"
+
+        # 指数退避重试
+        last_error = None
+        for attempt in range(max_retries):
+            try:
+                response = requests.post(
+                    self.ocr_api_url,
+                    headers=headers,
+                    json=payload,
+                    timeout=self.ocr_timeout
+                )
+                response.raise_for_status()
+
+                result = response.json()
+                return self._extract_ocr_content(result)
+
+            except Exception as e:
+                last_error = e
+                if attempt < max_retries - 1:
+                    # 指数退避: 2, 4, 8 秒
+                    wait_time = 2 ** (attempt + 1)
+                    logger.warning(f"  第 {page.number + 1} 页表格 OCR 第 {attempt + 1} 次失败: {e}, {wait_time}秒后重试...")
+                    time.sleep(wait_time)
+                else:
+                    logger.error(f"  第 {page.number + 1} 页表格 OCR 最终失败(已重试{max_retries}次): {e}")
+
+        # 所有重试都失败,抛出最后一个错误
+        raise last_error
+
+    def _compress_image(self, img_bytes: bytes) -> bytes:
+        """
+        压缩图片
+
+        Args:
+            img_bytes: 原始图片字节
+
+        Returns:
+            压缩后的图片字节
+        """
+        try:
+            from PIL import Image
+            img = Image.open(io.BytesIO(img_bytes))
+
+            if img.mode in ('RGBA', 'LA', 'P'):
+                background = Image.new('RGB', img.size, (255, 255, 255))
+                if img.mode == 'P':
+                    img = img.convert('RGBA')
+                if img.mode in ('RGBA', 'LA'):
+                    background.paste(img, mask=img.split()[-1])
+                img = background
+            elif img.mode != 'RGB':
+                img = img.convert('RGB')
+
+            min_edge = min(img.size)
+            if min_edge > self.max_short_edge:
+                ratio = self.max_short_edge / min_edge
+                new_size = (int(img.width * ratio), int(img.height * ratio))
+                img = img.resize(new_size, Image.Resampling.LANCZOS)
+
+            buffer = io.BytesIO()
+            img.save(buffer, format='JPEG', quality=self.jpeg_quality, optimize=True)
+            return buffer.getvalue()
+
+        except Exception as e:
+            logger.warning(f"图片压缩失败,使用原图: {e}")
+            return img_bytes
+
+    def _extract_ocr_content(self, result: Dict) -> str:
+        """
+        从 OCR 响应提取内容,并将 HTML 表格转换为 Markdown
+
+        Args:
+            result: OCR API 响应
+
+        Returns:
+            提取的文本内容
+        """
+        content = ""
+        if "choices" in result and isinstance(result["choices"], list):
+            if len(result["choices"]) > 0:
+                message = result["choices"][0].get("message", {})
+                content = message.get("content", "")
+
+        # 如果内容包含 HTML 标签,转换为 Markdown
+        if content and "<" in content and ">" in content:
+            try:
+                from ..doc_worker.pdf_worker.html_to_markdown import convert_html_to_markdown
+                content = convert_html_to_markdown(content)
+            except Exception as e:
+                logger.debug(f"HTML 转 Markdown 失败,保留原始内容: {e}")
+
+        return content
+
+    def replace_table_regions(
+        self,
+        page: fitz.Page,
+        original_text: str,
+        ocr_results: List[Dict],
+        clip_box: fitz.Rect
+    ) -> str:
+        """
+        用 OCR 结果替换原始文本中的表格区域
+
+        Args:
+            page: PDF 页面对象
+            original_text: 原始文本
+            ocr_results: OCR 结果列表,每个元素包含 region_index, bbox, score, ocr_text
+            clip_box: 裁剪区域
+
+        Returns:
+            替换后的文本
+        """
+        if not ocr_results:
+            return original_text
+
+        # 获取页面上的文本块及其坐标
+        text_blocks = []
+        for block in page.get_text("blocks"):
+            x0, y0, x1, y1, text, _, _ = block
+            # 只考虑裁剪区域内的文本
+            if y0 >= clip_box.y0 and y1 <= clip_box.y1:
+                text_blocks.append({
+                    "bbox": (x0, y0, x1, y1),
+                    "text": text.strip(),
+                })
+
+        # 按 Y 坐标排序
+        text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
+
+        # 找出属于表格区域的文本块
+        replaced_indices: Set[int] = set()
+        for ocr_result in ocr_results:
+            bbox = ocr_result["bbox"]
+            rx0, ry0, rx1, ry1 = bbox
+
+            for idx, block in enumerate(text_blocks):
+                if idx in replaced_indices:
+                    continue
+                bx0, by0, bx1, by1 = block["bbox"]
+
+                # 检查重叠
+                overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
+                overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
+                overlap_area = overlap_x * overlap_y
+                block_area = (bx1 - bx0) * (by1 - by0)
+
+                if block_area > 0 and overlap_area / block_area > 0.5:
+                    replaced_indices.add(idx)
+
+        # 构建新文本
+        result_parts: List[str] = []
+        last_idx = 0
+
+        for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
+            bbox = ocr_result["bbox"]
+            rx0, ry0, rx1, ry1 = bbox
+
+            # 找到该表格区域之前的文本
+            region_start_idx = None
+            for idx, block in enumerate(text_blocks):
+                if idx in replaced_indices:
+                    bx0, by0, bx1, by1 = block["bbox"]
+                    if (bx0 >= rx0 - 5 and bx1 <= rx1 + 5 and
+                        by0 >= ry0 - 5 and by1 <= ry1 + 5):
+                        if region_start_idx is None:
+                            region_start_idx = idx
+                        last_idx = idx + 1
+
+            if region_start_idx is not None:
+                # 添加表格前的非表格文本
+                for idx in range(last_idx - (last_idx - region_start_idx), region_start_idx):
+                    if idx not in replaced_indices and idx < len(text_blocks):
+                        result_parts.append(text_blocks[idx]["text"])
+                        result_parts.append("\n")
+
+                # 添加 OCR 结果
+                result_parts.append(ocr_result["ocr_text"])
+                result_parts.append("\n")
+
+        # 添加剩余文本
+        for idx in range(last_idx, len(text_blocks)):
+            if idx not in replaced_indices:
+                result_parts.append(text_blocks[idx]["text"])
+                result_parts.append("\n")
+
+        return "".join(result_parts).strip() or original_text

+ 25 - 338
core/construction_review/component/minimal_pipeline/pdf_extractor.py

@@ -6,19 +6,15 @@ PDF 结构提取器 - 同步并发 OCR 版本
 输出格式兼容后续分类与组装流程。
 """
 
-import base64
-import io
 import re
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from dataclasses import dataclass
 from typing import Dict, Any, List, Optional, Tuple
 
 import fitz
-import numpy as np
-import requests
 
 from foundation.observability.logger.loggering import review_logger as logger
 
+from .ocr_processor import OcrProcessor, TableRegion, OcrResult
+
 # 尝试导入 RapidLayout
 try:
     from rapid_layout import RapidLayout
@@ -28,25 +24,6 @@ except ImportError:
     RapidLayout = None
 
 
-@dataclass
-class TableRegion:
-    """表格区域信息"""
-    page_num: int
-    page: fitz.Page
-    bbox: Tuple[float, float, float, float]
-    score: float
-
-
-@dataclass
-class OcrResult:
-    """OCR 结果"""
-    page_num: int
-    bbox: Tuple[float, float, float, float]
-    score: float
-    text: str
-    success: bool
-
-
 class PdfStructureExtractor:
     """PDF 章节结构提取器(支持 OCR 异步并发)"""
 
@@ -54,13 +31,6 @@ class PdfStructureExtractor:
     SECTION_PATTERN = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
     TOC_PATTERN = re.compile(r"\.{3,}|…{2,}")
 
-    # OCR 配置
-    MAX_SHORT_EDGE = 1024
-    JPEG_QUALITY = 90
-    OCR_DPI = 200
-    OCR_CONFIDENCE_THRESHOLD = 0.5
-    OCR_CONCURRENT_WORKERS = 5
-
     def __init__(
         self,
         clip_top: float = 60,
@@ -76,11 +46,12 @@ class PdfStructureExtractor:
         self.clip_bottom = clip_bottom
         self.use_ocr = use_ocr and RAPID_LAYOUT_AVAILABLE
 
-        # OCR 配置
-        self.ocr_api_url = ocr_api_url
-        self.ocr_timeout = ocr_timeout
-        self.ocr_api_key = ocr_api_key
-        self._layout_engine: Optional[Any] = None
+        # 初始化 OCR 处理器
+        self._ocr_processor = OcrProcessor(
+            ocr_api_url=ocr_api_url,
+            ocr_timeout=ocr_timeout,
+            ocr_api_key=ocr_api_key,
+        ) if self.use_ocr else None
 
         # 目录检测配置
         self.detect_toc = detect_toc
@@ -90,12 +61,6 @@ class PdfStructureExtractor:
         if use_ocr and not RAPID_LAYOUT_AVAILABLE:
             logger.warning("RapidLayout 未安装,OCR 功能不可用")
 
-    def _get_layout_engine(self) -> Optional[Any]:
-        """延迟初始化 RapidLayout"""
-        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
-            self._layout_engine = RapidLayout()
-        return self._layout_engine
-
     def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
         """
         从 PDF 字节流提取章节结构。
@@ -152,11 +117,17 @@ class PdfStructureExtractor:
         from .toc_detector import TOCCatalogExtractor
 
         if self._toc_extractor is None:
+            # 使用 OCR 处理器的配置(如果已初始化)
+            ocr_config = {}
+            if self._ocr_processor:
+                ocr_config = {
+                    "ocr_api_url": self._ocr_processor.ocr_api_url,
+                    "ocr_api_key": self._ocr_processor.ocr_api_key,
+                    "ocr_timeout": self._ocr_processor.ocr_timeout,
+                }
             self._toc_extractor = TOCCatalogExtractor(
                 model_path=self.toc_model_path,
-                ocr_api_url=self.ocr_api_url,
-                ocr_api_key=self.ocr_api_key,
-                ocr_timeout=self.ocr_timeout,
+                **ocr_config
             )
 
         return self._toc_extractor.detect_and_extract(file_content, progress_callback)
@@ -251,13 +222,13 @@ class PdfStructureExtractor:
         table_regions: List[TableRegion] = []
         ocr_results: List[OcrResult] = []
 
-        if self.use_ocr:
+        if self.use_ocr and self._ocr_processor:
             logger.info("[阶段2] 扫描表格区域...")
             for page_num in range(total_pages):
                 page = doc.load_page(page_num)
                 rect = page.rect
                 clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
-                regions = self._detect_table_regions(page, page_num + 1, clip_box)
+                regions = self._ocr_processor.detect_table_regions(page, page_num + 1, clip_box)
                 for bbox, score in regions:
                     table_regions.append(TableRegion(
                         page_num=page_num + 1,
@@ -275,7 +246,12 @@ class PdfStructureExtractor:
             # 执行OCR
             if table_regions:
                 _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格,开始OCR识别...")
-                ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
+                ocr_results = self._ocr_processor.process_ocr_concurrent(
+                    table_regions,
+                    progress_callback=lambda completed, total: _emit_progress(
+                        "版面分析", 35 + int(completed / total * 15), f"OCR识别中 {completed}/{total}"
+                    )
+                )
                 success_count = sum(1 for r in ocr_results if r.success)
                 logger.info(f"[阶段2] OCR完成 {success_count}/{len(table_regions)}")
                 _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
@@ -316,295 +292,6 @@ class PdfStructureExtractor:
         logger.info(f"[PdfExtractor] 提取完成,共 {len(result['chapters'])} 个章节")
         return result
 
-    def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
-        """同步并发处理 OCR(使用 ThreadPoolExecutor)"""
-        results: List[OcrResult] = []
-        total = len(regions)
-        completed = 0
-
-        with ThreadPoolExecutor(max_workers=self.OCR_CONCURRENT_WORKERS) as executor:
-            # 提交所有任务
-            future_to_region = {
-                executor.submit(self._ocr_table_region, r.page, r.bbox): r
-                for r in regions
-            }
-
-            # 处理完成的结果
-            for future in as_completed(future_to_region):
-                region = future_to_region[future]
-                completed += 1
-                try:
-                    text = future.result()
-                    results.append(OcrResult(
-                        page_num=region.page_num,
-                        bbox=region.bbox,
-                        score=region.score,
-                        text=text,
-                        success=True,
-                    ))
-                except Exception as e:
-                    logger.error(f"  第 {region.page_num} 页表格 OCR 失败: {e}")
-                    results.append(OcrResult(
-                        page_num=region.page_num,
-                        bbox=region.bbox,
-                        score=region.score,
-                        text="",
-                        success=False,
-                    ))
-
-                # 每完成5个或最后一个时推送进度
-                if progress_callback and (completed % 5 == 0 or completed == total):
-                    progress = 35 + int(completed / total * 15)  # OCR执行占15%进度(35-50)
-                    progress_callback("版面分析", progress, f"OCR识别中 {completed}/{total}")
-
-        return results
-
-    def _detect_table_regions(
-        self,
-        page: fitz.Page,
-        page_num: int,
-        clip_box: fitz.Rect
-    ) -> List[Tuple[Tuple[float, float, float, float], float]]:
-        """检测页面中的表格区域,返回坐标列表"""
-        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
-
-        if not RAPID_LAYOUT_AVAILABLE:
-            return table_regions
-
-        layout_engine = self._get_layout_engine()
-        if layout_engine is None:
-            return table_regions
-
-        # 渲染页面(裁剪区域)
-        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=clip_box)
-        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
-
-        try:
-            layout_output = layout_engine(img)
-
-            # 解析版面结果
-            if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
-                # 获取缩放比例
-                scale_x = clip_box.width / img.shape[1]
-                scale_y = clip_box.height / img.shape[0]
-
-                for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
-                    if label == "table" and score > self.OCR_CONFIDENCE_THRESHOLD:
-                        # 转换为 PDF 坐标
-                        pdf_x1 = clip_box.x0 + box[0] * scale_x
-                        pdf_y1 = clip_box.y0 + box[1] * scale_y
-                        pdf_x2 = clip_box.x0 + box[2] * scale_x
-                        pdf_y2 = clip_box.y0 + box[3] * scale_y
-
-                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
-
-        except Exception as e:
-            logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
-
-        return table_regions
-
-    def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float], max_retries: int = 3) -> str:
-        """对指定区域进行 OCR 识别(使用 GLM-OCR),支持指数退避重试"""
-        import time
-
-        # 渲染指定区域
-        rect = fitz.Rect(bbox)
-        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=rect)
-        img_bytes = pix.tobytes("jpeg")
-
-        # 压缩图片
-        compressed = self._compress_image(img_bytes)
-        img_base64 = base64.b64encode(compressed).decode('utf-8')
-
-        # 请求 OCR
-        payload = {
-            "model": "GLM-OCR",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "识别图片中的表格内容,按原文排版输出。"
-                                    "注意:"
-                                    "1. 表格用 Markdown 表格格式"
-                                    "2. 保持换行和列对齐"
-                                    "3. 只输出表格内容,不要其他说明"
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
-                        }
-                    ]
-                }
-            ],
-            "max_tokens": 2048,
-            "temperature": 0.1
-        }
-
-        headers = {"Content-Type": "application/json"}
-        if self.ocr_api_key:
-            headers["Authorization"] = f"Bearer {self.ocr_api_key}"
-
-        # 指数退避重试
-        last_error = None
-        for attempt in range(max_retries):
-            try:
-                response = requests.post(
-                    self.ocr_api_url,
-                    headers=headers,
-                    json=payload,
-                    timeout=self.ocr_timeout
-                )
-                response.raise_for_status()
-
-                result = response.json()
-                return self._extract_ocr_content(result)
-
-            except Exception as e:
-                last_error = e
-                if attempt < max_retries - 1:
-                    # 指数退避: 2, 4, 8 秒
-                    wait_time = 2 ** (attempt + 1)
-                    logger.warning(f"  第 {page.number + 1} 页表格 OCR 第 {attempt + 1} 次失败: {e}, {wait_time}秒后重试...")
-                    time.sleep(wait_time)
-                else:
-                    logger.error(f"  第 {page.number + 1} 页表格 OCR 最终失败(已重试{max_retries}次): {e}")
-
-        # 所有重试都失败,抛出最后一个错误
-        raise last_error
-
-    def _replace_table_regions(
-        self,
-        page: fitz.Page,
-        original_text: str,
-        ocr_results: List[Dict],
-        clip_box: fitz.Rect
-    ) -> str:
-        """用 OCR 结果替换原始文本中的表格区域"""
-        if not ocr_results:
-            return original_text
-
-        # 获取页面上的文本块及其坐标
-        text_blocks = []
-        for block in page.get_text("blocks"):
-            x0, y0, x1, y1, text, _, _ = block
-            # 只考虑裁剪区域内的文本
-            if y0 >= clip_box.y0 and y1 <= clip_box.y1:
-                text_blocks.append({
-                    "bbox": (x0, y0, x1, y1),
-                    "text": text.strip(),
-                })
-
-        # 按 Y 坐标排序
-        text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
-
-        # 找出属于表格区域的文本块
-        replaced_indices: Set[int] = set()
-        for ocr_result in ocr_results:
-            bbox = ocr_result["bbox"]
-            rx0, ry0, rx1, ry1 = bbox
-
-            for idx, block in enumerate(text_blocks):
-                if idx in replaced_indices:
-                    continue
-                bx0, by0, bx1, by1 = block["bbox"]
-
-                # 检查重叠
-                overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
-                overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
-                overlap_area = overlap_x * overlap_y
-                block_area = (bx1 - bx0) * (by1 - by0)
-
-                if block_area > 0 and overlap_area / block_area > 0.5:
-                    replaced_indices.add(idx)
-
-        # 构建新文本
-        result_parts: List[str] = []
-        last_idx = 0
-
-        for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
-            bbox = ocr_result["bbox"]
-            rx0, ry0, rx1, ry1 = bbox
-
-            # 找到该表格区域之前的文本
-            region_start_idx = None
-            for idx, block in enumerate(text_blocks):
-                if idx in replaced_indices:
-                    bx0, by0, bx1, by1 = block["bbox"]
-                    if (bx0 >= rx0 - 5 and bx1 <= rx1 + 5 and
-                        by0 >= ry0 - 5 and by1 <= ry1 + 5):
-                        if region_start_idx is None:
-                            region_start_idx = idx
-                        last_idx = idx + 1
-
-            if region_start_idx is not None:
-                # 添加表格前的非表格文本
-                for idx in range(last_idx - (last_idx - region_start_idx), region_start_idx):
-                    if idx not in replaced_indices and idx < len(text_blocks):
-                        result_parts.append(text_blocks[idx]["text"])
-                        result_parts.append("\n")
-
-                # 添加 OCR 结果
-                result_parts.append(ocr_result["ocr_text"])
-                result_parts.append("\n")
-
-        # 添加剩余文本
-        for idx in range(last_idx, len(text_blocks)):
-            if idx not in replaced_indices:
-                result_parts.append(text_blocks[idx]["text"])
-                result_parts.append("\n")
-
-        return "".join(result_parts).strip() or original_text
-
-    def _compress_image(self, img_bytes: bytes) -> bytes:
-        """压缩图片"""
-        try:
-            from PIL import Image
-            img = Image.open(io.BytesIO(img_bytes))
-
-            if img.mode in ('RGBA', 'LA', 'P'):
-                background = Image.new('RGB', img.size, (255, 255, 255))
-                if img.mode == 'P':
-                    img = img.convert('RGBA')
-                if img.mode in ('RGBA', 'LA'):
-                    background.paste(img, mask=img.split()[-1])
-                img = background
-            elif img.mode != 'RGB':
-                img = img.convert('RGB')
-
-            min_edge = min(img.size)
-            if min_edge > self.MAX_SHORT_EDGE:
-                ratio = self.MAX_SHORT_EDGE / min_edge
-                new_size = (int(img.width * ratio), int(img.height * ratio))
-                img = img.resize(new_size, Image.Resampling.LANCZOS)
-
-            buffer = io.BytesIO()
-            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
-            return buffer.getvalue()
-
-        except Exception as e:
-            logger.warning(f"图片压缩失败,使用原图: {e}")
-            return img_bytes
-
-    def _extract_ocr_content(self, result: Dict) -> str:
-        """从 OCR 响应提取内容,并将 HTML 表格转换为 Markdown"""
-        content = ""
-        if "choices" in result and isinstance(result["choices"], list):
-            if len(result["choices"]) > 0:
-                message = result["choices"][0].get("message", {})
-                content = message.get("content", "")
-
-        # 如果内容包含 HTML 标签,转换为 Markdown
-        if content and "<" in content and ">" in content:
-            try:
-                from ..doc_worker.pdf_worker.html_to_markdown import convert_html_to_markdown
-                content = convert_html_to_markdown(content)
-            except Exception as e:
-                logger.debug(f"HTML 转 Markdown 失败,保留原始内容: {e}")
-
-        return content
-
     def _extract_text_blocks_with_position(
         self,
         page: fitz.Page,

+ 119 - 0
core/construction_review/component/minimal_pipeline/test.py

@@ -0,0 +1,119 @@
+import fitz  # PyMuPDF
+import re
+import json
+import os
+from datetime import datetime
+
+def extract_and_split_construction_plan(pdf_path):
+    # 打开PDF文件
+    doc = fitz.open(pdf_path)
+    
+    # 编译正则表达式
+    chapter_pattern = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
+    section_pattern = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
+    # 用于识别目录的特征:连续的三个以上小数点或省略号
+    toc_pattern = re.compile(r'\.{3,}|…{2,}') 
+    
+    structured_data = {}
+    current_chapter = "未分类前言"
+    current_section = "默认部分"
+    
+    in_body = False  # 状态机:标记是否已经跳过目录,正式进入正文
+    
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        
+        # 1. 清理页眉页脚:利用 clip 裁剪页面提取区域
+        # 默认A4纸高度约842磅,裁剪掉顶部和底部各60磅的区域(可根据实际PDF微调)
+        rect = page.rect
+        clip_box = fitz.Rect(0, 60, rect.width, rect.height - 60)
+        
+        # 仅提取裁剪框内的纯文本
+        text = page.get_text("text", clip=clip_box)
+        lines = text.split('\n')
+        
+        for line in lines:
+            line = line.strip()
+            # 跳过空行
+            if not line:
+                continue
+            
+            # 双保险:过滤掉可能因排版偏移漏掉的页眉页脚特征词或孤立的页码
+            if "四川路桥建设集团股份有限公司" in line or "T梁运输及安装专项施工方案" in line or line.isdigit():
+                continue
+            
+            # 2. 删除目录逻辑:判断是否正式进入正文
+            if not in_body:
+                if chapter_pattern.match(line) and not toc_pattern.search(line):
+                    in_body = True
+                else:
+                    continue  # 还在目录页,直接跳过
+            
+            # 进入正文后的防干扰处理:跳过残余目录格式
+            if toc_pattern.search(line):
+                continue
+            
+            # 匹配到一级标题
+            if chapter_pattern.match(line):
+                current_chapter = line
+                current_section = "章节前言" 
+                if current_chapter not in structured_data:
+                    structured_data[current_chapter] = {current_section: []}
+                continue
+            
+            # 匹配到二级标题
+            if section_pattern.match(line):
+                current_section = line
+                if current_chapter not in structured_data:
+                    structured_data[current_chapter] = {}
+                if current_section not in structured_data[current_chapter]:
+                    structured_data[current_chapter][current_section] = []
+                continue
+            
+            # 容错处理:确保基础字典结构存在
+            if current_chapter not in structured_data:
+                structured_data[current_chapter] = {current_section: []}
+            if current_section not in structured_data[current_chapter]:
+                structured_data[current_chapter][current_section] = []
+                
+            # 3. 将正文内容累加到对应的层级下
+            structured_data[current_chapter][current_section].append(line)
+    
+    # 将列表拼接成完整的文本块
+    for chap in structured_data:
+        for sec in structured_data[chap]:
+            structured_data[chap][sec] = '\n'.join(structured_data[chap][sec])
+            
+    return structured_data
+
+if __name__ == "__main__":
+    # 获取用户输入的路径
+    user_input = input("请输入需要提取的PDF文件路径(支持直接拖入文件或粘贴路径):")
+    
+    # 清理路径两端可能存在的引号和空格(应对“复制文件地址”或拖拽文件带来的双引号)
+    pdf_file_path = user_input.strip('\'" ')
+    
+    # 检查文件是否存在
+    if not os.path.exists(pdf_file_path):
+        print(f"\n[错误] 找不到文件,请检查路径是否正确:{pdf_file_path}")
+    else:
+        print("\n开始提取施工方案,请稍候...")
+        try:
+            result_data = extract_and_split_construction_plan(pdf_file_path)
+            
+            # 4. 保存为本地JSON,名称为:文件名+当前时间(到秒)
+            base_name = os.path.splitext(os.path.basename(pdf_file_path))[0]
+            current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+            
+            # 将输出文件保存在与原PDF相同的目录下
+            output_dir = os.path.dirname(pdf_file_path)
+            output_filename = os.path.join(output_dir, f"{base_name}_{current_time}.json")
+            
+            with open(output_filename, 'w', encoding='utf-8') as json_file:
+                json.dump(result_data, json_file, ensure_ascii=False, indent=4)
+                
+            print(f"\n[成功] 提取完成!")
+            print(f"结构化数据已保存至: {output_filename}")
+            
+        except Exception as e:
+            print(f"\n[失败] 提取过程中发生错误: {e}")