1 周之前 · b412527cea
--- a/config/config.ini
+++ b/config/config.ini
@@ -58,6 +58,9 @@ REDIS_PASSWORD=123456
 
				 REDIS_MAX_CONNECTIONS=50
			
 
				 
			
 
				 [ocr]
			
 
				+# 是否启用 OCR 表格识别（true/false）
			
 
				+enable = true
			
 
				+
			
 
				 # OCR 引擎选择（以下写法都支持）：
			
 
				 # GLM-OCR: glm_ocr | glm-ocr | glmocr
			
 
				 # MinerU:  mineru | mineru-ocr | mineru_ocr
			
--- a/core/construction_review/component/doc_worker/models/document_structure.py
+++ b/core/construction_review/component/doc_worker/models/document_structure.py
@@ -377,6 +377,13 @@ class UnifiedDocumentStructure:
 
				             "processing_timestamp": self.processing_timestamp
			
 
				         }
			
 
				 
			
 
				+    @staticmethod
			
 
				+    def _number_to_chinese(num: int) -> str:
			
 
				+        """数字转中文序号"""
			
 
				+        chinese_nums = {1: "一", 2: "二", 3: "三", 4: "四", 5: "五",
			
 
				+                       6: "六", 7: "七", 8: "八", 9: "九", 10: "十"}
			
 
				+        return chinese_nums.get(num, str(num))
			
 
				+
			
 
				     def to_legacy_dict(self) -> Dict[str, Any]:
			
 
				         """
			
 
				         转换为旧版字典格式（兼容 AI 审查工作流）
			
@@ -432,14 +439,34 @@ class UnifiedDocumentStructure:
 
				             chunks.append(chunk)
			
 
				 
			
 
				         # 构建 outline 结构（兼容旧格式）
			
 
				-        outline_chapters = []
			
 
				-        for item in self.outline.items:
			
 
				-            outline_chapters.append({
			
 
				-                "original": item.raw_title or f"{item.first_name}->{item.second_name}",
			
 
				-                "chapter": item.first_name,
			
 
				-                "subsections": []
			
 
				+        # 按一级分类分组构建 chapters
			
 
				+        chapters_map: Dict[str, Dict[str, Any]] = {}
			
 
				+
			
 
				+        for sec in self.secondary_classifications:
			
 
				+            # 一级code作为key
			
 
				+            first_code = sec.first_code
			
 
				+
			
 
				+            if first_code not in chapters_map:
			
 
				+                chapters_map[first_code] = {
			
 
				+                    "index": sec.first_seq,
			
 
				+                    "title": f"第{self._number_to_chinese(sec.first_seq)}章 {sec.first_name}",
			
 
				+                    "page": str(sec.page_start or 1),
			
 
				+                    "original": sec.section_label.split("->")[0] if "->" in sec.section_label else sec.first_name,
			
 
				+                    "chapter_classification": first_code,
			
 
				+                    "subsections": []
			
 
				+                }
			
 
				+
			
 
				+            # 添加二级到 subsections
			
 
				+            chapters_map[first_code]["subsections"].append({
			
 
				+                "title": sec.section_label.split("->")[-1] if "->" in sec.section_label else sec.second_name,
			
 
				+                "page": str(sec.page_start or 1),
			
 
				+                "level": 2,
			
 
				+                "original": sec.section_label,
			
 
				+                "secondary_category_code": sec.second_code
			
 
				             })
			
 
				 
			
 
				+        outline_chapters = list(chapters_map.values())
			
 
				+
			
 
				         return {
			
 
				             "document_id": self.document_id,
			
 
				             "document_name": self.document_name,
			
--- a/core/construction_review/component/doc_worker/pdf_worker/ocr_enhanced_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/ocr_enhanced_extractor.py
@@ -1,16 +1,16 @@
 
				 """
			
 
				-OCR 增强提取器 - 稳定版
			
 
				+OCR 增强提取器 - 精准表格区域版
			
 
				 
			
 
				 流程：
			
 
				-1. PyMuPDF 提取全部文本（用于章节切分）
			
 
				-2. RapidLayout 检测表格页
			
 
				-3. 对表格页 OCR，替换该页内容
			
 
				-4. 保持章节切分逻辑不变
			
 
				+1. PyMuPDF 提取全部文本（用于章节切分，确保格式稳定）
			
 
				+2. RapidLayout 检测表格区域（返回坐标）
			
 
				+3. 只对表格区域进行 OCR，替换该区域内容
			
 
				+4. 其他文本保持 PyMuPDF 提取结果，章节标题不受影响
			
 
				 
			
 
				 特点：
			
 
				 - 章节切分基于 PyMuPDF 文本（格式稳定，正则匹配可靠）
			
 
				-- 表格页内容通过 OCR 补充（识别率高）
			
 
				-- 输出标记哪些页使用了 OCR
			
 
				+- 仅表格区域使用 OCR（精准定位，不影响其他内容）
			
 
				+- 输出标记哪些页使用了 OCR 及表格区域坐标
			
 
				 """
			
 
				 
			
 
				 from __future__ import annotations
			
@@ -124,61 +124,69 @@ class OcrEnhancedExtractor(FullTextExtractor):
 
				             self._layout_engine = RapidLayout()
			
 
				         return self._layout_engine
			
 
				 
			
 
				-    def _detect_table_pages(self, doc: fitz.Document) -> Set[int]:
			
 
				-        """检测含表格的页码"""
			
 
				-        table_pages: Set[int] = set()
			
 
				+    def _detect_table_regions(self, page: fitz.Page, page_num: int) -> List[Tuple[Tuple[float, float, float, float], float]]:
			
 
				+        """
			
 
				+        检测页面中的表格区域
			
 
				+
			
 
				+        Args:
			
 
				+            page: PDF 页面对象
			
 
				+            page_num: 页码（用于日志）
			
 
				+
			
 
				+        Returns:
			
 
				+            表格区域列表，每个元素为 ((x1, y1, x2, y2), score)
			
 
				+        """
			
 
				+        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
			
 
				 
			
 
				         if not RAPID_LAYOUT_AVAILABLE:
			
 
				-            return table_pages
			
 
				+            return table_regions
			
 
				 
			
 
				         layout_engine = self._get_layout_engine()
			
 
				         if layout_engine is None:
			
 
				-            return table_pages
			
 
				+            return table_regions
			
 
				 
			
 
				-        logger.info(f"[版面分析] 检测表格页，共 {len(doc)} 页")
			
 
				-
			
 
				-        for page_num in range(1, len(doc) + 1):
			
 
				-            page = doc[page_num - 1]
			
 
				-
			
 
				-            # 裁剪页眉页脚
			
 
				-            rect = page.rect
			
 
				-            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				-
			
 
				-            # 渲染页面
			
 
				-            pix = page.get_pixmap(dpi=self.dpi, clip=clip_box)
			
 
				-            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
			
 
				-
			
 
				-            try:
			
 
				-                layout_output = layout_engine(img)
			
 
				+        # 裁剪页眉页脚
			
 
				+        rect = page.rect
			
 
				+        clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				 
			
 
				-                # 解析版面结果
			
 
				-                labels = []
			
 
				-                if hasattr(layout_output, 'class_names'):
			
 
				-                    labels = list(layout_output.class_names)
			
 
				-                elif hasattr(layout_output, 'boxes'):
			
 
				-                    labels = [
			
 
				-                        label for _, label, _
			
 
				-                        in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
			
 
				-                    ]
			
 
				+        # 渲染页面
			
 
				+        pix = page.get_pixmap(dpi=self.dpi, clip=clip_box)
			
 
				+        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
			
 
				 
			
 
				-                if "table" in labels:
			
 
				-                    table_pages.add(page_num)
			
 
				-                    logger.debug(f"  第 {page_num} 页: 检测到表格")
			
 
				+        try:
			
 
				+            layout_output = layout_engine(img)
			
 
				+
			
 
				+            # 解析版面结果
			
 
				+            if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
			
 
				+                # 获取缩放比例（像素坐标转 PDF 坐标）
			
 
				+                scale_x = clip_box.width / img.shape[1]
			
 
				+                scale_y = clip_box.height / img.shape[0]
			
 
				+
			
 
				+                for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
			
 
				+                    if label == "table" and score > 0.5:  # 置信度阈值
			
 
				+                        # box 格式: [x1, y1, x2, y2] 像素坐标
			
 
				+                        # 转换为 PDF 坐标（加上裁剪区域的偏移）
			
 
				+                        pdf_x1 = clip_box.x0 + box[0] * scale_x
			
 
				+                        pdf_y1 = clip_box.y0 + box[1] * scale_y
			
 
				+                        pdf_x2 = clip_box.x0 + box[2] * scale_x
			
 
				+                        pdf_y2 = clip_box.y0 + box[3] * scale_y
			
 
				+
			
 
				+                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
			
 
				+                        logger.debug(f"  第 {page_num} 页: 检测到表格 ({pdf_x1:.1f}, {pdf_y1:.1f}, {pdf_x2:.1f}, {pdf_y2:.1f}), 置信度 {score:.2f}")
			
 
				 
			
 
				-            except Exception as e:
			
 
				-                logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
			
 
				 
			
 
				-        logger.info(f"[版面分析] 检测到 {len(table_pages)} 页含表格")
			
 
				-        return table_pages
			
 
				+        return table_regions
			
 
				 
			
 
				     def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				-        执行 OCR 增强提取
			
 
				+        执行 OCR 增强提取（精准表格区域版）
			
 
				 
			
 
				         流程：
			
 
				-        1. PyMuPDF 提取全部文本
			
 
				-        2. 检测表格页
			
 
				-        3. 对表格页 OCR 替换内容
			
 
				+        1. PyMuPDF 提取全部文本（确保章节格式稳定）
			
 
				+        2. 检测每页的表格区域（返回坐标）
			
 
				+        3. 只 OCR 表格区域，替换该区域内容
			
 
				+        4. 其他文本保持 PyMuPDF 结果
			
 
				         """
			
 
				         total_start = time.time()
			
 
				 
			
@@ -211,44 +219,69 @@ class OcrEnhancedExtractor(FullTextExtractor):
 
				                     "start_pos": 0,  # 后续计算
			
 
				                     "end_pos": 0,
			
 
				                     "source_file": source_file,
			
 
				-                    "is_ocr": False,  # 标记是否 OCR
			
 
				+                    "is_ocr": False,
			
 
				+                    "ocr_regions": [],  # OCR 区域信息
			
 
				                 })
			
 
				 
			
 
				-            # 阶段 2: 检测表格页
			
 
				-            logger.info("[阶段2] 检测表格页...")
			
 
				-            table_pages = self._detect_table_pages(doc)
			
 
				+            # 阶段 2&3: 逐页检测表格区域并 OCR 替换
			
 
				+            logger.info("[阶段2] 检测表格区域并精准 OCR...")
			
 
				+            total_ocr_count = 0
			
 
				+            total_ocr_time = 0.0
			
 
				+
			
 
				+            for page_num in range(1, total_pages + 1):
			
 
				+                page = doc[page_num - 1]
			
 
				+
			
 
				+                # 检测该页的表格区域
			
 
				+                table_regions = self._detect_table_regions(page, page_num)
			
 
				 
			
 
				-            # 阶段 3: 对表格页 OCR
			
 
				-            if table_pages:
			
 
				-                logger.info(f"[阶段3] 对 {len(table_pages)} 页进行 OCR...")
			
 
				-                ocr_count = 0
			
 
				-                ocr_time = 0.0
			
 
				+                if not table_regions:
			
 
				+                    continue
			
 
				 
			
 
				-                for page_num in table_pages:
			
 
				-                    page = doc[page_num - 1]
			
 
				+                logger.info(f"  第 {page_num} 页: 检测到 {len(table_regions)} 个表格区域")
			
 
				 
			
 
				+                # 对每个表格区域进行 OCR
			
 
				+                ocr_results = []
			
 
				+                for idx, (bbox, score) in enumerate(table_regions):
			
 
				                     try:
			
 
				                         ocr_start = time.time()
			
 
				 
			
 
				-                        if self.ocr_engine_normalized == "glm_ocr":
			
 
				-                            ocr_text = self._ocr_with_glm(page, page_num)
			
 
				-                        else:
			
 
				-                            ocr_text = self._ocr_with_mineru(doc, page_num)
			
 
				+                        # 只 OCR 表格区域
			
 
				+                        ocr_text = self._ocr_table_region(page, bbox)
			
 
				 
			
 
				-                        ocr_time += time.time() - ocr_start
			
 
				-                        ocr_count += 1
			
 
				+                        ocr_time = time.time() - ocr_start
			
 
				+                        total_ocr_time += ocr_time
			
 
				 
			
 
				-                        # 替换该页内容
			
 
				-                        pages[page_num - 1]["text"] = ocr_text
			
 
				-                        pages[page_num - 1]["is_ocr"] = True
			
 
				-                        pages[page_num - 1]["original_text"] = pages[page_num - 1]["text"]  # 保留原文
			
 
				+                        ocr_results.append({
			
 
				+                            "region_index": idx,
			
 
				+                            "bbox": bbox,
			
 
				+                            "score": score,
			
 
				+                            "ocr_text": ocr_text,
			
 
				+                            "ocr_time": ocr_time,
			
 
				+                        })
			
 
				 
			
 
				-                        logger.debug(f"  第 {page_num} 页: OCR 完成 ({len(ocr_text)} 字符)")
			
 
				+                        logger.debug(f"    区域 {idx+1}: OCR 完成 ({len(ocr_text)} 字符), 耗时 {ocr_time:.2f}s")
			
 
				 
			
 
				                     except Exception as e:
			
 
				-                        logger.error(f"  第 {page_num} 页: OCR 失败 ({e})，使用原文")
			
 
				+                        logger.error(f"    区域 {idx+1}: OCR 失败 ({e})，保留原文")
			
 
				+
			
 
				+                # 替换表格区域内容
			
 
				+                if ocr_results:
			
 
				+                    original_text = pages[page_num - 1]["text"]
			
 
				+                    updated_text = self._replace_table_regions(
			
 
				+                        page, original_text, ocr_results, table_regions
			
 
				+                    )
			
 
				+
			
 
				+                    pages[page_num - 1]["text"] = updated_text
			
 
				+                    pages[page_num - 1]["is_ocr"] = True
			
 
				+                    pages[page_num - 1]["ocr_regions"] = [
			
 
				+                        {"bbox": r["bbox"], "score": r["score"], "chars": len(r["ocr_text"])}
			
 
				+                        for r in ocr_results
			
 
				+                    ]
			
 
				+
			
 
				+                    total_ocr_count += len(ocr_results)
			
 
				 
			
 
				-                logger.info(f"[OCR] 完成 {ocr_count} 页，耗时 {ocr_time:.2f}s")
			
 
				+            if total_ocr_count > 0:
			
 
				+                logger.info(f"[OCR] 完成 {total_ocr_count} 个表格区域，耗时 {total_ocr_time:.2f}s")
			
 
				 
			
 
				             # 阶段 4: 计算位置
			
 
				             current_pos = 0
			
@@ -264,19 +297,186 @@ class OcrEnhancedExtractor(FullTextExtractor):
 
				         # 统计
			
 
				         total_time = time.time() - total_start
			
 
				         ocr_pages = sum(1 for p in pages if p.get("is_ocr"))
			
 
				+        total_ocr_regions = sum(len(p.get("ocr_regions", [])) for p in pages)
			
 
				         total_chars = sum(len(p["text"]) for p in pages)
			
 
				 
			
 
				         logger.info(
			
 
				             f"[提取完成] 总页数: {total_pages} | "
			
 
				-            f"OCR: {ocr_pages} | 本地: {total_pages - ocr_pages} | "
			
 
				+            f"OCR页: {ocr_pages} | 本地页: {total_pages - ocr_pages} | "
			
 
				+            f"OCR区域: {total_ocr_regions} | "
			
 
				             f"总耗时: {total_time:.2f}s | "
			
 
				             f"总字符: {total_chars}"
			
 
				         )
			
 
				 
			
 
				         return pages
			
 
				 
			
 
				+    def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float]) -> str:
			
 
				+        """
			
 
				+        对指定区域进行 OCR 识别
			
 
				+
			
 
				+        Args:
			
 
				+            page: PDF 页面对象
			
 
				+            bbox: 区域坐标 (x1, y1, x2, y2)
			
 
				+
			
 
				+        Returns:
			
 
				+            OCR 识别结果文本
			
 
				+        """
			
 
				+        # 渲染指定区域
			
 
				+        rect = fitz.Rect(bbox)
			
 
				+        pix = page.get_pixmap(dpi=self.dpi, clip=rect)
			
 
				+        img_bytes = pix.tobytes("jpeg")
			
 
				+
			
 
				+        # 压缩
			
 
				+        compressed = self._compress_image(img_bytes)
			
 
				+        img_base64 = base64.b64encode(compressed).decode('utf-8')
			
 
				+
			
 
				+        # 请求 OCR
			
 
				+        payload = {
			
 
				+            "model": "GLM-OCR",
			
 
				+            "messages": [
			
 
				+                {
			
 
				+                    "role": "user",
			
 
				+                    "content": [
			
 
				+                        {
			
 
				+                            "type": "text",
			
 
				+                            "text": "识别图片中的表格内容，按原文排版输出。"
			
 
				+                                    "注意："
			
 
				+                                    "1. 表格用 Markdown 表格格式"
			
 
				+                                    "2. 保持换行和列对齐"
			
 
				+                                    "3. 只输出表格内容，不要其他说明"
			
 
				+                        },
			
 
				+                        {
			
 
				+                            "type": "image_url",
			
 
				+                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
			
 
				+                        }
			
 
				+                    ]
			
 
				+                }
			
 
				+            ],
			
 
				+            "max_tokens": 2048,
			
 
				+            "temperature": 0.1
			
 
				+        }
			
 
				+
			
 
				+        response = requests.post(
			
 
				+            self.glm_api_url,
			
 
				+            headers=self.glm_headers,
			
 
				+            json=payload,
			
 
				+            timeout=self.glm_timeout
			
 
				+        )
			
 
				+        response.raise_for_status()
			
 
				+
			
 
				+        result = response.json()
			
 
				+        content = self._extract_content(result)
			
 
				+
			
 
				+        return content
			
 
				+
			
 
				+    def _replace_table_regions(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        original_text: str,
			
 
				+        ocr_results: List[Dict[str, Any]],
			
 
				+        table_regions: List[Tuple[Tuple[float, float, float, float], float]]
			
 
				+    ) -> str:
			
 
				+        """
			
 
				+        用 OCR 结果替换原始文本中的表格区域
			
 
				+
			
 
				+        策略：
			
 
				+        1. 找到表格区域在原始文本中的位置
			
 
				+        2. 用 OCR 结果替换该部分内容
			
 
				+        3. 保留其他所有文本（包括章节标题）
			
 
				+
			
 
				+        Args:
			
 
				+            page: PDF 页面对象
			
 
				+            original_text: 原始文本（PyMuPDF 提取）
			
 
				+            ocr_results: OCR 结果列表
			
 
				+            table_regions: 表格区域坐标列表
			
 
				+
			
 
				+        Returns:
			
 
				+            替换后的文本
			
 
				+        """
			
 
				+        if not ocr_results:
			
 
				+            return original_text
			
 
				+
			
 
				+        # 获取页面上的文本块及其坐标
			
 
				+        text_blocks = []
			
 
				+        for block in page.get_text("blocks"):
			
 
				+            # block 格式: (x0, y0, x1, y1, text, block_no, block_type)
			
 
				+            x0, y0, x1, y1, text, _, _ = block
			
 
				+            # 只考虑页眉页脚裁剪区域内的文本
			
 
				+            if y0 >= self.clip_top and y1 <= page.rect.height - self.clip_bottom:
			
 
				+                text_blocks.append({
			
 
				+                    "bbox": (x0, y0, x1, y1),
			
 
				+                    "text": text.strip(),
			
 
				+                })
			
 
				+
			
 
				+        # 按 Y 坐标排序（从上到下）
			
 
				+        text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
			
 
				+
			
 
				+        # 标记哪些文本块属于表格区域
			
 
				+        replaced_indices = set()
			
 
				+        for region_idx, (bbox, _) in enumerate(table_regions):
			
 
				+            for idx, block in enumerate(text_blocks):
			
 
				+                if idx in replaced_indices:
			
 
				+                    continue
			
 
				+                # 检查文本块是否与表格区域有重叠
			
 
				+                bx0, by0, bx1, by1 = block["bbox"]
			
 
				+                rx0, ry0, rx1, ry1 = bbox
			
 
				+
			
 
				+                # 计算重叠区域
			
 
				+                overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
			
 
				+                overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
			
 
				+                overlap_area = overlap_x * overlap_y
			
 
				+                block_area = (bx1 - bx0) * (by1 - by0)
			
 
				+
			
 
				+                # 如果重叠面积超过 50%，认为是表格内的文本
			
 
				+                if block_area > 0 and overlap_area / block_area > 0.5:
			
 
				+                    replaced_indices.add(idx)
			
 
				+
			
 
				+        # 构建新文本：保留非表格区域的文本，替换表格区域为 OCR 结果
			
 
				+        result_parts = []
			
 
				+        last_idx = 0
			
 
				+
			
 
				+        # 按顺序处理每个表格区域
			
 
				+        for region_idx, (bbox, score) in enumerate(table_regions):
			
 
				+            if region_idx >= len(ocr_results):
			
 
				+                continue
			
 
				+
			
 
				+            ocr_text = ocr_results[region_idx]["ocr_text"]
			
 
				+
			
 
				+            # 找到该表格区域之前需要保留的文本
			
 
				+            region_blocks = []
			
 
				+            for idx, block in enumerate(text_blocks):
			
 
				+                if idx in replaced_indices:
			
 
				+                    bx0, by0, bx1, by1 = block["bbox"]
			
 
				+                    rx0, ry0, rx1, ry1 = bbox
			
 
				+                    # 如果该文本块属于当前表格区域
			
 
				+                    if (bx0 >= rx0 - 5 and bx1 <= rx1 + 5 and
			
 
				+                        by0 >= ry0 - 5 and by1 <= ry1 + 5):
			
 
				+                        region_blocks.append((idx, block))
			
 
				+
			
 
				+            if region_blocks:
			
 
				+                # 在第一个表格块之前添加之前的内容
			
 
				+                first_idx = region_blocks[0][0]
			
 
				+                for idx in range(last_idx, first_idx):
			
 
				+                    if idx not in replaced_indices:
			
 
				+                        result_parts.append(text_blocks[idx]["text"])
			
 
				+                        result_parts.append("\n")
			
 
				+
			
 
				+                # 添加 OCR 结果
			
 
				+                result_parts.append(ocr_text)
			
 
				+                result_parts.append("\n")
			
 
				+
			
 
				+                last_idx = region_blocks[-1][0] + 1
			
 
				+
			
 
				+        # 添加剩余的非表格文本
			
 
				+        for idx in range(last_idx, len(text_blocks)):
			
 
				+            if idx not in replaced_indices:
			
 
				+                result_parts.append(text_blocks[idx]["text"])
			
 
				+                result_parts.append("\n")
			
 
				+
			
 
				+        return "".join(result_parts)
			
 
				+
			
 
				     def _ocr_with_glm(self, page: fitz.Page, page_num: int) -> str:
			
 
				-        """GLM-OCR 识别"""
			
 
				+        """GLM-OCR 识别（整页版本，保留用于兼容）"""
			
 
				         # 渲染页面
			
 
				         rect = page.rect
			
 
				         clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -184,7 +184,7 @@ class DocumentProcessor:
 
				                     stage=stage, current=current, message=message
			
 
				                 )
			
 
				 
			
 
				-            simple_processor = SimpleDocumentProcessor()
			
 
				+            simple_processor = SimpleDocumentProcessor(use_ocr=self.use_ocr)
			
 
				             unified_doc = await simple_processor.process_unified(
			
 
				                 file_content=file_content,
			
 
				                 file_name=f"document_{uuid.uuid4().hex[:8]}",
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor.py
@@ -1,33 +1,102 @@
 
				 """
			
 
				-PDF 结构提取器
			
 
				+PDF 结构提取器 - 同步并发 OCR 版本
			
 
				 
			
 
				 基于 splitter_pdf 逻辑，直接提取章节结构并记录页码。
			
 
				+支持 OCR 增强：检测表格区域并使用 ThreadPoolExecutor 5并发 OCR，其他文本保持 PyMuPDF 提取。
			
 
				 输出格式兼容后续分类与组装流程。
			
 
				 """
			
 
				 
			
 
				+import base64
			
 
				+import io
			
 
				 import re
			
 
				-from typing import Dict, Any
			
 
				+from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Dict, Any, List, Optional, Tuple
			
 
				 
			
 
				 import fitz
			
 
				+import numpy as np
			
 
				+import requests
			
 
				 
			
 
				 from foundation.observability.logger.loggering import review_logger as logger
			
 
				 
			
 
				+# 尝试导入 RapidLayout
			
 
				+try:
			
 
				+    from rapid_layout import RapidLayout
			
 
				+    RAPID_LAYOUT_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    RAPID_LAYOUT_AVAILABLE = False
			
 
				+    RapidLayout = None
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class TableRegion:
			
 
				+    """表格区域信息"""
			
 
				+    page_num: int
			
 
				+    page: fitz.Page
			
 
				+    bbox: Tuple[float, float, float, float]
			
 
				+    score: float
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class OcrResult:
			
 
				+    """OCR 结果"""
			
 
				+    page_num: int
			
 
				+    bbox: Tuple[float, float, float, float]
			
 
				+    score: float
			
 
				+    text: str
			
 
				+    success: bool
			
 
				+
			
 
				 
			
 
				 class PdfStructureExtractor:
			
 
				-    """PDF 章节结构提取器"""
			
 
				+    """PDF 章节结构提取器（支持 OCR 异步并发）"""
			
 
				 
			
 
				     CHAPTER_PATTERN = re.compile(r"^第[一二三四五六七八九十百]+章\s*.*")
			
 
				     SECTION_PATTERN = re.compile(r"^[一二三四五六七八九十百]+、\s*.*")
			
 
				     TOC_PATTERN = re.compile(r"\.{3,}|…{2,}")
			
 
				 
			
 
				-    def __init__(self, clip_top: float = 60, clip_bottom: float = 60):
			
 
				+    # OCR 配置
			
 
				+    MAX_SHORT_EDGE = 1024
			
 
				+    JPEG_QUALITY = 90
			
 
				+    OCR_DPI = 200
			
 
				+    OCR_CONFIDENCE_THRESHOLD = 0.5
			
 
				+    OCR_CONCURRENT_WORKERS = 5
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        clip_top: float = 60,
			
 
				+        clip_bottom: float = 60,
			
 
				+        use_ocr: bool = False,
			
 
				+        ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
			
 
				+        ocr_timeout: int = 600,
			
 
				+        ocr_api_key: str = "",
			
 
				+    ):
			
 
				         self.clip_top = clip_top
			
 
				         self.clip_bottom = clip_bottom
			
 
				+        self.use_ocr = use_ocr and RAPID_LAYOUT_AVAILABLE
			
 
				+
			
 
				+        # OCR 配置
			
 
				+        self.ocr_api_url = ocr_api_url
			
 
				+        self.ocr_timeout = ocr_timeout
			
 
				+        self.ocr_api_key = ocr_api_key
			
 
				+        self._layout_engine: Optional[Any] = None
			
 
				+
			
 
				+        if use_ocr and not RAPID_LAYOUT_AVAILABLE:
			
 
				+            logger.warning("RapidLayout 未安装，OCR 功能不可用")
			
 
				 
			
 
				-    def extract(self, file_content: bytes) -> Dict[str, Any]:
			
 
				+    def _get_layout_engine(self) -> Optional[Any]:
			
 
				+        """延迟初始化 RapidLayout"""
			
 
				+        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
			
 
				+            self._layout_engine = RapidLayout()
			
 
				+        return self._layout_engine
			
 
				+
			
 
				+    def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
			
 
				         """
			
 
				         从 PDF 字节流提取章节结构。
			
 
				 
			
 
				+        Args:
			
 
				+            file_content: PDF 文件字节流
			
 
				+            progress_callback: 进度回调函数，接收 (stage, current, message) 参数
			
 
				+
			
 
				         Returns:
			
 
				             {
			
 
				                 "chapters": {
			
@@ -41,23 +110,95 @@ class PdfStructureExtractor:
 
				         """
			
 
				         doc = fitz.open(stream=file_content)
			
 
				         try:
			
 
				-            structure = self._extract_from_doc(doc)
			
 
				+            structure = self._extract_from_doc(doc, progress_callback)
			
 
				             structure["total_pages"] = len(doc)
			
 
				             return structure
			
 
				         finally:
			
 
				             doc.close()
			
 
				 
			
 
				-    def _extract_from_doc(self, doc: fitz.Document) -> Dict[str, Any]:
			
 
				+    def _extract_from_doc(self, doc: fitz.Document, progress_callback=None) -> Dict[str, Any]:
			
 
				+        """提取文档结构（支持 OCR 异步并发）"""
			
 
				+
			
 
				+        def _emit_progress(stage: str, current: int, message: str):
			
 
				+            """发送进度回调"""
			
 
				+            if progress_callback:
			
 
				+                try:
			
 
				+                    progress_callback(stage, current, message)
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+
			
 
				+        # === 阶段1: 收集所有需要 OCR 的表格区域 ===
			
 
				+        table_regions: List[TableRegion] = []
			
 
				+
			
 
				+        if self.use_ocr:
			
 
				+            logger.info("[OCR预处理] 扫描所有页面的表格区域...")
			
 
				+            total_pages = len(doc)
			
 
				+            for page_num in range(total_pages):
			
 
				+                page = doc.load_page(page_num)
			
 
				+                rect = page.rect
			
 
				+                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				+                regions = self._detect_table_regions(page, page_num + 1, clip_box)
			
 
				+                for bbox, score in regions:
			
 
				+                    table_regions.append(TableRegion(
			
 
				+                        page_num=page_num + 1,
			
 
				+                        page=page,
			
 
				+                        bbox=bbox,
			
 
				+                        score=score
			
 
				+                    ))
			
 
				+                # 每5页或最后一页推送一次进度
			
 
				+                if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
			
 
				+                    progress = int((page_num + 1) / total_pages * 30)  # OCR预处理占30%进度
			
 
				+                    _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
			
 
				+            logger.info(f"[OCR预处理] 共发现 {len(table_regions)} 个表格区域需要 OCR")
			
 
				+
			
 
				+        # === 阶段2: 异步并发执行 OCR (5并发) ===
			
 
				+        ocr_results: List[OcrResult] = []
			
 
				+
			
 
				+        if table_regions:
			
 
				+            logger.info(f"[OCR执行] 使用 {self.OCR_CONCURRENT_WORKERS} 并发执行 OCR...")
			
 
				+            _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格，开始OCR识别...")
			
 
				+            ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
			
 
				+            success_count = sum(1 for r in ocr_results if r.success)
			
 
				+            logger.info(f"[OCR执行] 完成 {success_count}/{len(table_regions)} 个表格 OCR")
			
 
				+            _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
			
 
				+
			
 
				+        # 按页码分组 OCR 结果
			
 
				+        ocr_by_page: Dict[int, List[OcrResult]] = {}
			
 
				+        for result in ocr_results:
			
 
				+            if result.success:
			
 
				+                if result.page_num not in ocr_by_page:
			
 
				+                    ocr_by_page[result.page_num] = []
			
 
				+                ocr_by_page[result.page_num].append(result)
			
 
				+
			
 
				+        # === 阶段3: 提取页面文本（应用 OCR 结果）并切分章节 ===
			
 
				         structured_data: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				         current_chapter = "未分类前言"
			
 
				         current_section = "默认部分"
			
 
				         in_body = False
			
 
				 
			
 
				+        logger.info("[文本提取] 提取页面内容并切分章节...")
			
 
				+
			
 
				         for page_num in range(len(doc)):
			
 
				             page = doc.load_page(page_num)
			
 
				             rect = page.rect
			
 
				             clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				-            text = page.get_text("text", clip=clip_box)
			
 
				+
			
 
				+            # 获取页面文本（应用 OCR 结果）
			
 
				+            if page_num + 1 in ocr_by_page:
			
 
				+                original_text = page.get_text("text", clip=clip_box)
			
 
				+                ocr_results_list = [
			
 
				+                    {
			
 
				+                        "region_index": i,
			
 
				+                        "bbox": r.bbox,
			
 
				+                        "score": r.score,
			
 
				+                        "ocr_text": r.text,
			
 
				+                    }
			
 
				+                    for i, r in enumerate(ocr_by_page[page_num + 1])
			
 
				+                ]
			
 
				+                text = self._replace_table_regions(page, original_text, ocr_results_list, clip_box)
			
 
				+            else:
			
 
				+                text = page.get_text("text", clip=clip_box)
			
 
				+
			
 
				             lines = text.split("\n")
			
 
				 
			
 
				             for line in lines:
			
@@ -133,6 +274,295 @@ class PdfStructureExtractor:
 
				         logger.info(f"[PdfExtractor] 提取完成，共 {len(result['chapters'])} 个章节")
			
 
				         return result
			
 
				 
			
 
				+    def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
			
 
				+        """同步并发处理 OCR（使用 ThreadPoolExecutor）"""
			
 
				+        results: List[OcrResult] = []
			
 
				+        total = len(regions)
			
 
				+        completed = 0
			
 
				+
			
 
				+        with ThreadPoolExecutor(max_workers=self.OCR_CONCURRENT_WORKERS) as executor:
			
 
				+            # 提交所有任务
			
 
				+            future_to_region = {
			
 
				+                executor.submit(self._ocr_table_region, r.page, r.bbox): r
			
 
				+                for r in regions
			
 
				+            }
			
 
				+
			
 
				+            # 处理完成的结果
			
 
				+            for future in as_completed(future_to_region):
			
 
				+                region = future_to_region[future]
			
 
				+                completed += 1
			
 
				+                try:
			
 
				+                    text = future.result()
			
 
				+                    results.append(OcrResult(
			
 
				+                        page_num=region.page_num,
			
 
				+                        bbox=region.bbox,
			
 
				+                        score=region.score,
			
 
				+                        text=text,
			
 
				+                        success=True,
			
 
				+                    ))
			
 
				+                except Exception as e:
			
 
				+                    logger.error(f"  第 {region.page_num} 页表格 OCR 失败: {e}")
			
 
				+                    results.append(OcrResult(
			
 
				+                        page_num=region.page_num,
			
 
				+                        bbox=region.bbox,
			
 
				+                        score=region.score,
			
 
				+                        text="",
			
 
				+                        success=False,
			
 
				+                    ))
			
 
				+
			
 
				+                # 每完成5个或最后一个时推送进度
			
 
				+                if progress_callback and (completed % 5 == 0 or completed == total):
			
 
				+                    progress = 35 + int(completed / total * 15)  # OCR执行占15%进度(35-50)
			
 
				+                    progress_callback("版面分析", progress, f"OCR识别中 {completed}/{total}")
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				+    def _detect_table_regions(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        page_num: int,
			
 
				+        clip_box: fitz.Rect
			
 
				+    ) -> List[Tuple[Tuple[float, float, float, float], float]]:
			
 
				+        """检测页面中的表格区域，返回坐标列表"""
			
 
				+        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
			
 
				+
			
 
				+        if not RAPID_LAYOUT_AVAILABLE:
			
 
				+            return table_regions
			
 
				+
			
 
				+        layout_engine = self._get_layout_engine()
			
 
				+        if layout_engine is None:
			
 
				+            return table_regions
			
 
				+
			
 
				+        # 渲染页面（裁剪区域）
			
 
				+        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=clip_box)
			
 
				+        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
			
 
				+
			
 
				+        try:
			
 
				+            layout_output = layout_engine(img)
			
 
				+
			
 
				+            # 解析版面结果
			
 
				+            if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
			
 
				+                # 获取缩放比例
			
 
				+                scale_x = clip_box.width / img.shape[1]
			
 
				+                scale_y = clip_box.height / img.shape[0]
			
 
				+
			
 
				+                for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
			
 
				+                    if label == "table" and score > self.OCR_CONFIDENCE_THRESHOLD:
			
 
				+                        # 转换为 PDF 坐标
			
 
				+                        pdf_x1 = clip_box.x0 + box[0] * scale_x
			
 
				+                        pdf_y1 = clip_box.y0 + box[1] * scale_y
			
 
				+                        pdf_x2 = clip_box.x0 + box[2] * scale_x
			
 
				+                        pdf_y2 = clip_box.y0 + box[3] * scale_y
			
 
				+
			
 
				+                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
			
 
				+
			
 
				+        return table_regions
			
 
				+
			
 
				+    def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float], max_retries: int = 3) -> str:
			
 
				+        """对指定区域进行 OCR 识别（使用 GLM-OCR），支持指数退避重试"""
			
 
				+        import time
			
 
				+
			
 
				+        # 渲染指定区域
			
 
				+        rect = fitz.Rect(bbox)
			
 
				+        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=rect)
			
 
				+        img_bytes = pix.tobytes("jpeg")
			
 
				+
			
 
				+        # 压缩图片
			
 
				+        compressed = self._compress_image(img_bytes)
			
 
				+        img_base64 = base64.b64encode(compressed).decode('utf-8')
			
 
				+
			
 
				+        # 请求 OCR
			
 
				+        payload = {
			
 
				+            "model": "GLM-OCR",
			
 
				+            "messages": [
			
 
				+                {
			
 
				+                    "role": "user",
			
 
				+                    "content": [
			
 
				+                        {
			
 
				+                            "type": "text",
			
 
				+                            "text": "识别图片中的表格内容，按原文排版输出。"
			
 
				+                                    "注意："
			
 
				+                                    "1. 表格用 Markdown 表格格式"
			
 
				+                                    "2. 保持换行和列对齐"
			
 
				+                                    "3. 只输出表格内容，不要其他说明"
			
 
				+                        },
			
 
				+                        {
			
 
				+                            "type": "image_url",
			
 
				+                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
			
 
				+                        }
			
 
				+                    ]
			
 
				+                }
			
 
				+            ],
			
 
				+            "max_tokens": 2048,
			
 
				+            "temperature": 0.1
			
 
				+        }
			
 
				+
			
 
				+        headers = {"Content-Type": "application/json"}
			
 
				+        if self.ocr_api_key:
			
 
				+            headers["Authorization"] = f"Bearer {self.ocr_api_key}"
			
 
				+
			
 
				+        # 指数退避重试
			
 
				+        last_error = None
			
 
				+        for attempt in range(max_retries):
			
 
				+            try:
			
 
				+                response = requests.post(
			
 
				+                    self.ocr_api_url,
			
 
				+                    headers=headers,
			
 
				+                    json=payload,
			
 
				+                    timeout=self.ocr_timeout
			
 
				+                )
			
 
				+                response.raise_for_status()
			
 
				+
			
 
				+                result = response.json()
			
 
				+                return self._extract_ocr_content(result)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                last_error = e
			
 
				+                if attempt < max_retries - 1:
			
 
				+                    # 指数退避: 2, 4, 8 秒
			
 
				+                    wait_time = 2 ** (attempt + 1)
			
 
				+                    logger.warning(f"  第 {page.number + 1} 页表格 OCR 第 {attempt + 1} 次失败: {e}, {wait_time}秒后重试...")
			
 
				+                    time.sleep(wait_time)
			
 
				+                else:
			
 
				+                    logger.error(f"  第 {page.number + 1} 页表格 OCR 最终失败（已重试{max_retries}次）: {e}")
			
 
				+
			
 
				+        # 所有重试都失败，抛出最后一个错误
			
 
				+        raise last_error
			
 
				+
			
 
				+    def _replace_table_regions(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        original_text: str,
			
 
				+        ocr_results: List[Dict],
			
 
				+        clip_box: fitz.Rect
			
 
				+    ) -> str:
			
 
				+        """用 OCR 结果替换原始文本中的表格区域"""
			
 
				+        if not ocr_results:
			
 
				+            return original_text
			
 
				+
			
 
				+        # 获取页面上的文本块及其坐标
			
 
				+        text_blocks = []
			
 
				+        for block in page.get_text("blocks"):
			
 
				+            x0, y0, x1, y1, text, _, _ = block
			
 
				+            # 只考虑裁剪区域内的文本
			
 
				+            if y0 >= clip_box.y0 and y1 <= clip_box.y1:
			
 
				+                text_blocks.append({
			
 
				+                    "bbox": (x0, y0, x1, y1),
			
 
				+                    "text": text.strip(),
			
 
				+                })
			
 
				+
			
 
				+        # 按 Y 坐标排序
			
 
				+        text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
			
 
				+
			
 
				+        # 找出属于表格区域的文本块
			
 
				+        replaced_indices: Set[int] = set()
			
 
				+        for ocr_result in ocr_results:
			
 
				+            bbox = ocr_result["bbox"]
			
 
				+            rx0, ry0, rx1, ry1 = bbox
			
 
				+
			
 
				+            for idx, block in enumerate(text_blocks):
			
 
				+                if idx in replaced_indices:
			
 
				+                    continue
			
 
				+                bx0, by0, bx1, by1 = block["bbox"]
			
 
				+
			
 
				+                # 检查重叠
			
 
				+                overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
			
 
				+                overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
			
 
				+                overlap_area = overlap_x * overlap_y
			
 
				+                block_area = (bx1 - bx0) * (by1 - by0)
			
 
				+
			
 
				+                if block_area > 0 and overlap_area / block_area > 0.5:
			
 
				+                    replaced_indices.add(idx)
			
 
				+
			
 
				+        # 构建新文本
			
 
				+        result_parts: List[str] = []
			
 
				+        last_idx = 0
			
 
				+
			
 
				+        for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
			
 
				+            bbox = ocr_result["bbox"]
			
 
				+            rx0, ry0, rx1, ry1 = bbox
			
 
				+
			
 
				+            # 找到该表格区域之前的文本
			
 
				+            region_start_idx = None
			
 
				+            for idx, block in enumerate(text_blocks):
			
 
				+                if idx in replaced_indices:
			
 
				+                    bx0, by0, bx1, by1 = block["bbox"]
			
 
				+                    if (bx0 >= rx0 - 5 and bx1 <= rx1 + 5 and
			
 
				+                        by0 >= ry0 - 5 and by1 <= ry1 + 5):
			
 
				+                        if region_start_idx is None:
			
 
				+                            region_start_idx = idx
			
 
				+                        last_idx = idx + 1
			
 
				+
			
 
				+            if region_start_idx is not None:
			
 
				+                # 添加表格前的非表格文本
			
 
				+                for idx in range(last_idx - (last_idx - region_start_idx), region_start_idx):
			
 
				+                    if idx not in replaced_indices and idx < len(text_blocks):
			
 
				+                        result_parts.append(text_blocks[idx]["text"])
			
 
				+                        result_parts.append("\n")
			
 
				+
			
 
				+                # 添加 OCR 结果
			
 
				+                result_parts.append(ocr_result["ocr_text"])
			
 
				+                result_parts.append("\n")
			
 
				+
			
 
				+        # 添加剩余文本
			
 
				+        for idx in range(last_idx, len(text_blocks)):
			
 
				+            if idx not in replaced_indices:
			
 
				+                result_parts.append(text_blocks[idx]["text"])
			
 
				+                result_parts.append("\n")
			
 
				+
			
 
				+        return "".join(result_parts).strip() or original_text
			
 
				+
			
 
				+    def _compress_image(self, img_bytes: bytes) -> bytes:
			
 
				+        """压缩图片"""
			
 
				+        try:
			
 
				+            from PIL import Image
			
 
				+            img = Image.open(io.BytesIO(img_bytes))
			
 
				+
			
 
				+            if img.mode in ('RGBA', 'LA', 'P'):
			
 
				+                background = Image.new('RGB', img.size, (255, 255, 255))
			
 
				+                if img.mode == 'P':
			
 
				+                    img = img.convert('RGBA')
			
 
				+                if img.mode in ('RGBA', 'LA'):
			
 
				+                    background.paste(img, mask=img.split()[-1])
			
 
				+                img = background
			
 
				+            elif img.mode != 'RGB':
			
 
				+                img = img.convert('RGB')
			
 
				+
			
 
				+            min_edge = min(img.size)
			
 
				+            if min_edge > self.MAX_SHORT_EDGE:
			
 
				+                ratio = self.MAX_SHORT_EDGE / min_edge
			
 
				+                new_size = (int(img.width * ratio), int(img.height * ratio))
			
 
				+                img = img.resize(new_size, Image.Resampling.LANCZOS)
			
 
				+
			
 
				+            buffer = io.BytesIO()
			
 
				+            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
			
 
				+            return buffer.getvalue()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"图片压缩失败，使用原图: {e}")
			
 
				+            return img_bytes
			
 
				+
			
 
				+    def _extract_ocr_content(self, result: Dict) -> str:
			
 
				+        """从 OCR 响应提取内容，并将 HTML 表格转换为 Markdown"""
			
 
				+        content = ""
			
 
				+        if "choices" in result and isinstance(result["choices"], list):
			
 
				+            if len(result["choices"]) > 0:
			
 
				+                message = result["choices"][0].get("message", {})
			
 
				+                content = message.get("content", "")
			
 
				+
			
 
				+        # 如果内容包含 HTML 标签，转换为 Markdown
			
 
				+        if content and "<" in content and ">" in content:
			
 
				+            try:
			
 
				+                from ..doc_worker.pdf_worker.html_to_markdown import convert_html_to_markdown
			
 
				+                content = convert_html_to_markdown(content)
			
 
				+            except Exception as e:
			
 
				+                logger.debug(f"HTML 转 Markdown 失败，保留原始内容: {e}")
			
 
				+
			
 
				+        return content
			
 
				+
			
 
				     @staticmethod
			
 
				     def _is_header_footer(line: str) -> bool:
			
 
				         return (
			
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -35,8 +35,26 @@ from ..doc_worker.models import (
 
				 class SimpleDocumentProcessor:
			
 
				     """最简文档处理器"""
			
 
				 
			
 
				-    def __init__(self):
			
 
				-        self.pdf_extractor = PdfStructureExtractor()
			
 
				+    def __init__(self, use_ocr: bool = False):
			
 
				+        # 从配置读取 OCR 配置
			
 
				+        ocr_api_url = "http://183.220.37.46:25429/v1/chat/completions"
			
 
				+        ocr_api_key = ""
			
 
				+        ocr_timeout = 600
			
 
				+
			
 
				+        try:
			
 
				+            from foundation.infrastructure.config.config import config_handler
			
 
				+            ocr_api_url = config_handler.get("ocr", "GLM_OCR_API_URL", ocr_api_url)
			
 
				+            ocr_api_key = config_handler.get("ocr", "GLM_OCR_API_KEY", "")
			
 
				+            ocr_timeout = int(config_handler.get("ocr", "GLM_OCR_TIMEOUT", str(ocr_timeout)))
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+
			
 
				+        self.pdf_extractor = PdfStructureExtractor(
			
 
				+            use_ocr=use_ocr,
			
 
				+            ocr_api_url=ocr_api_url,
			
 
				+            ocr_api_key=ocr_api_key,
			
 
				+            ocr_timeout=ocr_timeout,
			
 
				+        )
			
 
				         self.hierarchy_classifier = HierarchyClassifier()
			
 
				         self.chunk_classifier = ChunkClassifier()
			
 
				 
			
@@ -86,8 +104,20 @@ class SimpleDocumentProcessor:
 
				         """执行核心流程，返回 (structure, primary_result, secondary_result, chunks)。"""
			
 
				         logger.info(f"[SimpleProcessor] 开始处理文档: {file_name}")
			
 
				 
			
 
				-        # 1. PDF 结构提取
			
 
				-        structure = self.pdf_extractor.extract(file_content)
			
 
				+        # 1. PDF 结构提取（带进度回调）
			
 
				+        def _extraction_progress(stage: str, current: int, message: str):
			
 
				+            # 版面分析阶段映射到"文档提取"阶段，进度0-50
			
 
				+            if progress_callback:
			
 
				+                # 使用 asyncio.create_task 异步执行回调避免阻塞
			
 
				+                try:
			
 
				+                    loop = asyncio.get_event_loop()
			
 
				+                    loop.create_task(self._emit_progress(
			
 
				+                        progress_callback, "文档提取", int(current * 0.5), message
			
 
				+                    ))
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+
			
 
				+        structure = self.pdf_extractor.extract(file_content, progress_callback=_extraction_progress)
			
 
				         await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
			
 
				 
			
 
				         # 2. 一级分类
			
--- a/core/construction_review/workflows/document_workflow.py
+++ b/core/construction_review/workflows/document_workflow.py
@@ -32,9 +32,15 @@ class DocumentWorkflow:
 
				 
			
 
				         self.progress_manager = progress_manager
			
 
				         self.redis_duplicate_checker = redis_duplicate_checker
			
 
				+        # 从配置读取是否启用 OCR
			
 
				+        from foundation.infrastructure.config.config import config_handler
			
 
				+        use_ocr_str = config_handler.get("ocr", "enable", "false")
			
 
				+        use_ocr = use_ocr_str.lower() in ("true", "1", "yes", "on")
			
 
				+
			
 
				         self.document_processor = DocumentProcessor(
			
 
				             progress_manager=progress_manager,
			
 
				-            callback_task_id=task_file_info.callback_task_id
			
 
				+            callback_task_id=task_file_info.callback_task_id,
			
 
				+            use_ocr=use_ocr
			
 
				         )
			
 
				 
			
 
				     async def _processing_heartbeat(self, progress_state: dict) -> None: