1 week ago · a33ee5e5d9
--- a/core/construction_review/component/doc_worker/models/document_structure.py
+++ b/core/construction_review/component/doc_worker/models/document_structure.py
@@ -191,6 +191,9 @@ class UnifiedDocumentStructure:
 
															     # ========== 文档大纲 ==========
														
 
															     outline: Outline = field(default_factory=Outline)
														
 
															+    # ========== 目录结构（YOLO检测+OCR提取） ==========
														
 
															+    catalog: Optional[Dict[str, Any]] = None
														
 
															+
														
 
															     # ========== 原始数据（可选） ==========
														
 
															     raw_metadata: Dict[str, Any] = field(default_factory=dict)
														
@@ -467,7 +470,7 @@ class UnifiedDocumentStructure:
 
															         outline_chapters = list(chapters_map.values())
														
 
															-        return {
														
 
															+        result = {
														
 
															             "document_id": self.document_id,
														
 
															             "document_name": self.document_name,
														
 
															             "total_pages": self.total_pages,
														
@@ -485,3 +488,9 @@ class UnifiedDocumentStructure:
 
															                 "tertiary_count": self.tertiary_count,
														
 
															             }
														
 
															         }
														
 
															+
														
 
															+        # 添加目录结构（如果存在）
														
 
															+        if self.catalog:
														
 
															+            result["catalog"] = self.catalog
														
 
															+
														
 
															+        return result
														
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor.py
@@ -69,6 +69,8 @@ class PdfStructureExtractor:
 
															         ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
														
 
															         ocr_timeout: int = 600,
														
 
															         ocr_api_key: str = "",
														
 
															+        detect_toc: bool = True,
														
 
															+        toc_model_path: str = "config/yolo/best.pt",
														
 
															     ):
														
 
															         self.clip_top = clip_top
														
 
															         self.clip_bottom = clip_bottom
														
@@ -80,6 +82,11 @@ class PdfStructureExtractor:
 
															         self.ocr_api_key = ocr_api_key
														
 
															         self._layout_engine: Optional[Any] = None
														
 
															+        # 目录检测配置
														
 
															+        self.detect_toc = detect_toc
														
 
															+        self.toc_model_path = toc_model_path
														
 
															+        self._toc_extractor = None
														
 
															+
														
 
															         if use_ocr and not RAPID_LAYOUT_AVAILABLE:
														
 
															             logger.warning("RapidLayout 未安装，OCR 功能不可用")
														
@@ -105,17 +112,59 @@ class PdfStructureExtractor:
 
															                         "一、xxx": {"content": "...", "page_start": 2, "page_end": 3},
														
 
															                     }
														
 
															                 },
														
 
															-                "total_pages": N
														
 
															+                "total_pages": N,
														
 
															+                "catalog": {  # 目录结构（YOLO检测+OCR提取）
														
 
															+                    "chapters": [...],
														
 
															+                    "total_chapters": N
														
 
															+                }
														
 
															             }
														
 
															         """
														
 
															+        result = {"chapters": {}, "total_pages": 0, "catalog": None}
														
 
															+
														
 
															+        # === 阶段0: 目录页检测与提取（如果启用）===
														
 
															+        if self.detect_toc:
														
 
															+            try:
														
 
															+                catalog = self._extract_catalog(file_content, progress_callback)
														
 
															+                if catalog:
														
 
															+                    result["catalog"] = catalog
														
 
															+                    logger.info(f"[PDF提取] 目录提取完成: {catalog.get('total_chapters', 0)} 章")
														
 
															+            except Exception as e:
														
 
															+                logger.warning(f"[PDF提取] 目录提取失败: {e}")
														
 
															+
														
 
															+        # === 阶段1-3: 文档结构提取 ===
														
 
															         doc = fitz.open(stream=file_content)
														
 
															         try:
														
 
															             structure = self._extract_from_doc(doc, progress_callback)
														
 
															-            structure["total_pages"] = len(doc)
														
 
															-            return structure
														
 
															+            result["chapters"] = structure.get("chapters", {})
														
 
															+            result["total_pages"] = len(doc)
														
 
															+            return result
														
 
															         finally:
														
 
															             doc.close()
														
 
															+    def _extract_catalog(self, file_content: bytes, progress_callback=None) -> Optional[Dict[str, Any]]:
														
 
															+        """
														
 
															+        提取目录结构（YOLO检测 + OCR识别）
														
 
															+
														
 
															+        Returns:
														
 
															+            {"chapters": [...], "total_chapters": N} 或 None
														
 
															+        """
														
 
															+        # 延迟导入避免循环依赖
														
 
															+        try:
														
 
															+            from .toc_detector import TOCCatalogExtractor
														
 
															+        except ImportError:
														
 
															+            logger.warning("[PDF提取] toc_detector 模块未找到，跳过目录检测")
														
 
															+            return None
														
 
															+
														
 
															+        if self._toc_extractor is None:
														
 
															+            self._toc_extractor = TOCCatalogExtractor(
														
 
															+                model_path=self.toc_model_path,
														
 
															+                ocr_api_url=self.ocr_api_url,
														
 
															+                ocr_api_key=self.ocr_api_key,
														
 
															+                ocr_timeout=self.ocr_timeout,
														
 
															+            )
														
 
															+
														
 
															+        return self._toc_extractor.detect_and_extract(file_content, progress_callback)
														
 
															+
														
 
															     def _extract_from_doc(self, doc: fitz.Document, progress_callback=None) -> Dict[str, Any]:
														
 
															         """提取文档结构（支持 OCR 异步并发）"""
														
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -68,12 +68,14 @@ class SimpleDocumentProcessor:
 
															         处理 PDF 文档，返回 UnifiedDocumentStructure。
														
 
															         这是 document_processor 的主要入口。
														
 
															         """
														
 
															-        structure, primary_result, secondary_result, chunks = await self._run_pipeline(
														
 
															+        structure, primary_result, secondary_result, chunks, catalog = await self._run_pipeline(
														
 
															             file_content, file_name, progress_callback
														
 
															         )
														
 
															         if not chunks:
														
 
															-            return self._build_empty_unified(file_name, structure.get("total_pages", 0))
														
 
															+            empty_result = self._build_empty_unified(file_name, structure.get("total_pages", 0))
														
 
															+            empty_result.catalog = catalog
														
 
															+            return empty_result
														
 
															         return self._build_unified_doc(
														
 
															             structure=structure,
														
@@ -81,6 +83,7 @@ class SimpleDocumentProcessor:
 
															             secondary_result=secondary_result,
														
 
															             chunks=chunks,
														
 
															             document_name=file_name,
														
 
															+            catalog=catalog,
														
 
															         )
														
 
															     async def process(
														
@@ -100,8 +103,8 @@ class SimpleDocumentProcessor:
 
															         file_content: bytes,
														
 
															         file_name: str,
														
 
															         progress_callback: Optional[callable],
														
 
															-    ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any], List[Dict[str, Any]]]:
														
 
															-        """执行核心流程，返回 (structure, primary_result, secondary_result, chunks)。"""
														
 
															+    ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any], List[Dict[str, Any]], Optional[Dict[str, Any]]]:
														
 
															+        """执行核心流程，返回 (structure, primary_result, secondary_result, chunks, catalog)。"""
														
 
															         logger.info(f"[SimpleProcessor] 开始处理文档: {file_name}")
														
 
															         # 1. PDF 结构提取（带进度回调）
														
@@ -118,6 +121,7 @@ class SimpleDocumentProcessor:
 
															                     pass
														
 
															         structure = self.pdf_extractor.extract(file_content, progress_callback=_extraction_progress)
														
 
															+        catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
														
 
															         await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
														
 
															         # 2. 一级分类
														
@@ -154,7 +158,7 @@ class SimpleDocumentProcessor:
 
															         logger.info("[SimpleProcessor] 三级分类完成")
														
 
															         await self._emit_progress(progress_callback, "文档分类", 90, "三级分类完成")
														
 
															-        return structure, primary_result, secondary_result, chunks
														
 
															+        return structure, primary_result, secondary_result, chunks, catalog
														
 
															     async def _emit_progress(
														
 
															         self,
														
@@ -179,6 +183,7 @@ class SimpleDocumentProcessor:
 
															         secondary_result: Dict[str, Any],
														
 
															         chunks: List[Dict[str, Any]],
														
 
															         document_name: str,
														
 
															+        catalog: Optional[Dict[str, Any]] = None,
														
 
															     ) -> UnifiedDocumentStructure:
														
 
															         """构建 UnifiedDocumentStructure 并合并三级分类结果。"""
														
 
															         unified = build_unified_structure(
														
@@ -220,6 +225,9 @@ class SimpleDocumentProcessor:
 
															             }
														
 
															         }
														
 
															+        # 设置目录结构（YOLO检测+OCR提取）
														
 
															+        unified.catalog = catalog
														
 
															+
														
 
															         return unified
														
 
															     def _merge_tertiary_to_unified(
														
--- a/core/construction_review/component/minimal_pipeline/toc_detector.py
+++ b/core/construction_review/component/minimal_pipeline/toc_detector.py
@@ -0,0 +1,549 @@
 
															+"""
														
 
															+YOLO 目录页检测与 OCR 提取模块
														
 
															+
														
 
															+用于在文档处理流程早期检测目录页并提取目录内容，
														
 
															+输出结构与 outline 保持一致，便于后续进行目录完整性检查。
														
 
															+"""
														
 
															+
														
 
															+import io
														
 
															+import os
														
 
															+import re
														
 
															+from dataclasses import dataclass
														
 
															+from typing import Dict, Any, List, Optional, Tuple
														
 
															+from pathlib import Path
														
 
															+
														
 
															+import fitz
														
 
															+import numpy as np
														
 
															+
														
 
															+from foundation.observability.logger.loggering import review_logger as logger
														
 
															+
														
 
															+# 尝试导入 YOLO 相关库
														
 
															+try:
														
 
															+    from ultralytics import YOLO
														
 
															+    YOLO_AVAILABLE = True
														
 
															+except ImportError:
														
 
															+    YOLO_AVAILABLE = False
														
 
															+
														
 
															+try:
														
 
															+    from PIL import Image
														
 
															+    PIL_AVAILABLE = True
														
 
															+except ImportError:
														
 
															+    PIL_AVAILABLE = False
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class CatalogItem:
														
 
															+    """目录项结构"""
														
 
															+    index: int           # 章节序号（1-based）
														
 
															+    title: str           # 章节标题
														
 
															+    page: str            # 页码（字符串）
														
 
															+    original: str        # 原始文本
														
 
															+    level: int = 1       # 层级（1=章，2=节）
														
 
															+    parent_title: str = ""  # 父章节标题（用于二级）
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class CatalogSection:
														
 
															+    """目录节结构（对应二级目录）"""
														
 
															+    title: str
														
 
															+    page: str
														
 
															+    level: int
														
 
															+    original: str
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class CatalogChapter:
														
 
															+    """目录章结构（对应一级目录）"""
														
 
															+    index: int
														
 
															+    title: str
														
 
															+    page: str
														
 
															+    original: str
														
 
															+    subsections: List[CatalogSection]
														
 
															+
														
 
															+
														
 
															+class TOCCatalogExtractor:
														
 
															+    """
														
 
															+    目录页检测与内容提取器
														
 
															+
														
 
															+    使用 YOLO 模型检测目录页，使用 GLM-OCR 提取目录文本，
														
 
															+    解析为结构化数据，输出格式与 outline 保持一致。
														
 
															+    """
														
 
															+
														
 
															+    # YOLO 配置
														
 
															+    DEFAULT_MODEL_PATH = "config/yolo/best.pt"
														
 
															+    CONF_THRESHOLD = 0.25
														
 
															+    MAX_CHECK_PAGES = 50
														
 
															+    DPI = 150
														
 
															+
														
 
															+    # OCR 配置
														
 
															+    OCR_DPI = 200
														
 
															+    MAX_SHORT_EDGE = 1024
														
 
															+    JPEG_QUALITY = 90
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        model_path: str = None,
														
 
															+        ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
														
 
															+        ocr_api_key: str = "",
														
 
															+        ocr_timeout: int = 600,
														
 
															+    ):
														
 
															+        self.model_path = model_path or self.DEFAULT_MODEL_PATH
														
 
															+        self.ocr_api_url = ocr_api_url
														
 
															+        self.ocr_api_key = ocr_api_key
														
 
															+        self.ocr_timeout = ocr_timeout
														
 
															+
														
 
															+        self._model = None
														
 
															+        self._yolo_available = YOLO_AVAILABLE and PIL_AVAILABLE
														
 
															+
														
 
															+    def _load_model(self) -> bool:
														
 
															+        """加载 YOLO 模型"""
														
 
															+        if not self._yolo_available:
														
 
															+            logger.debug("[TOC检测] YOLO库未安装，跳过目录检测")
														
 
															+            return False
														
 
															+
														
 
															+        if not os.path.exists(self.model_path):
														
 
															+            logger.debug(f"[TOC检测] 模型文件不存在: {self.model_path}")
														
 
															+            return False
														
 
															+
														
 
															+        if self._model is None:
														
 
															+            try:
														
 
															+                logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
														
 
															+                self._model = YOLO(self.model_path)
														
 
															+                return True
														
 
															+            except Exception as e:
														
 
															+                logger.warning(f"[TOC检测] 模型加载失败: {e}")
														
 
															+                return False
														
 
															+        return True
														
 
															+
														
 
															+    def detect_and_extract(
														
 
															+        self,
														
 
															+        file_content: bytes,
														
 
															+        progress_callback=None
														
 
															+    ) -> Optional[Dict[str, Any]]:
														
 
															+        """
														
 
															+        检测目录页并提取目录内容
														
 
															+
														
 
															+        Args:
														
 
															+            file_content: PDF文件字节流
														
 
															+            progress_callback: 进度回调函数
														
 
															+
														
 
															+        Returns:
														
 
															+            目录结构字典，格式与 outline 保持一致：
														
 
															+            {
														
 
															+                "chapters": [...],
														
 
															+                "total_chapters": N
														
 
															+            }
														
 
															+        """
														
 
															+        if not self._load_model():
														
 
															+            return None
														
 
															+
														
 
															+        doc = fitz.open(stream=file_content)
														
 
															+        try:
														
 
															+            # 1. 检测目录页范围
														
 
															+            toc_pages = self._detect_toc_pages(doc, progress_callback)
														
 
															+            if not toc_pages:
														
 
															+                logger.info("[TOC检测] 未检测到目录页")
														
 
															+                return None
														
 
															+
														
 
															+            logger.info(f"[TOC检测] 检测到目录页: 第{toc_pages[0]+1}页 - 第{toc_pages[-1]+1}页")
														
 
															+
														
 
															+            # 2. OCR 提取目录页内容
														
 
															+            if progress_callback:
														
 
															+                progress_callback("目录识别", 10, f"检测到{len(toc_pages)}页目录，开始OCR识别...")
														
 
															+
														
 
															+            toc_text = self._ocr_toc_pages(doc, toc_pages, progress_callback)
														
 
															+
														
 
															+            if not toc_text:
														
 
															+                return None
														
 
															+
														
 
															+            # 3. 解析目录文本为结构化数据
														
 
															+            if progress_callback:
														
 
															+                progress_callback("目录识别", 80, "解析目录结构...")
														
 
															+
														
 
															+            catalog = self._parse_toc_text(toc_text)
														
 
															+
														
 
															+            if progress_callback:
														
 
															+                progress_callback("目录识别", 100, f"目录提取完成，共{catalog['total_chapters']}章")
														
 
															+
														
 
															+            return catalog
														
 
															+
														
 
															+        finally:
														
 
															+            doc.close()
														
 
															+
														
 
															+    def _detect_toc_pages(
														
 
															+        self,
														
 
															+        doc: fitz.Document,
														
 
															+        progress_callback=None
														
 
															+    ) -> List[int]:
														
 
															+        """
														
 
															+        使用 YOLO 检测目录页范围
														
 
															+
														
 
															+        Returns:
														
 
															+            目录页索引列表（0-based）
														
 
															+        """
														
 
															+        toc_pages = []
														
 
															+        total_pages = len(doc)
														
 
															+        pages_to_check = min(total_pages, self.MAX_CHECK_PAGES)
														
 
															+
														
 
															+        for page_idx in range(pages_to_check):
														
 
															+            page = doc.load_page(page_idx)
														
 
															+
														
 
															+            # 渲染页面
														
 
															+            zoom = self.DPI / 72
														
 
															+            mat = fitz.Matrix(zoom, zoom)
														
 
															+            pix = page.get_pixmap(matrix=mat)
														
 
															+
														
 
															+            # 转换为 numpy 数组
														
 
															+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
														
 
															+            img_array = np.array(img)
														
 
															+
														
 
															+            # YOLO 检测
														
 
															+            results = self._model(img_array, conf=self.CONF_THRESHOLD, verbose=False)
														
 
															+
														
 
															+            # 检查是否检测到 catalogs 类别
														
 
															+            has_catalogs = False
														
 
															+            for result in results:
														
 
															+                if result.boxes is not None:
														
 
															+                    for box in result.boxes:
														
 
															+                        cls_id = int(box.cls.item())
														
 
															+                        class_name = self._model.names.get(cls_id, f"class_{cls_id}")
														
 
															+                        if class_name == 'catalogs':
														
 
															+                            has_catalogs = True
														
 
															+                            break
														
 
															+                if has_catalogs:
														
 
															+                    break
														
 
															+
														
 
															+            if has_catalogs:
														
 
															+                toc_pages.append(page_idx)
														
 
															+                logger.debug(f"  第{page_idx + 1:3d}页: 检测到目录")
														
 
															+            else:
														
 
															+                logger.debug(f"  第{page_idx + 1:3d}页: 未检测到目录")
														
 
															+                # 如果已经检测到目录，且现在没有检测到，认为目录结束
														
 
															+                if toc_pages:
														
 
															+                    break
														
 
															+
														
 
															+            if progress_callback and (page_idx + 1) % 5 == 0:
														
 
															+                progress = int((page_idx + 1) / pages_to_check * 10)
														
 
															+                progress_callback("目录识别", progress, f"扫描页面 {page_idx + 1}/{pages_to_check}")
														
 
															+
														
 
															+        return toc_pages
														
 
															+
														
 
															+    def _ocr_toc_pages(
														
 
															+        self,
														
 
															+        doc: fitz.Document,
														
 
															+        toc_pages: List[int],
														
 
															+        progress_callback=None
														
 
															+    ) -> str:
														
 
															+        """
														
 
															+        对目录页进行 OCR 识别
														
 
															+
														
 
															+        Returns:
														
 
															+            合并后的目录文本
														
 
															+        """
														
 
															+        import base64
														
 
															+        import io
														
 
															+        import requests
														
 
															+        import time
														
 
															+
														
 
															+        all_texts = []
														
 
															+        total = len(toc_pages)
														
 
															+
														
 
															+        for idx, page_idx in enumerate(toc_pages):
														
 
															+            page = doc.load_page(page_idx)
														
 
															+
														
 
															+            try:
														
 
															+                # 渲染页面
														
 
															+                pix = page.get_pixmap(dpi=self.OCR_DPI)
														
 
															+                img_bytes = pix.tobytes("jpeg")
														
 
															+
														
 
															+                # 压缩图片
														
 
															+                compressed = self._compress_image(img_bytes)
														
 
															+                img_base64 = base64.b64encode(compressed).decode('utf-8')
														
 
															+
														
 
															+                # 请求 OCR
														
 
															+                payload = {
														
 
															+                    "model": "GLM-OCR",
														
 
															+                    "messages": [
														
 
															+                        {
														
 
															+                            "role": "user",
														
 
															+                            "content": [
														
 
															+                                {
														
 
															+                                    "type": "text",
														
 
															+                                    "text": "这是一份施工方案文档的目录页。请识别并提取目录内容，按原文格式输出。"
														
 
															+                                            "注意："
														
 
															+                                            "1. 保留章节层级关系（章/节）"
														
 
															+                                            "2. 保留页码信息"
														
 
															+                                            "3. 只输出目录内容，不要其他说明"
														
 
															+                                },
														
 
															+                                {
														
 
															+                                    "type": "image_url",
														
 
															+                                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
														
 
															+                                }
														
 
															+                            ]
														
 
															+                        }
														
 
															+                    ],
														
 
															+                    "max_tokens": 4096,
														
 
															+                    "temperature": 0.1
														
 
															+                }
														
 
															+
														
 
															+                headers = {"Content-Type": "application/json"}
														
 
															+                if self.ocr_api_key:
														
 
															+                    headers["Authorization"] = f"Bearer {self.ocr_api_key}"
														
 
															+
														
 
															+                # 指数退避重试
														
 
															+                max_retries = 3
														
 
															+                for attempt in range(max_retries):
														
 
															+                    try:
														
 
															+                        response = requests.post(
														
 
															+                            self.ocr_api_url,
														
 
															+                            headers=headers,
														
 
															+                            json=payload,
														
 
															+                            timeout=self.ocr_timeout
														
 
															+                        )
														
 
															+                        response.raise_for_status()
														
 
															+                        result = response.json()
														
 
															+
														
 
															+                        content = ""
														
 
															+                        if "choices" in result and result["choices"]:
														
 
															+                            content = result["choices"][0].get("message", {}).get("content", "")
														
 
															+
														
 
															+                        if content:
														
 
															+                            all_texts.append(content)
														
 
															+                            logger.debug(f"  第{page_idx + 1}页目录OCR成功")
														
 
															+                        break
														
 
															+
														
 
															+                    except Exception as e:
														
 
															+                        if attempt < max_retries - 1:
														
 
															+                            wait_time = 2 ** (attempt + 1)
														
 
															+                            logger.warning(f"  第{page_idx + 1}页目录OCR失败，{wait_time}秒后重试...")
														
 
															+                            time.sleep(wait_time)
														
 
															+                        else:
														
 
															+                            logger.error(f"  第{page_idx + 1}页目录OCR最终失败: {e}")
														
 
															+
														
 
															+                if progress_callback:
														
 
															+                    progress = 10 + int((idx + 1) / total * 60)
														
 
															+                    progress_callback("目录识别", progress, f"OCR识别中 {idx + 1}/{total}")
														
 
															+
														
 
															+            except Exception as e:
														
 
															+                logger.error(f"  第{page_idx + 1}页OCR处理出错: {e}")
														
 
															+
														
 
															+        return "\n".join(all_texts)
														
 
															+
														
 
															+    def _compress_image(self, img_bytes: bytes) -> bytes:
														
 
															+        """压缩图片"""
														
 
															+        try:
														
 
															+            from PIL import Image
														
 
															+            img = Image.open(io.BytesIO(img_bytes))
														
 
															+
														
 
															+            if img.mode in ('RGBA', 'LA', 'P'):
														
 
															+                background = Image.new('RGB', img.size, (255, 255, 255))
														
 
															+                if img.mode == 'P':
														
 
															+                    img = img.convert('RGBA')
														
 
															+                if img.mode in ('RGBA', 'LA'):
														
 
															+                    background.paste(img, mask=img.split()[-1])
														
 
															+                img = background
														
 
															+            elif img.mode != 'RGB':
														
 
															+                img = img.convert('RGB')
														
 
															+
														
 
															+            min_edge = min(img.size)
														
 
															+            if min_edge > self.MAX_SHORT_EDGE:
														
 
															+                ratio = self.MAX_SHORT_EDGE / min_edge
														
 
															+                new_size = (int(img.width * ratio), int(img.height * ratio))
														
 
															+                img = img.resize(new_size, Image.Resampling.LANCZOS)
														
 
															+
														
 
															+            buffer = io.BytesIO()
														
 
															+            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
														
 
															+            return buffer.getvalue()
														
 
															+
														
 
															+        except Exception as e:
														
 
															+            logger.warning(f"[TOC检测] 图片压缩失败，使用原图: {e}")
														
 
															+            return img_bytes
														
 
															+
														
 
															+    def _parse_toc_text(self, text: str) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        解析目录文本为结构化数据
														
 
															+
														
 
															+        支持格式：
														
 
															+        - 第一章 XXX...................1
														
 
															+        - 一、XXX......................2
														
 
															+        - 1. XXX ......................3
														
 
															+
														
 
															+        Returns:
														
 
															+            {"chapters": [...], "total_chapters": N}
														
 
															+        """
														
 
															+        lines = text.strip().split('\n')
														
 
															+        chapters = []
														
 
															+        current_chapter = None
														
 
															+
														
 
															+        # 正则表达式模式
														
 
															+        chapter_pattern = re.compile(
														
 
															+            r'第\s*([一二三四五六七八九十百0-9]+)\s*章\s*[\s\.]*(.+?)\s*[\.\s]*(\d+)\s*$',
														
 
															+            re.IGNORECASE
														
 
															+        )
														
 
															+        section_pattern = re.compile(
														
 
															+            r'([一二三四五六七八九十]+)\s*[、\.\s]+\s*(.+?)\s*[\.\s]*(\d+)\s*$'
														
 
															+        )
														
 
															+        generic_pattern = re.compile(
														
 
															+            r'([0-9]+)[\.\s]+(.+?)\s*[\.\s]+(\d+)\s*$'
														
 
															+        )
														
 
															+
														
 
															+        for line in lines:
														
 
															+            line = line.strip()
														
 
															+            if not line or len(line) < 3:
														
 
															+                continue
														
 
															+
														
 
															+            # 移除 Markdown 表格符号
														
 
															+            line = re.sub(r'^[\|\s]+|[\|\s]+$', '', line)
														
 
															+            line = line.replace('|', ' ')
														
 
															+
														
 
															+            # 尝试匹配章
														
 
															+            chapter_match = chapter_pattern.search(line)
														
 
															+            if chapter_match:
														
 
															+                chapter_num = chapter_match.group(1)
														
 
															+                title = chapter_match.group(2).strip()
														
 
															+                page = chapter_match.group(3).strip()
														
 
															+
														
 
															+                # 保存上一个章
														
 
															+                if current_chapter:
														
 
															+                    chapters.append(current_chapter)
														
 
															+
														
 
															+                current_chapter = {
														
 
															+                    "index": self._chinese_to_number(chapter_num) if not chapter_num.isdigit() else int(chapter_num),
														
 
															+                    "title": f"第{chapter_num}章 {title}",
														
 
															+                    "page": page,
														
 
															+                    "original": line,
														
 
															+                    "subsections": []
														
 
															+                }
														
 
															+                continue
														
 
															+
														
 
															+            # 尝试匹配节（二级）
														
 
															+            section_match = section_pattern.search(line)
														
 
															+            if section_match and current_chapter:
														
 
															+                section_num = section_match.group(1)
														
 
															+                title = section_match.group(2).strip()
														
 
															+                page = section_match.group(3).strip()
														
 
															+
														
 
															+                current_chapter["subsections"].append({
														
 
															+                    "title": f"{section_num}、{title}",
														
 
															+                    "page": page,
														
 
															+                    "level": 2,
														
 
															+                    "original": line
														
 
															+                })
														
 
															+                continue
														
 
															+
														
 
															+            # 尝试通用匹配（数字开头）
														
 
															+            generic_match = generic_pattern.search(line)
														
 
															+            if generic_match and current_chapter:
														
 
															+                title = generic_match.group(2).strip()
														
 
															+                page = generic_match.group(3).strip()
														
 
															+
														
 
															+                # 判断是章还是节（根据缩进或内容特征）
														
 
															+                if any(kw in title for kw in ['编制依据', '工程概况', '施工计划', '施工工艺',
														
 
															+                                               '安全保证', '质量保证', '环境保证', '人员配备',
														
 
															+                                               '验收要求']):
														
 
															+                    # 可能是章标题（没有"第X章"前缀的变体）
														
 
															+                    chapters.append(current_chapter)
														
 
															+                    current_chapter = {
														
 
															+                        "index": len(chapters) + 1,
														
 
															+                        "title": title,
														
 
															+                        "page": page,
														
 
															+                        "original": line,
														
 
															+                        "subsections": []
														
 
															+                    }
														
 
															+                else:
														
 
															+                    # 作为节
														
 
															+                    current_chapter["subsections"].append({
														
 
															+                        "title": title,
														
 
															+                        "page": page,
														
 
															+                        "level": 2,
														
 
															+                        "original": line
														
 
															+                    })
														
 
															+
														
 
															+        # 添加最后一个章
														
 
															+        if current_chapter:
														
 
															+            chapters.append(current_chapter)
														
 
															+
														
 
															+        # 如果没有匹配到章，尝试按空行或缩进分割
														
 
															+        if not chapters and lines:
														
 
															+            chapters = self._fallback_parse(lines)
														
 
															+
														
 
															+        return {
														
 
															+            "chapters": chapters,
														
 
															+            "total_chapters": len(chapters)
														
 
															+        }
														
 
															+
														
 
															+    def _fallback_parse(self, lines: List[str]) -> List[Dict[str, Any]]:
														
 
															+        """
														
 
															+        降级解析策略：当正则无法匹配时使用启发式方法
														
 
															+        """
														
 
															+        chapters = []
														
 
															+        idx = 0
														
 
															+
														
 
															+        for line in lines:
														
 
															+            line = line.strip()
														
 
															+            if not line:
														
 
															+                continue
														
 
															+
														
 
															+            # 检查是否包含页码（行尾数字）
														
 
															+            page_match = re.search(r'(\d+)\s*$', line)
														
 
															+            if not page_match:
														
 
															+                continue
														
 
															+
														
 
															+            page = page_match.group(1)
														
 
															+            title = re.sub(r'[\.\s]+\d+\s*$', '', line).strip()
														
 
															+
														
 
															+            # 根据内容特征判断层级
														
 
															+            is_chapter = any(kw in title for kw in ['编制依据', '工程概况', '施工计划',
														
 
															+                                                       '施工工艺', '安全保证', '质量保证',
														
 
															+                                                       '环境保证', '人员配备', '验收'])
														
 
															+
														
 
															+            if is_chapter or len(chapters) == 0:
														
 
															+                idx += 1
														
 
															+                chapters.append({
														
 
															+                    "index": idx,
														
 
															+                    "title": title,
														
 
															+                    "page": page,
														
 
															+                    "original": line,
														
 
															+                    "subsections": []
														
 
															+                })
														
 
															+            else:
														
 
															+                # 作为上一章的节
														
 
															+                if chapters:
														
 
															+                    chapters[-1]["subsections"].append({
														
 
															+                        "title": title,
														
 
															+                        "page": page,
														
 
															+                        "level": 2,
														
 
															+                        "original": line
														
 
															+                    })
														
 
															+
														
 
															+        return chapters
														
 
															+
														
 
															+    def _chinese_to_number(self, chinese: str) -> int:
														
 
															+        """中文数字转阿拉伯数字"""
														
 
															+        chinese_nums = {
														
 
															+            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
														
 
															+            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
														
 
															+            '十一': 11, '十二': 12
														
 
															+        }
														
 
															+        return chinese_nums.get(chinese, 0)
														
 
															+
														
 
															+
														
 
															+def extract_catalog_from_pdf(
														
 
															+    file_content: bytes,
														
 
															+    model_path: str = None,
														
 
															+    ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
														
 
															+    ocr_api_key: str = "",
														
 
															+    progress_callback=None
														
 
															+) -> Optional[Dict[str, Any]]:
														
 
															+    """
														
 
															+    便捷函数：从 PDF 提取目录结构
														
 
															+
														
 
															+    Returns:
														
 
															+        {"chapters": [...], "total_chapters": N} 或 None
														
 
															+    """
														
 
															+    extractor = TOCCatalogExtractor(
														
 
															+        model_path=model_path,
														
 
															+        ocr_api_url=ocr_api_url,
														
 
															+        ocr_api_key=ocr_api_key
														
 
															+    )
														
 
															+    return extractor.detect_and_extract(file_content, progress_callback)