1 долоо хоног өмнө · a33ee5e5d9
--- a/core/construction_review/component/doc_worker/models/document_structure.py
+++ b/core/construction_review/component/doc_worker/models/document_structure.py
@@ -191,6 +191,9 @@ class UnifiedDocumentStructure:
 
				     # ========== 文档大纲 ==========
			
 
				     outline: Outline = field(default_factory=Outline)
			
 
				 
			
 
				+    # ========== 目录结构（YOLO检测+OCR提取） ==========
			
 
				+    catalog: Optional[Dict[str, Any]] = None
			
 
				+
			
 
				     # ========== 原始数据（可选） ==========
			
 
				     raw_metadata: Dict[str, Any] = field(default_factory=dict)
			
 
				 
			
@@ -467,7 +470,7 @@ class UnifiedDocumentStructure:
 
				 
			
 
				         outline_chapters = list(chapters_map.values())
			
 
				 
			
 
				-        return {
			
 
				+        result = {
			
 
				             "document_id": self.document_id,
			
 
				             "document_name": self.document_name,
			
 
				             "total_pages": self.total_pages,
			
@@ -485,3 +488,9 @@ class UnifiedDocumentStructure:
 
				                 "tertiary_count": self.tertiary_count,
			
 
				             }
			
 
				         }
			
 
				+
			
 
				+        # 添加目录结构（如果存在）
			
 
				+        if self.catalog:
			
 
				+            result["catalog"] = self.catalog
			
 
				+
			
 
				+        return result
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor.py
@@ -69,6 +69,8 @@ class PdfStructureExtractor:
 
				         ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
			
 
				         ocr_timeout: int = 600,
			
 
				         ocr_api_key: str = "",
			
 
				+        detect_toc: bool = True,
			
 
				+        toc_model_path: str = "config/yolo/best.pt",
			
 
				     ):
			
 
				         self.clip_top = clip_top
			
 
				         self.clip_bottom = clip_bottom
			
@@ -80,6 +82,11 @@ class PdfStructureExtractor:
 
				         self.ocr_api_key = ocr_api_key
			
 
				         self._layout_engine: Optional[Any] = None
			
 
				 
			
 
				+        # 目录检测配置
			
 
				+        self.detect_toc = detect_toc
			
 
				+        self.toc_model_path = toc_model_path
			
 
				+        self._toc_extractor = None
			
 
				+
			
 
				         if use_ocr and not RAPID_LAYOUT_AVAILABLE:
			
 
				             logger.warning("RapidLayout 未安装，OCR 功能不可用")
			
 
				 
			
@@ -105,17 +112,59 @@ class PdfStructureExtractor:
 
				                         "一、xxx": {"content": "...", "page_start": 2, "page_end": 3},
			
 
				                     }
			
 
				                 },
			
 
				-                "total_pages": N
			
 
				+                "total_pages": N,
			
 
				+                "catalog": {  # 目录结构（YOLO检测+OCR提取）
			
 
				+                    "chapters": [...],
			
 
				+                    "total_chapters": N
			
 
				+                }
			
 
				             }
			
 
				         """
			
 
				+        result = {"chapters": {}, "total_pages": 0, "catalog": None}
			
 
				+
			
 
				+        # === 阶段0: 目录页检测与提取（如果启用）===
			
 
				+        if self.detect_toc:
			
 
				+            try:
			
 
				+                catalog = self._extract_catalog(file_content, progress_callback)
			
 
				+                if catalog:
			
 
				+                    result["catalog"] = catalog
			
 
				+                    logger.info(f"[PDF提取] 目录提取完成: {catalog.get('total_chapters', 0)} 章")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"[PDF提取] 目录提取失败: {e}")
			
 
				+
			
 
				+        # === 阶段1-3: 文档结构提取 ===
			
 
				         doc = fitz.open(stream=file_content)
			
 
				         try:
			
 
				             structure = self._extract_from_doc(doc, progress_callback)
			
 
				-            structure["total_pages"] = len(doc)
			
 
				-            return structure
			
 
				+            result["chapters"] = structure.get("chapters", {})
			
 
				+            result["total_pages"] = len(doc)
			
 
				+            return result
			
 
				         finally:
			
 
				             doc.close()
			
 
				 
			
 
				+    def _extract_catalog(self, file_content: bytes, progress_callback=None) -> Optional[Dict[str, Any]]:
			
 
				+        """
			
 
				+        提取目录结构（YOLO检测 + OCR识别）
			
 
				+
			
 
				+        Returns:
			
 
				+            {"chapters": [...], "total_chapters": N} 或 None
			
 
				+        """
			
 
				+        # 延迟导入避免循环依赖
			
 
				+        try:
			
 
				+            from .toc_detector import TOCCatalogExtractor
			
 
				+        except ImportError:
			
 
				+            logger.warning("[PDF提取] toc_detector 模块未找到，跳过目录检测")
			
 
				+            return None
			
 
				+
			
 
				+        if self._toc_extractor is None:
			
 
				+            self._toc_extractor = TOCCatalogExtractor(
			
 
				+                model_path=self.toc_model_path,
			
 
				+                ocr_api_url=self.ocr_api_url,
			
 
				+                ocr_api_key=self.ocr_api_key,
			
 
				+                ocr_timeout=self.ocr_timeout,
			
 
				+            )
			
 
				+
			
 
				+        return self._toc_extractor.detect_and_extract(file_content, progress_callback)
			
 
				+
			
 
				     def _extract_from_doc(self, doc: fitz.Document, progress_callback=None) -> Dict[str, Any]:
			
 
				         """提取文档结构（支持 OCR 异步并发）"""
			
 
				 
			
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -68,12 +68,14 @@ class SimpleDocumentProcessor:
 
				         处理 PDF 文档，返回 UnifiedDocumentStructure。
			
 
				         这是 document_processor 的主要入口。
			
 
				         """
			
 
				-        structure, primary_result, secondary_result, chunks = await self._run_pipeline(
			
 
				+        structure, primary_result, secondary_result, chunks, catalog = await self._run_pipeline(
			
 
				             file_content, file_name, progress_callback
			
 
				         )
			
 
				 
			
 
				         if not chunks:
			
 
				-            return self._build_empty_unified(file_name, structure.get("total_pages", 0))
			
 
				+            empty_result = self._build_empty_unified(file_name, structure.get("total_pages", 0))
			
 
				+            empty_result.catalog = catalog
			
 
				+            return empty_result
			
 
				 
			
 
				         return self._build_unified_doc(
			
 
				             structure=structure,
			
@@ -81,6 +83,7 @@ class SimpleDocumentProcessor:
 
				             secondary_result=secondary_result,
			
 
				             chunks=chunks,
			
 
				             document_name=file_name,
			
 
				+            catalog=catalog,
			
 
				         )
			
 
				 
			
 
				     async def process(
			
@@ -100,8 +103,8 @@ class SimpleDocumentProcessor:
 
				         file_content: bytes,
			
 
				         file_name: str,
			
 
				         progress_callback: Optional[callable],
			
 
				-    ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any], List[Dict[str, Any]]]:
			
 
				-        """执行核心流程，返回 (structure, primary_result, secondary_result, chunks)。"""
			
 
				+    ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any], List[Dict[str, Any]], Optional[Dict[str, Any]]]:
			
 
				+        """执行核心流程，返回 (structure, primary_result, secondary_result, chunks, catalog)。"""
			
 
				         logger.info(f"[SimpleProcessor] 开始处理文档: {file_name}")
			
 
				 
			
 
				         # 1. PDF 结构提取（带进度回调）
			
@@ -118,6 +121,7 @@ class SimpleDocumentProcessor:
 
				                     pass
			
 
				 
			
 
				         structure = self.pdf_extractor.extract(file_content, progress_callback=_extraction_progress)
			
 
				+        catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
			
 
				         await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
			
 
				 
			
 
				         # 2. 一级分类
			
@@ -154,7 +158,7 @@ class SimpleDocumentProcessor:
 
				         logger.info("[SimpleProcessor] 三级分类完成")
			
 
				         await self._emit_progress(progress_callback, "文档分类", 90, "三级分类完成")
			
 
				 
			
 
				-        return structure, primary_result, secondary_result, chunks
			
 
				+        return structure, primary_result, secondary_result, chunks, catalog
			
 
				 
			
 
				     async def _emit_progress(
			
 
				         self,
			
@@ -179,6 +183,7 @@ class SimpleDocumentProcessor:
 
				         secondary_result: Dict[str, Any],
			
 
				         chunks: List[Dict[str, Any]],
			
 
				         document_name: str,
			
 
				+        catalog: Optional[Dict[str, Any]] = None,
			
 
				     ) -> UnifiedDocumentStructure:
			
 
				         """构建 UnifiedDocumentStructure 并合并三级分类结果。"""
			
 
				         unified = build_unified_structure(
			
@@ -220,6 +225,9 @@ class SimpleDocumentProcessor:
 
				             }
			
 
				         }
			
 
				 
			
 
				+        # 设置目录结构（YOLO检测+OCR提取）
			
 
				+        unified.catalog = catalog
			
 
				+
			
 
				         return unified
			
 
				 
			
 
				     def _merge_tertiary_to_unified(
			
--- a/core/construction_review/component/minimal_pipeline/toc_detector.py
+++ b/core/construction_review/component/minimal_pipeline/toc_detector.py
@@ -0,0 +1,549 @@
 
				+"""
			
 
				+YOLO 目录页检测与 OCR 提取模块
			
 
				+
			
 
				+用于在文档处理流程早期检测目录页并提取目录内容，
			
 
				+输出结构与 outline 保持一致，便于后续进行目录完整性检查。
			
 
				+"""
			
 
				+
			
 
				+import io
			
 
				+import os
			
 
				+import re
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Dict, Any, List, Optional, Tuple
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import fitz
			
 
				+import numpy as np
			
 
				+
			
 
				+from foundation.observability.logger.loggering import review_logger as logger
			
 
				+
			
 
				+# 尝试导入 YOLO 相关库
			
 
				+try:
			
 
				+    from ultralytics import YOLO
			
 
				+    YOLO_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    YOLO_AVAILABLE = False
			
 
				+
			
 
				+try:
			
 
				+    from PIL import Image
			
 
				+    PIL_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    PIL_AVAILABLE = False
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class CatalogItem:
			
 
				+    """目录项结构"""
			
 
				+    index: int           # 章节序号（1-based）
			
 
				+    title: str           # 章节标题
			
 
				+    page: str            # 页码（字符串）
			
 
				+    original: str        # 原始文本
			
 
				+    level: int = 1       # 层级（1=章，2=节）
			
 
				+    parent_title: str = ""  # 父章节标题（用于二级）
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class CatalogSection:
			
 
				+    """目录节结构（对应二级目录）"""
			
 
				+    title: str
			
 
				+    page: str
			
 
				+    level: int
			
 
				+    original: str
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class CatalogChapter:
			
 
				+    """目录章结构（对应一级目录）"""
			
 
				+    index: int
			
 
				+    title: str
			
 
				+    page: str
			
 
				+    original: str
			
 
				+    subsections: List[CatalogSection]
			
 
				+
			
 
				+
			
 
				+class TOCCatalogExtractor:
			
 
				+    """
			
 
				+    目录页检测与内容提取器
			
 
				+
			
 
				+    使用 YOLO 模型检测目录页，使用 GLM-OCR 提取目录文本，
			
 
				+    解析为结构化数据，输出格式与 outline 保持一致。
			
 
				+    """
			
 
				+
			
 
				+    # YOLO 配置
			
 
				+    DEFAULT_MODEL_PATH = "config/yolo/best.pt"
			
 
				+    CONF_THRESHOLD = 0.25
			
 
				+    MAX_CHECK_PAGES = 50
			
 
				+    DPI = 150
			
 
				+
			
 
				+    # OCR 配置
			
 
				+    OCR_DPI = 200
			
 
				+    MAX_SHORT_EDGE = 1024
			
 
				+    JPEG_QUALITY = 90
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        model_path: str = None,
			
 
				+        ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
			
 
				+        ocr_api_key: str = "",
			
 
				+        ocr_timeout: int = 600,
			
 
				+    ):
			
 
				+        self.model_path = model_path or self.DEFAULT_MODEL_PATH
			
 
				+        self.ocr_api_url = ocr_api_url
			
 
				+        self.ocr_api_key = ocr_api_key
			
 
				+        self.ocr_timeout = ocr_timeout
			
 
				+
			
 
				+        self._model = None
			
 
				+        self._yolo_available = YOLO_AVAILABLE and PIL_AVAILABLE
			
 
				+
			
 
				+    def _load_model(self) -> bool:
			
 
				+        """加载 YOLO 模型"""
			
 
				+        if not self._yolo_available:
			
 
				+            logger.debug("[TOC检测] YOLO库未安装，跳过目录检测")
			
 
				+            return False
			
 
				+
			
 
				+        if not os.path.exists(self.model_path):
			
 
				+            logger.debug(f"[TOC检测] 模型文件不存在: {self.model_path}")
			
 
				+            return False
			
 
				+
			
 
				+        if self._model is None:
			
 
				+            try:
			
 
				+                logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
			
 
				+                self._model = YOLO(self.model_path)
			
 
				+                return True
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"[TOC检测] 模型加载失败: {e}")
			
 
				+                return False
			
 
				+        return True
			
 
				+
			
 
				+    def detect_and_extract(
			
 
				+        self,
			
 
				+        file_content: bytes,
			
 
				+        progress_callback=None
			
 
				+    ) -> Optional[Dict[str, Any]]:
			
 
				+        """
			
 
				+        检测目录页并提取目录内容
			
 
				+
			
 
				+        Args:
			
 
				+            file_content: PDF文件字节流
			
 
				+            progress_callback: 进度回调函数
			
 
				+
			
 
				+        Returns:
			
 
				+            目录结构字典，格式与 outline 保持一致：
			
 
				+            {
			
 
				+                "chapters": [...],
			
 
				+                "total_chapters": N
			
 
				+            }
			
 
				+        """
			
 
				+        if not self._load_model():
			
 
				+            return None
			
 
				+
			
 
				+        doc = fitz.open(stream=file_content)
			
 
				+        try:
			
 
				+            # 1. 检测目录页范围
			
 
				+            toc_pages = self._detect_toc_pages(doc, progress_callback)
			
 
				+            if not toc_pages:
			
 
				+                logger.info("[TOC检测] 未检测到目录页")
			
 
				+                return None
			
 
				+
			
 
				+            logger.info(f"[TOC检测] 检测到目录页: 第{toc_pages[0]+1}页 - 第{toc_pages[-1]+1}页")
			
 
				+
			
 
				+            # 2. OCR 提取目录页内容
			
 
				+            if progress_callback:
			
 
				+                progress_callback("目录识别", 10, f"检测到{len(toc_pages)}页目录，开始OCR识别...")
			
 
				+
			
 
				+            toc_text = self._ocr_toc_pages(doc, toc_pages, progress_callback)
			
 
				+
			
 
				+            if not toc_text:
			
 
				+                return None
			
 
				+
			
 
				+            # 3. 解析目录文本为结构化数据
			
 
				+            if progress_callback:
			
 
				+                progress_callback("目录识别", 80, "解析目录结构...")
			
 
				+
			
 
				+            catalog = self._parse_toc_text(toc_text)
			
 
				+
			
 
				+            if progress_callback:
			
 
				+                progress_callback("目录识别", 100, f"目录提取完成，共{catalog['total_chapters']}章")
			
 
				+
			
 
				+            return catalog
			
 
				+
			
 
				+        finally:
			
 
				+            doc.close()
			
 
				+
			
 
				+    def _detect_toc_pages(
			
 
				+        self,
			
 
				+        doc: fitz.Document,
			
 
				+        progress_callback=None
			
 
				+    ) -> List[int]:
			
 
				+        """
			
 
				+        使用 YOLO 检测目录页范围
			
 
				+
			
 
				+        Returns:
			
 
				+            目录页索引列表（0-based）
			
 
				+        """
			
 
				+        toc_pages = []
			
 
				+        total_pages = len(doc)
			
 
				+        pages_to_check = min(total_pages, self.MAX_CHECK_PAGES)
			
 
				+
			
 
				+        for page_idx in range(pages_to_check):
			
 
				+            page = doc.load_page(page_idx)
			
 
				+
			
 
				+            # 渲染页面
			
 
				+            zoom = self.DPI / 72
			
 
				+            mat = fitz.Matrix(zoom, zoom)
			
 
				+            pix = page.get_pixmap(matrix=mat)
			
 
				+
			
 
				+            # 转换为 numpy 数组
			
 
				+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
			
 
				+            img_array = np.array(img)
			
 
				+
			
 
				+            # YOLO 检测
			
 
				+            results = self._model(img_array, conf=self.CONF_THRESHOLD, verbose=False)
			
 
				+
			
 
				+            # 检查是否检测到 catalogs 类别
			
 
				+            has_catalogs = False
			
 
				+            for result in results:
			
 
				+                if result.boxes is not None:
			
 
				+                    for box in result.boxes:
			
 
				+                        cls_id = int(box.cls.item())
			
 
				+                        class_name = self._model.names.get(cls_id, f"class_{cls_id}")
			
 
				+                        if class_name == 'catalogs':
			
 
				+                            has_catalogs = True
			
 
				+                            break
			
 
				+                if has_catalogs:
			
 
				+                    break
			
 
				+
			
 
				+            if has_catalogs:
			
 
				+                toc_pages.append(page_idx)
			
 
				+                logger.debug(f"  第{page_idx + 1:3d}页: 检测到目录")
			
 
				+            else:
			
 
				+                logger.debug(f"  第{page_idx + 1:3d}页: 未检测到目录")
			
 
				+                # 如果已经检测到目录，且现在没有检测到，认为目录结束
			
 
				+                if toc_pages:
			
 
				+                    break
			
 
				+
			
 
				+            if progress_callback and (page_idx + 1) % 5 == 0:
			
 
				+                progress = int((page_idx + 1) / pages_to_check * 10)
			
 
				+                progress_callback("目录识别", progress, f"扫描页面 {page_idx + 1}/{pages_to_check}")
			
 
				+
			
 
				+        return toc_pages
			
 
				+
			
 
				+    def _ocr_toc_pages(
			
 
				+        self,
			
 
				+        doc: fitz.Document,
			
 
				+        toc_pages: List[int],
			
 
				+        progress_callback=None
			
 
				+    ) -> str:
			
 
				+        """
			
 
				+        对目录页进行 OCR 识别
			
 
				+
			
 
				+        Returns:
			
 
				+            合并后的目录文本
			
 
				+        """
			
 
				+        import base64
			
 
				+        import io
			
 
				+        import requests
			
 
				+        import time
			
 
				+
			
 
				+        all_texts = []
			
 
				+        total = len(toc_pages)
			
 
				+
			
 
				+        for idx, page_idx in enumerate(toc_pages):
			
 
				+            page = doc.load_page(page_idx)
			
 
				+
			
 
				+            try:
			
 
				+                # 渲染页面
			
 
				+                pix = page.get_pixmap(dpi=self.OCR_DPI)
			
 
				+                img_bytes = pix.tobytes("jpeg")
			
 
				+
			
 
				+                # 压缩图片
			
 
				+                compressed = self._compress_image(img_bytes)
			
 
				+                img_base64 = base64.b64encode(compressed).decode('utf-8')
			
 
				+
			
 
				+                # 请求 OCR
			
 
				+                payload = {
			
 
				+                    "model": "GLM-OCR",
			
 
				+                    "messages": [
			
 
				+                        {
			
 
				+                            "role": "user",
			
 
				+                            "content": [
			
 
				+                                {
			
 
				+                                    "type": "text",
			
 
				+                                    "text": "这是一份施工方案文档的目录页。请识别并提取目录内容，按原文格式输出。"
			
 
				+                                            "注意："
			
 
				+                                            "1. 保留章节层级关系（章/节）"
			
 
				+                                            "2. 保留页码信息"
			
 
				+                                            "3. 只输出目录内容，不要其他说明"
			
 
				+                                },
			
 
				+                                {
			
 
				+                                    "type": "image_url",
			
 
				+                                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
			
 
				+                                }
			
 
				+                            ]
			
 
				+                        }
			
 
				+                    ],
			
 
				+                    "max_tokens": 4096,
			
 
				+                    "temperature": 0.1
			
 
				+                }
			
 
				+
			
 
				+                headers = {"Content-Type": "application/json"}
			
 
				+                if self.ocr_api_key:
			
 
				+                    headers["Authorization"] = f"Bearer {self.ocr_api_key}"
			
 
				+
			
 
				+                # 指数退避重试
			
 
				+                max_retries = 3
			
 
				+                for attempt in range(max_retries):
			
 
				+                    try:
			
 
				+                        response = requests.post(
			
 
				+                            self.ocr_api_url,
			
 
				+                            headers=headers,
			
 
				+                            json=payload,
			
 
				+                            timeout=self.ocr_timeout
			
 
				+                        )
			
 
				+                        response.raise_for_status()
			
 
				+                        result = response.json()
			
 
				+
			
 
				+                        content = ""
			
 
				+                        if "choices" in result and result["choices"]:
			
 
				+                            content = result["choices"][0].get("message", {}).get("content", "")
			
 
				+
			
 
				+                        if content:
			
 
				+                            all_texts.append(content)
			
 
				+                            logger.debug(f"  第{page_idx + 1}页目录OCR成功")
			
 
				+                        break
			
 
				+
			
 
				+                    except Exception as e:
			
 
				+                        if attempt < max_retries - 1:
			
 
				+                            wait_time = 2 ** (attempt + 1)
			
 
				+                            logger.warning(f"  第{page_idx + 1}页目录OCR失败，{wait_time}秒后重试...")
			
 
				+                            time.sleep(wait_time)
			
 
				+                        else:
			
 
				+                            logger.error(f"  第{page_idx + 1}页目录OCR最终失败: {e}")
			
 
				+
			
 
				+                if progress_callback:
			
 
				+                    progress = 10 + int((idx + 1) / total * 60)
			
 
				+                    progress_callback("目录识别", progress, f"OCR识别中 {idx + 1}/{total}")
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"  第{page_idx + 1}页OCR处理出错: {e}")
			
 
				+
			
 
				+        return "\n".join(all_texts)
			
 
				+
			
 
				+    def _compress_image(self, img_bytes: bytes) -> bytes:
			
 
				+        """压缩图片"""
			
 
				+        try:
			
 
				+            from PIL import Image
			
 
				+            img = Image.open(io.BytesIO(img_bytes))
			
 
				+
			
 
				+            if img.mode in ('RGBA', 'LA', 'P'):
			
 
				+                background = Image.new('RGB', img.size, (255, 255, 255))
			
 
				+                if img.mode == 'P':
			
 
				+                    img = img.convert('RGBA')
			
 
				+                if img.mode in ('RGBA', 'LA'):
			
 
				+                    background.paste(img, mask=img.split()[-1])
			
 
				+                img = background
			
 
				+            elif img.mode != 'RGB':
			
 
				+                img = img.convert('RGB')
			
 
				+
			
 
				+            min_edge = min(img.size)
			
 
				+            if min_edge > self.MAX_SHORT_EDGE:
			
 
				+                ratio = self.MAX_SHORT_EDGE / min_edge
			
 
				+                new_size = (int(img.width * ratio), int(img.height * ratio))
			
 
				+                img = img.resize(new_size, Image.Resampling.LANCZOS)
			
 
				+
			
 
				+            buffer = io.BytesIO()
			
 
				+            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
			
 
				+            return buffer.getvalue()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"[TOC检测] 图片压缩失败，使用原图: {e}")
			
 
				+            return img_bytes
			
 
				+
			
 
				+    def _parse_toc_text(self, text: str) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        解析目录文本为结构化数据
			
 
				+
			
 
				+        支持格式：
			
 
				+        - 第一章 XXX...................1
			
 
				+        - 一、XXX......................2
			
 
				+        - 1. XXX ......................3
			
 
				+
			
 
				+        Returns:
			
 
				+            {"chapters": [...], "total_chapters": N}
			
 
				+        """
			
 
				+        lines = text.strip().split('\n')
			
 
				+        chapters = []
			
 
				+        current_chapter = None
			
 
				+
			
 
				+        # 正则表达式模式
			
 
				+        chapter_pattern = re.compile(
			
 
				+            r'第\s*([一二三四五六七八九十百0-9]+)\s*章\s*[\s\.]*(.+?)\s*[\.\s]*(\d+)\s*$',
			
 
				+            re.IGNORECASE
			
 
				+        )
			
 
				+        section_pattern = re.compile(
			
 
				+            r'([一二三四五六七八九十]+)\s*[、\.\s]+\s*(.+?)\s*[\.\s]*(\d+)\s*$'
			
 
				+        )
			
 
				+        generic_pattern = re.compile(
			
 
				+            r'([0-9]+)[\.\s]+(.+?)\s*[\.\s]+(\d+)\s*$'
			
 
				+        )
			
 
				+
			
 
				+        for line in lines:
			
 
				+            line = line.strip()
			
 
				+            if not line or len(line) < 3:
			
 
				+                continue
			
 
				+
			
 
				+            # 移除 Markdown 表格符号
			
 
				+            line = re.sub(r'^[\|\s]+|[\|\s]+$', '', line)
			
 
				+            line = line.replace('|', ' ')
			
 
				+
			
 
				+            # 尝试匹配章
			
 
				+            chapter_match = chapter_pattern.search(line)
			
 
				+            if chapter_match:
			
 
				+                chapter_num = chapter_match.group(1)
			
 
				+                title = chapter_match.group(2).strip()
			
 
				+                page = chapter_match.group(3).strip()
			
 
				+
			
 
				+                # 保存上一个章
			
 
				+                if current_chapter:
			
 
				+                    chapters.append(current_chapter)
			
 
				+
			
 
				+                current_chapter = {
			
 
				+                    "index": self._chinese_to_number(chapter_num) if not chapter_num.isdigit() else int(chapter_num),
			
 
				+                    "title": f"第{chapter_num}章 {title}",
			
 
				+                    "page": page,
			
 
				+                    "original": line,
			
 
				+                    "subsections": []
			
 
				+                }
			
 
				+                continue
			
 
				+
			
 
				+            # 尝试匹配节（二级）
			
 
				+            section_match = section_pattern.search(line)
			
 
				+            if section_match and current_chapter:
			
 
				+                section_num = section_match.group(1)
			
 
				+                title = section_match.group(2).strip()
			
 
				+                page = section_match.group(3).strip()
			
 
				+
			
 
				+                current_chapter["subsections"].append({
			
 
				+                    "title": f"{section_num}、{title}",
			
 
				+                    "page": page,
			
 
				+                    "level": 2,
			
 
				+                    "original": line
			
 
				+                })
			
 
				+                continue
			
 
				+
			
 
				+            # 尝试通用匹配（数字开头）
			
 
				+            generic_match = generic_pattern.search(line)
			
 
				+            if generic_match and current_chapter:
			
 
				+                title = generic_match.group(2).strip()
			
 
				+                page = generic_match.group(3).strip()
			
 
				+
			
 
				+                # 判断是章还是节（根据缩进或内容特征）
			
 
				+                if any(kw in title for kw in ['编制依据', '工程概况', '施工计划', '施工工艺',
			
 
				+                                               '安全保证', '质量保证', '环境保证', '人员配备',
			
 
				+                                               '验收要求']):
			
 
				+                    # 可能是章标题（没有"第X章"前缀的变体）
			
 
				+                    chapters.append(current_chapter)
			
 
				+                    current_chapter = {
			
 
				+                        "index": len(chapters) + 1,
			
 
				+                        "title": title,
			
 
				+                        "page": page,
			
 
				+                        "original": line,
			
 
				+                        "subsections": []
			
 
				+                    }
			
 
				+                else:
			
 
				+                    # 作为节
			
 
				+                    current_chapter["subsections"].append({
			
 
				+                        "title": title,
			
 
				+                        "page": page,
			
 
				+                        "level": 2,
			
 
				+                        "original": line
			
 
				+                    })
			
 
				+
			
 
				+        # 添加最后一个章
			
 
				+        if current_chapter:
			
 
				+            chapters.append(current_chapter)
			
 
				+
			
 
				+        # 如果没有匹配到章，尝试按空行或缩进分割
			
 
				+        if not chapters and lines:
			
 
				+            chapters = self._fallback_parse(lines)
			
 
				+
			
 
				+        return {
			
 
				+            "chapters": chapters,
			
 
				+            "total_chapters": len(chapters)
			
 
				+        }
			
 
				+
			
 
				+    def _fallback_parse(self, lines: List[str]) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        降级解析策略：当正则无法匹配时使用启发式方法
			
 
				+        """
			
 
				+        chapters = []
			
 
				+        idx = 0
			
 
				+
			
 
				+        for line in lines:
			
 
				+            line = line.strip()
			
 
				+            if not line:
			
 
				+                continue
			
 
				+
			
 
				+            # 检查是否包含页码（行尾数字）
			
 
				+            page_match = re.search(r'(\d+)\s*$', line)
			
 
				+            if not page_match:
			
 
				+                continue
			
 
				+
			
 
				+            page = page_match.group(1)
			
 
				+            title = re.sub(r'[\.\s]+\d+\s*$', '', line).strip()
			
 
				+
			
 
				+            # 根据内容特征判断层级
			
 
				+            is_chapter = any(kw in title for kw in ['编制依据', '工程概况', '施工计划',
			
 
				+                                                       '施工工艺', '安全保证', '质量保证',
			
 
				+                                                       '环境保证', '人员配备', '验收'])
			
 
				+
			
 
				+            if is_chapter or len(chapters) == 0:
			
 
				+                idx += 1
			
 
				+                chapters.append({
			
 
				+                    "index": idx,
			
 
				+                    "title": title,
			
 
				+                    "page": page,
			
 
				+                    "original": line,
			
 
				+                    "subsections": []
			
 
				+                })
			
 
				+            else:
			
 
				+                # 作为上一章的节
			
 
				+                if chapters:
			
 
				+                    chapters[-1]["subsections"].append({
			
 
				+                        "title": title,
			
 
				+                        "page": page,
			
 
				+                        "level": 2,
			
 
				+                        "original": line
			
 
				+                    })
			
 
				+
			
 
				+        return chapters
			
 
				+
			
 
				+    def _chinese_to_number(self, chinese: str) -> int:
			
 
				+        """中文数字转阿拉伯数字"""
			
 
				+        chinese_nums = {
			
 
				+            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
			
 
				+            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
			
 
				+            '十一': 11, '十二': 12
			
 
				+        }
			
 
				+        return chinese_nums.get(chinese, 0)
			
 
				+
			
 
				+
			
 
				+def extract_catalog_from_pdf(
			
 
				+    file_content: bytes,
			
 
				+    model_path: str = None,
			
 
				+    ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
			
 
				+    ocr_api_key: str = "",
			
 
				+    progress_callback=None
			
 
				+) -> Optional[Dict[str, Any]]:
			
 
				+    """
			
 
				+    便捷函数：从 PDF 提取目录结构
			
 
				+
			
 
				+    Returns:
			
 
				+        {"chapters": [...], "total_chapters": N} 或 None
			
 
				+    """
			
 
				+    extractor = TOCCatalogExtractor(
			
 
				+        model_path=model_path,
			
 
				+        ocr_api_url=ocr_api_url,
			
 
				+        ocr_api_key=ocr_api_key
			
 
				+    )
			
 
				+    return extractor.detect_and_extract(file_content, progress_callback)