1 هفته پیش · 26179c2cb8
--- a/core/construction_review/component/minimal_pipeline/toc_detector.py
+++ b/core/construction_review/component/minimal_pipeline/toc_detector.py
@@ -17,18 +17,8 @@ import numpy as np
 
															 from foundation.observability.logger.loggering import review_logger as logger
														
 
															-# 尝试导入 YOLO 相关库
														
 
															-try:
														
 
															-    from ultralytics import YOLO
														
 
															-    YOLO_AVAILABLE = True
														
 
															-except ImportError:
														
 
															-    YOLO_AVAILABLE = False
														
 
															-
														
 
															-try:
														
 
															-    from PIL import Image
														
 
															-    PIL_AVAILABLE = True
														
 
															-except ImportError:
														
 
															-    PIL_AVAILABLE = False
														
 
															+from ultralytics import YOLO
														
 
															+from PIL import Image
														
 
															 @dataclass
														
@@ -94,26 +84,15 @@ class TOCCatalogExtractor:
 
															         self.ocr_timeout = ocr_timeout
														
 
															         self._model = None
														
 
															-        self._yolo_available = YOLO_AVAILABLE and PIL_AVAILABLE
														
 
															     def _load_model(self) -> bool:
														
 
															-        """加载 YOLO 模型"""
														
 
															-        if not self._yolo_available:
														
 
															-            logger.debug("[TOC检测] YOLO库未安装，跳过目录检测")
														
 
															-            return False
														
 
															-
														
 
															+        """加载 YOLO 模型，缺少依赖或模型文件直接报错"""
														
 
															         if not os.path.exists(self.model_path):
														
 
															-            logger.debug(f"[TOC检测] 模型文件不存在: {self.model_path}")
														
 
															-            return False
														
 
															+            raise FileNotFoundError(f"[TOC检测] YOLO模型文件不存在: {self.model_path}")
														
 
															         if self._model is None:
														
 
															-            try:
														
 
															-                logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
														
 
															-                self._model = YOLO(self.model_path)
														
 
															-                return True
														
 
															-            except Exception as e:
														
 
															-                logger.warning(f"[TOC检测] 模型加载失败: {e}")
														
 
															-                return False
														
 
															+            logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
														
 
															+            self._model = YOLO(self.model_path)
														
 
															         return True
														
 
															     def detect_and_extract(
														
@@ -361,7 +340,6 @@ class TOCCatalogExtractor:
 
															             force_smaller: 是否强制更小的尺寸（用于处理过大的图片）
														
 
															         """
														
 
															         try:
														
 
															-            from PIL import Image
														
 
															             img = Image.open(io.BytesIO(img_bytes))
														
 
															             if img.mode in ('RGBA', 'LA', 'P'):
														
@@ -396,15 +374,20 @@ class TOCCatalogExtractor:
 
															     def _parse_toc_text(self, text: str) -> Dict[str, Any]:
														
 
															         """
														
 
															-        解析目录文本为结构化数据
														
 
															+        解析目录文本为结构化数据，输出标准格式
														
 
															-        支持格式：
														
 
															-        - 第一章 XXX...................1
														
 
															-        - 一、XXX......................2
														
 
															-        - 1. XXX ......................3
														
 
															+        标准格式：
														
 
															+        第X章 XXX
														
 
															+        一、XXX
														
 
															+        二、XXX
														
 
															         Returns:
														
 
															-            {"chapters": [...], "total_chapters": N}
														
 
															+            {
														
 
															+                "chapters": [...],
														
 
															+                "total_chapters": N,
														
 
															+                "raw_ocr_text": "原始OCR文本",
														
 
															+                "formatted_text": "标准格式文本"
														
 
															+            }
														
 
															         """
														
 
															         lines = text.strip().split('\n')
														
 
															         chapters = []
														
@@ -422,6 +405,13 @@ class TOCCatalogExtractor:
 
															             r'([0-9]+)[\.\s]+(.+?)\s*[\.\s]+(\d+)\s*$'
														
 
															         )
														
 
															+        # 中文数字映射
														
 
															+        chinese_nums = {
														
 
															+            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
														
 
															+            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
														
 
															+            '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
														
 
															+        }
														
 
															+
														
 
															         for line in lines:
														
 
															             line = line.strip()
														
 
															             if not line or len(line) < 3:
														
@@ -442,24 +432,37 @@ class TOCCatalogExtractor:
 
															                 if current_chapter:
														
 
															                     chapters.append(current_chapter)
														
 
															+                # 标准化为阿拉伯数字
														
 
															+                if chapter_num.isdigit():
														
 
															+                    idx = int(chapter_num)
														
 
															+                else:
														
 
															+                    idx = chinese_nums.get(chapter_num, len(chapters) + 1)
														
 
															+
														
 
															                 current_chapter = {
														
 
															-                    "index": self._chinese_to_number(chapter_num) if not chapter_num.isdigit() else int(chapter_num),
														
 
															-                    "title": f"第{chapter_num}章 {title}",
														
 
															+                    "index": idx,
														
 
															+                    "title": f"第{idx}章 {title}",
														
 
															                     "page": page,
														
 
															                     "original": line,
														
 
															                     "subsections": []
														
 
															                 }
														
 
															                 continue
														
 
															-            # 尝试匹配节（二级）
														
 
															+            # 尝试匹配节（二级）- 标准化为一、二、三格式
														
 
															             section_match = section_pattern.search(line)
														
 
															             if section_match and current_chapter:
														
 
															                 section_num = section_match.group(1)
														
 
															                 title = section_match.group(2).strip()
														
 
															                 page = section_match.group(3).strip()
														
 
															+                # 标准化节编号
														
 
															+                if section_num.isdigit():
														
 
															+                    section_idx = int(section_num)
														
 
															+                    section_cn = self._number_to_chinese(section_idx)
														
 
															+                else:
														
 
															+                    section_cn = section_num
														
 
															+
														
 
															                 current_chapter["subsections"].append({
														
 
															-                    "title": f"{section_num}、{title}",
														
 
															+                    "title": f"{section_cn}、{title}",
														
 
															                     "page": page,
														
 
															                     "level": 2,
														
 
															                     "original": line
														
@@ -472,23 +475,25 @@ class TOCCatalogExtractor:
 
															                 title = generic_match.group(2).strip()
														
 
															                 page = generic_match.group(3).strip()
														
 
															-                # 判断是章还是节（根据缩进或内容特征）
														
 
															+                # 判断是章还是节（根据内容特征）
														
 
															                 if any(kw in title for kw in ['编制依据', '工程概况', '施工计划', '施工工艺',
														
 
															                                                '安全保证', '质量保证', '环境保证', '人员配备',
														
 
															                                                '验收要求']):
														
 
															-                    # 可能是章标题（没有"第X章"前缀的变体）
														
 
															                     chapters.append(current_chapter)
														
 
															+                    idx = len(chapters) + 1
														
 
															                     current_chapter = {
														
 
															-                        "index": len(chapters) + 1,
														
 
															-                        "title": title,
														
 
															+                        "index": idx,
														
 
															+                        "title": f"第{idx}章 {title}",
														
 
															                         "page": page,
														
 
															                         "original": line,
														
 
															                         "subsections": []
														
 
															                     }
														
 
															                 else:
														
 
															-                    # 作为节
														
 
															+                    # 作为节，自动编号
														
 
															+                    section_idx = len(current_chapter["subsections"]) + 1
														
 
															+                    section_cn = self._number_to_chinese(section_idx)
														
 
															                     current_chapter["subsections"].append({
														
 
															-                        "title": title,
														
 
															+                        "title": f"{section_cn}、{title}",
														
 
															                         "page": page,
														
 
															                         "level": 2,
														
 
															                         "original": line
														
@@ -502,17 +507,33 @@ class TOCCatalogExtractor:
 
															         if not chapters and lines:
														
 
															             chapters = self._fallback_parse(lines)
														
 
															+        # 构建标准格式文本
														
 
															+        formatted_lines = []
														
 
															+        for ch in chapters:
														
 
															+            formatted_lines.append(ch["title"])
														
 
															+            for sub in ch.get("subsections", []):
														
 
															+                formatted_lines.append(f"  {sub['title']}")
														
 
															+
														
 
															+        formatted_text = "\n".join(formatted_lines)
														
 
															+
														
 
															+        # 日志输出完整的目录解析结果
														
 
															+        logger.info(f"[TOC解析] 共 {len(chapters)} 章，标准格式文本:\n{formatted_text}")
														
 
															+
														
 
															         return {
														
 
															             "chapters": chapters,
														
 
															-            "total_chapters": len(chapters)
														
 
															+            "total_chapters": len(chapters),
														
 
															+            "raw_ocr_text": text,
														
 
															+            "formatted_text": formatted_text
														
 
															         }
														
 
															     def _fallback_parse(self, lines: List[str]) -> List[Dict[str, Any]]:
														
 
															         """
														
 
															         降级解析策略：当正则无法匹配时使用启发式方法
														
 
															+        输出标准格式：第X章 XXX / 一、XXX
														
 
															         """
														
 
															         chapters = []
														
 
															         idx = 0
														
 
															+        section_idx = 0
														
 
															         for line in lines:
														
 
															             line = line.strip()
														
@@ -530,22 +551,26 @@ class TOCCatalogExtractor:
 
															             # 根据内容特征判断层级
														
 
															             is_chapter = any(kw in title for kw in ['编制依据', '工程概况', '施工计划',
														
 
															                                                        '施工工艺', '安全保证', '质量保证',
														
 
															-                                                       '环境保证', '人员配备', '验收'])
														
 
															+                                                       '环境保证', '人员配备', '验收',
														
 
															+                                                       '其他资料'])
														
 
															             if is_chapter or len(chapters) == 0:
														
 
															                 idx += 1
														
 
															+                section_idx = 0  # 重置节计数
														
 
															                 chapters.append({
														
 
															                     "index": idx,
														
 
															-                    "title": title,
														
 
															+                    "title": f"第{idx}章 {title}",
														
 
															                     "page": page,
														
 
															                     "original": line,
														
 
															                     "subsections": []
														
 
															                 })
														
 
															             else:
														
 
															-                # 作为上一章的节
														
 
															+                # 作为上一章的节，使用标准格式 一、二、三
														
 
															                 if chapters:
														
 
															+                    section_idx += 1
														
 
															+                    section_cn = self._number_to_chinese(section_idx)
														
 
															                     chapters[-1]["subsections"].append({
														
 
															-                        "title": title,
														
 
															+                        "title": f"{section_cn}、{title}",
														
 
															                         "page": page,
														
 
															                         "level": 2,
														
 
															                         "original": line
														
@@ -553,14 +578,14 @@ class TOCCatalogExtractor:
 
															         return chapters
														
 
															-    def _chinese_to_number(self, chinese: str) -> int:
														
 
															-        """中文数字转阿拉伯数字"""
														
 
															+    def _number_to_chinese(self, num: int) -> str:
														
 
															+        """阿拉伯数字转中文数字"""
														
 
															         chinese_nums = {
														
 
															-            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
														
 
															-            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
														
 
															-            '十一': 11, '十二': 12
														
 
															+            1: '一', 2: '二', 3: '三', 4: '四', 5: '五',
														
 
															+            6: '六', 7: '七', 8: '八', 9: '九', 10: '十',
														
 
															+            11: '十一', 12: '十二', 13: '十三', 14: '十四', 15: '十五'
														
 
															         }
														
 
															-        return chinese_nums.get(chinese, 0)
														
 
															+        return chinese_nums.get(num, str(num))
														
 
															 def extract_catalog_from_pdf(