il y a 1 mois · 26179c2cb8
--- a/core/construction_review/component/minimal_pipeline/toc_detector.py
+++ b/core/construction_review/component/minimal_pipeline/toc_detector.py
@@ -17,18 +17,8 @@ import numpy as np
 
				 
			
 
				 from foundation.observability.logger.loggering import review_logger as logger
			
 
				 
			
 
				-# 尝试导入 YOLO 相关库
			
 
				-try:
			
 
				-    from ultralytics import YOLO
			
 
				-    YOLO_AVAILABLE = True
			
 
				-except ImportError:
			
 
				-    YOLO_AVAILABLE = False
			
 
				-
			
 
				-try:
			
 
				-    from PIL import Image
			
 
				-    PIL_AVAILABLE = True
			
 
				-except ImportError:
			
 
				-    PIL_AVAILABLE = False
			
 
				+from ultralytics import YOLO
			
 
				+from PIL import Image
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -94,26 +84,15 @@ class TOCCatalogExtractor:
 
				         self.ocr_timeout = ocr_timeout
			
 
				 
			
 
				         self._model = None
			
 
				-        self._yolo_available = YOLO_AVAILABLE and PIL_AVAILABLE
			
 
				 
			
 
				     def _load_model(self) -> bool:
			
 
				-        """加载 YOLO 模型"""
			
 
				-        if not self._yolo_available:
			
 
				-            logger.debug("[TOC检测] YOLO库未安装，跳过目录检测")
			
 
				-            return False
			
 
				-
			
 
				+        """加载 YOLO 模型，缺少依赖或模型文件直接报错"""
			
 
				         if not os.path.exists(self.model_path):
			
 
				-            logger.debug(f"[TOC检测] 模型文件不存在: {self.model_path}")
			
 
				-            return False
			
 
				+            raise FileNotFoundError(f"[TOC检测] YOLO模型文件不存在: {self.model_path}")
			
 
				 
			
 
				         if self._model is None:
			
 
				-            try:
			
 
				-                logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
			
 
				-                self._model = YOLO(self.model_path)
			
 
				-                return True
			
 
				-            except Exception as e:
			
 
				-                logger.warning(f"[TOC检测] 模型加载失败: {e}")
			
 
				-                return False
			
 
				+            logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
			
 
				+            self._model = YOLO(self.model_path)
			
 
				         return True
			
 
				 
			
 
				     def detect_and_extract(
			
@@ -361,7 +340,6 @@ class TOCCatalogExtractor:
 
				             force_smaller: 是否强制更小的尺寸（用于处理过大的图片）
			
 
				         """
			
 
				         try:
			
 
				-            from PIL import Image
			
 
				             img = Image.open(io.BytesIO(img_bytes))
			
 
				 
			
 
				             if img.mode in ('RGBA', 'LA', 'P'):
			
@@ -396,15 +374,20 @@ class TOCCatalogExtractor:
 
				 
			
 
				     def _parse_toc_text(self, text: str) -> Dict[str, Any]:
			
 
				         """
			
 
				-        解析目录文本为结构化数据
			
 
				+        解析目录文本为结构化数据，输出标准格式
			
 
				 
			
 
				-        支持格式：
			
 
				-        - 第一章 XXX...................1
			
 
				-        - 一、XXX......................2
			
 
				-        - 1. XXX ......................3
			
 
				+        标准格式：
			
 
				+        第X章 XXX
			
 
				+        一、XXX
			
 
				+        二、XXX
			
 
				 
			
 
				         Returns:
			
 
				-            {"chapters": [...], "total_chapters": N}
			
 
				+            {
			
 
				+                "chapters": [...],
			
 
				+                "total_chapters": N,
			
 
				+                "raw_ocr_text": "原始OCR文本",
			
 
				+                "formatted_text": "标准格式文本"
			
 
				+            }
			
 
				         """
			
 
				         lines = text.strip().split('\n')
			
 
				         chapters = []
			
@@ -422,6 +405,13 @@ class TOCCatalogExtractor:
 
				             r'([0-9]+)[\.\s]+(.+?)\s*[\.\s]+(\d+)\s*$'
			
 
				         )
			
 
				 
			
 
				+        # 中文数字映射
			
 
				+        chinese_nums = {
			
 
				+            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
			
 
				+            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
			
 
				+            '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
			
 
				+        }
			
 
				+
			
 
				         for line in lines:
			
 
				             line = line.strip()
			
 
				             if not line or len(line) < 3:
			
@@ -442,24 +432,37 @@ class TOCCatalogExtractor:
 
				                 if current_chapter:
			
 
				                     chapters.append(current_chapter)
			
 
				 
			
 
				+                # 标准化为阿拉伯数字
			
 
				+                if chapter_num.isdigit():
			
 
				+                    idx = int(chapter_num)
			
 
				+                else:
			
 
				+                    idx = chinese_nums.get(chapter_num, len(chapters) + 1)
			
 
				+
			
 
				                 current_chapter = {
			
 
				-                    "index": self._chinese_to_number(chapter_num) if not chapter_num.isdigit() else int(chapter_num),
			
 
				-                    "title": f"第{chapter_num}章 {title}",
			
 
				+                    "index": idx,
			
 
				+                    "title": f"第{idx}章 {title}",
			
 
				                     "page": page,
			
 
				                     "original": line,
			
 
				                     "subsections": []
			
 
				                 }
			
 
				                 continue
			
 
				 
			
 
				-            # 尝试匹配节（二级）
			
 
				+            # 尝试匹配节（二级）- 标准化为一、二、三格式
			
 
				             section_match = section_pattern.search(line)
			
 
				             if section_match and current_chapter:
			
 
				                 section_num = section_match.group(1)
			
 
				                 title = section_match.group(2).strip()
			
 
				                 page = section_match.group(3).strip()
			
 
				 
			
 
				+                # 标准化节编号
			
 
				+                if section_num.isdigit():
			
 
				+                    section_idx = int(section_num)
			
 
				+                    section_cn = self._number_to_chinese(section_idx)
			
 
				+                else:
			
 
				+                    section_cn = section_num
			
 
				+
			
 
				                 current_chapter["subsections"].append({
			
 
				-                    "title": f"{section_num}、{title}",
			
 
				+                    "title": f"{section_cn}、{title}",
			
 
				                     "page": page,
			
 
				                     "level": 2,
			
 
				                     "original": line
			
@@ -472,23 +475,25 @@ class TOCCatalogExtractor:
 
				                 title = generic_match.group(2).strip()
			
 
				                 page = generic_match.group(3).strip()
			
 
				 
			
 
				-                # 判断是章还是节（根据缩进或内容特征）
			
 
				+                # 判断是章还是节（根据内容特征）
			
 
				                 if any(kw in title for kw in ['编制依据', '工程概况', '施工计划', '施工工艺',
			
 
				                                                '安全保证', '质量保证', '环境保证', '人员配备',
			
 
				                                                '验收要求']):
			
 
				-                    # 可能是章标题（没有"第X章"前缀的变体）
			
 
				                     chapters.append(current_chapter)
			
 
				+                    idx = len(chapters) + 1
			
 
				                     current_chapter = {
			
 
				-                        "index": len(chapters) + 1,
			
 
				-                        "title": title,
			
 
				+                        "index": idx,
			
 
				+                        "title": f"第{idx}章 {title}",
			
 
				                         "page": page,
			
 
				                         "original": line,
			
 
				                         "subsections": []
			
 
				                     }
			
 
				                 else:
			
 
				-                    # 作为节
			
 
				+                    # 作为节，自动编号
			
 
				+                    section_idx = len(current_chapter["subsections"]) + 1
			
 
				+                    section_cn = self._number_to_chinese(section_idx)
			
 
				                     current_chapter["subsections"].append({
			
 
				-                        "title": title,
			
 
				+                        "title": f"{section_cn}、{title}",
			
 
				                         "page": page,
			
 
				                         "level": 2,
			
 
				                         "original": line
			
@@ -502,17 +507,33 @@ class TOCCatalogExtractor:
 
				         if not chapters and lines:
			
 
				             chapters = self._fallback_parse(lines)
			
 
				 
			
 
				+        # 构建标准格式文本
			
 
				+        formatted_lines = []
			
 
				+        for ch in chapters:
			
 
				+            formatted_lines.append(ch["title"])
			
 
				+            for sub in ch.get("subsections", []):
			
 
				+                formatted_lines.append(f"  {sub['title']}")
			
 
				+
			
 
				+        formatted_text = "\n".join(formatted_lines)
			
 
				+
			
 
				+        # 日志输出完整的目录解析结果
			
 
				+        logger.info(f"[TOC解析] 共 {len(chapters)} 章，标准格式文本:\n{formatted_text}")
			
 
				+
			
 
				         return {
			
 
				             "chapters": chapters,
			
 
				-            "total_chapters": len(chapters)
			
 
				+            "total_chapters": len(chapters),
			
 
				+            "raw_ocr_text": text,
			
 
				+            "formatted_text": formatted_text
			
 
				         }
			
 
				 
			
 
				     def _fallback_parse(self, lines: List[str]) -> List[Dict[str, Any]]:
			
 
				         """
			
 
				         降级解析策略：当正则无法匹配时使用启发式方法
			
 
				+        输出标准格式：第X章 XXX / 一、XXX
			
 
				         """
			
 
				         chapters = []
			
 
				         idx = 0
			
 
				+        section_idx = 0
			
 
				 
			
 
				         for line in lines:
			
 
				             line = line.strip()
			
@@ -530,22 +551,26 @@ class TOCCatalogExtractor:
 
				             # 根据内容特征判断层级
			
 
				             is_chapter = any(kw in title for kw in ['编制依据', '工程概况', '施工计划',
			
 
				                                                        '施工工艺', '安全保证', '质量保证',
			
 
				-                                                       '环境保证', '人员配备', '验收'])
			
 
				+                                                       '环境保证', '人员配备', '验收',
			
 
				+                                                       '其他资料'])
			
 
				 
			
 
				             if is_chapter or len(chapters) == 0:
			
 
				                 idx += 1
			
 
				+                section_idx = 0  # 重置节计数
			
 
				                 chapters.append({
			
 
				                     "index": idx,
			
 
				-                    "title": title,
			
 
				+                    "title": f"第{idx}章 {title}",
			
 
				                     "page": page,
			
 
				                     "original": line,
			
 
				                     "subsections": []
			
 
				                 })
			
 
				             else:
			
 
				-                # 作为上一章的节
			
 
				+                # 作为上一章的节，使用标准格式 一、二、三
			
 
				                 if chapters:
			
 
				+                    section_idx += 1
			
 
				+                    section_cn = self._number_to_chinese(section_idx)
			
 
				                     chapters[-1]["subsections"].append({
			
 
				-                        "title": title,
			
 
				+                        "title": f"{section_cn}、{title}",
			
 
				                         "page": page,
			
 
				                         "level": 2,
			
 
				                         "original": line
			
@@ -553,14 +578,14 @@ class TOCCatalogExtractor:
 
				 
			
 
				         return chapters
			
 
				 
			
 
				-    def _chinese_to_number(self, chinese: str) -> int:
			
 
				-        """中文数字转阿拉伯数字"""
			
 
				+    def _number_to_chinese(self, num: int) -> str:
			
 
				+        """阿拉伯数字转中文数字"""
			
 
				         chinese_nums = {
			
 
				-            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
			
 
				-            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
			
 
				-            '十一': 11, '十二': 12
			
 
				+            1: '一', 2: '二', 3: '三', 4: '四', 5: '五',
			
 
				+            6: '六', 7: '七', 8: '八', 9: '九', 10: '十',
			
 
				+            11: '十一', 12: '十二', 13: '十三', 14: '十四', 15: '十五'
			
 
				         }
			
 
				-        return chinese_nums.get(chinese, 0)
			
 
				+        return chinese_nums.get(num, str(num))
			
 
				 
			
 
				 
			
 
				 def extract_catalog_from_pdf(