Parcourir la source

refactor(toc_detector): 移除 YOLO 降级保护,强制依赖生效

- 将 ultralytics 和 PIL 的 try-except 导入改为强制导入
- _load_model() 失败时直接抛出异常,不再静默跳过
- 确保目录页 YOLO 检测必须在依赖完整时运行,避免环境差异导致效果不一致
WangXuMing il y a 1 semaine
Parent
commit
26179c2cb8

+ 80 - 55
core/construction_review/component/minimal_pipeline/toc_detector.py

@@ -17,18 +17,8 @@ import numpy as np
 
 from foundation.observability.logger.loggering import review_logger as logger
 
-# 尝试导入 YOLO 相关库
-try:
-    from ultralytics import YOLO
-    YOLO_AVAILABLE = True
-except ImportError:
-    YOLO_AVAILABLE = False
-
-try:
-    from PIL import Image
-    PIL_AVAILABLE = True
-except ImportError:
-    PIL_AVAILABLE = False
+from ultralytics import YOLO
+from PIL import Image
 
 
 @dataclass
@@ -94,26 +84,15 @@ class TOCCatalogExtractor:
         self.ocr_timeout = ocr_timeout
 
         self._model = None
-        self._yolo_available = YOLO_AVAILABLE and PIL_AVAILABLE
 
     def _load_model(self) -> bool:
-        """加载 YOLO 模型"""
-        if not self._yolo_available:
-            logger.debug("[TOC检测] YOLO库未安装,跳过目录检测")
-            return False
-
+        """加载 YOLO 模型,缺少依赖或模型文件直接报错"""
         if not os.path.exists(self.model_path):
-            logger.debug(f"[TOC检测] 模型文件不存在: {self.model_path}")
-            return False
+            raise FileNotFoundError(f"[TOC检测] YOLO模型文件不存在: {self.model_path}")
 
         if self._model is None:
-            try:
-                logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
-                self._model = YOLO(self.model_path)
-                return True
-            except Exception as e:
-                logger.warning(f"[TOC检测] 模型加载失败: {e}")
-                return False
+            logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
+            self._model = YOLO(self.model_path)
         return True
 
     def detect_and_extract(
@@ -361,7 +340,6 @@ class TOCCatalogExtractor:
             force_smaller: 是否强制更小的尺寸(用于处理过大的图片)
         """
         try:
-            from PIL import Image
             img = Image.open(io.BytesIO(img_bytes))
 
             if img.mode in ('RGBA', 'LA', 'P'):
@@ -396,15 +374,20 @@ class TOCCatalogExtractor:
 
     def _parse_toc_text(self, text: str) -> Dict[str, Any]:
         """
-        解析目录文本为结构化数据
+        解析目录文本为结构化数据,输出标准格式
 
-        支持格式:
-        - 第一章 XXX...................1
-        - 一、XXX......................2
-        - 1. XXX ......................3
+        标准格式:
+        第X章 XXX
+        一、XXX
+        二、XXX
 
         Returns:
-            {"chapters": [...], "total_chapters": N}
+            {
+                "chapters": [...],
+                "total_chapters": N,
+                "raw_ocr_text": "原始OCR文本",
+                "formatted_text": "标准格式文本"
+            }
         """
         lines = text.strip().split('\n')
         chapters = []
@@ -422,6 +405,13 @@ class TOCCatalogExtractor:
             r'([0-9]+)[\.\s]+(.+?)\s*[\.\s]+(\d+)\s*$'
         )
 
+        # 中文数字映射
+        chinese_nums = {
+            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
+            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
+            '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
+        }
+
         for line in lines:
             line = line.strip()
             if not line or len(line) < 3:
@@ -442,24 +432,37 @@ class TOCCatalogExtractor:
                 if current_chapter:
                     chapters.append(current_chapter)
 
+                # 标准化为阿拉伯数字
+                if chapter_num.isdigit():
+                    idx = int(chapter_num)
+                else:
+                    idx = chinese_nums.get(chapter_num, len(chapters) + 1)
+
                 current_chapter = {
-                    "index": self._chinese_to_number(chapter_num) if not chapter_num.isdigit() else int(chapter_num),
-                    "title": f"第{chapter_num}章 {title}",
+                    "index": idx,
+                    "title": f"第{idx}章 {title}",
                     "page": page,
                     "original": line,
                     "subsections": []
                 }
                 continue
 
-            # 尝试匹配节(二级)
+            # 尝试匹配节(二级)- 标准化为一、二、三格式
             section_match = section_pattern.search(line)
             if section_match and current_chapter:
                 section_num = section_match.group(1)
                 title = section_match.group(2).strip()
                 page = section_match.group(3).strip()
 
+                # 标准化节编号
+                if section_num.isdigit():
+                    section_idx = int(section_num)
+                    section_cn = self._number_to_chinese(section_idx)
+                else:
+                    section_cn = section_num
+
                 current_chapter["subsections"].append({
-                    "title": f"{section_num}、{title}",
+                    "title": f"{section_cn}、{title}",
                     "page": page,
                     "level": 2,
                     "original": line
@@ -472,23 +475,25 @@ class TOCCatalogExtractor:
                 title = generic_match.group(2).strip()
                 page = generic_match.group(3).strip()
 
-                # 判断是章还是节(根据缩进或内容特征)
+                # 判断是章还是节(根据内容特征)
                 if any(kw in title for kw in ['编制依据', '工程概况', '施工计划', '施工工艺',
                                                '安全保证', '质量保证', '环境保证', '人员配备',
                                                '验收要求']):
-                    # 可能是章标题(没有"第X章"前缀的变体)
                     chapters.append(current_chapter)
+                    idx = len(chapters) + 1
                     current_chapter = {
-                        "index": len(chapters) + 1,
-                        "title": title,
+                        "index": idx,
+                        "title": f"第{idx}章 {title}",
                         "page": page,
                         "original": line,
                         "subsections": []
                     }
                 else:
-                    # 作为节
+                    # 作为节,自动编号
+                    section_idx = len(current_chapter["subsections"]) + 1
+                    section_cn = self._number_to_chinese(section_idx)
                     current_chapter["subsections"].append({
-                        "title": title,
+                        "title": f"{section_cn}、{title}",
                         "page": page,
                         "level": 2,
                         "original": line
@@ -502,17 +507,33 @@ class TOCCatalogExtractor:
         if not chapters and lines:
             chapters = self._fallback_parse(lines)
 
+        # 构建标准格式文本
+        formatted_lines = []
+        for ch in chapters:
+            formatted_lines.append(ch["title"])
+            for sub in ch.get("subsections", []):
+                formatted_lines.append(f"  {sub['title']}")
+
+        formatted_text = "\n".join(formatted_lines)
+
+        # 日志输出完整的目录解析结果
+        logger.info(f"[TOC解析] 共 {len(chapters)} 章,标准格式文本:\n{formatted_text}")
+
         return {
             "chapters": chapters,
-            "total_chapters": len(chapters)
+            "total_chapters": len(chapters),
+            "raw_ocr_text": text,
+            "formatted_text": formatted_text
         }
 
     def _fallback_parse(self, lines: List[str]) -> List[Dict[str, Any]]:
         """
         降级解析策略:当正则无法匹配时使用启发式方法
+        输出标准格式:第X章 XXX / 一、XXX
         """
         chapters = []
         idx = 0
+        section_idx = 0
 
         for line in lines:
             line = line.strip()
@@ -530,22 +551,26 @@ class TOCCatalogExtractor:
             # 根据内容特征判断层级
             is_chapter = any(kw in title for kw in ['编制依据', '工程概况', '施工计划',
                                                        '施工工艺', '安全保证', '质量保证',
-                                                       '环境保证', '人员配备', '验收'])
+                                                       '环境保证', '人员配备', '验收',
+                                                       '其他资料'])
 
             if is_chapter or len(chapters) == 0:
                 idx += 1
+                section_idx = 0  # 重置节计数
                 chapters.append({
                     "index": idx,
-                    "title": title,
+                    "title": f"第{idx}章 {title}",
                     "page": page,
                     "original": line,
                     "subsections": []
                 })
             else:
-                # 作为上一章的节
+                # 作为上一章的节,使用标准格式 一、二、三
                 if chapters:
+                    section_idx += 1
+                    section_cn = self._number_to_chinese(section_idx)
                     chapters[-1]["subsections"].append({
-                        "title": title,
+                        "title": f"{section_cn}、{title}",
                         "page": page,
                         "level": 2,
                         "original": line
@@ -553,14 +578,14 @@ class TOCCatalogExtractor:
 
         return chapters
 
-    def _chinese_to_number(self, chinese: str) -> int:
-        """中文数字转阿拉伯数字"""
+    def _number_to_chinese(self, num: int) -> str:
+        """阿拉伯数字转中文数字"""
         chinese_nums = {
-            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
-            '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
-            '十一': 11, '十二': 12
+            1: '一', 2: '二', 3: '三', 4: '四', 5: '五',
+            6: '六', 7: '七', 8: '八', 9: '九', 10: '十',
+            11: '十一', 12: '十二', 13: '十三', 14: '十四', 15: '十五'
         }
-        return chinese_nums.get(chinese, 0)
+        return chinese_nums.get(num, str(num))
 
 
 def extract_catalog_from_pdf(