Pārlūkot izejas kodu

fix(catalog): 修复目录完整性检查,直接使用分类代码对比

- 重写 check_outline_catalogue,直接从 catalog 读取分类代码
- 对比 CSV 标准分类表,code 精确匹配,移除模糊匹配
- 修复 document_structure 序列化,添加 catalog 字段
- 修复 simple_processor 目录分类结果写回逻辑
- 统一 catalog 和 outline 的字段格式
WangXuMing 1 nedēļu atpakaļ
vecāks
revīzija
6e4807d82e

+ 71 - 118
core/construction_review/component/ai_review_engine.py

@@ -1012,8 +1012,6 @@ class AIReviewEngine(BaseReviewer):
         Returns:
             Dict[str, Any]: 包含缺失一级、二级目录的统计结果
         """
-        from .outline_catalogue_matcher import OutlineCatalogueMatcher
-
         start_time = time.time()
         name = "outline_catalogue_check"
 
@@ -1025,149 +1023,104 @@ class AIReviewEngine(BaseReviewer):
                 Path(__file__).parent / 'doc_worker' / 'config' /
                 'StandardCategoryTable.csv'
             )
-            raw_content_csv = str(
-                Path(__file__).parent / 'doc_worker' / 'config' /
-                'construction_plan_standards.csv'
-            )
 
-            # 获取 catalog(YOLO+OCR提取的原始目录)和 outline(分类后的结构
+            # 获取 catalog(YOLO+OCR提取的目录,已包含分类代码)
             catalog_chapters = []
-            outline_chapters = []
 
             if outline_data and isinstance(outline_data, dict):
-                # 从 outline_data 获取 catalog(优先)
                 catalog_raw = outline_data.get('catalog')
                 if catalog_raw and isinstance(catalog_raw, dict):
                     catalog_chapters = catalog_raw.get('chapters', [])
-                # 获取 outline(用于分类代码映射)
-                outline_raw = outline_data.get('outline')
-                if isinstance(outline_raw, dict):
-                    outline_chapters = outline_raw.get('chapters', [])
-                elif isinstance(outline_raw, list):
-                    outline_chapters = outline_raw
 
             # 从 state 回退获取
-            if state and isinstance(state, dict):
+            if not catalog_chapters and state and isinstance(state, dict):
                 structured = state.get('structured_content', {})
+                catalog_raw = structured.get('catalog')
+                if catalog_raw and isinstance(catalog_raw, dict):
+                    catalog_chapters = catalog_raw.get('chapters', [])
 
-                # 获取 catalog
-                if not catalog_chapters:
-                    catalog_raw = structured.get('catalog')
-                    if catalog_raw and isinstance(catalog_raw, dict):
-                        catalog_chapters = catalog_raw.get('chapters', [])
-
-                # 获取 outline
-                if not outline_chapters:
-                    outline_raw = structured.get('outline', {})
-                    if isinstance(outline_raw, dict):
-                        outline_chapters = outline_raw.get('chapters', [])
-                    elif isinstance(outline_raw, list):
-                        outline_chapters = outline_raw
-
-            logger.info(f"[{name}] catalog: {len(catalog_chapters)} 章, outline: {len(outline_chapters)} 章")
-
-            # 使用模糊匹配器
-            matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
-
-            # 构建 outline 标题到分类代码的映射
-            # outline: [{"chapter_classification": "basis", "title": "第一章 编制依据", "subsections": [...]}]
-            outline_first_map = {}  # title -> chapter_classification
-            outline_second_map = {}  # (first_title, sub_title) -> secondary_category_code
-
-            for chapter in outline_chapters:
+            logger.info(f"[{name}] catalog: {len(catalog_chapters)} 章")
+
+            # 读取 CSV 标准分类表
+            import pandas as pd
+            df = pd.read_csv(csv_path)
+
+            # 构建标准一级和二级目录
+            standard_first = {}  # code -> name
+            standard_second = {}  # (first_code, second_code) -> name
+            for _, row in df.iterrows():
+                first_code = row.get('first_code', '')
+                first_name = row.get('first_name', '')
+                second_code = row.get('second_code', '')
+                second_name = row.get('second_name', '')
+
+                if first_code and first_name:
+                    standard_first[first_code] = first_name
+                if first_code and second_code and second_name:
+                    standard_second[(first_code, second_code)] = second_name
+
+            # 从 catalog 收集实际存在的一级和二级 code
+            actual_first = set()
+            actual_second = set()
+            for chapter in catalog_chapters:
                 if not isinstance(chapter, dict):
                     continue
                 first_code = chapter.get('chapter_classification', '')
-                first_title = chapter.get('title', '')
-                if first_code and first_title:
-                    outline_first_map[first_title] = first_code
+                if first_code:
+                    actual_first.add(first_code)
 
                 for sub in chapter.get('subsections', []):
                     if isinstance(sub, dict):
-                        sub_title = sub.get('title', '')
                         second_code = sub.get('secondary_category_code', '')
-                        if first_title and sub_title and second_code:
-                            outline_second_map[(first_title, sub_title)] = second_code
-
-            # 使用 catalog 的标题,匹配 outline 的分类代码
-            outline_first = set()
-            outline_secondary = {}
-
-            for chapter in catalog_chapters:
-                if not isinstance(chapter, dict):
-                    continue
-
-                catalog_title = chapter.get('title', '')
-                if not catalog_title:
-                    continue
-
-                # 尝试从 outline 匹配一级分类代码
-                first_code = outline_first_map.get(catalog_title)
+                        if first_code and second_code:
+                            actual_second.add((first_code, second_code))
+
+            # 计算缺失项
+            missing_first = []
+            matched_first = set()
+            for code, name in standard_first.items():
+                if code in actual_first:
+                    matched_first.add(code)
+                else:
+                    missing_first.append({'first_code': code, 'first_name': name})
 
-                # 如果精确匹配失败,尝试模糊匹配
-                if not first_code:
-                    for outline_title, code in outline_first_map.items():
-                        if matcher._calculate_similarity(catalog_title, outline_title) > 0.7:
-                            first_code = code
-                            break
+            missing_second = []
+            matched_second = set()
+            for (fc, sc), name in standard_second.items():
+                if (fc, sc) in actual_second:
+                    matched_second.add((fc, sc))
+                else:
+                    missing_second.append({
+                        'first_code': fc,
+                        'second_code': sc,
+                        'second_name': name
+                    })
 
-                if first_code:
-                    outline_first.add(first_code)
+            logger.info(f"[{name}] 标准一级: {len(standard_first)} 个, 实际: {len(matched_first)} 个, 缺失: {len(missing_first)} 个")
+            logger.info(f"[{name}] 标准二级: {len(standard_second)} 个, 实际: {len(matched_second)} 个, 缺失: {len(missing_second)} 个")
 
-                # 匹配二级分类
-                for sub in chapter.get('subsections', []):
-                    if not isinstance(sub, dict):
-                        continue
-                    sub_title = sub.get('title', '')
-                    if not sub_title:
-                        continue
-
-                    # 尝试精确匹配
-                    second_code = outline_second_map.get((catalog_title, sub_title))
-
-                    # 模糊匹配
-                    if not second_code and first_code:
-                        for (outline_first, outline_sub), code in outline_second_map.items():
-                            if matcher._calculate_similarity(catalog_title, outline_first) > 0.7 and \
-                               matcher._calculate_similarity(sub_title, outline_sub) > 0.7:
-                                second_code = code
-                                break
-
-                    if first_code and second_code:
-                        outline_secondary[(first_code, second_code)] = sub_title
-
-            logger.info(f"[{name}] 匹配到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
-
-            # 执行标准目录匹配检查
-            match_result = matcher.match_catalogue(
-                outline_first=outline_first,
-                outline_secondary=outline_secondary,
-                threshold=0.6
-            )
-            
             catalogue_result = {
                 "level": "primary_and_secondary",
-                "is_complete": match_result['missing_first_count'] == 0 and match_result['missing_second_count'] == 0,
+                "is_complete": len(missing_first) == 0 and len(missing_second) == 0,
                 "first_level": {
-                    "total_required": len(matcher.first_names),
-                    "actual_present": len(match_result['matched_first']),
-                    "missing_count": match_result['missing_first_count'],
-                    "missing": match_result['missing_first']
+                    "total_required": len(standard_first),
+                    "actual_present": len(matched_first),
+                    "missing_count": len(missing_first),
+                    "missing": missing_first
                 },
                 "second_level": {
-                    "total_required": len(matcher.second_names),
-                    "actual_present": len(match_result['matched_second']),
-                    "missing_count": match_result['missing_second_count'],
-                    "missing": match_result['missing_second']
-                },
-                "match_details": match_result['match_details']
+                    "total_required": len(standard_second),
+                    "actual_present": len(matched_second),
+                    "missing_count": len(missing_second),
+                    "missing": missing_second
+                }
             }
 
             execution_time = time.time() - start_time
             logger.info(
                 f"[{name}] 检查完成,耗时: {execution_time:.2f}s, "
-                f"缺失一级: {match_result['missing_first_count']} 个, "
-                f"缺失二级: {match_result['missing_second_count']} 个"
+                f"缺失一级: {len(missing_first)} 个, "
+                f"缺失二级: {len(missing_second)} 个"
             )
 
             return {
@@ -1175,10 +1128,10 @@ class AIReviewEngine(BaseReviewer):
                 "execution_time": execution_time,
                 "details": {
                     "name": name,
-                    "missing_first_count": match_result['missing_first_count'],
-                    "missing_second_count": match_result['missing_second_count'],
-                    "missing_first": match_result['missing_first'],
-                    "missing_second": match_result['missing_second'],
+                    "missing_first_count": len(missing_first),
+                    "missing_second_count": len(missing_second),
+                    "missing_first": missing_first,
+                    "missing_second": missing_second,
                     "catalogue_check": catalogue_result
                 }
             }

+ 3 - 1
core/construction_review/component/doc_worker/models/document_structure.py

@@ -281,7 +281,8 @@ class UnifiedDocumentStructure:
                 }
                 for t in self.tertiary_classifications
             ],
-            "outline": self.outline.to_dict()
+            "outline": self.outline.to_dict(),
+            "catalog": self.catalog
         }
 
     @classmethod
@@ -349,6 +350,7 @@ class UnifiedDocumentStructure:
                 for t in data.get("tertiary_classifications", [])
             ],
             outline=Outline.from_dict(data.get("outline", [])),
+            catalog=data.get("catalog"),
             raw_metadata=data.get("raw_metadata", {})
         )
 

+ 129 - 1
core/construction_review/component/minimal_pipeline/simple_processor.py

@@ -122,6 +122,18 @@ class SimpleDocumentProcessor:
 
         structure = self.pdf_extractor.extract(file_content, progress_callback=_extraction_progress)
         catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
+
+        # 对 catalog 进行分类(如果存在)
+        if catalog and catalog.get("chapters"):
+            try:
+                catalog = await self._classify_catalog(catalog)
+                logger.info(f"[SimpleProcessor] Catalog分类完成")
+                # 验证一级分类是否写入
+                for ch in catalog.get("chapters", [])[:2]:
+                    logger.info(f"[SimpleProcessor] Catalog章节验证: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
+            except Exception as e:
+                logger.warning(f"[SimpleProcessor] Catalog分类失败: {e}")
+
         await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
 
         # 2. 一级分类
@@ -139,7 +151,7 @@ class SimpleDocumentProcessor:
         chunks = assemble_chunks(structure, primary_result, secondary_result)
         if not chunks:
             logger.warning("[SimpleProcessor] 无可用的 chunks")
-            return structure, primary_result, secondary_result, chunks
+            return structure, primary_result, secondary_result, chunks, catalog
         await self._emit_progress(progress_callback, "文档切分", 50, f"组装 {len(chunks)} 个内容块")
 
         # 5. 三级分类
@@ -158,6 +170,12 @@ class SimpleDocumentProcessor:
         logger.info("[SimpleProcessor] 三级分类完成")
         await self._emit_progress(progress_callback, "文档分类", 90, "三级分类完成")
 
+        # 验证返回前的catalog
+        if catalog:
+            logger.info(f"[SimpleProcessor] 返回前Catalog验证: {len(catalog.get('chapters', []))} 章")
+            for ch in catalog.get("chapters", [])[:2]:
+                logger.info(f"[SimpleProcessor] 返回前章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
+
         return structure, primary_result, secondary_result, chunks, catalog
 
     async def _emit_progress(
@@ -226,6 +244,10 @@ class SimpleDocumentProcessor:
         }
 
         # 设置目录结构(YOLO检测+OCR提取)
+        if catalog:
+            logger.info(f"[_build_unified_doc] 设置catalog: {len(catalog.get('chapters', []))} 章")
+            for ch in catalog.get("chapters", [])[:2]:
+                logger.info(f"[_build_unified_doc] catalog章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
         unified.catalog = catalog
 
         return unified
@@ -317,3 +339,109 @@ class SimpleDocumentProcessor:
             total_pages=total_pages,
             secondary_classifications=[],
         )
+
+    async def _classify_catalog(self, catalog: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        使用 HierarchyClassifier 对 catalog 进行一二级分类
+        """
+        from ..doc_worker.classification.hierarchy_classifier import HierarchyClassifier
+
+        # 转换为 toc_items 格式
+        toc_items = []
+        for idx, chapter in enumerate(catalog.get("chapters", [])):
+            # 一级目录
+            toc_items.append({
+                "title": chapter.get("title", ""),
+                "page": str(chapter.get("page", "0")),
+                "level": 1,
+                "original": chapter.get("original", "")
+            })
+            # 二级目录
+            for sub in chapter.get("subsections", []):
+                toc_items.append({
+                    "title": sub.get("title", ""),
+                    "page": str(sub.get("page", "0")),
+                    "level": 2,
+                    "original": sub.get("original", "")
+                })
+
+        if not toc_items:
+            return catalog
+
+        # 调用分类器
+        classifier = HierarchyClassifier()
+
+        # 一级分类
+        primary_result = await classifier.classify_async(toc_items, target_level=1)
+
+        # 二级分类
+        secondary_result = await classifier.classify_secondary_async(primary_result)
+
+        # 将分类结果写回 catalog
+        primary_items = primary_result.get("items", [])
+        secondary_items = secondary_result.get("items", [])
+
+        # 一级分类结果映射 (normalized_title -> code/name)
+        first_classification = {}
+        for item in primary_items:
+            title = item.get("title", "").strip()
+            first_classification[title] = {
+                "code": item.get("category_code", ""),
+                "name": item.get("category", "")
+            }
+
+        # 二级分类结果映射 ((first_title, sub_title) -> code)
+        second_classification = {}
+        for first_item in secondary_items:
+            first_title = (first_item.get("original_title", "") or first_item.get("first_category", "")).strip()
+            classifications = first_item.get("classifications", [])
+            for cls in classifications:
+                sub_title = cls.get("title", "").strip()
+                key = (first_title, sub_title)
+                second_classification[key] = cls.get("category_code", "")
+
+        logger.info(f"[_classify_catalog] 一级分类映射: {first_classification}")
+        logger.info(f"[_classify_catalog] 二级分类映射: {list(second_classification.keys())}")
+
+        # 写回 catalog,并调整格式与 outline 一致
+        for chapter in catalog.get("chapters", []):
+            ch_title = chapter.get("title", "").strip()
+            logger.info(f"[_classify_catalog] 处理章节: '{ch_title}'")
+            logger.info(f"[_classify_catalog] 查找一级分类, keys={list(first_classification.keys())}")
+
+            if ch_title in first_classification:
+                chapter["chapter_classification"] = first_classification[ch_title]["code"]
+                chapter["first_name"] = first_classification[ch_title]["name"]
+                logger.info(f"[_classify_catalog] 精确匹配成功: {ch_title} -> {first_classification[ch_title]}")
+            else:
+                # 尝试模糊匹配
+                matched = False
+                for cls_title, cls_data in first_classification.items():
+                    if cls_title in ch_title or ch_title in cls_title:
+                        chapter["chapter_classification"] = cls_data["code"]
+                        chapter["first_name"] = cls_data["name"]
+                        logger.info(f"[_classify_catalog] 模糊匹配成功: '{ch_title}' ~ '{cls_title}' -> {cls_data}")
+                        matched = True
+                        break
+                if not matched:
+                    logger.warning(f"[_classify_catalog] 未找到匹配: '{ch_title}' 不在 {list(first_classification.keys())}")
+
+            # 调整 original 格式:只保留 "第X章"
+            chapter["original"] = f"第{chapter.get('index', 1)}章"
+
+            # 二级
+            for sub in chapter.get("subsections", []):
+                sub_title = sub.get("title", "").strip()
+                key = (ch_title, sub_title)
+                if key in second_classification:
+                    sub["secondary_category_code"] = second_classification[key]
+                else:
+                    # 尝试模糊匹配
+                    for (ft, st), code in second_classification.items():
+                        if (ft in ch_title or ch_title in ft) and (st in sub_title or sub_title in st):
+                            sub["secondary_category_code"] = code
+                            break
+                # 调整 original 格式:"第X章->节标题"
+                sub["original"] = f"第{chapter.get('index', 1)}章->{sub_title}"
+
+        return catalog