1 mēnesi atpakaļ · 6e4807d82e
--- a/core/construction_review/component/ai_review_engine.py
+++ b/core/construction_review/component/ai_review_engine.py
@@ -1012,8 +1012,6 @@ class AIReviewEngine(BaseReviewer):
 
				         Returns:
			
 
				             Dict[str, Any]: 包含缺失一级、二级目录的统计结果
			
 
				         """
			
 
				-        from .outline_catalogue_matcher import OutlineCatalogueMatcher
			
 
				-
			
 
				         start_time = time.time()
			
 
				         name = "outline_catalogue_check"
			
 
				 
			
@@ -1025,149 +1023,104 @@ class AIReviewEngine(BaseReviewer):
 
				                 Path(__file__).parent / 'doc_worker' / 'config' /
			
 
				                 'StandardCategoryTable.csv'
			
 
				             )
			
 
				-            raw_content_csv = str(
			
 
				-                Path(__file__).parent / 'doc_worker' / 'config' /
			
 
				-                'construction_plan_standards.csv'
			
 
				-            )
			
 
				 
			
 
				-            # 获取 catalog（YOLO+OCR提取的原始目录）和 outline（分类后的结构）
			
 
				+            # 获取 catalog（YOLO+OCR提取的目录，已包含分类代码）
			
 
				             catalog_chapters = []
			
 
				-            outline_chapters = []
			
 
				 
			
 
				             if outline_data and isinstance(outline_data, dict):
			
 
				-                # 从 outline_data 获取 catalog（优先）
			
 
				                 catalog_raw = outline_data.get('catalog')
			
 
				                 if catalog_raw and isinstance(catalog_raw, dict):
			
 
				                     catalog_chapters = catalog_raw.get('chapters', [])
			
 
				-                # 获取 outline（用于分类代码映射）
			
 
				-                outline_raw = outline_data.get('outline')
			
 
				-                if isinstance(outline_raw, dict):
			
 
				-                    outline_chapters = outline_raw.get('chapters', [])
			
 
				-                elif isinstance(outline_raw, list):
			
 
				-                    outline_chapters = outline_raw
			
 
				 
			
 
				             # 从 state 回退获取
			
 
				-            if state and isinstance(state, dict):
			
 
				+            if not catalog_chapters and state and isinstance(state, dict):
			
 
				                 structured = state.get('structured_content', {})
			
 
				+                catalog_raw = structured.get('catalog')
			
 
				+                if catalog_raw and isinstance(catalog_raw, dict):
			
 
				+                    catalog_chapters = catalog_raw.get('chapters', [])
			
 
				 
			
 
				-                # 获取 catalog
			
 
				-                if not catalog_chapters:
			
 
				-                    catalog_raw = structured.get('catalog')
			
 
				-                    if catalog_raw and isinstance(catalog_raw, dict):
			
 
				-                        catalog_chapters = catalog_raw.get('chapters', [])
			
 
				-
			
 
				-                # 获取 outline
			
 
				-                if not outline_chapters:
			
 
				-                    outline_raw = structured.get('outline', {})
			
 
				-                    if isinstance(outline_raw, dict):
			
 
				-                        outline_chapters = outline_raw.get('chapters', [])
			
 
				-                    elif isinstance(outline_raw, list):
			
 
				-                        outline_chapters = outline_raw
			
 
				-
			
 
				-            logger.info(f"[{name}] catalog: {len(catalog_chapters)} 章, outline: {len(outline_chapters)} 章")
			
 
				-
			
 
				-            # 使用模糊匹配器
			
 
				-            matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
			
 
				-
			
 
				-            # 构建 outline 标题到分类代码的映射
			
 
				-            # outline: [{"chapter_classification": "basis", "title": "第一章 编制依据", "subsections": [...]}]
			
 
				-            outline_first_map = {}  # title -> chapter_classification
			
 
				-            outline_second_map = {}  # (first_title, sub_title) -> secondary_category_code
			
 
				-
			
 
				-            for chapter in outline_chapters:
			
 
				+            logger.info(f"[{name}] catalog: {len(catalog_chapters)} 章")
			
 
				+
			
 
				+            # 读取 CSV 标准分类表
			
 
				+            import pandas as pd
			
 
				+            df = pd.read_csv(csv_path)
			
 
				+
			
 
				+            # 构建标准一级和二级目录
			
 
				+            standard_first = {}  # code -> name
			
 
				+            standard_second = {}  # (first_code, second_code) -> name
			
 
				+            for _, row in df.iterrows():
			
 
				+                first_code = row.get('first_code', '')
			
 
				+                first_name = row.get('first_name', '')
			
 
				+                second_code = row.get('second_code', '')
			
 
				+                second_name = row.get('second_name', '')
			
 
				+
			
 
				+                if first_code and first_name:
			
 
				+                    standard_first[first_code] = first_name
			
 
				+                if first_code and second_code and second_name:
			
 
				+                    standard_second[(first_code, second_code)] = second_name
			
 
				+
			
 
				+            # 从 catalog 收集实际存在的一级和二级 code
			
 
				+            actual_first = set()
			
 
				+            actual_second = set()
			
 
				+            for chapter in catalog_chapters:
			
 
				                 if not isinstance(chapter, dict):
			
 
				                     continue
			
 
				                 first_code = chapter.get('chapter_classification', '')
			
 
				-                first_title = chapter.get('title', '')
			
 
				-                if first_code and first_title:
			
 
				-                    outline_first_map[first_title] = first_code
			
 
				+                if first_code:
			
 
				+                    actual_first.add(first_code)
			
 
				 
			
 
				                 for sub in chapter.get('subsections', []):
			
 
				                     if isinstance(sub, dict):
			
 
				-                        sub_title = sub.get('title', '')
			
 
				                         second_code = sub.get('secondary_category_code', '')
			
 
				-                        if first_title and sub_title and second_code:
			
 
				-                            outline_second_map[(first_title, sub_title)] = second_code
			
 
				-
			
 
				-            # 使用 catalog 的标题，匹配 outline 的分类代码
			
 
				-            outline_first = set()
			
 
				-            outline_secondary = {}
			
 
				-
			
 
				-            for chapter in catalog_chapters:
			
 
				-                if not isinstance(chapter, dict):
			
 
				-                    continue
			
 
				-
			
 
				-                catalog_title = chapter.get('title', '')
			
 
				-                if not catalog_title:
			
 
				-                    continue
			
 
				-
			
 
				-                # 尝试从 outline 匹配一级分类代码
			
 
				-                first_code = outline_first_map.get(catalog_title)
			
 
				+                        if first_code and second_code:
			
 
				+                            actual_second.add((first_code, second_code))
			
 
				+
			
 
				+            # 计算缺失项
			
 
				+            missing_first = []
			
 
				+            matched_first = set()
			
 
				+            for code, name in standard_first.items():
			
 
				+                if code in actual_first:
			
 
				+                    matched_first.add(code)
			
 
				+                else:
			
 
				+                    missing_first.append({'first_code': code, 'first_name': name})
			
 
				 
			
 
				-                # 如果精确匹配失败，尝试模糊匹配
			
 
				-                if not first_code:
			
 
				-                    for outline_title, code in outline_first_map.items():
			
 
				-                        if matcher._calculate_similarity(catalog_title, outline_title) > 0.7:
			
 
				-                            first_code = code
			
 
				-                            break
			
 
				+            missing_second = []
			
 
				+            matched_second = set()
			
 
				+            for (fc, sc), name in standard_second.items():
			
 
				+                if (fc, sc) in actual_second:
			
 
				+                    matched_second.add((fc, sc))
			
 
				+                else:
			
 
				+                    missing_second.append({
			
 
				+                        'first_code': fc,
			
 
				+                        'second_code': sc,
			
 
				+                        'second_name': name
			
 
				+                    })
			
 
				 
			
 
				-                if first_code:
			
 
				-                    outline_first.add(first_code)
			
 
				+            logger.info(f"[{name}] 标准一级: {len(standard_first)} 个, 实际: {len(matched_first)} 个, 缺失: {len(missing_first)} 个")
			
 
				+            logger.info(f"[{name}] 标准二级: {len(standard_second)} 个, 实际: {len(matched_second)} 个, 缺失: {len(missing_second)} 个")
			
 
				 
			
 
				-                # 匹配二级分类
			
 
				-                for sub in chapter.get('subsections', []):
			
 
				-                    if not isinstance(sub, dict):
			
 
				-                        continue
			
 
				-                    sub_title = sub.get('title', '')
			
 
				-                    if not sub_title:
			
 
				-                        continue
			
 
				-
			
 
				-                    # 尝试精确匹配
			
 
				-                    second_code = outline_second_map.get((catalog_title, sub_title))
			
 
				-
			
 
				-                    # 模糊匹配
			
 
				-                    if not second_code and first_code:
			
 
				-                        for (outline_first, outline_sub), code in outline_second_map.items():
			
 
				-                            if matcher._calculate_similarity(catalog_title, outline_first) > 0.7 and \
			
 
				-                               matcher._calculate_similarity(sub_title, outline_sub) > 0.7:
			
 
				-                                second_code = code
			
 
				-                                break
			
 
				-
			
 
				-                    if first_code and second_code:
			
 
				-                        outline_secondary[(first_code, second_code)] = sub_title
			
 
				-
			
 
				-            logger.info(f"[{name}] 匹配到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
			
 
				-
			
 
				-            # 执行标准目录匹配检查
			
 
				-            match_result = matcher.match_catalogue(
			
 
				-                outline_first=outline_first,
			
 
				-                outline_secondary=outline_secondary,
			
 
				-                threshold=0.6
			
 
				-            )
			
 
				-            
			
 
				             catalogue_result = {
			
 
				                 "level": "primary_and_secondary",
			
 
				-                "is_complete": match_result['missing_first_count'] == 0 and match_result['missing_second_count'] == 0,
			
 
				+                "is_complete": len(missing_first) == 0 and len(missing_second) == 0,
			
 
				                 "first_level": {
			
 
				-                    "total_required": len(matcher.first_names),
			
 
				-                    "actual_present": len(match_result['matched_first']),
			
 
				-                    "missing_count": match_result['missing_first_count'],
			
 
				-                    "missing": match_result['missing_first']
			
 
				+                    "total_required": len(standard_first),
			
 
				+                    "actual_present": len(matched_first),
			
 
				+                    "missing_count": len(missing_first),
			
 
				+                    "missing": missing_first
			
 
				                 },
			
 
				                 "second_level": {
			
 
				-                    "total_required": len(matcher.second_names),
			
 
				-                    "actual_present": len(match_result['matched_second']),
			
 
				-                    "missing_count": match_result['missing_second_count'],
			
 
				-                    "missing": match_result['missing_second']
			
 
				-                },
			
 
				-                "match_details": match_result['match_details']
			
 
				+                    "total_required": len(standard_second),
			
 
				+                    "actual_present": len(matched_second),
			
 
				+                    "missing_count": len(missing_second),
			
 
				+                    "missing": missing_second
			
 
				+                }
			
 
				             }
			
 
				 
			
 
				             execution_time = time.time() - start_time
			
 
				             logger.info(
			
 
				                 f"[{name}] 检查完成，耗时: {execution_time:.2f}s, "
			
 
				-                f"缺失一级: {match_result['missing_first_count']} 个, "
			
 
				-                f"缺失二级: {match_result['missing_second_count']} 个"
			
 
				+                f"缺失一级: {len(missing_first)} 个, "
			
 
				+                f"缺失二级: {len(missing_second)} 个"
			
 
				             )
			
 
				 
			
 
				             return {
			
@@ -1175,10 +1128,10 @@ class AIReviewEngine(BaseReviewer):
 
				                 "execution_time": execution_time,
			
 
				                 "details": {
			
 
				                     "name": name,
			
 
				-                    "missing_first_count": match_result['missing_first_count'],
			
 
				-                    "missing_second_count": match_result['missing_second_count'],
			
 
				-                    "missing_first": match_result['missing_first'],
			
 
				-                    "missing_second": match_result['missing_second'],
			
 
				+                    "missing_first_count": len(missing_first),
			
 
				+                    "missing_second_count": len(missing_second),
			
 
				+                    "missing_first": missing_first,
			
 
				+                    "missing_second": missing_second,
			
 
				                     "catalogue_check": catalogue_result
			
 
				                 }
			
 
				             }
			
--- a/core/construction_review/component/doc_worker/models/document_structure.py
+++ b/core/construction_review/component/doc_worker/models/document_structure.py
@@ -281,7 +281,8 @@ class UnifiedDocumentStructure:
 
				                 }
			
 
				                 for t in self.tertiary_classifications
			
 
				             ],
			
 
				-            "outline": self.outline.to_dict()
			
 
				+            "outline": self.outline.to_dict(),
			
 
				+            "catalog": self.catalog
			
 
				         }
			
 
				 
			
 
				     @classmethod
			
@@ -349,6 +350,7 @@ class UnifiedDocumentStructure:
 
				                 for t in data.get("tertiary_classifications", [])
			
 
				             ],
			
 
				             outline=Outline.from_dict(data.get("outline", [])),
			
 
				+            catalog=data.get("catalog"),
			
 
				             raw_metadata=data.get("raw_metadata", {})
			
 
				         )
			
 
				 
			
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -122,6 +122,18 @@ class SimpleDocumentProcessor:
 
				 
			
 
				         structure = self.pdf_extractor.extract(file_content, progress_callback=_extraction_progress)
			
 
				         catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
			
 
				+
			
 
				+        # 对 catalog 进行分类（如果存在）
			
 
				+        if catalog and catalog.get("chapters"):
			
 
				+            try:
			
 
				+                catalog = await self._classify_catalog(catalog)
			
 
				+                logger.info(f"[SimpleProcessor] Catalog分类完成")
			
 
				+                # 验证一级分类是否写入
			
 
				+                for ch in catalog.get("chapters", [])[:2]:
			
 
				+                    logger.info(f"[SimpleProcessor] Catalog章节验证: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"[SimpleProcessor] Catalog分类失败: {e}")
			
 
				+
			
 
				         await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
			
 
				 
			
 
				         # 2. 一级分类
			
@@ -139,7 +151,7 @@ class SimpleDocumentProcessor:
 
				         chunks = assemble_chunks(structure, primary_result, secondary_result)
			
 
				         if not chunks:
			
 
				             logger.warning("[SimpleProcessor] 无可用的 chunks")
			
 
				-            return structure, primary_result, secondary_result, chunks
			
 
				+            return structure, primary_result, secondary_result, chunks, catalog
			
 
				         await self._emit_progress(progress_callback, "文档切分", 50, f"组装 {len(chunks)} 个内容块")
			
 
				 
			
 
				         # 5. 三级分类
			
@@ -158,6 +170,12 @@ class SimpleDocumentProcessor:
 
				         logger.info("[SimpleProcessor] 三级分类完成")
			
 
				         await self._emit_progress(progress_callback, "文档分类", 90, "三级分类完成")
			
 
				 
			
 
				+        # 验证返回前的catalog
			
 
				+        if catalog:
			
 
				+            logger.info(f"[SimpleProcessor] 返回前Catalog验证: {len(catalog.get('chapters', []))} 章")
			
 
				+            for ch in catalog.get("chapters", [])[:2]:
			
 
				+                logger.info(f"[SimpleProcessor] 返回前章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
			
 
				+
			
 
				         return structure, primary_result, secondary_result, chunks, catalog
			
 
				 
			
 
				     async def _emit_progress(
			
@@ -226,6 +244,10 @@ class SimpleDocumentProcessor:
 
				         }
			
 
				 
			
 
				         # 设置目录结构（YOLO检测+OCR提取）
			
 
				+        if catalog:
			
 
				+            logger.info(f"[_build_unified_doc] 设置catalog: {len(catalog.get('chapters', []))} 章")
			
 
				+            for ch in catalog.get("chapters", [])[:2]:
			
 
				+                logger.info(f"[_build_unified_doc] catalog章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
			
 
				         unified.catalog = catalog
			
 
				 
			
 
				         return unified
			
@@ -317,3 +339,109 @@ class SimpleDocumentProcessor:
 
				             total_pages=total_pages,
			
 
				             secondary_classifications=[],
			
 
				         )
			
 
				+
			
 
				+    async def _classify_catalog(self, catalog: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        使用 HierarchyClassifier 对 catalog 进行一二级分类
			
 
				+        """
			
 
				+        from ..doc_worker.classification.hierarchy_classifier import HierarchyClassifier
			
 
				+
			
 
				+        # 转换为 toc_items 格式
			
 
				+        toc_items = []
			
 
				+        for idx, chapter in enumerate(catalog.get("chapters", [])):
			
 
				+            # 一级目录
			
 
				+            toc_items.append({
			
 
				+                "title": chapter.get("title", ""),
			
 
				+                "page": str(chapter.get("page", "0")),
			
 
				+                "level": 1,
			
 
				+                "original": chapter.get("original", "")
			
 
				+            })
			
 
				+            # 二级目录
			
 
				+            for sub in chapter.get("subsections", []):
			
 
				+                toc_items.append({
			
 
				+                    "title": sub.get("title", ""),
			
 
				+                    "page": str(sub.get("page", "0")),
			
 
				+                    "level": 2,
			
 
				+                    "original": sub.get("original", "")
			
 
				+                })
			
 
				+
			
 
				+        if not toc_items:
			
 
				+            return catalog
			
 
				+
			
 
				+        # 调用分类器
			
 
				+        classifier = HierarchyClassifier()
			
 
				+
			
 
				+        # 一级分类
			
 
				+        primary_result = await classifier.classify_async(toc_items, target_level=1)
			
 
				+
			
 
				+        # 二级分类
			
 
				+        secondary_result = await classifier.classify_secondary_async(primary_result)
			
 
				+
			
 
				+        # 将分类结果写回 catalog
			
 
				+        primary_items = primary_result.get("items", [])
			
 
				+        secondary_items = secondary_result.get("items", [])
			
 
				+
			
 
				+        # 一级分类结果映射 (normalized_title -> code/name)
			
 
				+        first_classification = {}
			
 
				+        for item in primary_items:
			
 
				+            title = item.get("title", "").strip()
			
 
				+            first_classification[title] = {
			
 
				+                "code": item.get("category_code", ""),
			
 
				+                "name": item.get("category", "")
			
 
				+            }
			
 
				+
			
 
				+        # 二级分类结果映射 ((first_title, sub_title) -> code)
			
 
				+        second_classification = {}
			
 
				+        for first_item in secondary_items:
			
 
				+            first_title = (first_item.get("original_title", "") or first_item.get("first_category", "")).strip()
			
 
				+            classifications = first_item.get("classifications", [])
			
 
				+            for cls in classifications:
			
 
				+                sub_title = cls.get("title", "").strip()
			
 
				+                key = (first_title, sub_title)
			
 
				+                second_classification[key] = cls.get("category_code", "")
			
 
				+
			
 
				+        logger.info(f"[_classify_catalog] 一级分类映射: {first_classification}")
			
 
				+        logger.info(f"[_classify_catalog] 二级分类映射: {list(second_classification.keys())}")
			
 
				+
			
 
				+        # 写回 catalog，并调整格式与 outline 一致
			
 
				+        for chapter in catalog.get("chapters", []):
			
 
				+            ch_title = chapter.get("title", "").strip()
			
 
				+            logger.info(f"[_classify_catalog] 处理章节: '{ch_title}'")
			
 
				+            logger.info(f"[_classify_catalog] 查找一级分类, keys={list(first_classification.keys())}")
			
 
				+
			
 
				+            if ch_title in first_classification:
			
 
				+                chapter["chapter_classification"] = first_classification[ch_title]["code"]
			
 
				+                chapter["first_name"] = first_classification[ch_title]["name"]
			
 
				+                logger.info(f"[_classify_catalog] 精确匹配成功: {ch_title} -> {first_classification[ch_title]}")
			
 
				+            else:
			
 
				+                # 尝试模糊匹配
			
 
				+                matched = False
			
 
				+                for cls_title, cls_data in first_classification.items():
			
 
				+                    if cls_title in ch_title or ch_title in cls_title:
			
 
				+                        chapter["chapter_classification"] = cls_data["code"]
			
 
				+                        chapter["first_name"] = cls_data["name"]
			
 
				+                        logger.info(f"[_classify_catalog] 模糊匹配成功: '{ch_title}' ~ '{cls_title}' -> {cls_data}")
			
 
				+                        matched = True
			
 
				+                        break
			
 
				+                if not matched:
			
 
				+                    logger.warning(f"[_classify_catalog] 未找到匹配: '{ch_title}' 不在 {list(first_classification.keys())}")
			
 
				+
			
 
				+            # 调整 original 格式：只保留 "第X章"
			
 
				+            chapter["original"] = f"第{chapter.get('index', 1)}章"
			
 
				+
			
 
				+            # 二级
			
 
				+            for sub in chapter.get("subsections", []):
			
 
				+                sub_title = sub.get("title", "").strip()
			
 
				+                key = (ch_title, sub_title)
			
 
				+                if key in second_classification:
			
 
				+                    sub["secondary_category_code"] = second_classification[key]
			
 
				+                else:
			
 
				+                    # 尝试模糊匹配
			
 
				+                    for (ft, st), code in second_classification.items():
			
 
				+                        if (ft in ch_title or ch_title in ft) and (st in sub_title or sub_title in st):
			
 
				+                            sub["secondary_category_code"] = code
			
 
				+                            break
			
 
				+                # 调整 original 格式："第X章->节标题"
			
 
				+                sub["original"] = f"第{chapter.get('index', 1)}章->{sub_title}"
			
 
				+
			
 
				+        return catalog