hace 1 semana · 6e4807d82e
--- a/core/construction_review/component/ai_review_engine.py
+++ b/core/construction_review/component/ai_review_engine.py
@@ -1012,8 +1012,6 @@ class AIReviewEngine(BaseReviewer):
 
															         Returns:
														
 
															             Dict[str, Any]: 包含缺失一级、二级目录的统计结果
														
 
															         """
														
 
															-        from .outline_catalogue_matcher import OutlineCatalogueMatcher
														
 
															-
														
 
															         start_time = time.time()
														
 
															         name = "outline_catalogue_check"
														
@@ -1025,149 +1023,104 @@ class AIReviewEngine(BaseReviewer):
 
															                 Path(__file__).parent / 'doc_worker' / 'config' /
														
 
															                 'StandardCategoryTable.csv'
														
 
															             )
														
 
															-            raw_content_csv = str(
														
 
															-                Path(__file__).parent / 'doc_worker' / 'config' /
														
 
															-                'construction_plan_standards.csv'
														
 
															-            )
														
 
															-            # 获取 catalog（YOLO+OCR提取的原始目录）和 outline（分类后的结构）
														
 
															+            # 获取 catalog（YOLO+OCR提取的目录，已包含分类代码）
														
 
															             catalog_chapters = []
														
 
															-            outline_chapters = []
														
 
															             if outline_data and isinstance(outline_data, dict):
														
 
															-                # 从 outline_data 获取 catalog（优先）
														
 
															                 catalog_raw = outline_data.get('catalog')
														
 
															                 if catalog_raw and isinstance(catalog_raw, dict):
														
 
															                     catalog_chapters = catalog_raw.get('chapters', [])
														
 
															-                # 获取 outline（用于分类代码映射）
														
 
															-                outline_raw = outline_data.get('outline')
														
 
															-                if isinstance(outline_raw, dict):
														
 
															-                    outline_chapters = outline_raw.get('chapters', [])
														
 
															-                elif isinstance(outline_raw, list):
														
 
															-                    outline_chapters = outline_raw
														
 
															             # 从 state 回退获取
														
 
															-            if state and isinstance(state, dict):
														
 
															+            if not catalog_chapters and state and isinstance(state, dict):
														
 
															                 structured = state.get('structured_content', {})
														
 
															+                catalog_raw = structured.get('catalog')
														
 
															+                if catalog_raw and isinstance(catalog_raw, dict):
														
 
															+                    catalog_chapters = catalog_raw.get('chapters', [])
														
 
															-                # 获取 catalog
														
 
															-                if not catalog_chapters:
														
 
															-                    catalog_raw = structured.get('catalog')
														
 
															-                    if catalog_raw and isinstance(catalog_raw, dict):
														
 
															-                        catalog_chapters = catalog_raw.get('chapters', [])
														
 
															-
														
 
															-                # 获取 outline
														
 
															-                if not outline_chapters:
														
 
															-                    outline_raw = structured.get('outline', {})
														
 
															-                    if isinstance(outline_raw, dict):
														
 
															-                        outline_chapters = outline_raw.get('chapters', [])
														
 
															-                    elif isinstance(outline_raw, list):
														
 
															-                        outline_chapters = outline_raw
														
 
															-
														
 
															-            logger.info(f"[{name}] catalog: {len(catalog_chapters)} 章, outline: {len(outline_chapters)} 章")
														
 
															-
														
 
															-            # 使用模糊匹配器
														
 
															-            matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
														
 
															-
														
 
															-            # 构建 outline 标题到分类代码的映射
														
 
															-            # outline: [{"chapter_classification": "basis", "title": "第一章 编制依据", "subsections": [...]}]
														
 
															-            outline_first_map = {}  # title -> chapter_classification
														
 
															-            outline_second_map = {}  # (first_title, sub_title) -> secondary_category_code
														
 
															-
														
 
															-            for chapter in outline_chapters:
														
 
															+            logger.info(f"[{name}] catalog: {len(catalog_chapters)} 章")
														
 
															+
														
 
															+            # 读取 CSV 标准分类表
														
 
															+            import pandas as pd
														
 
															+            df = pd.read_csv(csv_path)
														
 
															+
														
 
															+            # 构建标准一级和二级目录
														
 
															+            standard_first = {}  # code -> name
														
 
															+            standard_second = {}  # (first_code, second_code) -> name
														
 
															+            for _, row in df.iterrows():
														
 
															+                first_code = row.get('first_code', '')
														
 
															+                first_name = row.get('first_name', '')
														
 
															+                second_code = row.get('second_code', '')
														
 
															+                second_name = row.get('second_name', '')
														
 
															+
														
 
															+                if first_code and first_name:
														
 
															+                    standard_first[first_code] = first_name
														
 
															+                if first_code and second_code and second_name:
														
 
															+                    standard_second[(first_code, second_code)] = second_name
														
 
															+
														
 
															+            # 从 catalog 收集实际存在的一级和二级 code
														
 
															+            actual_first = set()
														
 
															+            actual_second = set()
														
 
															+            for chapter in catalog_chapters:
														
 
															                 if not isinstance(chapter, dict):
														
 
															                     continue
														
 
															                 first_code = chapter.get('chapter_classification', '')
														
 
															-                first_title = chapter.get('title', '')
														
 
															-                if first_code and first_title:
														
 
															-                    outline_first_map[first_title] = first_code
														
 
															+                if first_code:
														
 
															+                    actual_first.add(first_code)
														
 
															                 for sub in chapter.get('subsections', []):
														
 
															                     if isinstance(sub, dict):
														
 
															-                        sub_title = sub.get('title', '')
														
 
															                         second_code = sub.get('secondary_category_code', '')
														
 
															-                        if first_title and sub_title and second_code:
														
 
															-                            outline_second_map[(first_title, sub_title)] = second_code
														
 
															-
														
 
															-            # 使用 catalog 的标题，匹配 outline 的分类代码
														
 
															-            outline_first = set()
														
 
															-            outline_secondary = {}
														
 
															-
														
 
															-            for chapter in catalog_chapters:
														
 
															-                if not isinstance(chapter, dict):
														
 
															-                    continue
														
 
															-
														
 
															-                catalog_title = chapter.get('title', '')
														
 
															-                if not catalog_title:
														
 
															-                    continue
														
 
															-
														
 
															-                # 尝试从 outline 匹配一级分类代码
														
 
															-                first_code = outline_first_map.get(catalog_title)
														
 
															+                        if first_code and second_code:
														
 
															+                            actual_second.add((first_code, second_code))
														
 
															+
														
 
															+            # 计算缺失项
														
 
															+            missing_first = []
														
 
															+            matched_first = set()
														
 
															+            for code, name in standard_first.items():
														
 
															+                if code in actual_first:
														
 
															+                    matched_first.add(code)
														
 
															+                else:
														
 
															+                    missing_first.append({'first_code': code, 'first_name': name})
														
 
															-                # 如果精确匹配失败，尝试模糊匹配
														
 
															-                if not first_code:
														
 
															-                    for outline_title, code in outline_first_map.items():
														
 
															-                        if matcher._calculate_similarity(catalog_title, outline_title) > 0.7:
														
 
															-                            first_code = code
														
 
															-                            break
														
 
															+            missing_second = []
														
 
															+            matched_second = set()
														
 
															+            for (fc, sc), name in standard_second.items():
														
 
															+                if (fc, sc) in actual_second:
														
 
															+                    matched_second.add((fc, sc))
														
 
															+                else:
														
 
															+                    missing_second.append({
														
 
															+                        'first_code': fc,
														
 
															+                        'second_code': sc,
														
 
															+                        'second_name': name
														
 
															+                    })
														
 
															-                if first_code:
														
 
															-                    outline_first.add(first_code)
														
 
															+            logger.info(f"[{name}] 标准一级: {len(standard_first)} 个, 实际: {len(matched_first)} 个, 缺失: {len(missing_first)} 个")
														
 
															+            logger.info(f"[{name}] 标准二级: {len(standard_second)} 个, 实际: {len(matched_second)} 个, 缺失: {len(missing_second)} 个")
														
 
															-                # 匹配二级分类
														
 
															-                for sub in chapter.get('subsections', []):
														
 
															-                    if not isinstance(sub, dict):
														
 
															-                        continue
														
 
															-                    sub_title = sub.get('title', '')
														
 
															-                    if not sub_title:
														
 
															-                        continue
														
 
															-
														
 
															-                    # 尝试精确匹配
														
 
															-                    second_code = outline_second_map.get((catalog_title, sub_title))
														
 
															-
														
 
															-                    # 模糊匹配
														
 
															-                    if not second_code and first_code:
														
 
															-                        for (outline_first, outline_sub), code in outline_second_map.items():
														
 
															-                            if matcher._calculate_similarity(catalog_title, outline_first) > 0.7 and \
														
 
															-                               matcher._calculate_similarity(sub_title, outline_sub) > 0.7:
														
 
															-                                second_code = code
														
 
															-                                break
														
 
															-
														
 
															-                    if first_code and second_code:
														
 
															-                        outline_secondary[(first_code, second_code)] = sub_title
														
 
															-
														
 
															-            logger.info(f"[{name}] 匹配到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
														
 
															-
														
 
															-            # 执行标准目录匹配检查
														
 
															-            match_result = matcher.match_catalogue(
														
 
															-                outline_first=outline_first,
														
 
															-                outline_secondary=outline_secondary,
														
 
															-                threshold=0.6
														
 
															-            )
														
 
															-            
														
 
															             catalogue_result = {
														
 
															                 "level": "primary_and_secondary",
														
 
															-                "is_complete": match_result['missing_first_count'] == 0 and match_result['missing_second_count'] == 0,
														
 
															+                "is_complete": len(missing_first) == 0 and len(missing_second) == 0,
														
 
															                 "first_level": {
														
 
															-                    "total_required": len(matcher.first_names),
														
 
															-                    "actual_present": len(match_result['matched_first']),
														
 
															-                    "missing_count": match_result['missing_first_count'],
														
 
															-                    "missing": match_result['missing_first']
														
 
															+                    "total_required": len(standard_first),
														
 
															+                    "actual_present": len(matched_first),
														
 
															+                    "missing_count": len(missing_first),
														
 
															+                    "missing": missing_first
														
 
															                 },
														
 
															                 "second_level": {
														
 
															-                    "total_required": len(matcher.second_names),
														
 
															-                    "actual_present": len(match_result['matched_second']),
														
 
															-                    "missing_count": match_result['missing_second_count'],
														
 
															-                    "missing": match_result['missing_second']
														
 
															-                },
														
 
															-                "match_details": match_result['match_details']
														
 
															+                    "total_required": len(standard_second),
														
 
															+                    "actual_present": len(matched_second),
														
 
															+                    "missing_count": len(missing_second),
														
 
															+                    "missing": missing_second
														
 
															+                }
														
 
															             }
														
 
															             execution_time = time.time() - start_time
														
 
															             logger.info(
														
 
															                 f"[{name}] 检查完成，耗时: {execution_time:.2f}s, "
														
 
															-                f"缺失一级: {match_result['missing_first_count']} 个, "
														
 
															-                f"缺失二级: {match_result['missing_second_count']} 个"
														
 
															+                f"缺失一级: {len(missing_first)} 个, "
														
 
															+                f"缺失二级: {len(missing_second)} 个"
														
 
															             )
														
 
															             return {
														
@@ -1175,10 +1128,10 @@ class AIReviewEngine(BaseReviewer):
 
															                 "execution_time": execution_time,
														
 
															                 "details": {
														
 
															                     "name": name,
														
 
															-                    "missing_first_count": match_result['missing_first_count'],
														
 
															-                    "missing_second_count": match_result['missing_second_count'],
														
 
															-                    "missing_first": match_result['missing_first'],
														
 
															-                    "missing_second": match_result['missing_second'],
														
 
															+                    "missing_first_count": len(missing_first),
														
 
															+                    "missing_second_count": len(missing_second),
														
 
															+                    "missing_first": missing_first,
														
 
															+                    "missing_second": missing_second,
														
 
															                     "catalogue_check": catalogue_result
														
 
															                 }
														
 
															             }
														
--- a/core/construction_review/component/doc_worker/models/document_structure.py
+++ b/core/construction_review/component/doc_worker/models/document_structure.py
@@ -281,7 +281,8 @@ class UnifiedDocumentStructure:
 
															                 }
														
 
															                 for t in self.tertiary_classifications
														
 
															             ],
														
 
															-            "outline": self.outline.to_dict()
														
 
															+            "outline": self.outline.to_dict(),
														
 
															+            "catalog": self.catalog
														
 
															         }
														
 
															     @classmethod
														
@@ -349,6 +350,7 @@ class UnifiedDocumentStructure:
 
															                 for t in data.get("tertiary_classifications", [])
														
 
															             ],
														
 
															             outline=Outline.from_dict(data.get("outline", [])),
														
 
															+            catalog=data.get("catalog"),
														
 
															             raw_metadata=data.get("raw_metadata", {})
														
 
															         )
														
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -122,6 +122,18 @@ class SimpleDocumentProcessor:
 
															         structure = self.pdf_extractor.extract(file_content, progress_callback=_extraction_progress)
														
 
															         catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
														
 
															+
														
 
															+        # 对 catalog 进行分类（如果存在）
														
 
															+        if catalog and catalog.get("chapters"):
														
 
															+            try:
														
 
															+                catalog = await self._classify_catalog(catalog)
														
 
															+                logger.info(f"[SimpleProcessor] Catalog分类完成")
														
 
															+                # 验证一级分类是否写入
														
 
															+                for ch in catalog.get("chapters", [])[:2]:
														
 
															+                    logger.info(f"[SimpleProcessor] Catalog章节验证: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
														
 
															+            except Exception as e:
														
 
															+                logger.warning(f"[SimpleProcessor] Catalog分类失败: {e}")
														
 
															+
														
 
															         await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
														
 
															         # 2. 一级分类
														
@@ -139,7 +151,7 @@ class SimpleDocumentProcessor:
 
															         chunks = assemble_chunks(structure, primary_result, secondary_result)
														
 
															         if not chunks:
														
 
															             logger.warning("[SimpleProcessor] 无可用的 chunks")
														
 
															-            return structure, primary_result, secondary_result, chunks
														
 
															+            return structure, primary_result, secondary_result, chunks, catalog
														
 
															         await self._emit_progress(progress_callback, "文档切分", 50, f"组装 {len(chunks)} 个内容块")
														
 
															         # 5. 三级分类
														
@@ -158,6 +170,12 @@ class SimpleDocumentProcessor:
 
															         logger.info("[SimpleProcessor] 三级分类完成")
														
 
															         await self._emit_progress(progress_callback, "文档分类", 90, "三级分类完成")
														
 
															+        # 验证返回前的catalog
														
 
															+        if catalog:
														
 
															+            logger.info(f"[SimpleProcessor] 返回前Catalog验证: {len(catalog.get('chapters', []))} 章")
														
 
															+            for ch in catalog.get("chapters", [])[:2]:
														
 
															+                logger.info(f"[SimpleProcessor] 返回前章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
														
 
															+
														
 
															         return structure, primary_result, secondary_result, chunks, catalog
														
 
															     async def _emit_progress(
														
@@ -226,6 +244,10 @@ class SimpleDocumentProcessor:
 
															         }
														
 
															         # 设置目录结构（YOLO检测+OCR提取）
														
 
															+        if catalog:
														
 
															+            logger.info(f"[_build_unified_doc] 设置catalog: {len(catalog.get('chapters', []))} 章")
														
 
															+            for ch in catalog.get("chapters", [])[:2]:
														
 
															+                logger.info(f"[_build_unified_doc] catalog章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
														
 
															         unified.catalog = catalog
														
 
															         return unified
														
@@ -317,3 +339,109 @@ class SimpleDocumentProcessor:
 
															             total_pages=total_pages,
														
 
															             secondary_classifications=[],
														
 
															         )
														
 
															+
														
 
															+    async def _classify_catalog(self, catalog: Dict[str, Any]) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        使用 HierarchyClassifier 对 catalog 进行一二级分类
														
 
															+        """
														
 
															+        from ..doc_worker.classification.hierarchy_classifier import HierarchyClassifier
														
 
															+
														
 
															+        # 转换为 toc_items 格式
														
 
															+        toc_items = []
														
 
															+        for idx, chapter in enumerate(catalog.get("chapters", [])):
														
 
															+            # 一级目录
														
 
															+            toc_items.append({
														
 
															+                "title": chapter.get("title", ""),
														
 
															+                "page": str(chapter.get("page", "0")),
														
 
															+                "level": 1,
														
 
															+                "original": chapter.get("original", "")
														
 
															+            })
														
 
															+            # 二级目录
														
 
															+            for sub in chapter.get("subsections", []):
														
 
															+                toc_items.append({
														
 
															+                    "title": sub.get("title", ""),
														
 
															+                    "page": str(sub.get("page", "0")),
														
 
															+                    "level": 2,
														
 
															+                    "original": sub.get("original", "")
														
 
															+                })
														
 
															+
														
 
															+        if not toc_items:
														
 
															+            return catalog
														
 
															+
														
 
															+        # 调用分类器
														
 
															+        classifier = HierarchyClassifier()
														
 
															+
														
 
															+        # 一级分类
														
 
															+        primary_result = await classifier.classify_async(toc_items, target_level=1)
														
 
															+
														
 
															+        # 二级分类
														
 
															+        secondary_result = await classifier.classify_secondary_async(primary_result)
														
 
															+
														
 
															+        # 将分类结果写回 catalog
														
 
															+        primary_items = primary_result.get("items", [])
														
 
															+        secondary_items = secondary_result.get("items", [])
														
 
															+
														
 
															+        # 一级分类结果映射 (normalized_title -> code/name)
														
 
															+        first_classification = {}
														
 
															+        for item in primary_items:
														
 
															+            title = item.get("title", "").strip()
														
 
															+            first_classification[title] = {
														
 
															+                "code": item.get("category_code", ""),
														
 
															+                "name": item.get("category", "")
														
 
															+            }
														
 
															+
														
 
															+        # 二级分类结果映射 ((first_title, sub_title) -> code)
														
 
															+        second_classification = {}
														
 
															+        for first_item in secondary_items:
														
 
															+            first_title = (first_item.get("original_title", "") or first_item.get("first_category", "")).strip()
														
 
															+            classifications = first_item.get("classifications", [])
														
 
															+            for cls in classifications:
														
 
															+                sub_title = cls.get("title", "").strip()
														
 
															+                key = (first_title, sub_title)
														
 
															+                second_classification[key] = cls.get("category_code", "")
														
 
															+
														
 
															+        logger.info(f"[_classify_catalog] 一级分类映射: {first_classification}")
														
 
															+        logger.info(f"[_classify_catalog] 二级分类映射: {list(second_classification.keys())}")
														
 
															+
														
 
															+        # 写回 catalog，并调整格式与 outline 一致
														
 
															+        for chapter in catalog.get("chapters", []):
														
 
															+            ch_title = chapter.get("title", "").strip()
														
 
															+            logger.info(f"[_classify_catalog] 处理章节: '{ch_title}'")
														
 
															+            logger.info(f"[_classify_catalog] 查找一级分类, keys={list(first_classification.keys())}")
														
 
															+
														
 
															+            if ch_title in first_classification:
														
 
															+                chapter["chapter_classification"] = first_classification[ch_title]["code"]
														
 
															+                chapter["first_name"] = first_classification[ch_title]["name"]
														
 
															+                logger.info(f"[_classify_catalog] 精确匹配成功: {ch_title} -> {first_classification[ch_title]}")
														
 
															+            else:
														
 
															+                # 尝试模糊匹配
														
 
															+                matched = False
														
 
															+                for cls_title, cls_data in first_classification.items():
														
 
															+                    if cls_title in ch_title or ch_title in cls_title:
														
 
															+                        chapter["chapter_classification"] = cls_data["code"]
														
 
															+                        chapter["first_name"] = cls_data["name"]
														
 
															+                        logger.info(f"[_classify_catalog] 模糊匹配成功: '{ch_title}' ~ '{cls_title}' -> {cls_data}")
														
 
															+                        matched = True
														
 
															+                        break
														
 
															+                if not matched:
														
 
															+                    logger.warning(f"[_classify_catalog] 未找到匹配: '{ch_title}' 不在 {list(first_classification.keys())}")
														
 
															+
														
 
															+            # 调整 original 格式：只保留 "第X章"
														
 
															+            chapter["original"] = f"第{chapter.get('index', 1)}章"
														
 
															+
														
 
															+            # 二级
														
 
															+            for sub in chapter.get("subsections", []):
														
 
															+                sub_title = sub.get("title", "").strip()
														
 
															+                key = (ch_title, sub_title)
														
 
															+                if key in second_classification:
														
 
															+                    sub["secondary_category_code"] = second_classification[key]
														
 
															+                else:
														
 
															+                    # 尝试模糊匹配
														
 
															+                    for (ft, st), code in second_classification.items():
														
 
															+                        if (ft in ch_title or ch_title in ft) and (st in sub_title or sub_title in st):
														
 
															+                            sub["secondary_category_code"] = code
														
 
															+                            break
														
 
															+                # 调整 original 格式："第X章->节标题"
														
 
															+                sub["original"] = f"第{chapter.get('index', 1)}章->{sub_title}"
														
 
															+
														
 
															+        return catalog