1 неделя назад · cb5c4551f5
--- a/core/construction_review/component/ai_review_engine.py
+++ b/core/construction_review/component/ai_review_engine.py
@@ -1047,34 +1047,41 @@ class AIReviewEngine(BaseReviewer):
 
															                 elif isinstance(outline_raw, list):
														
 
															                     outline_chapters = outline_raw
														
 
															-            # 提取一级和二级信息
														
 
															-            outline_first = set()
														
 
															-            outline_secondary = {}
														
 
															+            # 🆕 提取一级和二级标题（用于独立模糊匹配）
														
 
															+            # 结构：{first_code: {'title': '章节标题', 'subsections': ['二级标题1', '二级标题2']}}
														
 
															+            outline_by_first: Dict[str, Dict] = {}
														
 
															             for chapter in outline_chapters:
														
 
															                 if not isinstance(chapter, dict):
														
 
															                     continue
														
 
															                 first_code = chapter.get('chapter_classification', '')
														
 
															-                if first_code:
														
 
															-                    outline_first.add(first_code)
														
 
															+                first_title = chapter.get('title', '')
														
 
															-                # 提取 subsections 中的二级章节
														
 
															+                if not first_code:
														
 
															+                    continue
														
 
															+                
														
 
															+                if first_code not in outline_by_first:
														
 
															+                    outline_by_first[first_code] = {
														
 
															+                        'title': first_title,
														
 
															+                        'subsections': []
														
 
															+                    }
														
 
															+                
														
 
															+                # 提取二级标题列表
														
 
															                 for sub in chapter.get('subsections', []):
														
 
															                     if not isinstance(sub, dict):
														
 
															                         continue
														
 
															-                    second_code = sub.get('secondary_category_code', '')
														
 
															-                    if first_code and second_code:
														
 
															-                        outline_secondary[(first_code, second_code)] = sub.get('title', '')
														
 
															+                    sub_title = sub.get('title', '')
														
 
															+                    if sub_title:
														
 
															+                        outline_by_first[first_code]['subsections'].append(sub_title)
														
 
															-            logger.info(f"[{name}] 获取到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
														
 
															-
														
 
															-            # 使用模糊匹配
														
 
															+            logger.info(f"[{name}] 获取到 {len(outline_by_first)} 个一级章节")
														
 
															+            
														
 
															+            # 使用模糊匹配（基于标题）
														
 
															             matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
														
 
															-            match_result = matcher.match_catalogue(
														
 
															-                outline_first=outline_first,
														
 
															-                outline_secondary=outline_secondary,
														
 
															-                threshold=0.6
														
 
															+            match_result = matcher.match_catalogue_by_title(
														
 
															+                outline_by_first=outline_by_first,
														
 
															+                threshold=0.6  # 阈值0.6
														
 
															             )
														
 
															             catalogue_result = {
														
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -99,7 +99,8 @@ SECONDARY_CATEGORY_KEYWORDS = {
 
															     "safety": {
														
 
															         "SafetySystem": ["安全保证体系"],  # 严格匹配标准目录名
														
 
															         "Organization": ["组织保证措施"],  # 严格匹配
														
 
															-        "TechMeasures": ["技术保障措施"],  # 严格匹配
														
 
															+        "TechMeasures": ["技术保障措施", "技术保证措施"],  # 严格匹配（包含常见变体）
														
 
															+        "Protection": ["安全防护措施"],  # 🆕 新增缺失的分类
														
 
															         "Monitoring": ["监测监控措施"],  # 严格匹配
														
 
															         "Emergency": ["应急处置措施"],  # 严格匹配
														
 
															     },
														
--- a/core/construction_review/component/outline_catalogue_matcher.py
+++ b/core/construction_review/component/outline_catalogue_matcher.py
@@ -8,6 +8,7 @@
 
															 """
														
 
															 import difflib
														
 
															+import logging
														
 
															 import re
														
 
															 from typing import Dict, List, Optional, Set, Tuple, Any
														
 
															 from collections import defaultdict
														
@@ -15,6 +16,8 @@ from pathlib import Path
 
															 import pandas as pd
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															 class OutlineCatalogueMatcher:
														
 
															     """
														
@@ -227,161 +230,198 @@ class OutlineCatalogueMatcher:
 
															         return min(sum(scores), 1.0)
														
 
															-    def match_catalogue(
														
 
															+    def _match_by_title_fuzzy(
														
 
															+        self,
														
 
															+        standard_name: str,
														
 
															+        candidate_titles: List[str],
														
 
															+        threshold: float
														
 
															+    ) -> Tuple[bool, float, Optional[str]]:
														
 
															+        """
														
 
															+        在候选标题中找到与标准名称最相似的一个
														
 
															+        
														
 
															+        Returns:
														
 
															+            (是否匹配, 最佳分数, 匹配的标题)
														
 
															+        """
														
 
															+        best_score = 0.0
														
 
															+        best_title = None
														
 
															+        
														
 
															+        for title in candidate_titles:
														
 
															+            score = self._calculate_enhanced_similarity(standard_name, title)
														
 
															+            if score > best_score:
														
 
															+                best_score = score
														
 
															+                best_title = title
														
 
															+        
														
 
															+        is_match = best_score >= threshold
														
 
															+        return is_match, best_score, best_title
														
 
															+    
														
 
															+    def match_catalogue_by_title(
														
 
															         self,
														
 
															-        outline_first: Set[str],
														
 
															-        outline_secondary: Dict[Tuple[str, str], str],
														
 
															+        outline_by_first: Dict[str, Dict[str, any]],
														
 
															         threshold: float = 0.6
														
 
															     ) -> Dict[str, Any]:
														
 
															         """
														
 
															-        执行目录匹配
														
 
															+        🆕 基于标题的独立模糊匹配（一二级都独立）
														
 
															         Args:
														
 
															-            outline_first: 从outline中提取的一级code集合
														
 
															-            outline_secondary: 从outline中提取的二级 {(first_code, second_code): title}
														
 
															-            threshold: 模糊匹配阈值（默认0.6）
														
 
															+            outline_by_first: {
														
 
															+                first_code: {
														
 
															+                    'title': '一级标题',
														
 
															+                    'subsections': ['二级标题1', '二级标题2', ...]
														
 
															+                }
														
 
															+            }
														
 
															+            threshold: 匹配阈值，默认0.6
														
 
															         Returns:
														
 
															-            匹配结果，包含：
														
 
															-            - matched_first: 匹配的一级code集合
														
 
															-            - matched_second: 匹配的二级key集合
														
 
															-            - missing_first: 缺失的一级列表
														
 
															-            - missing_second: 缺失的二级列表
														
 
															-            - match_details: 匹配详情
														
 
															+            匹配结果
														
 
															         """
														
 
															-        required_first = set(self.first_names.keys())
														
 
															-        required_second = set(self.second_names.keys())
														
 
															+        logger.info(f"[独立模糊匹配] 开始，阈值={threshold}")
														
 
															+        
														
 
															+        # ========== 一级目录匹配（独立模糊）==========
														
 
															+        actual_first_titles = {
														
 
															+            code: info['title'] 
														
 
															+            for code, info in outline_by_first.items()
														
 
															+        }
														
 
															-        # 一级匹配
														
 
															-        matched_first = outline_first & required_first
														
 
															-        missing_first = required_first - matched_first
														
 
															+        matched_first = set()
														
 
															+        missing_first = []
														
 
															+        
														
 
															+        for req_code, req_name in self.first_names.items():
														
 
															+            # 优先：直接用code精确匹配，因为一级分类通常较准
														
 
															+            if req_code in actual_first_titles:
														
 
															+                matched_first.add(req_code)
														
 
															+                logger.debug(f"[一级匹配] {req_name}: 存在")
														
 
															+            else:
														
 
															+                # 尝试用标题模糊匹配
														
 
															+                is_match, score, matched_title = self._match_by_title_fuzzy(
														
 
															+                    req_name,
														
 
															+                    list(actual_first_titles.values()),
														
 
															+                    threshold
														
 
															+                )
														
 
															+                if is_match:
														
 
															+                    # 找到匹配的标题，反向查找code
														
 
															+                    for code, title in actual_first_titles.items():
														
 
															+                        if title == matched_title:
														
 
															+                            matched_first.add(req_code)
														
 
															+                            logger.debug(f"[一级模糊匹配] {req_name} -> {matched_title} ({score:.3f})")
														
 
															+                            break
														
 
															+                else:
														
 
															+                    missing_first.append({
														
 
															+                        'first_code': req_code,
														
 
															+                        'first_name': req_name,
														
 
															+                        'first_seq': self.first_seq.get(req_code, 0)
														
 
															+                    })
														
 
															+                    logger.debug(f"[一级缺失] {req_name}")
														
 
															+        
														
 
															+        # ========== 二级目录匹配（结合一级 + 全局兜底）==========
														
 
															+        # 🆕 先收集所有二级标题用于全局兜底
														
 
															+        all_actual_second_titles = []
														
 
															+        for fc, info in outline_by_first.items():
														
 
															+            for sub_title in info.get('subsections', []):
														
 
															+                all_actual_second_titles.append({
														
 
															+                    'first_code': fc,
														
 
															+                    'title': sub_title
														
 
															+                })
														
 
															-        # 二级匹配
														
 
															         matched_second = set()
														
 
															-        missing_second = set()
														
 
															+        missing_second = []
														
 
															         match_details = []
														
 
															+        matched_actual_titles = set()  # 防重复
														
 
															-        # 精确匹配
														
 
															-        outline_second_keys = set(outline_secondary.keys())
														
 
															-        exact_matches = outline_second_keys & required_second
														
 
															-        matched_second.update(exact_matches)
														
 
															-        
														
 
															-        for key in exact_matches:
														
 
															-            first_code, second_code = key
														
 
															+        for req_key, req_name in self.second_names.items():
														
 
															+            first_code, second_code = req_key
														
 
															+            
														
 
															+            # 🆕 步骤1：优先在同一一级下匹配
														
 
															+            same_group_titles = outline_by_first.get(first_code, {}).get('subsections', [])
														
 
															+            best_score_same = 0.0
														
 
															+            best_match_same = None
														
 
															+            
														
 
															+            for title in same_group_titles:
														
 
															+                if title in matched_actual_titles:
														
 
															+                    continue
														
 
															+                score = self._calculate_enhanced_similarity(req_name, title)
														
 
															+                if score > best_score_same:
														
 
															+                    best_score_same = score
														
 
															+                    best_match_same = title
														
 
															+            
														
 
															+            # 同组匹配成功
														
 
															+            if best_score_same >= threshold and best_match_same:
														
 
															+                matched_second.add(req_key)
														
 
															+                matched_actual_titles.add(best_match_same)
														
 
															+                match_details.append({
														
 
															+                    'level': 'second',
														
 
															+                    'required_first_code': first_code,
														
 
															+                    'required_second_code': second_code,
														
 
															+                    'required_second_name': req_name,
														
 
															+                    'matched': True,
														
 
															+                    'match_type': 'same_group_fuzzy',
														
 
															+                    'similarity': best_score_same,
														
 
															+                    'matched_title': best_match_same
														
 
															+                })
														
 
															+                logger.debug(f"[二级同组匹配] {req_name} -> {best_match_same} ({best_score_same:.3f})")
														
 
															+                continue
														
 
															+            
														
 
															+            # 🆕 步骤2：同组失败，尝试全局匹配（提高阈值防误匹配）
														
 
															+            GLOBAL_THRESHOLD = 0.7  # 全局匹配阈值更高
														
 
															+            best_score_global = 0.0
														
 
															+            best_match_global = None
														
 
															+            best_match_fc = None
														
 
															+            
														
 
															+            for actual in all_actual_second_titles:
														
 
															+                if actual['title'] in matched_actual_titles:
														
 
															+                    continue
														
 
															+                score = self._calculate_enhanced_similarity(req_name, actual['title'])
														
 
															+                if score > best_score_global:
														
 
															+                    best_score_global = score
														
 
															+                    best_match_global = actual['title']
														
 
															+                    best_match_fc = actual['first_code']
														
 
															+            
														
 
															+            # 全局匹配成功（且跨组）
														
 
															+            if best_score_global >= GLOBAL_THRESHOLD and best_match_global:
														
 
															+                matched_second.add(req_key)
														
 
															+                matched_actual_titles.add(best_match_global)
														
 
															+                match_details.append({
														
 
															+                    'level': 'second',
														
 
															+                    'required_first_code': first_code,
														
 
															+                    'required_second_code': second_code,
														
 
															+                    'required_second_name': req_name,
														
 
															+                    'matched': True,
														
 
															+                    'match_type': 'cross_group_fuzzy',  # 标记为跨组匹配
														
 
															+                    'similarity': best_score_global,
														
 
															+                    'matched_title': best_match_global,
														
 
															+                    'matched_actual_first': best_match_fc  # 实际匹配到的一级
														
 
															+                })
														
 
															+                logger.warning(f"[二级跨组匹配] {req_name}(应在{first_code}) -> {best_match_global}(实际在{best_match_fc}) ({best_score_global:.3f})")
														
 
															+                continue
														
 
															+            
														
 
															+            # 都失败，记为缺失
														
 
															+            best_score = max(best_score_same, best_score_global)
														
 
															+            best_attempt = best_match_same or best_match_global
														
 
															+            missing_second.append({
														
 
															+                'first_code': first_code,
														
 
															+                'first_name': self.first_names.get(first_code, ''),
														
 
															+                'secondary_code': second_code,
														
 
															+                'secondary_name': req_name,
														
 
															+                'second_seq': self.second_seq.get(req_key, 0)
														
 
															+            })
														
 
															             match_details.append({
														
 
															                 'level': 'second',
														
 
															                 'required_first_code': first_code,
														
 
															                 'required_second_code': second_code,
														
 
															-                'required_second_name': self.second_names.get(key, ''),
														
 
															-                'matched': True,
														
 
															-                'match_type': 'exact',
														
 
															-                'similarity': 1.0
														
 
															-            })
														
 
															-        
														
 
															-        # 模糊匹配（对未精确匹配的）
														
 
															-        required_remaining = required_second - exact_matches
														
 
															-        outline_remaining = outline_second_keys - exact_matches
														
 
															-        
														
 
															-        if required_remaining and outline_remaining:
														
 
															-            # 准备outline数据
														
 
															-            outline_list = []
														
 
															-            for key in outline_remaining:
														
 
															-                first_code, second_code = key
														
 
															-                title = outline_secondary.get(key, "")
														
 
															-                outline_list.append({
														
 
															-                    'key': key,
														
 
															-                    'first_code': first_code,
														
 
															-                    'second_code': second_code,
														
 
															-                    'title': title
														
 
															-                })
														
 
															-            
														
 
															-            # 对每个required进行模糊匹配
														
 
															-            for req_key in required_remaining:
														
 
															-                first_code, second_code = req_key
														
 
															-                second_name = self.second_names.get(req_key, '')
														
 
															-                first_name = self.first_names.get(first_code, '')
														
 
															-                
														
 
															-                # 获取详细定义
														
 
															-                raw_content = self.second_raw_content.get((first_name, second_name))
														
 
															-                
														
 
															-                best_match = None
														
 
															-                best_score = 0.0
														
 
															-                
														
 
															-                for item in outline_list:
														
 
															-                    # 计算相似度
														
 
															-                    score1 = self._calculate_enhanced_similarity(second_name, item['title'])
														
 
															-                    score2 = self._calculate_enhanced_similarity(
														
 
															-                        f"{first_name}{second_name}",
														
 
															-                        item['title']
														
 
															-                    )
														
 
															-                    score = max(score1, score2)
														
 
															-                    
														
 
															-                    # 如果有详细定义，也计算
														
 
															-                    if raw_content:
														
 
															-                        score3 = self._calculate_enhanced_similarity(
														
 
															-                            second_name,
														
 
															-                            item['title'],
														
 
															-                            raw_content
														
 
															-                        )
														
 
															-                        score = max(score, score3)
														
 
															-                    
														
 
															-                    if score > best_score:
														
 
															-                        best_score = score
														
 
															-                        best_match = item
														
 
															-                
														
 
															-                if best_score >= threshold:
														
 
															-                    matched_second.add(req_key)
														
 
															-                    match_details.append({
														
 
															-                        'level': 'second',
														
 
															-                        'required_first_code': first_code,
														
 
															-                        'required_second_code': second_code,
														
 
															-                        'required_second_name': second_name,
														
 
															-                        'matched': True,
														
 
															-                        'match_type': 'fuzzy',
														
 
															-                        'similarity': best_score,
														
 
															-                        'matched_title': best_match['title'] if best_match else None,
														
 
															-                        'used_raw_content': raw_content is not None
														
 
															-                    })
														
 
															-                else:
														
 
															-                    missing_second.add(req_key)
														
 
															-                    match_details.append({
														
 
															-                        'level': 'second',
														
 
															-                        'required_first_code': first_code,
														
 
															-                        'required_second_code': second_code,
														
 
															-                        'required_second_name': second_name,
														
 
															-                        'matched': False,
														
 
															-                        'match_type': 'none',
														
 
															-                        'similarity': best_score
														
 
															-                    })
														
 
															-        else:
														
 
															-            missing_second = required_remaining
														
 
															-        
														
 
															-        # 构建缺失详情
														
 
															-        missing_first_details = []
														
 
															-        for code in sorted(missing_first, key=lambda x: self.first_seq.get(x, 0)):
														
 
															-            missing_first_details.append({
														
 
															-                'first_code': code,
														
 
															-                'first_name': self.first_names.get(code, code),
														
 
															-                'first_seq': self.first_seq.get(code, 0)
														
 
															+                'required_second_name': req_name,
														
 
															+                'matched': False,
														
 
															+                'match_type': 'none',
														
 
															+                'similarity': best_score,
														
 
															+                'best_attempt': best_attempt
														
 
															             })
														
 
															+            logger.debug(f"[二级缺失] {req_name} (最佳尝试: {best_attempt}, {best_score:.3f})")
														
 
															-        missing_second_details = []
														
 
															-        for key in sorted(missing_second, key=lambda x: (self.first_seq.get(x[0], 0), self.second_seq.get(x, 0))):
														
 
															-            first_code, second_code = key
														
 
															-            missing_second_details.append({
														
 
															-                'first_code': first_code,
														
 
															-                'first_name': self.first_names.get(first_code, first_code),
														
 
															-                'first_seq': self.first_seq.get(first_code, 0),
														
 
															-                'secondary_code': second_code,
														
 
															-                'secondary_name': self.second_names.get(key, ''),
														
 
															-                'second_seq': self.second_seq.get(key, 0)
														
 
															-            })
														
 
															+        logger.info(f"[独立模糊匹配] 完成：一级缺失 {len(missing_first)} 个，二级缺失 {len(missing_second)} 个")
														
 
															         return {
														
 
															             'matched_first': matched_first,
														
 
															             'matched_second': matched_second,
														
 
															-            'missing_first': missing_first_details,
														
 
															-            'missing_second': missing_second_details,
														
 
															+            'missing_first': missing_first,
														
 
															+            'missing_second': missing_second,
														
 
															             'missing_first_count': len(missing_first),
														
 
															             'missing_second_count': len(missing_second),
														
 
															             'match_details': match_details