Forráskód Böngészése

Merge branch 'dev' of http://47.109.151.80:15030/CRBC-MaaS-Platform-Project/LQAgentPlatform into dev_sgsc_wxm

WangXuMing 1 hete
szülő
commit
78bcae0dcb

+ 6 - 5
config/config.ini.template

@@ -145,13 +145,10 @@ MYSQL_HOST=192.168.92.61
 MYSQL_PORT=13306
 MYSQL_USER=root
 MYSQL_PASSWORD=Lq123456!
-MYSQL_DB=lq_db
+MYSQL_DB=lq_db_dev
 MYSQL_MIN_SIZE=1
 MYSQL_MAX_SIZE=5
 MYSQL_AUTO_COMMIT=True
-MYSQL_CONNECT_TIMEOUT=30
-MYSQL_READ_TIMEOUT=60
-MYSQL_WRITE_TIMEOUT=30
 
 
 [pgvector]
@@ -241,5 +238,9 @@ MAX_TOKENS=1024
 [construction_review]
 MAX_CELERY_TASKS=1
 
-
+[timeliness_review]
+# 时效性审查中用于匹配前需要去除的符号(第二轮处理)
+# 这些符号会在基础规范化(去除空白、书名号、括号、HTML标签)之后去除
+# 包含各种连接符:半角连字符(-)、全角连接号(-)、全角破折号(—)
+REMOVE_SYMBOLS=),-,.,/,,:,[,],【,】,〔,〕,(,),-,—
 

+ 23 - 16
core/construction_review/component/ai_review_engine.py

@@ -1047,34 +1047,41 @@ class AIReviewEngine(BaseReviewer):
                 elif isinstance(outline_raw, list):
                     outline_chapters = outline_raw
 
-            # 提取一级和二级信息
-            outline_first = set()
-            outline_secondary = {}
+            # 🆕 提取一级和二级标题(用于独立模糊匹配)
+            # 结构:{first_code: {'title': '章节标题', 'subsections': ['二级标题1', '二级标题2']}}
+            outline_by_first: Dict[str, Dict] = {}
             
             for chapter in outline_chapters:
                 if not isinstance(chapter, dict):
                     continue
                 
                 first_code = chapter.get('chapter_classification', '')
-                if first_code:
-                    outline_first.add(first_code)
+                first_title = chapter.get('title', '')
                 
-                # 提取 subsections 中的二级章节
+                if not first_code:
+                    continue
+                
+                if first_code not in outline_by_first:
+                    outline_by_first[first_code] = {
+                        'title': first_title,
+                        'subsections': []
+                    }
+                
+                # 提取二级标题列表
                 for sub in chapter.get('subsections', []):
                     if not isinstance(sub, dict):
                         continue
-                    second_code = sub.get('secondary_category_code', '')
-                    if first_code and second_code:
-                        outline_secondary[(first_code, second_code)] = sub.get('title', '')
+                    sub_title = sub.get('title', '')
+                    if sub_title:
+                        outline_by_first[first_code]['subsections'].append(sub_title)
             
-            logger.info(f"[{name}] 获取到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
-
-            # 使用模糊匹配
+            logger.info(f"[{name}] 获取到 {len(outline_by_first)} 个一级章节")
+            
+            # 使用模糊匹配(基于标题)
             matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
-            match_result = matcher.match_catalogue(
-                outline_first=outline_first,
-                outline_secondary=outline_secondary,
-                threshold=0.6
+            match_result = matcher.match_catalogue_by_title(
+                outline_by_first=outline_by_first,
+                threshold=0.6  # 阈值0.6
             )
             
             catalogue_result = {

+ 86 - 0
core/construction_review/component/document_processor.py

@@ -66,6 +66,91 @@ class DocumentComponents:
     text_splitter: TextSplitter
 
 
+# 二级分类标题关键词映射(用于outline的subsection分类)
+# 基于 StandardCategoryTable.csv,严格匹配标准目录名
+SECONDARY_CATEGORY_KEYWORDS = {
+    # 编制依据 (basis)
+    "basis": {
+        "LawsAndRegulations": ["法律法规"],  # 严格匹配
+        "StandardsAndSpecifications": ["标准规范"],  # 严格匹配
+        "DocumentSystems": ["文件制度"],  # 严格匹配
+        "CompilationPrinciples": ["编制原则"],  # 严格匹配
+        "CompilationScope": ["编制范围"],  # 严格匹配
+    },
+    # 工程概况 (overview)
+    "overview": {
+        "DesignSummary": ["设计概况"],  # 严格匹配
+        "GeologyWeather": ["工程地质与水文气象"],  # 严格匹配标准目录名
+        "Surroundings": ["周边环境"],  # 严格匹配
+        "LayoutPlan": ["施工平面及立面布置"],  # 严格匹配标准目录名
+        "RequirementsTech": ["施工要求和技术保证条件"],  # 严格匹配标准目录名
+        "RiskLevel": ["风险辨识与分级"],  # 严格匹配标准目录名
+        "Stakeholders": ["参建各方责任主体单位"],  # 严格匹配标准目录名
+    },
+    # 施工计划 (plan)
+    "plan": {
+        "Schedule": ["施工进度计划"],  # 严格匹配标准目录名
+        "Materials": ["施工材料计划"],  # 严格匹配标准目录名
+        "Equipment": ["施工设备计划"],  # 严格匹配标准目录名
+        "Workforce": ["劳动力计划"],  # 严格匹配
+        "SafetyCost": ["安全生产费用使用计划"],  # 严格匹配标准目录名
+    },
+    # 施工工艺技术 (technology)
+    "technology": {
+        # 按标准目录严格匹配,优先匹配完整名称避免歧义
+        "MethodsOverview": ["主要施工方法概述", "施工方法概述"],  # 不包含"施工方法"避免与Operations冲突
+        "TechParams": ["技术参数"],  # 不包含"参数"避免过于宽泛
+        "Process": ["工艺流程"],  # 不包含"流程"避免过于宽泛
+        "PrepWork": ["施工准备"],  # 不包含"准备"避免过于宽泛
+        "Operations": ["施工方法及操作要求", "施工方案及操作要求", "操作要求", "施工方案"],  # 最具体的放前面
+        "Inspection": ["检查要求"],  # 不包含"检查""验收"避免与其他章节冲突
+    },
+    # 安全保证措施 (safety)
+    "safety": {
+        "SafetySystem": ["安全保证体系"],  # 严格匹配标准目录名
+        "Organization": ["组织保证措施"],  # 严格匹配
+        "TechMeasures": ["技术保障措施", "技术保证措施"],  # 严格匹配(包含常见变体)
+        "Protection": ["安全防护措施"],  # 🆕 新增缺失的分类
+        "Monitoring": ["监测监控措施"],  # 严格匹配
+        "Emergency": ["应急处置措施"],  # 严格匹配
+    },
+    # 质量保证措施 (quality)
+    "quality": {
+        "QualitySystem": ["质量保证体系"],  # 严格匹配
+        "QualityGoals": ["质量目标"],  # 严格匹配
+        "Excellence": ["工程创优规划"],  # 严格匹配
+        "QualityControl": ["质量控制程序与具体措施"],  # 严格匹配标准目录名
+    },
+    # 环境保证措施 (environment)
+    "environment": {
+        "EnvSystem": ["环境保证体系"],  # 严格匹配
+        "EnvOrg": ["环境保护组织机构"],  # 严格匹配
+        "EnvProtection": ["环境保护及文明施工措施"],  # 严格匹配标准目录名
+    },
+    # 施工管理及作业人员配备与分工 (management)
+    "management": {
+        "Managers": ["施工管理人员"],  # 严格匹配
+        "SafetyStaff": ["专职安全生产管理人员"],  # 严格匹配标准目录名
+        "SpecialWorkers": ["特种作业人员"],  # 严格匹配
+        "OtherWorkers": ["其他作业人员"],  # 严格匹配
+    },
+    # 验收要求 (acceptance)
+    "acceptance": {
+        "Standards": ["验收标准"],  # 严格匹配
+        "Procedure": ["验收程序"],  # 严格匹配
+        "Content": ["验收内容"],  # 严格匹配
+        "Timing": ["验收时间"],  # 严格匹配
+        "Personnel": ["验收人员"],  # 严格匹配
+    },
+    # 其他资料 (other)
+    "other": {
+        "Calculations": ["计算书"],  # 严格匹配
+        "Drawings": ["相关施工图纸"],  # 严格匹配标准目录名
+        "Tables": ["附图附表"],  # 严格匹配
+        "Team": ["编制及审核人员情况"],  # 严格匹配标准目录名
+    },
+}
+
 class DocumentProcessor:
     """
     文档处理器
@@ -734,3 +819,4 @@ class DocumentProcessor:
         except Exception as e:
             logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
             raise
+

+ 171 - 131
core/construction_review/component/outline_catalogue_matcher.py

@@ -8,6 +8,7 @@
 """
 
 import difflib
+import logging
 import re
 from typing import Dict, List, Optional, Set, Tuple, Any
 from collections import defaultdict
@@ -15,6 +16,8 @@ from pathlib import Path
 
 import pandas as pd
 
+logger = logging.getLogger(__name__)
+
 
 class OutlineCatalogueMatcher:
     """
@@ -227,161 +230,198 @@ class OutlineCatalogueMatcher:
         
         return min(sum(scores), 1.0)
     
-    def match_catalogue(
+    def _match_by_title_fuzzy(
+        self,
+        standard_name: str,
+        candidate_titles: List[str],
+        threshold: float
+    ) -> Tuple[bool, float, Optional[str]]:
+        """
+        在候选标题中找到与标准名称最相似的一个
+        
+        Returns:
+            (是否匹配, 最佳分数, 匹配的标题)
+        """
+        best_score = 0.0
+        best_title = None
+        
+        for title in candidate_titles:
+            score = self._calculate_enhanced_similarity(standard_name, title)
+            if score > best_score:
+                best_score = score
+                best_title = title
+        
+        is_match = best_score >= threshold
+        return is_match, best_score, best_title
+    
+    def match_catalogue_by_title(
         self,
-        outline_first: Set[str],
-        outline_secondary: Dict[Tuple[str, str], str],
+        outline_by_first: Dict[str, Dict[str, any]],
         threshold: float = 0.6
     ) -> Dict[str, Any]:
         """
-        执行目录匹配
+        🆕 基于标题的独立模糊匹配(一二级都独立)
         
         Args:
-            outline_first: 从outline中提取的一级code集合
-            outline_secondary: 从outline中提取的二级 {(first_code, second_code): title}
-            threshold: 模糊匹配阈值(默认0.6)
+            outline_by_first: {
+                first_code: {
+                    'title': '一级标题',
+                    'subsections': ['二级标题1', '二级标题2', ...]
+                }
+            }
+            threshold: 匹配阈值,默认0.6
             
         Returns:
-            匹配结果,包含:
-            - matched_first: 匹配的一级code集合
-            - matched_second: 匹配的二级key集合
-            - missing_first: 缺失的一级列表
-            - missing_second: 缺失的二级列表
-            - match_details: 匹配详情
+            匹配结果
         """
-        required_first = set(self.first_names.keys())
-        required_second = set(self.second_names.keys())
+        logger.info(f"[独立模糊匹配] 开始,阈值={threshold}")
+        
+        # ========== 一级目录匹配(独立模糊)==========
+        actual_first_titles = {
+            code: info['title'] 
+            for code, info in outline_by_first.items()
+        }
         
-        # 一级匹配
-        matched_first = outline_first & required_first
-        missing_first = required_first - matched_first
+        matched_first = set()
+        missing_first = []
+        
+        for req_code, req_name in self.first_names.items():
+            # 优先:直接用code精确匹配,因为一级分类通常较准
+            if req_code in actual_first_titles:
+                matched_first.add(req_code)
+                logger.debug(f"[一级匹配] {req_name}: 存在")
+            else:
+                # 尝试用标题模糊匹配
+                is_match, score, matched_title = self._match_by_title_fuzzy(
+                    req_name,
+                    list(actual_first_titles.values()),
+                    threshold
+                )
+                if is_match:
+                    # 找到匹配的标题,反向查找code
+                    for code, title in actual_first_titles.items():
+                        if title == matched_title:
+                            matched_first.add(req_code)
+                            logger.debug(f"[一级模糊匹配] {req_name} -> {matched_title} ({score:.3f})")
+                            break
+                else:
+                    missing_first.append({
+                        'first_code': req_code,
+                        'first_name': req_name,
+                        'first_seq': self.first_seq.get(req_code, 0)
+                    })
+                    logger.debug(f"[一级缺失] {req_name}")
+        
+        # ========== 二级目录匹配(结合一级 + 全局兜底)==========
+        # 🆕 先收集所有二级标题用于全局兜底
+        all_actual_second_titles = []
+        for fc, info in outline_by_first.items():
+            for sub_title in info.get('subsections', []):
+                all_actual_second_titles.append({
+                    'first_code': fc,
+                    'title': sub_title
+                })
         
-        # 二级匹配
         matched_second = set()
-        missing_second = set()
+        missing_second = []
         match_details = []
+        matched_actual_titles = set()  # 防重复
         
-        # 精确匹配
-        outline_second_keys = set(outline_secondary.keys())
-        exact_matches = outline_second_keys & required_second
-        matched_second.update(exact_matches)
-        
-        for key in exact_matches:
-            first_code, second_code = key
+        for req_key, req_name in self.second_names.items():
+            first_code, second_code = req_key
+            
+            # 🆕 步骤1:优先在同一一级下匹配
+            same_group_titles = outline_by_first.get(first_code, {}).get('subsections', [])
+            best_score_same = 0.0
+            best_match_same = None
+            
+            for title in same_group_titles:
+                if title in matched_actual_titles:
+                    continue
+                score = self._calculate_enhanced_similarity(req_name, title)
+                if score > best_score_same:
+                    best_score_same = score
+                    best_match_same = title
+            
+            # 同组匹配成功
+            if best_score_same >= threshold and best_match_same:
+                matched_second.add(req_key)
+                matched_actual_titles.add(best_match_same)
+                match_details.append({
+                    'level': 'second',
+                    'required_first_code': first_code,
+                    'required_second_code': second_code,
+                    'required_second_name': req_name,
+                    'matched': True,
+                    'match_type': 'same_group_fuzzy',
+                    'similarity': best_score_same,
+                    'matched_title': best_match_same
+                })
+                logger.debug(f"[二级同组匹配] {req_name} -> {best_match_same} ({best_score_same:.3f})")
+                continue
+            
+            # 🆕 步骤2:同组失败,尝试全局匹配(提高阈值防误匹配)
+            GLOBAL_THRESHOLD = 0.7  # 全局匹配阈值更高
+            best_score_global = 0.0
+            best_match_global = None
+            best_match_fc = None
+            
+            for actual in all_actual_second_titles:
+                if actual['title'] in matched_actual_titles:
+                    continue
+                score = self._calculate_enhanced_similarity(req_name, actual['title'])
+                if score > best_score_global:
+                    best_score_global = score
+                    best_match_global = actual['title']
+                    best_match_fc = actual['first_code']
+            
+            # 全局匹配成功(且跨组)
+            if best_score_global >= GLOBAL_THRESHOLD and best_match_global:
+                matched_second.add(req_key)
+                matched_actual_titles.add(best_match_global)
+                match_details.append({
+                    'level': 'second',
+                    'required_first_code': first_code,
+                    'required_second_code': second_code,
+                    'required_second_name': req_name,
+                    'matched': True,
+                    'match_type': 'cross_group_fuzzy',  # 标记为跨组匹配
+                    'similarity': best_score_global,
+                    'matched_title': best_match_global,
+                    'matched_actual_first': best_match_fc  # 实际匹配到的一级
+                })
+                logger.warning(f"[二级跨组匹配] {req_name}(应在{first_code}) -> {best_match_global}(实际在{best_match_fc}) ({best_score_global:.3f})")
+                continue
+            
+            # 都失败,记为缺失
+            best_score = max(best_score_same, best_score_global)
+            best_attempt = best_match_same or best_match_global
+            missing_second.append({
+                'first_code': first_code,
+                'first_name': self.first_names.get(first_code, ''),
+                'secondary_code': second_code,
+                'secondary_name': req_name,
+                'second_seq': self.second_seq.get(req_key, 0)
+            })
             match_details.append({
                 'level': 'second',
                 'required_first_code': first_code,
                 'required_second_code': second_code,
-                'required_second_name': self.second_names.get(key, ''),
-                'matched': True,
-                'match_type': 'exact',
-                'similarity': 1.0
-            })
-        
-        # 模糊匹配(对未精确匹配的)
-        required_remaining = required_second - exact_matches
-        outline_remaining = outline_second_keys - exact_matches
-        
-        if required_remaining and outline_remaining:
-            # 准备outline数据
-            outline_list = []
-            for key in outline_remaining:
-                first_code, second_code = key
-                title = outline_secondary.get(key, "")
-                outline_list.append({
-                    'key': key,
-                    'first_code': first_code,
-                    'second_code': second_code,
-                    'title': title
-                })
-            
-            # 对每个required进行模糊匹配
-            for req_key in required_remaining:
-                first_code, second_code = req_key
-                second_name = self.second_names.get(req_key, '')
-                first_name = self.first_names.get(first_code, '')
-                
-                # 获取详细定义
-                raw_content = self.second_raw_content.get((first_name, second_name))
-                
-                best_match = None
-                best_score = 0.0
-                
-                for item in outline_list:
-                    # 计算相似度
-                    score1 = self._calculate_enhanced_similarity(second_name, item['title'])
-                    score2 = self._calculate_enhanced_similarity(
-                        f"{first_name}{second_name}",
-                        item['title']
-                    )
-                    score = max(score1, score2)
-                    
-                    # 如果有详细定义,也计算
-                    if raw_content:
-                        score3 = self._calculate_enhanced_similarity(
-                            second_name,
-                            item['title'],
-                            raw_content
-                        )
-                        score = max(score, score3)
-                    
-                    if score > best_score:
-                        best_score = score
-                        best_match = item
-                
-                if best_score >= threshold:
-                    matched_second.add(req_key)
-                    match_details.append({
-                        'level': 'second',
-                        'required_first_code': first_code,
-                        'required_second_code': second_code,
-                        'required_second_name': second_name,
-                        'matched': True,
-                        'match_type': 'fuzzy',
-                        'similarity': best_score,
-                        'matched_title': best_match['title'] if best_match else None,
-                        'used_raw_content': raw_content is not None
-                    })
-                else:
-                    missing_second.add(req_key)
-                    match_details.append({
-                        'level': 'second',
-                        'required_first_code': first_code,
-                        'required_second_code': second_code,
-                        'required_second_name': second_name,
-                        'matched': False,
-                        'match_type': 'none',
-                        'similarity': best_score
-                    })
-        else:
-            missing_second = required_remaining
-        
-        # 构建缺失详情
-        missing_first_details = []
-        for code in sorted(missing_first, key=lambda x: self.first_seq.get(x, 0)):
-            missing_first_details.append({
-                'first_code': code,
-                'first_name': self.first_names.get(code, code),
-                'first_seq': self.first_seq.get(code, 0)
+                'required_second_name': req_name,
+                'matched': False,
+                'match_type': 'none',
+                'similarity': best_score,
+                'best_attempt': best_attempt
             })
+            logger.debug(f"[二级缺失] {req_name} (最佳尝试: {best_attempt}, {best_score:.3f})")
         
-        missing_second_details = []
-        for key in sorted(missing_second, key=lambda x: (self.first_seq.get(x[0], 0), self.second_seq.get(x, 0))):
-            first_code, second_code = key
-            missing_second_details.append({
-                'first_code': first_code,
-                'first_name': self.first_names.get(first_code, first_code),
-                'first_seq': self.first_seq.get(first_code, 0),
-                'secondary_code': second_code,
-                'secondary_name': self.second_names.get(key, ''),
-                'second_seq': self.second_seq.get(key, 0)
-            })
+        logger.info(f"[独立模糊匹配] 完成:一级缺失 {len(missing_first)} 个,二级缺失 {len(missing_second)} 个")
         
         return {
             'matched_first': matched_first,
             'matched_second': matched_second,
-            'missing_first': missing_first_details,
-            'missing_second': missing_second_details,
+            'missing_first': missing_first,
+            'missing_second': missing_second,
             'missing_first_count': len(missing_first),
             'missing_second_count': len(missing_second),
             'match_details': match_details

+ 133 - 37
core/construction_review/component/reviewers/standard_timeliness_reviewer.py

@@ -26,6 +26,10 @@
         results = reviewer.review_standards(standards_list)
 """
 import asyncio
+import json
+import os
+import threading
+from datetime import datetime
 from typing import List, Dict, Any, Optional
 from dataclasses import dataclass, asdict
 
@@ -67,13 +71,14 @@ class StandardTimelinessReviewer:
     对标准列表进行时效性审查。
     """
 
-    def __init__(self, db_pool=None, standard_service: Optional[StandardMatchingService] = None):
+    def __init__(self, db_pool=None, standard_service: Optional[StandardMatchingService] = None, callback_task_id: Optional[str] = None):
         """
         初始化审查器
 
         Args:
             db_pool: 数据库连接池,用于初始化 StandardMatchingService(如未提供standard_service则必填)
             standard_service: 已初始化的 StandardMatchingService 实例(优先级高于 db_pool)
+            callback_task_id: 回调任务ID,用于持久化判定结果
 
         Raises:
             RuntimeError: 当db_pool和standard_service都为None时抛出异常
@@ -86,6 +91,8 @@ class StandardTimelinessReviewer:
         self.db_pool = db_pool
         self._service = standard_service
         self._own_service = False  # 标记是否由本实例创建 service
+        self.callback_task_id = callback_task_id
+        self._log_lock = threading.Lock()
 
     async def __aenter__(self):
         """异步上下文管理器入口"""
@@ -102,6 +109,38 @@ class StandardTimelinessReviewer:
             await self._service.close()
         return False
 
+    def _log_determination_results(self, review_results: List["TimelinessReviewResult"]) -> None:
+        """将时效性判定结果持久化到JSON文件,不影响主逻辑"""
+        if not self.callback_task_id:
+            return
+        try:
+            with self._log_lock:
+                log_dir = os.path.join("temp", "construction_review", "timeliness_result")
+                os.makedirs(log_dir, exist_ok=True)
+                log_path = os.path.join(log_dir, f"{self.callback_task_id}.json")
+
+                records = []
+                if os.path.exists(log_path):
+                    try:
+                        with open(log_path, "r", encoding="utf-8") as f:
+                            records = json.load(f)
+                            if not isinstance(records, list):
+                                records = []
+                    except Exception:
+                        records = []
+
+                for result in review_results:
+                    records.append({
+                        "timestamp": datetime.now().isoformat(),
+                        "callback_task_id": self.callback_task_id,
+                        **result.to_dict()
+                    })
+
+                with open(log_path, "w", encoding="utf-8") as f:
+                    json.dump(records, f, ensure_ascii=False, indent=2)
+        except Exception as e:
+            logger.warning(f"记录时效性判定结果失败: {e}")
+
     def review_standards(self, standards: List[Dict[str, str]]) -> List[TimelinessReviewResult]:
         """
         审查标准列表的时效性
@@ -112,7 +151,7 @@ class StandardTimelinessReviewer:
                 - standard_number: 标准号
 
         Returns:
-            List[TimelinessReviewResult]: 审查结果列表
+            List[TimelinessReviewResult]: 审查结果列表(文件名为空的会被过滤掉)
         """
         if not self._service:
             raise RuntimeError("服务未初始化,请使用异步上下文管理器或调用 initialize()")
@@ -123,12 +162,15 @@ class StandardTimelinessReviewer:
         # 转换为时效性审查结果
         review_results = []
         for match_result in match_results:
-            review_result = self._convert_match_to_review_result(match_result)
-            review_results.append(review_result)
+            # 跳过 match 返回 None 的情况(文件名为空)
+            if match_result is not None:
+                review_result = self._convert_match_to_review_result(match_result)
+                review_results.append(review_result)
 
+        self._log_determination_results(review_results)
         return review_results
 
-    def review_single(self, standard_name: str, standard_number: str, seq_no: int = 1) -> TimelinessReviewResult:
+    def review_single(self, standard_name: str, standard_number: str, seq_no: int = 1) -> Optional[TimelinessReviewResult]:
         """
         审查单个标准的时效性
 
@@ -139,12 +181,18 @@ class StandardTimelinessReviewer:
 
         Returns:
             TimelinessReviewResult: 审查结果
+            None: 当文件名为空时返回 None,表示跳过审查
         """
         if not self._service:
             raise RuntimeError("服务未初始化,请使用异步上下文管理器或调用 initialize()")
 
         match_result = self._service.check_single(seq_no, standard_name, standard_number)
-        return self._convert_match_to_review_result(match_result)
+        # 如果 match 返回 None(文件名为空),则返回 None
+        if match_result is None:
+            return None
+        review_result = self._convert_match_to_review_result(match_result)
+        self._log_determination_results([review_result])
+        return review_result
 
     def _convert_match_to_review_result(self, match_result: StandardMatchResult) -> TimelinessReviewResult:
         """
@@ -163,8 +211,8 @@ class StandardTimelinessReviewer:
             # 正常状态 - 无风险
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=False,
@@ -176,8 +224,8 @@ class StandardTimelinessReviewer:
             # 被替代 - high(与原有逻辑一致)
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=True,
@@ -194,8 +242,8 @@ class StandardTimelinessReviewer:
             # 废止无替代 - high(与原有逻辑一致)
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=True,
@@ -210,8 +258,8 @@ class StandardTimelinessReviewer:
             # 不匹配 - high(与原有逻辑一致:编号错误属于high)
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=True,
@@ -228,8 +276,8 @@ class StandardTimelinessReviewer:
             # 标准库不存在 - 直接过滤,不返回问题
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=False,
@@ -242,8 +290,8 @@ class StandardTimelinessReviewer:
             logger.warning(f"未知的匹配状态码: {status_code}")
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result="未知",
                 status_code=status_code,
                 has_issue=True,
@@ -253,6 +301,43 @@ class StandardTimelinessReviewer:
                 final_result=match_result.final_result
             )
 
+    def _normalize_text(self, text: str) -> str:
+        """
+        规范化文本用于比较(与 StandardRepository._normalize_for_matching 保持一致)
+        去除所有空白、标点符号、书名号、括号等
+        从 config.ini 读取需要去除的符号
+        """
+        if not text:
+            return ""
+        import re
+
+        # 基础规范化(与 StandardRepository 一致)
+        # 去除 HTML 标签
+        text = re.sub(r'<[^>]+>', '', text)
+        # 去除所有 Unicode 空白字符
+        text = re.sub(r'\s+', '', text)
+        # 去除书名号和括号(第一轮)
+        text = text.replace('《', '').replace('》', '').replace('(', '').replace(')', '').replace('(', '').replace(')', '')
+
+        # 第二轮:从 config.ini 读取并去除指定符号
+        default_symbols = '),-,.,/,,:,[,],【,】,〔,〕,(,),-,—'
+
+        # 尝试从配置读取
+        symbols_str = default_symbols
+        try:
+            from foundation.infrastructure.config.config import config_handler
+            symbols_str = config_handler.get('timeliness_review', 'REMOVE_SYMBOLS', default_symbols)
+        except Exception:
+            pass  # 使用默认符号
+
+        # 解析并去除符号
+        if symbols_str:
+            symbols_to_remove = [s.strip() for s in symbols_str.split(',') if s.strip()]
+            for symbol in symbols_to_remove:
+                text = text.replace(symbol, '')
+
+        return text
+
     def convert_to_standardized_format(
         self,
         review_results: List[TimelinessReviewResult],
@@ -278,25 +363,36 @@ class StandardTimelinessReviewer:
             # 标准库不存在或无问题的结果直接过滤,不返回
             if result.status_code == MatchResultCode.NOT_FOUND.value or not result.has_issue:
                 continue
-            else:
-                # 有问题
-                standardized_results.append({
-                    "check_item": check_item,
-                    "chapter_code": chapter_code,
-                    "check_item_code": check_item_code,
-                    "check_result": {
-                        "location": f"《{result.standard_name}》({result.standard_number})",
-                        "description": result.reason or result.final_result,
-                        "suggestion": result.suggestion,
-                        "issue_type": result.issue_type,
-                        "standard_name": result.standard_name,
-                        "standard_number": result.standard_number,
-                        "replacement_name": result.replacement_name,
-                        "replacement_number": result.replacement_number,
-                    },
-                    "exist_issue": True,
-                    "risk_info": {"risk_level": result.risk_level}
-                })
+
+            # 【兜底逻辑】检查替代标准是否和原始标准实质相同(规范化后比较)
+            if result.replacement_name and result.replacement_number:
+                original_combined = self._normalize_text(f"{result.standard_name}{result.standard_number}")
+                replacement_combined = self._normalize_text(f"{result.replacement_name}{result.replacement_number}")
+
+                if original_combined == replacement_combined:
+                    logger.info(f"[兜底过滤] 替代标准与原始标准实质相同,跳过: "
+                                f"{result.standard_name}({result.standard_number}) ~ "
+                                f"{result.replacement_name}({result.replacement_number})")
+                    continue  # 跳过这条问题,视为无风险
+
+            # 有问题
+            standardized_results.append({
+                "check_item": check_item,
+                "chapter_code": chapter_code,
+                "check_item_code": check_item_code,
+                "check_result": {
+                    "location": f"《{result.standard_name}》({result.standard_number})",
+                    "description": result.reason or result.final_result,
+                    "suggestion": result.suggestion,
+                    "issue_type": result.issue_type,
+                    "standard_name": result.standard_name,
+                    "standard_number": result.standard_number,
+                    "replacement_name": result.replacement_name,
+                    "replacement_number": result.replacement_number,
+                },
+                "exist_issue": True,
+                "risk_info": {"risk_level": result.risk_level}
+            })
 
         return standardized_results
 

+ 4 - 0
core/construction_review/component/reviewers/timeliness_basis_reviewer.py

@@ -388,6 +388,10 @@ class BasisReviewService:
 
         start_time = time.time()
         total_batches = (len(items) + 2) // 3  # 计算总批次数
+
+        # 绑定 callback_task_id 到时效性审查器,用于记录判定结果
+        if self._timeliness_reviewer and callback_task_id:
+            self._timeliness_reviewer.callback_task_id = callback_task_id
         
         # 发送开始审查的SSE推送(使用独立命名空间,避免与主流程进度冲突)
         if progress_manager and callback_task_id:

+ 3 - 0
core/construction_review/component/reviewers/timeliness_content_reviewer.py

@@ -276,6 +276,9 @@ class ContentTimelinessReviewer:
 
         try:
             async with self._semaphore:
+                # 绑定 callback_task_id,用于记录判定结果
+                if callback_task_id:
+                    self._timeliness_reviewer.callback_task_id = callback_task_id
                 # 执行规则匹配审查
                 review_results = self._timeliness_reviewer.review_standards(standards_list)
 

+ 2 - 1
core/construction_review/component/reviewers/utils/directory_extraction.py

@@ -43,7 +43,8 @@ SYSTEM = """
 1) 只抽取包含书名号《 》的条目。
 2) 每条条目包括:title(《》内名称,去掉书名号)、suffix(《》后面的版本/日期/修订说明,可为空)、raw(该条目原文)。
 3) 忽略标题行、段落说明、无《》的行。
-4) 输出必须严格符合格式要求,不要输出任何额外文字。
+4) **重要:title 和 raw 必须保留原文的所有空格和格式,不要修改或去除任何空格。**
+5) 输出必须严格符合格式要求,不要输出任何额外文字。
 """
 HUMAN ="""
 文本如下:

+ 1 - 1
core/construction_review/component/standard_matching/standard_dao.py

@@ -12,7 +12,7 @@ class StandardDAO:
 
     def __init__(self, db_pool):
         self.db_pool = db_pool
-        self.table_name = "t_samp_standard_base_info"
+        self.table_name = "t_samp_standard_base_info_status"
 
     async def load_all_standards(self) -> List[Dict]:
         """

+ 265 - 137
core/construction_review/component/standard_matching/standard_service.py

@@ -15,6 +15,14 @@ from enum import Enum
 
 from foundation.observability.logger.loggering import review_logger as logger
 
+# 导入配置处理器
+try:
+    from foundation.infrastructure.config.config import config_handler
+    _CONFIG_AVAILABLE = True
+except ImportError:
+    _CONFIG_AVAILABLE = False
+    config_handler = None
+
 
 class ValidityStatus(Enum):
     """时效性状态"""
@@ -36,8 +44,12 @@ class MatchResultCode(Enum):
 class StandardMatchResult:
     """标准匹配结果数据结构"""
     seq_no: int = 0                             # 序号
-    original_name: str = ""                      # 原始标准名称
-    original_number: str = ""                    # 原始标准号
+    raw_name: str = ""                           # 原始输入名称(未修改,用于返回)
+    raw_number: str = ""                         # 原始输入标准号(未修改,用于返回)
+    normalized_name: str = ""                    # 规范化名称(用于匹配)
+    normalized_number: str = ""                  # 规范化标准号(用于匹配)
+    matched_name: str = ""                       # 匹配到的数据库原始名称
+    matched_number: str = ""                     # 匹配到的数据库原始标准号
     substitute_number: Optional[str] = None      # 替代标准号(如果有)
     substitute_name: Optional[str] = None        # 替代标准名称(如果有)
     process_result: str = ""                     # 处理结果状态
@@ -49,8 +61,10 @@ class StandardMatchResult:
 class StandardRecord:
     """标准记录数据结构"""
     id: int
-    standard_name: str
-    standard_number: str
+    standard_name: str           # 原始名称(数据库中的值,用于返回)
+    standard_number: str         # 原始标准号(用于返回)
+    normalized_name: str         # 规范化名称(用于匹配)
+    normalized_number: str       # 规范化标准号(用于匹配)
     validity: str
 
 
@@ -64,14 +78,19 @@ class StandardRepository:
         # 原始数据列表
         self._records: List[StandardRecord] = []
 
-        # 索引结构,加速查询
+        # 原始索引(用于返回数据)
         self._number_index: Dict[str, StandardRecord] = {}  # 标准号 -> 记录
         self._name_index: Dict[str, List[StandardRecord]] = {}  # 名称 -> 记录列表
         self._current_records: List[StandardRecord] = []  # 现行/试行标准列表
 
+        # 规范化索引(用于匹配)
+        self._normalized_number_index: Dict[str, StandardRecord] = {}  # 规范化标准号 -> 记录
+        self._normalized_name_index: Dict[str, List[StandardRecord]] = {}  # 规范化名称 -> 记录列表
+
     def load_data(self, raw_data: List[Dict]):
         """
         加载原始数据到内存并建立索引
+        同时创建规范化索引用于匹配
 
         Args:
             raw_data: 从数据库查询的原始标准数据列表
@@ -80,6 +99,8 @@ class StandardRepository:
         self._number_index = {}
         self._name_index = {}
         self._current_records = []
+        self._normalized_number_index = {}
+        self._normalized_name_index = {}
 
         for item in raw_data:
             # 跳过无效数据
@@ -88,28 +109,37 @@ class StandardRepository:
             if not standard_number or not standard_name:
                 continue
 
+            # 创建规范化版本(用于匹配)
+            normalized_name = self._normalize_for_matching(standard_name)
+            normalized_number = self._normalize_for_matching(standard_number)
+
             record = StandardRecord(
                 id=item.get("id", 0),
-                standard_name=standard_name,
-                standard_number=standard_number,
+                standard_name=standard_name,           # 原始名称(用于返回)
+                standard_number=standard_number,       # 原始标准号(用于返回)
+                normalized_name=normalized_name,       # 规范化名称(用于匹配)
+                normalized_number=normalized_number,   # 规范化标准号(用于匹配)
                 validity=item.get("validity", "")
             )
             self._records.append(record)
 
-            # 建立标准号索引
+            # 建立原始索引(用于返回数据)
             self._number_index[record.standard_number] = record
-
-            # 建立名称索引(一个名称可能对应多个标准号)
             if record.standard_name not in self._name_index:
                 self._name_index[record.standard_name] = []
             self._name_index[record.standard_name].append(record)
 
+            # 建立规范化索引(用于匹配)
+            self._normalized_number_index[record.normalized_number] = record
+            if record.normalized_name not in self._normalized_name_index:
+                self._normalized_name_index[record.normalized_name] = []
+            self._normalized_name_index[record.normalized_name].append(record)
+
             # 收集现行/试行标准
             if record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
                 self._current_records.append(record)
 
         # 对现行标准按标准号降序排序(用于找最新替代标准)
-        # 处理可能的 None 值
         self._current_records.sort(
             key=lambda r: r.standard_number or "",
             reverse=True
@@ -134,40 +164,98 @@ class StandardRepository:
         return results
 
     def find_by_number_fuzzy(self, standard_number: str) -> List[StandardRecord]:
-        """模糊匹配标准号"""
+        """模糊匹配标准号(使用规范化数据)"""
         results = []
-        # 提取前缀(如 GB/T 5972)
-        parts = standard_number.split("-")
-        prefix = parts[0] if parts else standard_number
+        # 规范化输入的标准号
+        normalized_input = self._normalize_for_matching(standard_number)
 
-        for number, record in self._number_index.items():
-            # 前缀匹配
-            if number.startswith(prefix):
+        # 使用规范化索引进行前缀匹配
+        for normalized_number, record in self._normalized_number_index.items():
+            # 前缀匹配:检查是否以规范化后的输入开头,或包含关系
+            if normalized_number.startswith(normalized_input) or normalized_input in normalized_number:
                 results.append(record)
         return results
 
-    def find_current_by_name(self, standard_name: str) -> List[StandardRecord]:
-        """查询指定名称的现行/试行标准(支持模糊匹配)"""
+    def find_current_by_name(self, normalized_standard_name: str) -> List[StandardRecord]:
+        """查询指定名称的现行/试行标准(使用规范化名称匹配)"""
         results = []
         for record in self._current_records:
-            # 精确匹配
-            if record.standard_name == standard_name:
-                results.append(record)
-            # 模糊匹配(忽略空格、书名号等)
-            elif self._is_name_fuzzy_match_for_repo(record.standard_name, standard_name):
+            # 使用规范化名称匹配
+            if record.normalized_name == normalized_standard_name:
                 results.append(record)
         return results
 
-    def _is_name_fuzzy_match_for_repo(self, name1: str, name2: str) -> bool:
-        """判断两个标准名称是否模糊匹配"""
-        clean1 = name1.replace("《", "").replace("》", "").replace(" ", "").replace(" ", "")
-        clean2 = name2.replace("《", "").replace("》", "").replace(" ", "").replace(" ", "")
-        return clean1 == clean2
+    def _is_name_fuzzy_match_for_repo(self, normalized_name1: str, normalized_name2: str) -> bool:
+        """判断两个标准名称是否模糊匹配(使用规范化名称)"""
+        return normalized_name1 == normalized_name2
 
     def get_all_records(self) -> List[StandardRecord]:
         """获取所有记录"""
         return self._records.copy()
 
+    def _normalize_for_matching(self, text: str) -> str:
+        """
+        规范化文本用于匹配
+        第一轮:去除所有空白字符(包括空格、不间断空格、换行符等)、书名号、括号和 HTML 标签
+        第二轮:从配置读取并去除指定符号
+
+        Args:
+            text: 原始文本
+
+        Returns:
+            规范化后的字符串(去除所有空白、分隔符、HTML 标签和配置指定的符号)
+        """
+        if not text:
+            return ""
+
+        import re
+
+        # ========== 第一轮:基础规范化 ==========
+        # 去除 HTML 标签(如 <1680>)
+        text = re.sub(r'<[^>]+>', '', text)
+        # 去除所有 Unicode 空白字符(包括普通空格、不间断空格、换行等)
+        text = re.sub(r'\s+', '', text)
+        # 去除书名号和括号
+        text = text.replace('《', '').replace('》', '').replace('(', '').replace(')', '').replace('(', '').replace(')', '')
+
+        # ========== 第二轮:从配置读取并去除指定符号 ==========
+        # 读取配置中的符号列表,默认使用常见符号
+        # 包含各种连接符:半角连字符(-)、全角连接号(-)、全角破折号(—)
+        default_symbols = '),-,.,/,,:,[,],【,】,〔,〕,(,),-,—'
+
+        if _CONFIG_AVAILABLE and config_handler:
+            try:
+                symbols_str = config_handler.get('timeliness_review', 'REMOVE_SYMBOLS', default_symbols)
+            except Exception:
+                symbols_str = default_symbols
+        else:
+            symbols_str = default_symbols
+
+        # 解析符号列表(按逗号分割)
+        if symbols_str:
+            symbols_to_remove = [s.strip() for s in symbols_str.split(',') if s.strip()]
+            # 去除每个符号
+            for symbol in symbols_to_remove:
+                text = text.replace(symbol, '')
+
+        return text
+
+    def find_by_normalized_number(self, normalized_number: str) -> Optional[StandardRecord]:
+        """通过规范化标准号精确匹配"""
+        return self._normalized_number_index.get(normalized_number)
+
+    def find_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
+        """通过规范化名称匹配"""
+        return self._normalized_name_index.get(normalized_name, [])
+
+    def find_current_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
+        """查询指定规范化名称的现行/试行标准"""
+        results = []
+        for record in self._current_records:
+            if record.normalized_name == normalized_name:
+                results.append(record)
+        return results
+
 
 class StandardMatcher:
     """
@@ -178,52 +266,70 @@ class StandardMatcher:
     def __init__(self, repository: StandardRepository):
         self.repo = repository
 
-    def match(self, seq_no: int, input_name: str, input_number: str) -> StandardMatchResult:
+    def match(self, seq_no: int, input_name: str, input_number: str) -> Optional[StandardMatchResult]:
         """
         执行标准匹配
 
         匹配流程:
-        1. 标准号精确匹配
-        2. 根据匹配结果进入不同分支处理
+        1. 保存原始输入(用于返回)
+        2. 创建规范化版本(用于匹配)
+        3. 如果规范化后文件名为空,返回 None(跳过审查)
+        4. 使用规范化数据进行匹配
+        5. 返回结果中使用原始数据
+
+        Returns:
+            StandardMatchResult: 匹配结果
+            None: 当规范化文件名为空时返回 None,表示跳过审查
         """
-        # 去除前后空格
-        input_name = input_name.strip() if input_name else input_name
-        input_number = input_number.strip() if input_number else input_number
+        # 1. 保存原始输入
+        raw_name = input_name.strip() if input_name else ""
+        raw_number = input_number.strip() if input_number else ""
+
+        # 2. 创建规范化版本(去除所有符号,只保留中文字符)
+        normalized_name = self.repo._normalize_for_matching(raw_name)
+        normalized_number = self.repo._normalize_for_matching(raw_number)
 
-        # 清洗书名号和括号
-        input_name = self._clean_brackets_and_booknames(input_name)
-        input_number = self._clean_brackets_and_booknames(input_number)
+        # 3. 如果规范化后文件名为空,跳过审查
+        if not normalized_name:
+            logger.info(f"文件名规范化后为空,跳过审查。原始名称: '{raw_name}'")
+            return None
 
+        # 4. 初始化结果(保存原始和规范化数据)
         result = StandardMatchResult(
             seq_no=seq_no,
-            original_name=input_name,
-            original_number=input_number
+            raw_name=raw_name,
+            raw_number=raw_number,
+            normalized_name=normalized_name,
+            normalized_number=normalized_number
         )
 
-        # 步骤1: 精确匹配标准号
-        match_by_number = self.repo.find_by_number_exact(input_number)
+        # 5. 使用规范化数据进行匹配
+        match_by_number = self.repo.find_by_normalized_number(normalized_number)
 
         if match_by_number:
             # 分支A: 标准号匹配成功
-            return self._handle_number_matched(result, match_by_number, input_name)
+            return self._handle_number_matched(result, match_by_number)
         else:
             # 分支B: 标准号未匹配
-            return self._handle_number_not_matched(result, input_name, input_number)
+            return self._handle_number_not_matched(result, normalized_name, normalized_number)
 
     def _handle_number_matched(
         self,
         result: StandardMatchResult,
-        db_record: StandardRecord,
-        input_name: str
+        db_record: StandardRecord
     ) -> StandardMatchResult:
         """处理标准号匹配成功的情况"""
-        # 检查名称是否匹配
-        if db_record.standard_name == input_name:
+        # 保存匹配到的数据库原始数据
+        result.matched_name = db_record.standard_name
+        result.matched_number = db_record.standard_number
+
+        # 使用规范化名称进行比较
+        if db_record.normalized_name == result.normalized_name:
             # 名称也匹配
             return self._handle_full_match(result, db_record)
         else:
             # 名称不匹配
-            return self._handle_name_mismatch(result, db_record, input_name)
+            return self._handle_name_mismatch(result, db_record)
 
     def _handle_full_match(
         self,
@@ -231,39 +337,45 @@ class StandardMatcher:
         db_record: StandardRecord
     ) -> StandardMatchResult:
         """处理名称和标准号都完全匹配的情况"""
+        # 【调试日志】
+        logger.info(f"[_handle_full_match] 匹配记录: name={db_record.standard_name}, "
+                    f"number={db_record.standard_number}, validity={db_record.validity} "
+                    f"(期望: {ValidityStatus.CURRENT.value}/{ValidityStatus.TRIAL.value}, "
+                    f"实际是否匹配: {db_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]})")
+
         if db_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
             # 情况1: 现行或试行 - 状态正常
             return self._set_ok_result(result)
         else:
             # 废止状态 - 查找替代标准
+            logger.info(f"[_handle_full_match] 进入废止处理流程")
             return self._handle_abolished(result, db_record)
 
     def _handle_name_mismatch(
         self,
         result: StandardMatchResult,
-        db_record: StandardRecord,
-        input_name: str
+        db_record: StandardRecord
     ) -> StandardMatchResult:
         """处理标准号匹配但名称不匹配的情况"""
-        # 首先检查是否是名称模糊匹配(忽略空格、书名号等
-        if self._is_name_fuzzy_match(db_record.standard_name, input_name):
-            # 名称模糊匹配成功,按完全匹配处理
+        # 首先检查是否是名称模糊匹配(使用规范化名称
+        if self._is_name_fuzzy_match(db_record.normalized_name, result.normalized_name):
+            # 名称规范化后匹配成功,按完全匹配处理
             return self._handle_full_match(result, db_record)
 
-        # 尝试用输入的名称模糊匹配
-        name_matches = self.repo.find_by_name_fuzzy(input_name)
+        # 尝试用规范化名称模糊匹配
+        name_matches = self.repo.find_by_normalized_name(result.normalized_name)
 
-        # 查找精确名称匹配
-        exact_match = self._find_exact_name_match(name_matches, input_name)
+        # 查找精确名称匹配(使用规范化名称)
+        exact_match = self._find_exact_name_match(name_matches, result.normalized_name)
 
         if exact_match:
-            # 找到名称匹配的记录
-            return self._handle_fuzzy_name_match(result, exact_match)
-
-        # 尝试在模糊匹配结果中查找模糊名称匹配
-        for match in name_matches:
-            if self._is_name_fuzzy_match(match.standard_name, input_name):
-                return self._handle_fuzzy_name_match(result, match)
+            # 找到名称匹配的记录,检查标准号是否一致
+            if result.normalized_number == exact_match.normalized_number:
+                # 标准号实质一致,按完全匹配处理
+                return self._handle_full_match(result, exact_match)
+            else:
+                # 名称匹配但标准号不一致 = 标准号错误
+                return self._set_mismatch_result(result, exact_match)
 
         # 名称完全不匹配,但标准号已匹配成功
         # 说明该标准存在于库中,应返回不匹配而非不存在
@@ -277,42 +389,34 @@ class StandardMatcher:
     def _handle_number_not_matched(
         self,
         result: StandardMatchResult,
-        input_name: str,
-        input_number: str
+        normalized_name: str,
+        normalized_number: str
     ) -> StandardMatchResult:
         """处理标准号未匹配的情况"""
-        # 尝试模糊匹配标准号
-        fuzzy_number_matches = self.repo.find_by_number_fuzzy(input_number)
+        # 尝试模糊匹配标准号(使用原始数据的方法,可能需要改进)
+        fuzzy_number_matches = self.repo.find_by_number_fuzzy(normalized_number)
 
         if fuzzy_number_matches:
             # 检查名称是否匹配
-            return self._check_name_in_records(result, fuzzy_number_matches, input_name)
+            return self._check_name_in_records(result, fuzzy_number_matches, normalized_name)
         else:
             # 尝试直接按名称查询
-            return self._search_by_name_only(result, input_name)
+            return self._search_by_name_only(result, normalized_name)
 
     def _check_name_in_records(
         self,
         result: StandardMatchResult,
         records: List[StandardRecord],
-        input_name: str
+        normalized_name: str
     ) -> StandardMatchResult:
-        """在一批记录中查找名称匹配"""
-        # 首先尝试精确匹配
+        """在一批记录中查找名称匹配(标准号已模糊匹配成功)"""
+        # 首先尝试精确匹配(使用规范化名称)
         for record in records:
-            if record.standard_name == input_name:
+            if record.normalized_name == normalized_name:
                 # 名称匹配,检查状态
                 if record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-                    return self._set_mismatch_result(result, record)
-                elif record.validity == ValidityStatus.ABOLISHED.value:
-                    return self._handle_abolished(result, record)
-
-        # 尝试模糊名称匹配(忽略空格和书名号)
-        for record in records:
-            if self._is_name_fuzzy_match(record.standard_name, input_name):
-                # 名称模糊匹配成功
-                if record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-                    return self._set_mismatch_result(result, record)
+                    # 标准号模糊匹配成功 + 名称匹配 + 现行/试行 = 正常
+                    return self._set_ok_result(result)
                 elif record.validity == ValidityStatus.ABOLISHED.value:
                     return self._handle_abolished(result, record)
 
@@ -322,35 +426,22 @@ class StandardMatcher:
     def _search_by_name_only(
         self,
         result: StandardMatchResult,
-        input_name: str
+        normalized_name: str
     ) -> StandardMatchResult:
-        """仅通过名称查询"""
-        # 精确匹配名称
-        name_match = self.repo.find_by_name_exact(input_name)
+        """仅通过名称查询(标准号未匹配)"""
+        # 精确匹配规范化名称
+        name_matches = self.repo.find_by_normalized_name(normalized_name)
 
-        if name_match:
+        if name_matches:
+            # 取第一个匹配的记录
+            name_match = name_matches[0]
             if name_match.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
+                # 标准号不匹配但名称匹配 + 现行/试行 = 标准号错误(不匹配)
                 return self._set_mismatch_result(result, name_match)
             elif name_match.validity == ValidityStatus.ABOLISHED.value:
-                return self._set_not_found_result(result)
-
-        # 模糊匹配名称
-        fuzzy_matches = self.repo.find_by_name_fuzzy(input_name)
-
-        # 首先尝试精确匹配
-        exact_match = self._find_exact_name_match(fuzzy_matches, input_name)
-        if exact_match:
-            if exact_match.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-                return self._set_mismatch_result(result, exact_match)
-
-        # 尝试模糊名称匹配(忽略空格、书名号等)
-        for match in fuzzy_matches:
-            if self._is_name_fuzzy_match(match.standard_name, input_name):
-                if match.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-                    return self._set_mismatch_result(result, match)
-                elif match.validity == ValidityStatus.ABOLISHED.value:
-                    return self._handle_abolished(result, match)
+                return self._handle_abolished(result, name_match)
 
+        # 名称未找到
         return self._set_not_found_result(result)
 
     def _handle_fuzzy_name_match(
@@ -358,9 +449,10 @@ class StandardMatcher:
         result: StandardMatchResult,
         match_record: StandardRecord
     ) -> StandardMatchResult:
-        """处理模糊名称匹配成功的情况"""
+        """处理模糊名称匹配成功的情况(标准号已匹配)"""
         if match_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-            return self._set_mismatch_result(result, match_record)
+            # 标准号匹配 + 名称模糊匹配 + 现行/试行 = 正常
+            return self._set_ok_result(result)
         elif match_record.validity == ValidityStatus.ABOLISHED.value:
             return self._handle_abolished(result, match_record)
         return self._set_not_found_result(result)
@@ -371,8 +463,8 @@ class StandardMatcher:
         abolished_record: StandardRecord
     ) -> StandardMatchResult:
         """处理已废止标准的情况"""
-        # 查询同名现行标准作为替代
-        substitutes = self.repo.find_current_by_name(abolished_record.standard_name)
+        # 查询同名现行标准作为替代(使用规范化名称)
+        substitutes = self.repo.find_current_by_normalized_name(abolished_record.normalized_name)
 
         if substitutes:
             # 有替代标准,取最新的(已按标准号降序)
@@ -422,26 +514,27 @@ class StandardMatcher:
         result: StandardMatchResult,
         substitute: StandardRecord
     ) -> StandardMatchResult:
-        """设置被替代的结果"""
+        """设置被替代的结果 - 使用原始数据显示"""
         result.substitute_name = self._format_standard_name(substitute.standard_name)
         result.substitute_number = self._format_standard_number(substitute.standard_number)
         result.process_result = "被替代"
         result.status_code = MatchResultCode.SUBSTITUTED.value
+        # 使用 raw_name(原始输入)和 matched_name(数据库原始值)显示
         result.final_result = (
-            f"{self._format_standard_name(result.original_name)}"
-            f"{self._format_standard_number(result.original_number)}已废止,"
+            f"{self._format_standard_name(result.raw_name)}"
+            f"{self._format_standard_number(result.raw_number)}已废止,"
             f"替代{self._format_standard_name(substitute.standard_name)}"
             f"{self._format_standard_number(substitute.standard_number)}"
         )
         return result
 
     def _set_abolished_result(self, result: StandardMatchResult) -> StandardMatchResult:
-        """设置废止无替代的结果"""
+        """设置废止无替代的结果 - 使用原始数据显示"""
         result.process_result = "废止无现行"
         result.status_code = MatchResultCode.ABOLISHED.value
         result.final_result = (
-            f"{self._format_standard_name(result.original_name)}"
-            f"{self._format_standard_number(result.original_number)}已废止,无现行状态"
+            f"{self._format_standard_name(result.raw_name)}"
+            f"{self._format_standard_number(result.raw_number)}已废止,无现行状态"
         )
         return result
 
@@ -450,40 +543,37 @@ class StandardMatcher:
         result: StandardMatchResult,
         actual: StandardRecord
     ) -> StandardMatchResult:
-        """设置不匹配的结果"""
+        """设置不匹配的结果 - 使用原始数据显示"""
         result.substitute_name = self._format_standard_name(actual.standard_name)
         result.substitute_number = self._format_standard_number(actual.standard_number)
         result.process_result = "不匹配"
         result.status_code = MatchResultCode.MISMATCH.value
         result.final_result = (
-            f"{self._format_standard_name(result.original_name)}"
-            f"{self._format_standard_number(result.original_number)}"
+            f"{self._format_standard_name(result.raw_name)}"
+            f"{self._format_standard_number(result.raw_number)}"
             f"与实际{self._format_standard_name(actual.standard_name)}"
             f"{self._format_standard_number(actual.standard_number)}不匹配"
         )
         return result
 
     def _set_not_found_result(self, result: StandardMatchResult) -> StandardMatchResult:
-        """设置不存在的结果"""
+        """设置不存在的结果 - 使用原始数据显示"""
         result.process_result = "标准库不存在"
         result.status_code = MatchResultCode.NOT_FOUND.value
         result.final_result = (
-            f"{self._format_standard_name(result.original_name)}"
-            f"{self._format_standard_number(result.original_number)}标准库不存在,请确认"
+            f"{self._format_standard_name(result.raw_name)}"
+            f"{self._format_standard_number(result.raw_number)}标准库不存在,请确认"
         )
         return result
 
     # ========== 工具方法 ==========
 
-    def _is_name_fuzzy_match(self, name1: str, name2: str) -> bool:
+    def _is_name_fuzzy_match(self, normalized_name1: str, normalized_name2: str) -> bool:
         """
         判断两个标准名称是否模糊匹配
-        只去除书名号,保留中间空格(中间空格属于名称的一部分
+        使用规范化后的名称进行比较(已去除空格、括号、书名号等
         """
-        # 清理书名号,但保留中间空格
-        clean1 = name1.replace("《", "").replace("》", "")
-        clean2 = name2.replace("《", "").replace("》", "")
-        return clean1 == clean2
+        return normalized_name1 == normalized_name2
 
     def _clean_brackets_and_booknames(self, text: str) -> str:
         """
@@ -545,14 +635,49 @@ class StandardMatcher:
 
         return text
 
+    def _extract_chinese_chars(self, text: str) -> str:
+        """
+        提取字符串中的中文字符和空格
+        保留:中文字符(\u4e00-\u9fa5)、中文标点、空格(无换行符时)
+        删除:英文、数字、特殊符号、换行符等
+        特殊处理:如果存在换行符,则去除所有空格
+        """
+        if not text:
+            return text
+
+        import re
+
+        # 检查是否存在换行符(在清洗前检查)
+        has_newline = '\n' in text or '\r' in text
+
+        # 首先去除换行符及其旁边的所有空格
+        text = re.sub(r'\s*[\n\r]+\s*', '', text)
+        # 去除制表符
+        text = text.replace('\t', '')
+
+        if has_newline:
+            # 有换行符时:提取中文字符,去除所有空格
+            chinese_pattern = re.compile(r'[\u4e00-\u9fa5\u3000-\u303F\uFF00-\uFFEF]+')
+            matches = chinese_pattern.findall(text)
+            result = ''.join(matches)
+            # 去除所有空格(包括全角空格)
+            result = result.replace(' ', '').replace(' ', '')
+            return result.strip()
+        else:
+            # 无换行符时:提取中文字符和空格,保留中间空格
+            chinese_pattern = re.compile(r'[\u4e00-\u9fa5\u3000-\u303F\uFF00-\uFFEF\s]+')
+            matches = chinese_pattern.findall(text)
+            result = ''.join(matches)
+            return result.strip()
+
     def _find_exact_name_match(
         self,
         records: List[StandardRecord],
-        target_name: str
+        target_normalized_name: str
     ) -> Optional[StandardRecord]:
-        """在记录列表中查找精确名称匹配"""
+        """在记录列表中查找规范化名称精确匹配"""
         for record in records:
-            if record.standard_name == target_name:
+            if record.normalized_name == target_normalized_name:
                 return record
         return None
 
@@ -632,7 +757,7 @@ class StandardMatchingService:
                 - standard_number: 标准号(原始)
 
         Returns:
-            List[StandardMatchResult]: 匹配结果列表
+            List[StandardMatchResult]: 匹配结果列表(文件名为空的会被过滤掉)
         """
         if not self._initialized:
             raise RuntimeError("服务未初始化,请先调用 initialize()")
@@ -644,7 +769,9 @@ class StandardMatchingService:
                 input_name=std.get("standard_name", ""),
                 input_number=std.get("standard_number", "")
             )
-            results.append(result)
+            # 跳过文件名为空的情况(match 返回 None)
+            if result is not None:
+                results.append(result)
         return results
 
     def check_single(
@@ -652,7 +779,7 @@ class StandardMatchingService:
         seq_no: int,
         standard_name: str,
         standard_number: str
-    ) -> StandardMatchResult:
+    ) -> Optional[StandardMatchResult]:
         """
         检查单个标准
 
@@ -663,6 +790,7 @@ class StandardMatchingService:
 
         Returns:
             StandardMatchResult: 匹配结果
+            None: 当文件名为空时返回 None,表示跳过审查
         """
         if not self._initialized:
             raise RuntimeError("服务未初始化,请先调用 initialize()")