Răsfoiți Sursa

Merge branch 'dev_sgsc_lpl' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev

fix(sgsc-时效性审查模型-xth): 修复空格被判错bug
LingMin 1 săptămână în urmă
părinte
comite
1a9f6ad321

+ 1 - 4
config/config.ini.template

@@ -145,13 +145,10 @@ MYSQL_HOST=192.168.92.61
 MYSQL_PORT=13306
 MYSQL_USER=root
 MYSQL_PASSWORD=Lq123456!
-MYSQL_DB=lq_db
+MYSQL_DB=lq_db_dev
 MYSQL_MIN_SIZE=1
 MYSQL_MAX_SIZE=5
 MYSQL_AUTO_COMMIT=True
-MYSQL_CONNECT_TIMEOUT=30
-MYSQL_READ_TIMEOUT=60
-MYSQL_WRITE_TIMEOUT=30
 
 
 [pgvector]

+ 22 - 16
core/construction_review/component/reviewers/standard_timeliness_reviewer.py

@@ -112,7 +112,7 @@ class StandardTimelinessReviewer:
                 - standard_number: 标准号
 
         Returns:
-            List[TimelinessReviewResult]: 审查结果列表
+            List[TimelinessReviewResult]: 审查结果列表(文件名为空的会被过滤掉)
         """
         if not self._service:
             raise RuntimeError("服务未初始化,请使用异步上下文管理器或调用 initialize()")
@@ -123,12 +123,14 @@ class StandardTimelinessReviewer:
         # 转换为时效性审查结果
         review_results = []
         for match_result in match_results:
-            review_result = self._convert_match_to_review_result(match_result)
-            review_results.append(review_result)
+            # 跳过 match 返回 None 的情况(文件名为空)
+            if match_result is not None:
+                review_result = self._convert_match_to_review_result(match_result)
+                review_results.append(review_result)
 
         return review_results
 
-    def review_single(self, standard_name: str, standard_number: str, seq_no: int = 1) -> TimelinessReviewResult:
+    def review_single(self, standard_name: str, standard_number: str, seq_no: int = 1) -> Optional[TimelinessReviewResult]:
         """
         审查单个标准的时效性
 
@@ -139,11 +141,15 @@ class StandardTimelinessReviewer:
 
         Returns:
             TimelinessReviewResult: 审查结果
+            None: 当文件名为空时返回 None,表示跳过审查
         """
         if not self._service:
             raise RuntimeError("服务未初始化,请使用异步上下文管理器或调用 initialize()")
 
         match_result = self._service.check_single(seq_no, standard_name, standard_number)
+        # 如果 match 返回 None(文件名为空),则返回 None
+        if match_result is None:
+            return None
         return self._convert_match_to_review_result(match_result)
 
     def _convert_match_to_review_result(self, match_result: StandardMatchResult) -> TimelinessReviewResult:
@@ -163,8 +169,8 @@ class StandardTimelinessReviewer:
             # 正常状态 - 无风险
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=False,
@@ -176,8 +182,8 @@ class StandardTimelinessReviewer:
             # 被替代 - high(与原有逻辑一致)
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=True,
@@ -194,8 +200,8 @@ class StandardTimelinessReviewer:
             # 废止无替代 - high(与原有逻辑一致)
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=True,
@@ -210,8 +216,8 @@ class StandardTimelinessReviewer:
             # 不匹配 - high(与原有逻辑一致:编号错误属于high)
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=True,
@@ -228,8 +234,8 @@ class StandardTimelinessReviewer:
             # 标准库不存在 - 直接过滤,不返回问题
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result=match_result.process_result,
                 status_code=status_code,
                 has_issue=False,
@@ -242,8 +248,8 @@ class StandardTimelinessReviewer:
             logger.warning(f"未知的匹配状态码: {status_code}")
             return TimelinessReviewResult(
                 seq_no=match_result.seq_no,
-                standard_name=match_result.original_name,
-                standard_number=match_result.original_number,
+                standard_name=match_result.raw_name,
+                standard_number=match_result.raw_number,
                 process_result="未知",
                 status_code=status_code,
                 has_issue=True,

+ 2 - 1
core/construction_review/component/reviewers/utils/directory_extraction.py

@@ -43,7 +43,8 @@ SYSTEM = """
 1) 只抽取包含书名号《 》的条目。
 2) 每条条目包括:title(《》内名称,去掉书名号)、suffix(《》后面的版本/日期/修订说明,可为空)、raw(该条目原文)。
 3) 忽略标题行、段落说明、无《》的行。
-4) 输出必须严格符合格式要求,不要输出任何额外文字。
+4) **重要:title 和 raw 必须保留原文的所有空格和格式,不要修改或去除任何空格。**
+5) 输出必须严格符合格式要求,不要输出任何额外文字。
 """
 HUMAN ="""
 文本如下:

+ 219 - 135
core/construction_review/component/standard_matching/standard_service.py

@@ -36,8 +36,12 @@ class MatchResultCode(Enum):
 class StandardMatchResult:
     """标准匹配结果数据结构"""
     seq_no: int = 0                             # 序号
-    original_name: str = ""                      # 原始标准名称
-    original_number: str = ""                    # 原始标准号
+    raw_name: str = ""                           # 原始输入名称(未修改,用于返回)
+    raw_number: str = ""                         # 原始输入标准号(未修改,用于返回)
+    normalized_name: str = ""                    # 规范化名称(用于匹配)
+    normalized_number: str = ""                  # 规范化标准号(用于匹配)
+    matched_name: str = ""                       # 匹配到的数据库原始名称
+    matched_number: str = ""                     # 匹配到的数据库原始标准号
     substitute_number: Optional[str] = None      # 替代标准号(如果有)
     substitute_name: Optional[str] = None        # 替代标准名称(如果有)
     process_result: str = ""                     # 处理结果状态
@@ -49,8 +53,10 @@ class StandardMatchResult:
 class StandardRecord:
     """标准记录数据结构"""
     id: int
-    standard_name: str
-    standard_number: str
+    standard_name: str           # 原始名称(数据库中的值,用于返回)
+    standard_number: str         # 原始标准号(用于返回)
+    normalized_name: str         # 规范化名称(用于匹配)
+    normalized_number: str       # 规范化标准号(用于匹配)
     validity: str
 
 
@@ -64,14 +70,19 @@ class StandardRepository:
         # 原始数据列表
         self._records: List[StandardRecord] = []
 
-        # 索引结构,加速查询
+        # 原始索引(用于返回数据)
         self._number_index: Dict[str, StandardRecord] = {}  # 标准号 -> 记录
         self._name_index: Dict[str, List[StandardRecord]] = {}  # 名称 -> 记录列表
         self._current_records: List[StandardRecord] = []  # 现行/试行标准列表
 
+        # 规范化索引(用于匹配)
+        self._normalized_number_index: Dict[str, StandardRecord] = {}  # 规范化标准号 -> 记录
+        self._normalized_name_index: Dict[str, List[StandardRecord]] = {}  # 规范化名称 -> 记录列表
+
     def load_data(self, raw_data: List[Dict]):
         """
         加载原始数据到内存并建立索引
+        同时创建规范化索引用于匹配
 
         Args:
             raw_data: 从数据库查询的原始标准数据列表
@@ -80,6 +91,8 @@ class StandardRepository:
         self._number_index = {}
         self._name_index = {}
         self._current_records = []
+        self._normalized_number_index = {}
+        self._normalized_name_index = {}
 
         for item in raw_data:
             # 跳过无效数据
@@ -88,28 +101,37 @@ class StandardRepository:
             if not standard_number or not standard_name:
                 continue
 
+            # 创建规范化版本(用于匹配)
+            normalized_name = self._normalize_for_matching(standard_name)
+            normalized_number = self._normalize_for_matching(standard_number)
+
             record = StandardRecord(
                 id=item.get("id", 0),
-                standard_name=standard_name,
-                standard_number=standard_number,
+                standard_name=standard_name,           # 原始名称(用于返回)
+                standard_number=standard_number,       # 原始标准号(用于返回)
+                normalized_name=normalized_name,       # 规范化名称(用于匹配)
+                normalized_number=normalized_number,   # 规范化标准号(用于匹配)
                 validity=item.get("validity", "")
             )
             self._records.append(record)
 
-            # 建立标准号索引
+            # 建立原始索引(用于返回数据)
             self._number_index[record.standard_number] = record
-
-            # 建立名称索引(一个名称可能对应多个标准号)
             if record.standard_name not in self._name_index:
                 self._name_index[record.standard_name] = []
             self._name_index[record.standard_name].append(record)
 
+            # 建立规范化索引(用于匹配)
+            self._normalized_number_index[record.normalized_number] = record
+            if record.normalized_name not in self._normalized_name_index:
+                self._normalized_name_index[record.normalized_name] = []
+            self._normalized_name_index[record.normalized_name].append(record)
+
             # 收集现行/试行标准
             if record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
                 self._current_records.append(record)
 
         # 对现行标准按标准号降序排序(用于找最新替代标准)
-        # 处理可能的 None 值
         self._current_records.sort(
             key=lambda r: r.standard_number or "",
             reverse=True
@@ -134,40 +156,74 @@ class StandardRepository:
         return results
 
     def find_by_number_fuzzy(self, standard_number: str) -> List[StandardRecord]:
-        """模糊匹配标准号"""
+        """模糊匹配标准号(使用规范化数据)"""
         results = []
-        # 提取前缀(如 GB/T 5972)
-        parts = standard_number.split("-")
-        prefix = parts[0] if parts else standard_number
+        # 规范化输入的标准号
+        normalized_input = self._normalize_for_matching(standard_number)
 
-        for number, record in self._number_index.items():
-            # 前缀匹配
-            if number.startswith(prefix):
+        # 使用规范化索引进行前缀匹配
+        for normalized_number, record in self._normalized_number_index.items():
+            # 前缀匹配:检查是否以规范化后的输入开头,或包含关系
+            if normalized_number.startswith(normalized_input) or normalized_input in normalized_number:
                 results.append(record)
         return results
 
-    def find_current_by_name(self, standard_name: str) -> List[StandardRecord]:
-        """查询指定名称的现行/试行标准(支持模糊匹配)"""
+    def find_current_by_name(self, normalized_standard_name: str) -> List[StandardRecord]:
+        """查询指定名称的现行/试行标准(使用规范化名称匹配)"""
         results = []
         for record in self._current_records:
-            # 精确匹配
-            if record.standard_name == standard_name:
-                results.append(record)
-            # 模糊匹配(忽略空格、书名号等)
-            elif self._is_name_fuzzy_match_for_repo(record.standard_name, standard_name):
+            # 使用规范化名称匹配
+            if record.normalized_name == normalized_standard_name:
                 results.append(record)
         return results
 
-    def _is_name_fuzzy_match_for_repo(self, name1: str, name2: str) -> bool:
-        """判断两个标准名称是否模糊匹配"""
-        clean1 = name1.replace("《", "").replace("》", "").replace(" ", "").replace(" ", "")
-        clean2 = name2.replace("《", "").replace("》", "").replace(" ", "").replace(" ", "")
-        return clean1 == clean2
+    def _is_name_fuzzy_match_for_repo(self, normalized_name1: str, normalized_name2: str) -> bool:
+        """判断两个标准名称是否模糊匹配(使用规范化名称)"""
+        return normalized_name1 == normalized_name2
 
     def get_all_records(self) -> List[StandardRecord]:
         """获取所有记录"""
         return self._records.copy()
 
+    def _normalize_for_matching(self, text: str) -> str:
+        """
+        规范化文本用于匹配
+        去除所有空白字符(包括空格、不间断空格、换行符等)、书名号、括号和 HTML 标签
+
+        Args:
+            text: 原始文本
+
+        Returns:
+            规范化后的字符串(去除所有空白、分隔符和 HTML 标签)
+        """
+        if not text:
+            return ""
+
+        import re
+        # 去除 HTML 标签(如 <1680>)
+        text = re.sub(r'<[^>]+>', '', text)
+        # 去除所有 Unicode 空白字符(包括普通空格、不间断空格、换行等)
+        text = re.sub(r'\s+', '', text)
+        # 去除书名号和括号
+        text = text.replace('《', '').replace('》', '').replace('(', '').replace(')', '').replace('(', '').replace(')', '')
+        return text
+
+    def find_by_normalized_number(self, normalized_number: str) -> Optional[StandardRecord]:
+        """通过规范化标准号精确匹配"""
+        return self._normalized_number_index.get(normalized_number)
+
+    def find_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
+        """通过规范化名称匹配"""
+        return self._normalized_name_index.get(normalized_name, [])
+
+    def find_current_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
+        """查询指定规范化名称的现行/试行标准"""
+        results = []
+        for record in self._current_records:
+            if record.normalized_name == normalized_name:
+                results.append(record)
+        return results
+
 
 class StandardMatcher:
     """
@@ -178,52 +234,70 @@ class StandardMatcher:
     def __init__(self, repository: StandardRepository):
         self.repo = repository
 
-    def match(self, seq_no: int, input_name: str, input_number: str) -> StandardMatchResult:
+    def match(self, seq_no: int, input_name: str, input_number: str) -> Optional[StandardMatchResult]:
         """
         执行标准匹配
 
         匹配流程:
-        1. 标准号精确匹配
-        2. 根据匹配结果进入不同分支处理
+        1. 保存原始输入(用于返回)
+        2. 创建规范化版本(用于匹配)
+        3. 如果规范化后文件名为空,返回 None(跳过审查)
+        4. 使用规范化数据进行匹配
+        5. 返回结果中使用原始数据
+
+        Returns:
+            StandardMatchResult: 匹配结果
+            None: 当规范化文件名为空时返回 None,表示跳过审查
         """
-        # 去除前后空格
-        input_name = input_name.strip() if input_name else input_name
-        input_number = input_number.strip() if input_number else input_number
+        # 1. 保存原始输入
+        raw_name = input_name.strip() if input_name else ""
+        raw_number = input_number.strip() if input_number else ""
+
+        # 2. 创建规范化版本(去除所有符号,只保留中文字符)
+        normalized_name = self.repo._normalize_for_matching(raw_name)
+        normalized_number = self.repo._normalize_for_matching(raw_number)
 
-        # 清洗书名号和括号
-        input_name = self._clean_brackets_and_booknames(input_name)
-        input_number = self._clean_brackets_and_booknames(input_number)
+        # 3. 如果规范化后文件名为空,跳过审查
+        if not normalized_name:
+            logger.info(f"文件名规范化后为空,跳过审查。原始名称: '{raw_name}'")
+            return None
 
+        # 4. 初始化结果(保存原始和规范化数据)
         result = StandardMatchResult(
             seq_no=seq_no,
-            original_name=input_name,
-            original_number=input_number
+            raw_name=raw_name,
+            raw_number=raw_number,
+            normalized_name=normalized_name,
+            normalized_number=normalized_number
         )
 
-        # 步骤1: 精确匹配标准号
-        match_by_number = self.repo.find_by_number_exact(input_number)
+        # 5. 使用规范化数据进行匹配
+        match_by_number = self.repo.find_by_normalized_number(normalized_number)
 
         if match_by_number:
             # 分支A: 标准号匹配成功
-            return self._handle_number_matched(result, match_by_number, input_name)
+            return self._handle_number_matched(result, match_by_number)
         else:
             # 分支B: 标准号未匹配
-            return self._handle_number_not_matched(result, input_name, input_number)
+            return self._handle_number_not_matched(result, normalized_name, normalized_number)
 
     def _handle_number_matched(
         self,
         result: StandardMatchResult,
-        db_record: StandardRecord,
-        input_name: str
+        db_record: StandardRecord
     ) -> StandardMatchResult:
         """处理标准号匹配成功的情况"""
-        # 检查名称是否匹配
-        if db_record.standard_name == input_name:
+        # 保存匹配到的数据库原始数据
+        result.matched_name = db_record.standard_name
+        result.matched_number = db_record.standard_number
+
+        # 使用规范化名称进行比较
+        if db_record.normalized_name == result.normalized_name:
             # 名称也匹配
             return self._handle_full_match(result, db_record)
         else:
             # 名称不匹配
-            return self._handle_name_mismatch(result, db_record, input_name)
+            return self._handle_name_mismatch(result, db_record)
 
     def _handle_full_match(
         self,
@@ -241,30 +315,24 @@ class StandardMatcher:
     def _handle_name_mismatch(
         self,
         result: StandardMatchResult,
-        db_record: StandardRecord,
-        input_name: str
+        db_record: StandardRecord
     ) -> StandardMatchResult:
         """处理标准号匹配但名称不匹配的情况"""
-        # 首先检查是否是名称模糊匹配(忽略空格、书名号等
-        if self._is_name_fuzzy_match(db_record.standard_name, input_name):
-            # 名称模糊匹配成功,按完全匹配处理
+        # 首先检查是否是名称模糊匹配(使用规范化名称
+        if self._is_name_fuzzy_match(db_record.normalized_name, result.normalized_name):
+            # 名称规范化后匹配成功,按完全匹配处理
             return self._handle_full_match(result, db_record)
 
-        # 尝试用输入的名称模糊匹配
-        name_matches = self.repo.find_by_name_fuzzy(input_name)
+        # 尝试用规范化名称模糊匹配
+        name_matches = self.repo.find_by_normalized_name(result.normalized_name)
 
-        # 查找精确名称匹配
-        exact_match = self._find_exact_name_match(name_matches, input_name)
+        # 查找精确名称匹配(使用规范化名称)
+        exact_match = self._find_exact_name_match(name_matches, result.normalized_name)
 
         if exact_match:
             # 找到名称匹配的记录
             return self._handle_fuzzy_name_match(result, exact_match)
 
-        # 尝试在模糊匹配结果中查找模糊名称匹配
-        for match in name_matches:
-            if self._is_name_fuzzy_match(match.standard_name, input_name):
-                return self._handle_fuzzy_name_match(result, match)
-
         # 名称完全不匹配,但标准号已匹配成功
         # 说明该标准存在于库中,应返回不匹配而非不存在
         if db_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
@@ -277,42 +345,34 @@ class StandardMatcher:
     def _handle_number_not_matched(
         self,
         result: StandardMatchResult,
-        input_name: str,
-        input_number: str
+        normalized_name: str,
+        normalized_number: str
     ) -> StandardMatchResult:
         """处理标准号未匹配的情况"""
-        # 尝试模糊匹配标准号
-        fuzzy_number_matches = self.repo.find_by_number_fuzzy(input_number)
+        # 尝试模糊匹配标准号(使用原始数据的方法,可能需要改进)
+        fuzzy_number_matches = self.repo.find_by_number_fuzzy(normalized_number)
 
         if fuzzy_number_matches:
             # 检查名称是否匹配
-            return self._check_name_in_records(result, fuzzy_number_matches, input_name)
+            return self._check_name_in_records(result, fuzzy_number_matches, normalized_name)
         else:
             # 尝试直接按名称查询
-            return self._search_by_name_only(result, input_name)
+            return self._search_by_name_only(result, normalized_name)
 
     def _check_name_in_records(
         self,
         result: StandardMatchResult,
         records: List[StandardRecord],
-        input_name: str
+        normalized_name: str
     ) -> StandardMatchResult:
-        """在一批记录中查找名称匹配"""
-        # 首先尝试精确匹配
+        """在一批记录中查找名称匹配(标准号已模糊匹配成功)"""
+        # 首先尝试精确匹配(使用规范化名称)
         for record in records:
-            if record.standard_name == input_name:
+            if record.normalized_name == normalized_name:
                 # 名称匹配,检查状态
                 if record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-                    return self._set_mismatch_result(result, record)
-                elif record.validity == ValidityStatus.ABOLISHED.value:
-                    return self._handle_abolished(result, record)
-
-        # 尝试模糊名称匹配(忽略空格和书名号)
-        for record in records:
-            if self._is_name_fuzzy_match(record.standard_name, input_name):
-                # 名称模糊匹配成功
-                if record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-                    return self._set_mismatch_result(result, record)
+                    # 标准号模糊匹配成功 + 名称匹配 + 现行/试行 = 正常
+                    return self._set_ok_result(result)
                 elif record.validity == ValidityStatus.ABOLISHED.value:
                     return self._handle_abolished(result, record)
 
@@ -322,35 +382,22 @@ class StandardMatcher:
     def _search_by_name_only(
         self,
         result: StandardMatchResult,
-        input_name: str
+        normalized_name: str
     ) -> StandardMatchResult:
-        """仅通过名称查询"""
-        # 精确匹配名称
-        name_match = self.repo.find_by_name_exact(input_name)
+        """仅通过名称查询(标准号未匹配)"""
+        # 精确匹配规范化名称
+        name_matches = self.repo.find_by_normalized_name(normalized_name)
 
-        if name_match:
+        if name_matches:
+            # 取第一个匹配的记录
+            name_match = name_matches[0]
             if name_match.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
+                # 标准号不匹配但名称匹配 + 现行/试行 = 标准号错误(不匹配)
                 return self._set_mismatch_result(result, name_match)
             elif name_match.validity == ValidityStatus.ABOLISHED.value:
-                return self._set_not_found_result(result)
-
-        # 模糊匹配名称
-        fuzzy_matches = self.repo.find_by_name_fuzzy(input_name)
-
-        # 首先尝试精确匹配
-        exact_match = self._find_exact_name_match(fuzzy_matches, input_name)
-        if exact_match:
-            if exact_match.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-                return self._set_mismatch_result(result, exact_match)
-
-        # 尝试模糊名称匹配(忽略空格、书名号等)
-        for match in fuzzy_matches:
-            if self._is_name_fuzzy_match(match.standard_name, input_name):
-                if match.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-                    return self._set_mismatch_result(result, match)
-                elif match.validity == ValidityStatus.ABOLISHED.value:
-                    return self._handle_abolished(result, match)
+                return self._handle_abolished(result, name_match)
 
+        # 名称未找到
         return self._set_not_found_result(result)
 
     def _handle_fuzzy_name_match(
@@ -358,9 +405,10 @@ class StandardMatcher:
         result: StandardMatchResult,
         match_record: StandardRecord
     ) -> StandardMatchResult:
-        """处理模糊名称匹配成功的情况"""
+        """处理模糊名称匹配成功的情况(标准号已匹配)"""
         if match_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
-            return self._set_mismatch_result(result, match_record)
+            # 标准号匹配 + 名称模糊匹配 + 现行/试行 = 正常
+            return self._set_ok_result(result)
         elif match_record.validity == ValidityStatus.ABOLISHED.value:
             return self._handle_abolished(result, match_record)
         return self._set_not_found_result(result)
@@ -371,8 +419,8 @@ class StandardMatcher:
         abolished_record: StandardRecord
     ) -> StandardMatchResult:
         """处理已废止标准的情况"""
-        # 查询同名现行标准作为替代
-        substitutes = self.repo.find_current_by_name(abolished_record.standard_name)
+        # 查询同名现行标准作为替代(使用规范化名称)
+        substitutes = self.repo.find_current_by_normalized_name(abolished_record.normalized_name)
 
         if substitutes:
             # 有替代标准,取最新的(已按标准号降序)
@@ -422,26 +470,27 @@ class StandardMatcher:
         result: StandardMatchResult,
         substitute: StandardRecord
     ) -> StandardMatchResult:
-        """设置被替代的结果"""
+        """设置被替代的结果 - 使用原始数据显示"""
         result.substitute_name = self._format_standard_name(substitute.standard_name)
         result.substitute_number = self._format_standard_number(substitute.standard_number)
         result.process_result = "被替代"
         result.status_code = MatchResultCode.SUBSTITUTED.value
+        # 使用 raw_name(原始输入)和 matched_name(数据库原始值)显示
         result.final_result = (
-            f"{self._format_standard_name(result.original_name)}"
-            f"{self._format_standard_number(result.original_number)}已废止,"
+            f"{self._format_standard_name(result.raw_name)}"
+            f"{self._format_standard_number(result.raw_number)}已废止,"
             f"替代{self._format_standard_name(substitute.standard_name)}"
             f"{self._format_standard_number(substitute.standard_number)}"
         )
         return result
 
     def _set_abolished_result(self, result: StandardMatchResult) -> StandardMatchResult:
-        """设置废止无替代的结果"""
+        """设置废止无替代的结果 - 使用原始数据显示"""
         result.process_result = "废止无现行"
         result.status_code = MatchResultCode.ABOLISHED.value
         result.final_result = (
-            f"{self._format_standard_name(result.original_name)}"
-            f"{self._format_standard_number(result.original_number)}已废止,无现行状态"
+            f"{self._format_standard_name(result.raw_name)}"
+            f"{self._format_standard_number(result.raw_number)}已废止,无现行状态"
         )
         return result
 
@@ -450,40 +499,37 @@ class StandardMatcher:
         result: StandardMatchResult,
         actual: StandardRecord
     ) -> StandardMatchResult:
-        """设置不匹配的结果"""
+        """设置不匹配的结果 - 使用原始数据显示"""
         result.substitute_name = self._format_standard_name(actual.standard_name)
         result.substitute_number = self._format_standard_number(actual.standard_number)
         result.process_result = "不匹配"
         result.status_code = MatchResultCode.MISMATCH.value
         result.final_result = (
-            f"{self._format_standard_name(result.original_name)}"
-            f"{self._format_standard_number(result.original_number)}"
+            f"{self._format_standard_name(result.raw_name)}"
+            f"{self._format_standard_number(result.raw_number)}"
             f"与实际{self._format_standard_name(actual.standard_name)}"
             f"{self._format_standard_number(actual.standard_number)}不匹配"
         )
         return result
 
     def _set_not_found_result(self, result: StandardMatchResult) -> StandardMatchResult:
-        """设置不存在的结果"""
+        """设置不存在的结果 - 使用原始数据显示"""
         result.process_result = "标准库不存在"
         result.status_code = MatchResultCode.NOT_FOUND.value
         result.final_result = (
-            f"{self._format_standard_name(result.original_name)}"
-            f"{self._format_standard_number(result.original_number)}标准库不存在,请确认"
+            f"{self._format_standard_name(result.raw_name)}"
+            f"{self._format_standard_number(result.raw_number)}标准库不存在,请确认"
         )
         return result
 
     # ========== 工具方法 ==========
 
-    def _is_name_fuzzy_match(self, name1: str, name2: str) -> bool:
+    def _is_name_fuzzy_match(self, normalized_name1: str, normalized_name2: str) -> bool:
         """
         判断两个标准名称是否模糊匹配
-        只去除书名号,保留中间空格(中间空格属于名称的一部分
+        使用规范化后的名称进行比较(已去除空格、括号、书名号等
         """
-        # 清理书名号,但保留中间空格
-        clean1 = name1.replace("《", "").replace("》", "")
-        clean2 = name2.replace("《", "").replace("》", "")
-        return clean1 == clean2
+        return normalized_name1 == normalized_name2
 
     def _clean_brackets_and_booknames(self, text: str) -> str:
         """
@@ -545,14 +591,49 @@ class StandardMatcher:
 
         return text
 
+    def _extract_chinese_chars(self, text: str) -> str:
+        """
+        提取字符串中的中文字符和空格
+        保留:中文字符(\u4e00-\u9fa5)、中文标点、空格(无换行符时)
+        删除:英文、数字、特殊符号、换行符等
+        特殊处理:如果存在换行符,则去除所有空格
+        """
+        if not text:
+            return text
+
+        import re
+
+        # 检查是否存在换行符(在清洗前检查)
+        has_newline = '\n' in text or '\r' in text
+
+        # 首先去除换行符及其旁边的所有空格
+        text = re.sub(r'\s*[\n\r]+\s*', '', text)
+        # 去除制表符
+        text = text.replace('\t', '')
+
+        if has_newline:
+            # 有换行符时:提取中文字符,去除所有空格
+            chinese_pattern = re.compile(r'[\u4e00-\u9fa5\u3000-\u303F\uFF00-\uFFEF]+')
+            matches = chinese_pattern.findall(text)
+            result = ''.join(matches)
+            # 去除所有空格(包括全角空格)
+            result = result.replace(' ', '').replace(' ', '')
+            return result.strip()
+        else:
+            # 无换行符时:提取中文字符和空格,保留中间空格
+            chinese_pattern = re.compile(r'[\u4e00-\u9fa5\u3000-\u303F\uFF00-\uFFEF\s]+')
+            matches = chinese_pattern.findall(text)
+            result = ''.join(matches)
+            return result.strip()
+
     def _find_exact_name_match(
         self,
         records: List[StandardRecord],
-        target_name: str
+        target_normalized_name: str
     ) -> Optional[StandardRecord]:
-        """在记录列表中查找精确名称匹配"""
+        """在记录列表中查找规范化名称精确匹配"""
         for record in records:
-            if record.standard_name == target_name:
+            if record.normalized_name == target_normalized_name:
                 return record
         return None
 
@@ -632,7 +713,7 @@ class StandardMatchingService:
                 - standard_number: 标准号(原始)
 
         Returns:
-            List[StandardMatchResult]: 匹配结果列表
+            List[StandardMatchResult]: 匹配结果列表(文件名为空的会被过滤掉)
         """
         if not self._initialized:
             raise RuntimeError("服务未初始化,请先调用 initialize()")
@@ -644,7 +725,9 @@ class StandardMatchingService:
                 input_name=std.get("standard_name", ""),
                 input_number=std.get("standard_number", "")
             )
-            results.append(result)
+            # 跳过文件名为空的情况(match 返回 None)
+            if result is not None:
+                results.append(result)
         return results
 
     def check_single(
@@ -652,7 +735,7 @@ class StandardMatchingService:
         seq_no: int,
         standard_name: str,
         standard_number: str
-    ) -> StandardMatchResult:
+    ) -> Optional[StandardMatchResult]:
         """
         检查单个标准
 
@@ -663,6 +746,7 @@ class StandardMatchingService:
 
         Returns:
             StandardMatchResult: 匹配结果
+            None: 当文件名为空时返回 None,表示跳过审查
         """
         if not self._initialized:
             raise RuntimeError("服务未初始化,请先调用 initialize()")