|
|
@@ -15,6 +15,14 @@ from enum import Enum
|
|
|
|
|
|
from foundation.observability.logger.loggering import review_logger as logger
|
|
|
|
|
|
+# 导入配置处理器
|
|
|
+try:
|
|
|
+ from foundation.infrastructure.config.config import config_handler
|
|
|
+ _CONFIG_AVAILABLE = True
|
|
|
+except ImportError:
|
|
|
+ _CONFIG_AVAILABLE = False
|
|
|
+ config_handler = None
|
|
|
+
|
|
|
|
|
|
class ValidityStatus(Enum):
|
|
|
"""时效性状态"""
|
|
|
@@ -36,8 +44,12 @@ class MatchResultCode(Enum):
|
|
|
class StandardMatchResult:
|
|
|
"""标准匹配结果数据结构"""
|
|
|
seq_no: int = 0 # 序号
|
|
|
- original_name: str = "" # 原始标准名称
|
|
|
- original_number: str = "" # 原始标准号
|
|
|
+ raw_name: str = "" # 原始输入名称(未修改,用于返回)
|
|
|
+ raw_number: str = "" # 原始输入标准号(未修改,用于返回)
|
|
|
+ normalized_name: str = "" # 规范化名称(用于匹配)
|
|
|
+ normalized_number: str = "" # 规范化标准号(用于匹配)
|
|
|
+ matched_name: str = "" # 匹配到的数据库原始名称
|
|
|
+ matched_number: str = "" # 匹配到的数据库原始标准号
|
|
|
substitute_number: Optional[str] = None # 替代标准号(如果有)
|
|
|
substitute_name: Optional[str] = None # 替代标准名称(如果有)
|
|
|
process_result: str = "" # 处理结果状态
|
|
|
@@ -49,8 +61,10 @@ class StandardMatchResult:
|
|
|
class StandardRecord:
|
|
|
"""标准记录数据结构"""
|
|
|
id: int
|
|
|
- standard_name: str
|
|
|
- standard_number: str
|
|
|
+ standard_name: str # 原始名称(数据库中的值,用于返回)
|
|
|
+ standard_number: str # 原始标准号(用于返回)
|
|
|
+ normalized_name: str # 规范化名称(用于匹配)
|
|
|
+ normalized_number: str # 规范化标准号(用于匹配)
|
|
|
validity: str
|
|
|
|
|
|
|
|
|
@@ -64,14 +78,19 @@ class StandardRepository:
|
|
|
# 原始数据列表
|
|
|
self._records: List[StandardRecord] = []
|
|
|
|
|
|
- # 索引结构,加速查询
|
|
|
+ # 原始索引(用于返回数据)
|
|
|
self._number_index: Dict[str, StandardRecord] = {} # 标准号 -> 记录
|
|
|
self._name_index: Dict[str, List[StandardRecord]] = {} # 名称 -> 记录列表
|
|
|
self._current_records: List[StandardRecord] = [] # 现行/试行标准列表
|
|
|
|
|
|
+ # 规范化索引(用于匹配)
|
|
|
+ self._normalized_number_index: Dict[str, StandardRecord] = {} # 规范化标准号 -> 记录
|
|
|
+ self._normalized_name_index: Dict[str, List[StandardRecord]] = {} # 规范化名称 -> 记录列表
|
|
|
+
|
|
|
def load_data(self, raw_data: List[Dict]):
|
|
|
"""
|
|
|
加载原始数据到内存并建立索引
|
|
|
+ 同时创建规范化索引用于匹配
|
|
|
|
|
|
Args:
|
|
|
raw_data: 从数据库查询的原始标准数据列表
|
|
|
@@ -80,6 +99,8 @@ class StandardRepository:
|
|
|
self._number_index = {}
|
|
|
self._name_index = {}
|
|
|
self._current_records = []
|
|
|
+ self._normalized_number_index = {}
|
|
|
+ self._normalized_name_index = {}
|
|
|
|
|
|
for item in raw_data:
|
|
|
# 跳过无效数据
|
|
|
@@ -88,28 +109,37 @@ class StandardRepository:
|
|
|
if not standard_number or not standard_name:
|
|
|
continue
|
|
|
|
|
|
+ # 创建规范化版本(用于匹配)
|
|
|
+ normalized_name = self._normalize_for_matching(standard_name)
|
|
|
+ normalized_number = self._normalize_for_matching(standard_number)
|
|
|
+
|
|
|
record = StandardRecord(
|
|
|
id=item.get("id", 0),
|
|
|
- standard_name=standard_name,
|
|
|
- standard_number=standard_number,
|
|
|
+ standard_name=standard_name, # 原始名称(用于返回)
|
|
|
+ standard_number=standard_number, # 原始标准号(用于返回)
|
|
|
+ normalized_name=normalized_name, # 规范化名称(用于匹配)
|
|
|
+ normalized_number=normalized_number, # 规范化标准号(用于匹配)
|
|
|
validity=item.get("validity", "")
|
|
|
)
|
|
|
self._records.append(record)
|
|
|
|
|
|
- # 建立标准号索引
|
|
|
+ # 建立原始索引(用于返回数据)
|
|
|
self._number_index[record.standard_number] = record
|
|
|
-
|
|
|
- # 建立名称索引(一个名称可能对应多个标准号)
|
|
|
if record.standard_name not in self._name_index:
|
|
|
self._name_index[record.standard_name] = []
|
|
|
self._name_index[record.standard_name].append(record)
|
|
|
|
|
|
+ # 建立规范化索引(用于匹配)
|
|
|
+ self._normalized_number_index[record.normalized_number] = record
|
|
|
+ if record.normalized_name not in self._normalized_name_index:
|
|
|
+ self._normalized_name_index[record.normalized_name] = []
|
|
|
+ self._normalized_name_index[record.normalized_name].append(record)
|
|
|
+
|
|
|
# 收集现行/试行标准
|
|
|
if record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
|
|
|
self._current_records.append(record)
|
|
|
|
|
|
# 对现行标准按标准号降序排序(用于找最新替代标准)
|
|
|
- # 处理可能的 None 值
|
|
|
self._current_records.sort(
|
|
|
key=lambda r: r.standard_number or "",
|
|
|
reverse=True
|
|
|
@@ -134,40 +164,98 @@ class StandardRepository:
|
|
|
return results
|
|
|
|
|
|
def find_by_number_fuzzy(self, standard_number: str) -> List[StandardRecord]:
|
|
|
- """模糊匹配标准号"""
|
|
|
+ """模糊匹配标准号(使用规范化数据)"""
|
|
|
results = []
|
|
|
- # 提取前缀(如 GB/T 5972)
|
|
|
- parts = standard_number.split("-")
|
|
|
- prefix = parts[0] if parts else standard_number
|
|
|
+ # 规范化输入的标准号
|
|
|
+ normalized_input = self._normalize_for_matching(standard_number)
|
|
|
|
|
|
- for number, record in self._number_index.items():
|
|
|
- # 前缀匹配
|
|
|
- if number.startswith(prefix):
|
|
|
+ # 使用规范化索引进行前缀匹配
|
|
|
+ for normalized_number, record in self._normalized_number_index.items():
|
|
|
+ # 前缀匹配:检查是否以规范化后的输入开头,或包含关系
|
|
|
+ if normalized_number.startswith(normalized_input) or normalized_input in normalized_number:
|
|
|
results.append(record)
|
|
|
return results
|
|
|
|
|
|
- def find_current_by_name(self, standard_name: str) -> List[StandardRecord]:
|
|
|
- """查询指定名称的现行/试行标准(支持模糊匹配)"""
|
|
|
+ def find_current_by_name(self, normalized_standard_name: str) -> List[StandardRecord]:
|
|
|
+ """查询指定名称的现行/试行标准(使用规范化名称匹配)"""
|
|
|
results = []
|
|
|
for record in self._current_records:
|
|
|
- # 精确匹配
|
|
|
- if record.standard_name == standard_name:
|
|
|
- results.append(record)
|
|
|
- # 模糊匹配(忽略空格、书名号等)
|
|
|
- elif self._is_name_fuzzy_match_for_repo(record.standard_name, standard_name):
|
|
|
+ # 使用规范化名称匹配
|
|
|
+ if record.normalized_name == normalized_standard_name:
|
|
|
results.append(record)
|
|
|
return results
|
|
|
|
|
|
- def _is_name_fuzzy_match_for_repo(self, name1: str, name2: str) -> bool:
|
|
|
- """判断两个标准名称是否模糊匹配"""
|
|
|
- clean1 = name1.replace("《", "").replace("》", "").replace(" ", "").replace(" ", "")
|
|
|
- clean2 = name2.replace("《", "").replace("》", "").replace(" ", "").replace(" ", "")
|
|
|
- return clean1 == clean2
|
|
|
+ def _is_name_fuzzy_match_for_repo(self, normalized_name1: str, normalized_name2: str) -> bool:
|
|
|
+ """判断两个标准名称是否模糊匹配(使用规范化名称)"""
|
|
|
+ return normalized_name1 == normalized_name2
|
|
|
|
|
|
def get_all_records(self) -> List[StandardRecord]:
|
|
|
"""获取所有记录"""
|
|
|
return self._records.copy()
|
|
|
|
|
|
+ def _normalize_for_matching(self, text: str) -> str:
|
|
|
+ """
|
|
|
+ 规范化文本用于匹配
|
|
|
+ 第一轮:去除所有空白字符(包括空格、不间断空格、换行符等)、书名号、括号和 HTML 标签
|
|
|
+ 第二轮:从配置读取并去除指定符号
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: 原始文本
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 规范化后的字符串(去除所有空白、分隔符、HTML 标签和配置指定的符号)
|
|
|
+ """
|
|
|
+ if not text:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ import re
|
|
|
+
|
|
|
+ # ========== 第一轮:基础规范化 ==========
|
|
|
+ # 去除 HTML 标签(如 <1680>)
|
|
|
+ text = re.sub(r'<[^>]+>', '', text)
|
|
|
+ # 去除所有 Unicode 空白字符(包括普通空格、不间断空格、换行等)
|
|
|
+ text = re.sub(r'\s+', '', text)
|
|
|
+ # 去除书名号和括号
|
|
|
+ text = text.replace('《', '').replace('》', '').replace('(', '').replace(')', '').replace('(', '').replace(')', '')
|
|
|
+
|
|
|
+ # ========== 第二轮:从配置读取并去除指定符号 ==========
|
|
|
+ # 读取配置中的符号列表,默认使用常见符号
|
|
|
+ # 包含各种连接符:半角连字符(-)、全角连接号(-)、全角破折号(—)
|
|
|
+ default_symbols = '),-,.,/,,:,[,],【,】,〔,〕,(,),-,—'
|
|
|
+
|
|
|
+ if _CONFIG_AVAILABLE and config_handler:
|
|
|
+ try:
|
|
|
+ symbols_str = config_handler.get('timeliness_review', 'REMOVE_SYMBOLS', default_symbols)
|
|
|
+ except Exception:
|
|
|
+ symbols_str = default_symbols
|
|
|
+ else:
|
|
|
+ symbols_str = default_symbols
|
|
|
+
|
|
|
+ # 解析符号列表(按逗号分割)
|
|
|
+ if symbols_str:
|
|
|
+ symbols_to_remove = [s.strip() for s in symbols_str.split(',') if s.strip()]
|
|
|
+ # 去除每个符号
|
|
|
+ for symbol in symbols_to_remove:
|
|
|
+ text = text.replace(symbol, '')
|
|
|
+
|
|
|
+ return text
|
|
|
+
|
|
|
+ def find_by_normalized_number(self, normalized_number: str) -> Optional[StandardRecord]:
|
|
|
+ """通过规范化标准号精确匹配"""
|
|
|
+ return self._normalized_number_index.get(normalized_number)
|
|
|
+
|
|
|
+ def find_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
|
|
|
+ """通过规范化名称匹配"""
|
|
|
+ return self._normalized_name_index.get(normalized_name, [])
|
|
|
+
|
|
|
+ def find_current_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
|
|
|
+ """查询指定规范化名称的现行/试行标准"""
|
|
|
+ results = []
|
|
|
+ for record in self._current_records:
|
|
|
+ if record.normalized_name == normalized_name:
|
|
|
+ results.append(record)
|
|
|
+ return results
|
|
|
+
|
|
|
|
|
|
class StandardMatcher:
|
|
|
"""
|
|
|
@@ -178,52 +266,70 @@ class StandardMatcher:
|
|
|
def __init__(self, repository: StandardRepository):
|
|
|
self.repo = repository
|
|
|
|
|
|
- def match(self, seq_no: int, input_name: str, input_number: str) -> StandardMatchResult:
|
|
|
+ def match(self, seq_no: int, input_name: str, input_number: str) -> Optional[StandardMatchResult]:
|
|
|
"""
|
|
|
执行标准匹配
|
|
|
|
|
|
匹配流程:
|
|
|
- 1. 标准号精确匹配
|
|
|
- 2. 根据匹配结果进入不同分支处理
|
|
|
+ 1. 保存原始输入(用于返回)
|
|
|
+ 2. 创建规范化版本(用于匹配)
|
|
|
+ 3. 如果规范化后文件名为空,返回 None(跳过审查)
|
|
|
+ 4. 使用规范化数据进行匹配
|
|
|
+ 5. 返回结果中使用原始数据
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ StandardMatchResult: 匹配结果
|
|
|
+ None: 当规范化文件名为空时返回 None,表示跳过审查
|
|
|
"""
|
|
|
- # 去除前后空格
|
|
|
- input_name = input_name.strip() if input_name else input_name
|
|
|
- input_number = input_number.strip() if input_number else input_number
|
|
|
+ # 1. 保存原始输入
|
|
|
+ raw_name = input_name.strip() if input_name else ""
|
|
|
+ raw_number = input_number.strip() if input_number else ""
|
|
|
+
|
|
|
+ # 2. 创建规范化版本(去除所有符号,只保留中文字符)
|
|
|
+ normalized_name = self.repo._normalize_for_matching(raw_name)
|
|
|
+ normalized_number = self.repo._normalize_for_matching(raw_number)
|
|
|
|
|
|
- # 清洗书名号和括号
|
|
|
- input_name = self._clean_brackets_and_booknames(input_name)
|
|
|
- input_number = self._clean_brackets_and_booknames(input_number)
|
|
|
+ # 3. 如果规范化后文件名为空,跳过审查
|
|
|
+ if not normalized_name:
|
|
|
+ logger.info(f"文件名规范化后为空,跳过审查。原始名称: '{raw_name}'")
|
|
|
+ return None
|
|
|
|
|
|
+ # 4. 初始化结果(保存原始和规范化数据)
|
|
|
result = StandardMatchResult(
|
|
|
seq_no=seq_no,
|
|
|
- original_name=input_name,
|
|
|
- original_number=input_number
|
|
|
+ raw_name=raw_name,
|
|
|
+ raw_number=raw_number,
|
|
|
+ normalized_name=normalized_name,
|
|
|
+ normalized_number=normalized_number
|
|
|
)
|
|
|
|
|
|
- # 步骤1: 精确匹配标准号
|
|
|
- match_by_number = self.repo.find_by_number_exact(input_number)
|
|
|
+ # 5. 使用规范化数据进行匹配
|
|
|
+ match_by_number = self.repo.find_by_normalized_number(normalized_number)
|
|
|
|
|
|
if match_by_number:
|
|
|
# 分支A: 标准号匹配成功
|
|
|
- return self._handle_number_matched(result, match_by_number, input_name)
|
|
|
+ return self._handle_number_matched(result, match_by_number)
|
|
|
else:
|
|
|
# 分支B: 标准号未匹配
|
|
|
- return self._handle_number_not_matched(result, input_name, input_number)
|
|
|
+ return self._handle_number_not_matched(result, normalized_name, normalized_number)
|
|
|
|
|
|
def _handle_number_matched(
|
|
|
self,
|
|
|
result: StandardMatchResult,
|
|
|
- db_record: StandardRecord,
|
|
|
- input_name: str
|
|
|
+ db_record: StandardRecord
|
|
|
) -> StandardMatchResult:
|
|
|
"""处理标准号匹配成功的情况"""
|
|
|
- # 检查名称是否匹配
|
|
|
- if db_record.standard_name == input_name:
|
|
|
+ # 保存匹配到的数据库原始数据
|
|
|
+ result.matched_name = db_record.standard_name
|
|
|
+ result.matched_number = db_record.standard_number
|
|
|
+
|
|
|
+ # 使用规范化名称进行比较
|
|
|
+ if db_record.normalized_name == result.normalized_name:
|
|
|
# 名称也匹配
|
|
|
return self._handle_full_match(result, db_record)
|
|
|
else:
|
|
|
# 名称不匹配
|
|
|
- return self._handle_name_mismatch(result, db_record, input_name)
|
|
|
+ return self._handle_name_mismatch(result, db_record)
|
|
|
|
|
|
def _handle_full_match(
|
|
|
self,
|
|
|
@@ -231,39 +337,45 @@ class StandardMatcher:
|
|
|
db_record: StandardRecord
|
|
|
) -> StandardMatchResult:
|
|
|
"""处理名称和标准号都完全匹配的情况"""
|
|
|
+ # 【调试日志】
|
|
|
+ logger.info(f"[_handle_full_match] 匹配记录: name={db_record.standard_name}, "
|
|
|
+ f"number={db_record.standard_number}, validity={db_record.validity} "
|
|
|
+ f"(期望: {ValidityStatus.CURRENT.value}/{ValidityStatus.TRIAL.value}, "
|
|
|
+ f"实际是否匹配: {db_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]})")
|
|
|
+
|
|
|
if db_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
|
|
|
# 情况1: 现行或试行 - 状态正常
|
|
|
return self._set_ok_result(result)
|
|
|
else:
|
|
|
# 废止状态 - 查找替代标准
|
|
|
+ logger.info(f"[_handle_full_match] 进入废止处理流程")
|
|
|
return self._handle_abolished(result, db_record)
|
|
|
|
|
|
def _handle_name_mismatch(
|
|
|
self,
|
|
|
result: StandardMatchResult,
|
|
|
- db_record: StandardRecord,
|
|
|
- input_name: str
|
|
|
+ db_record: StandardRecord
|
|
|
) -> StandardMatchResult:
|
|
|
"""处理标准号匹配但名称不匹配的情况"""
|
|
|
- # 首先检查是否是名称模糊匹配(忽略空格、书名号等)
|
|
|
- if self._is_name_fuzzy_match(db_record.standard_name, input_name):
|
|
|
- # 名称模糊匹配成功,按完全匹配处理
|
|
|
+ # 首先检查是否是名称模糊匹配(使用规范化名称)
|
|
|
+ if self._is_name_fuzzy_match(db_record.normalized_name, result.normalized_name):
|
|
|
+ # 名称规范化后匹配成功,按完全匹配处理
|
|
|
return self._handle_full_match(result, db_record)
|
|
|
|
|
|
- # 尝试用输入的名称模糊匹配
|
|
|
- name_matches = self.repo.find_by_name_fuzzy(input_name)
|
|
|
+ # 尝试用规范化名称模糊匹配
|
|
|
+ name_matches = self.repo.find_by_normalized_name(result.normalized_name)
|
|
|
|
|
|
- # 查找精确名称匹配
|
|
|
- exact_match = self._find_exact_name_match(name_matches, input_name)
|
|
|
+ # 查找精确名称匹配(使用规范化名称)
|
|
|
+ exact_match = self._find_exact_name_match(name_matches, result.normalized_name)
|
|
|
|
|
|
if exact_match:
|
|
|
- # 找到名称匹配的记录
|
|
|
- return self._handle_fuzzy_name_match(result, exact_match)
|
|
|
-
|
|
|
- # 尝试在模糊匹配结果中查找模糊名称匹配
|
|
|
- for match in name_matches:
|
|
|
- if self._is_name_fuzzy_match(match.standard_name, input_name):
|
|
|
- return self._handle_fuzzy_name_match(result, match)
|
|
|
+ # 找到名称匹配的记录,检查标准号是否一致
|
|
|
+ if result.normalized_number == exact_match.normalized_number:
|
|
|
+ # 标准号实质一致,按完全匹配处理
|
|
|
+ return self._handle_full_match(result, exact_match)
|
|
|
+ else:
|
|
|
+ # 名称匹配但标准号不一致 = 标准号错误
|
|
|
+ return self._set_mismatch_result(result, exact_match)
|
|
|
|
|
|
# 名称完全不匹配,但标准号已匹配成功
|
|
|
# 说明该标准存在于库中,应返回不匹配而非不存在
|
|
|
@@ -277,42 +389,34 @@ class StandardMatcher:
|
|
|
def _handle_number_not_matched(
|
|
|
self,
|
|
|
result: StandardMatchResult,
|
|
|
- input_name: str,
|
|
|
- input_number: str
|
|
|
+ normalized_name: str,
|
|
|
+ normalized_number: str
|
|
|
) -> StandardMatchResult:
|
|
|
"""处理标准号未匹配的情况"""
|
|
|
- # 尝试模糊匹配标准号
|
|
|
- fuzzy_number_matches = self.repo.find_by_number_fuzzy(input_number)
|
|
|
+ # 尝试模糊匹配标准号(使用原始数据的方法,可能需要改进)
|
|
|
+ fuzzy_number_matches = self.repo.find_by_number_fuzzy(normalized_number)
|
|
|
|
|
|
if fuzzy_number_matches:
|
|
|
# 检查名称是否匹配
|
|
|
- return self._check_name_in_records(result, fuzzy_number_matches, input_name)
|
|
|
+ return self._check_name_in_records(result, fuzzy_number_matches, normalized_name)
|
|
|
else:
|
|
|
# 尝试直接按名称查询
|
|
|
- return self._search_by_name_only(result, input_name)
|
|
|
+ return self._search_by_name_only(result, normalized_name)
|
|
|
|
|
|
def _check_name_in_records(
|
|
|
self,
|
|
|
result: StandardMatchResult,
|
|
|
records: List[StandardRecord],
|
|
|
- input_name: str
|
|
|
+ normalized_name: str
|
|
|
) -> StandardMatchResult:
|
|
|
- """在一批记录中查找名称匹配"""
|
|
|
- # 首先尝试精确匹配
|
|
|
+ """在一批记录中查找名称匹配(标准号已模糊匹配成功)"""
|
|
|
+ # 首先尝试精确匹配(使用规范化名称)
|
|
|
for record in records:
|
|
|
- if record.standard_name == input_name:
|
|
|
+ if record.normalized_name == normalized_name:
|
|
|
# 名称匹配,检查状态
|
|
|
if record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
|
|
|
- return self._set_mismatch_result(result, record)
|
|
|
- elif record.validity == ValidityStatus.ABOLISHED.value:
|
|
|
- return self._handle_abolished(result, record)
|
|
|
-
|
|
|
- # 尝试模糊名称匹配(忽略空格和书名号)
|
|
|
- for record in records:
|
|
|
- if self._is_name_fuzzy_match(record.standard_name, input_name):
|
|
|
- # 名称模糊匹配成功
|
|
|
- if record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
|
|
|
- return self._set_mismatch_result(result, record)
|
|
|
+ # 标准号模糊匹配成功 + 名称匹配 + 现行/试行 = 正常
|
|
|
+ return self._set_ok_result(result)
|
|
|
elif record.validity == ValidityStatus.ABOLISHED.value:
|
|
|
return self._handle_abolished(result, record)
|
|
|
|
|
|
@@ -322,35 +426,22 @@ class StandardMatcher:
|
|
|
def _search_by_name_only(
|
|
|
self,
|
|
|
result: StandardMatchResult,
|
|
|
- input_name: str
|
|
|
+ normalized_name: str
|
|
|
) -> StandardMatchResult:
|
|
|
- """仅通过名称查询"""
|
|
|
- # 精确匹配名称
|
|
|
- name_match = self.repo.find_by_name_exact(input_name)
|
|
|
+ """仅通过名称查询(标准号未匹配)"""
|
|
|
+ # 精确匹配规范化名称
|
|
|
+ name_matches = self.repo.find_by_normalized_name(normalized_name)
|
|
|
|
|
|
- if name_match:
|
|
|
+ if name_matches:
|
|
|
+ # 取第一个匹配的记录
|
|
|
+ name_match = name_matches[0]
|
|
|
if name_match.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
|
|
|
+ # 标准号不匹配但名称匹配 + 现行/试行 = 标准号错误(不匹配)
|
|
|
return self._set_mismatch_result(result, name_match)
|
|
|
elif name_match.validity == ValidityStatus.ABOLISHED.value:
|
|
|
- return self._set_not_found_result(result)
|
|
|
-
|
|
|
- # 模糊匹配名称
|
|
|
- fuzzy_matches = self.repo.find_by_name_fuzzy(input_name)
|
|
|
-
|
|
|
- # 首先尝试精确匹配
|
|
|
- exact_match = self._find_exact_name_match(fuzzy_matches, input_name)
|
|
|
- if exact_match:
|
|
|
- if exact_match.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
|
|
|
- return self._set_mismatch_result(result, exact_match)
|
|
|
-
|
|
|
- # 尝试模糊名称匹配(忽略空格、书名号等)
|
|
|
- for match in fuzzy_matches:
|
|
|
- if self._is_name_fuzzy_match(match.standard_name, input_name):
|
|
|
- if match.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
|
|
|
- return self._set_mismatch_result(result, match)
|
|
|
- elif match.validity == ValidityStatus.ABOLISHED.value:
|
|
|
- return self._handle_abolished(result, match)
|
|
|
+ return self._handle_abolished(result, name_match)
|
|
|
|
|
|
+ # 名称未找到
|
|
|
return self._set_not_found_result(result)
|
|
|
|
|
|
def _handle_fuzzy_name_match(
|
|
|
@@ -358,9 +449,10 @@ class StandardMatcher:
|
|
|
result: StandardMatchResult,
|
|
|
match_record: StandardRecord
|
|
|
) -> StandardMatchResult:
|
|
|
- """处理模糊名称匹配成功的情况"""
|
|
|
+ """处理模糊名称匹配成功的情况(标准号已匹配)"""
|
|
|
if match_record.validity in [ValidityStatus.CURRENT.value, ValidityStatus.TRIAL.value]:
|
|
|
- return self._set_mismatch_result(result, match_record)
|
|
|
+ # 标准号匹配 + 名称模糊匹配 + 现行/试行 = 正常
|
|
|
+ return self._set_ok_result(result)
|
|
|
elif match_record.validity == ValidityStatus.ABOLISHED.value:
|
|
|
return self._handle_abolished(result, match_record)
|
|
|
return self._set_not_found_result(result)
|
|
|
@@ -371,8 +463,8 @@ class StandardMatcher:
|
|
|
abolished_record: StandardRecord
|
|
|
) -> StandardMatchResult:
|
|
|
"""处理已废止标准的情况"""
|
|
|
- # 查询同名现行标准作为替代
|
|
|
- substitutes = self.repo.find_current_by_name(abolished_record.standard_name)
|
|
|
+ # 查询同名现行标准作为替代(使用规范化名称)
|
|
|
+ substitutes = self.repo.find_current_by_normalized_name(abolished_record.normalized_name)
|
|
|
|
|
|
if substitutes:
|
|
|
# 有替代标准,取最新的(已按标准号降序)
|
|
|
@@ -422,26 +514,27 @@ class StandardMatcher:
|
|
|
result: StandardMatchResult,
|
|
|
substitute: StandardRecord
|
|
|
) -> StandardMatchResult:
|
|
|
- """设置被替代的结果"""
|
|
|
+ """设置被替代的结果 - 使用原始数据显示"""
|
|
|
result.substitute_name = self._format_standard_name(substitute.standard_name)
|
|
|
result.substitute_number = self._format_standard_number(substitute.standard_number)
|
|
|
result.process_result = "被替代"
|
|
|
result.status_code = MatchResultCode.SUBSTITUTED.value
|
|
|
+ # 使用 raw_name(原始输入)和 matched_name(数据库原始值)显示
|
|
|
result.final_result = (
|
|
|
- f"{self._format_standard_name(result.original_name)}"
|
|
|
- f"{self._format_standard_number(result.original_number)}已废止,"
|
|
|
+ f"{self._format_standard_name(result.raw_name)}"
|
|
|
+ f"{self._format_standard_number(result.raw_number)}已废止,"
|
|
|
f"替代{self._format_standard_name(substitute.standard_name)}"
|
|
|
f"{self._format_standard_number(substitute.standard_number)}"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
def _set_abolished_result(self, result: StandardMatchResult) -> StandardMatchResult:
|
|
|
- """设置废止无替代的结果"""
|
|
|
+ """设置废止无替代的结果 - 使用原始数据显示"""
|
|
|
result.process_result = "废止无现行"
|
|
|
result.status_code = MatchResultCode.ABOLISHED.value
|
|
|
result.final_result = (
|
|
|
- f"{self._format_standard_name(result.original_name)}"
|
|
|
- f"{self._format_standard_number(result.original_number)}已废止,无现行状态"
|
|
|
+ f"{self._format_standard_name(result.raw_name)}"
|
|
|
+ f"{self._format_standard_number(result.raw_number)}已废止,无现行状态"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
@@ -450,40 +543,37 @@ class StandardMatcher:
|
|
|
result: StandardMatchResult,
|
|
|
actual: StandardRecord
|
|
|
) -> StandardMatchResult:
|
|
|
- """设置不匹配的结果"""
|
|
|
+ """设置不匹配的结果 - 使用原始数据显示"""
|
|
|
result.substitute_name = self._format_standard_name(actual.standard_name)
|
|
|
result.substitute_number = self._format_standard_number(actual.standard_number)
|
|
|
result.process_result = "不匹配"
|
|
|
result.status_code = MatchResultCode.MISMATCH.value
|
|
|
result.final_result = (
|
|
|
- f"{self._format_standard_name(result.original_name)}"
|
|
|
- f"{self._format_standard_number(result.original_number)}"
|
|
|
+ f"{self._format_standard_name(result.raw_name)}"
|
|
|
+ f"{self._format_standard_number(result.raw_number)}"
|
|
|
f"与实际{self._format_standard_name(actual.standard_name)}"
|
|
|
f"{self._format_standard_number(actual.standard_number)}不匹配"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
def _set_not_found_result(self, result: StandardMatchResult) -> StandardMatchResult:
|
|
|
- """设置不存在的结果"""
|
|
|
+ """设置不存在的结果 - 使用原始数据显示"""
|
|
|
result.process_result = "标准库不存在"
|
|
|
result.status_code = MatchResultCode.NOT_FOUND.value
|
|
|
result.final_result = (
|
|
|
- f"{self._format_standard_name(result.original_name)}"
|
|
|
- f"{self._format_standard_number(result.original_number)}标准库不存在,请确认"
|
|
|
+ f"{self._format_standard_name(result.raw_name)}"
|
|
|
+ f"{self._format_standard_number(result.raw_number)}标准库不存在,请确认"
|
|
|
)
|
|
|
return result
|
|
|
|
|
|
# ========== 工具方法 ==========
|
|
|
|
|
|
- def _is_name_fuzzy_match(self, name1: str, name2: str) -> bool:
|
|
|
+ def _is_name_fuzzy_match(self, normalized_name1: str, normalized_name2: str) -> bool:
|
|
|
"""
|
|
|
判断两个标准名称是否模糊匹配
|
|
|
- 只去除书名号,保留中间空格(中间空格属于名称的一部分)
|
|
|
+ 使用规范化后的名称进行比较(已去除空格、括号、书名号等)
|
|
|
"""
|
|
|
- # 清理书名号,但保留中间空格
|
|
|
- clean1 = name1.replace("《", "").replace("》", "")
|
|
|
- clean2 = name2.replace("《", "").replace("》", "")
|
|
|
- return clean1 == clean2
|
|
|
+ return normalized_name1 == normalized_name2
|
|
|
|
|
|
def _clean_brackets_and_booknames(self, text: str) -> str:
|
|
|
"""
|
|
|
@@ -545,14 +635,49 @@ class StandardMatcher:
|
|
|
|
|
|
return text
|
|
|
|
|
|
+ def _extract_chinese_chars(self, text: str) -> str:
|
|
|
+ """
|
|
|
+ 提取字符串中的中文字符和空格
|
|
|
+ 保留:中文字符(\u4e00-\u9fa5)、中文标点、空格(无换行符时)
|
|
|
+ 删除:英文、数字、特殊符号、换行符等
|
|
|
+ 特殊处理:如果存在换行符,则去除所有空格
|
|
|
+ """
|
|
|
+ if not text:
|
|
|
+ return text
|
|
|
+
|
|
|
+ import re
|
|
|
+
|
|
|
+ # 检查是否存在换行符(在清洗前检查)
|
|
|
+ has_newline = '\n' in text or '\r' in text
|
|
|
+
|
|
|
+ # 首先去除换行符及其旁边的所有空格
|
|
|
+ text = re.sub(r'\s*[\n\r]+\s*', '', text)
|
|
|
+ # 去除制表符
|
|
|
+ text = text.replace('\t', '')
|
|
|
+
|
|
|
+ if has_newline:
|
|
|
+ # 有换行符时:提取中文字符,去除所有空格
|
|
|
+ chinese_pattern = re.compile(r'[\u4e00-\u9fa5\u3000-\u303F\uFF00-\uFFEF]+')
|
|
|
+ matches = chinese_pattern.findall(text)
|
|
|
+ result = ''.join(matches)
|
|
|
+ # 去除所有空格(包括全角空格)
|
|
|
+ result = result.replace(' ', '').replace(' ', '')
|
|
|
+ return result.strip()
|
|
|
+ else:
|
|
|
+ # 无换行符时:提取中文字符和空格,保留中间空格
|
|
|
+ chinese_pattern = re.compile(r'[\u4e00-\u9fa5\u3000-\u303F\uFF00-\uFFEF\s]+')
|
|
|
+ matches = chinese_pattern.findall(text)
|
|
|
+ result = ''.join(matches)
|
|
|
+ return result.strip()
|
|
|
+
|
|
|
def _find_exact_name_match(
|
|
|
self,
|
|
|
records: List[StandardRecord],
|
|
|
- target_name: str
|
|
|
+ target_normalized_name: str
|
|
|
) -> Optional[StandardRecord]:
|
|
|
- """在记录列表中查找精确名称匹配"""
|
|
|
+ """在记录列表中查找规范化名称精确匹配"""
|
|
|
for record in records:
|
|
|
- if record.standard_name == target_name:
|
|
|
+ if record.normalized_name == target_normalized_name:
|
|
|
return record
|
|
|
return None
|
|
|
|
|
|
@@ -632,7 +757,7 @@ class StandardMatchingService:
|
|
|
- standard_number: 标准号(原始)
|
|
|
|
|
|
Returns:
|
|
|
- List[StandardMatchResult]: 匹配结果列表
|
|
|
+ List[StandardMatchResult]: 匹配结果列表(文件名为空的会被过滤掉)
|
|
|
"""
|
|
|
if not self._initialized:
|
|
|
raise RuntimeError("服务未初始化,请先调用 initialize()")
|
|
|
@@ -644,7 +769,9 @@ class StandardMatchingService:
|
|
|
input_name=std.get("standard_name", ""),
|
|
|
input_number=std.get("standard_number", "")
|
|
|
)
|
|
|
- results.append(result)
|
|
|
+ # 跳过文件名为空的情况(match 返回 None)
|
|
|
+ if result is not None:
|
|
|
+ results.append(result)
|
|
|
return results
|
|
|
|
|
|
def check_single(
|
|
|
@@ -652,7 +779,7 @@ class StandardMatchingService:
|
|
|
seq_no: int,
|
|
|
standard_name: str,
|
|
|
standard_number: str
|
|
|
- ) -> StandardMatchResult:
|
|
|
+ ) -> Optional[StandardMatchResult]:
|
|
|
"""
|
|
|
检查单个标准
|
|
|
|
|
|
@@ -663,6 +790,7 @@ class StandardMatchingService:
|
|
|
|
|
|
Returns:
|
|
|
StandardMatchResult: 匹配结果
|
|
|
+ None: 当文件名为空时返回 None,表示跳过审查
|
|
|
"""
|
|
|
if not self._initialized:
|
|
|
raise RuntimeError("服务未初始化,请先调用 initialize()")
|