|
@@ -9,6 +9,7 @@
|
|
|
- StandardMatcher: 匹配规则逻辑
|
|
- StandardMatcher: 匹配规则逻辑
|
|
|
- StandardMatchingService: 对外服务接口
|
|
- StandardMatchingService: 对外服务接口
|
|
|
"""
|
|
"""
|
|
|
|
|
+import re
|
|
|
from typing import List, Dict, Optional
|
|
from typing import List, Dict, Optional
|
|
|
from dataclasses import dataclass, field
|
|
from dataclasses import dataclass, field
|
|
|
from enum import Enum
|
|
from enum import Enum
|
|
@@ -85,6 +86,7 @@ class StandardRepository:
|
|
|
|
|
|
|
|
# 规范化索引(用于匹配)
|
|
# 规范化索引(用于匹配)
|
|
|
self._normalized_number_index: Dict[str, StandardRecord] = {} # 规范化标准号 -> 记录
|
|
self._normalized_number_index: Dict[str, StandardRecord] = {} # 规范化标准号 -> 记录
|
|
|
|
|
+ self._normalized_number_records_index: Dict[str, List[StandardRecord]] = {} # 规范化标准号 -> 多条记录
|
|
|
self._normalized_name_index: Dict[str, List[StandardRecord]] = {} # 规范化名称 -> 记录列表
|
|
self._normalized_name_index: Dict[str, List[StandardRecord]] = {} # 规范化名称 -> 记录列表
|
|
|
|
|
|
|
|
def load_data(self, raw_data: List[Dict]):
|
|
def load_data(self, raw_data: List[Dict]):
|
|
@@ -100,6 +102,7 @@ class StandardRepository:
|
|
|
self._name_index = {}
|
|
self._name_index = {}
|
|
|
self._current_records = []
|
|
self._current_records = []
|
|
|
self._normalized_number_index = {}
|
|
self._normalized_number_index = {}
|
|
|
|
|
+ self._normalized_number_records_index = {}
|
|
|
self._normalized_name_index = {}
|
|
self._normalized_name_index = {}
|
|
|
|
|
|
|
|
for item in raw_data:
|
|
for item in raw_data:
|
|
@@ -131,6 +134,9 @@ class StandardRepository:
|
|
|
|
|
|
|
|
# 建立规范化索引(用于匹配)
|
|
# 建立规范化索引(用于匹配)
|
|
|
self._normalized_number_index[record.normalized_number] = record
|
|
self._normalized_number_index[record.normalized_number] = record
|
|
|
|
|
+ if record.normalized_number not in self._normalized_number_records_index:
|
|
|
|
|
+ self._normalized_number_records_index[record.normalized_number] = []
|
|
|
|
|
+ self._normalized_number_records_index[record.normalized_number].append(record)
|
|
|
if record.normalized_name not in self._normalized_name_index:
|
|
if record.normalized_name not in self._normalized_name_index:
|
|
|
self._normalized_name_index[record.normalized_name] = []
|
|
self._normalized_name_index[record.normalized_name] = []
|
|
|
self._normalized_name_index[record.normalized_name].append(record)
|
|
self._normalized_name_index[record.normalized_name].append(record)
|
|
@@ -170,10 +176,10 @@ class StandardRepository:
|
|
|
normalized_input = self._normalize_for_matching(standard_number)
|
|
normalized_input = self._normalize_for_matching(standard_number)
|
|
|
|
|
|
|
|
# 使用规范化索引进行前缀匹配
|
|
# 使用规范化索引进行前缀匹配
|
|
|
- for normalized_number, record in self._normalized_number_index.items():
|
|
|
|
|
|
|
+ for normalized_number, records in self._normalized_number_records_index.items():
|
|
|
# 前缀匹配:检查是否以规范化后的输入开头,或包含关系
|
|
# 前缀匹配:检查是否以规范化后的输入开头,或包含关系
|
|
|
if normalized_number.startswith(normalized_input) or normalized_input in normalized_number:
|
|
if normalized_number.startswith(normalized_input) or normalized_input in normalized_number:
|
|
|
- results.append(record)
|
|
|
|
|
|
|
+ results.extend(records)
|
|
|
return results
|
|
return results
|
|
|
|
|
|
|
|
def find_current_by_name(self, normalized_standard_name: str) -> List[StandardRecord]:
|
|
def find_current_by_name(self, normalized_standard_name: str) -> List[StandardRecord]:
|
|
@@ -246,6 +252,10 @@ class StandardRepository:
|
|
|
"""通过规范化标准号精确匹配"""
|
|
"""通过规范化标准号精确匹配"""
|
|
|
return self._normalized_number_index.get(normalized_number)
|
|
return self._normalized_number_index.get(normalized_number)
|
|
|
|
|
|
|
|
|
|
+ def find_all_by_normalized_number(self, normalized_number: str) -> List[StandardRecord]:
|
|
|
|
|
+ """通过规范化标准号获取全部匹配记录"""
|
|
|
|
|
+ return self._normalized_number_records_index.get(normalized_number, [])
|
|
|
|
|
+
|
|
|
def find_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
|
|
def find_by_normalized_name(self, normalized_name: str) -> List[StandardRecord]:
|
|
|
"""通过规范化名称匹配"""
|
|
"""通过规范化名称匹配"""
|
|
|
return self._normalized_name_index.get(normalized_name, [])
|
|
return self._normalized_name_index.get(normalized_name, [])
|
|
@@ -267,6 +277,10 @@ class StandardMatcher:
|
|
|
|
|
|
|
|
def __init__(self, repository: StandardRepository):
|
|
def __init__(self, repository: StandardRepository):
|
|
|
self.repo = repository
|
|
self.repo = repository
|
|
|
|
|
+ self._year_version_only_name_pattern = re.compile(
|
|
|
|
|
+ r"^(?:(?:20(?:0\d|1\d|2[0-6]))(?:年版|年|版)?)+$"
|
|
|
|
|
+ )
|
|
|
|
|
+ self._year_only_number_pattern = re.compile(r"^20(?:0\d|1\d|2[0-6])$")
|
|
|
|
|
|
|
|
def match(self, seq_no: int, input_name: str, input_number: str) -> Optional[StandardMatchResult]:
|
|
def match(self, seq_no: int, input_name: str, input_number: str) -> Optional[StandardMatchResult]:
|
|
|
"""
|
|
"""
|
|
@@ -287,6 +301,13 @@ class StandardMatcher:
|
|
|
raw_name = input_name.strip() if input_name else ""
|
|
raw_name = input_name.strip() if input_name else ""
|
|
|
raw_number = input_number.strip() if input_number else ""
|
|
raw_number = input_number.strip() if input_number else ""
|
|
|
|
|
|
|
|
|
|
+ if self._is_year_only_number(raw_number):
|
|
|
|
|
+ logger.info(
|
|
|
|
|
+ "[skip_year_only_number] "
|
|
|
|
|
+ f"raw_name={raw_name}, raw_number={raw_number}"
|
|
|
|
|
+ )
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
# 2. 创建规范化版本(去除所有符号,只保留中文字符)
|
|
# 2. 创建规范化版本(去除所有符号,只保留中文字符)
|
|
|
normalized_name = self.repo._normalize_for_matching(raw_name)
|
|
normalized_name = self.repo._normalize_for_matching(raw_name)
|
|
|
normalized_number = self.repo._normalize_for_matching(raw_number)
|
|
normalized_number = self.repo._normalize_for_matching(raw_number)
|
|
@@ -306,7 +327,11 @@ class StandardMatcher:
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
# 5. 使用规范化数据进行匹配
|
|
# 5. 使用规范化数据进行匹配
|
|
|
- match_by_number = self.repo.find_by_normalized_number(normalized_number)
|
|
|
|
|
|
|
+ exact_number_matches = self.repo.find_all_by_normalized_number(normalized_number)
|
|
|
|
|
+ if self._should_skip_ambiguous_numeric_version_name(raw_name, raw_number, exact_number_matches):
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ match_by_number = exact_number_matches[0] if exact_number_matches else None
|
|
|
if match_by_number:
|
|
if match_by_number:
|
|
|
logger.info(
|
|
logger.info(
|
|
|
"[standard_number_exact_match] "
|
|
"[standard_number_exact_match] "
|
|
@@ -584,6 +609,49 @@ class StandardMatcher:
|
|
|
"""
|
|
"""
|
|
|
return normalized_name1 == normalized_name2
|
|
return normalized_name1 == normalized_name2
|
|
|
|
|
|
|
|
|
|
+ def _should_skip_ambiguous_numeric_version_name(
|
|
|
|
|
+ self,
|
|
|
|
|
+ raw_name: str,
|
|
|
|
|
+ raw_number: str,
|
|
|
|
|
+ records: List[StandardRecord]
|
|
|
|
|
+ ) -> bool:
|
|
|
|
|
+ """标准号对应多个中文名且提取名称仅为年份/版次时,跳过时效性审查。"""
|
|
|
|
|
+ if len(records) < 2:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ unique_names = {record.standard_name.strip() for record in records if record.standard_name}
|
|
|
|
|
+ if len(unique_names) < 2:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ if not self._is_year_version_only_name(raw_name):
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(
|
|
|
|
|
+ "[skip_ambiguous_numeric_version_name] "
|
|
|
|
|
+ f"raw_name={raw_name}, raw_number={raw_number}, "
|
|
|
|
|
+ f"candidate_names={sorted(unique_names)}"
|
|
|
|
|
+ )
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ def _is_year_version_only_name(self, raw_name: str) -> bool:
|
|
|
|
|
+ """判断提取出的名称是否只是 2000-2026 年份及“年/版”的噪声文本。"""
|
|
|
|
|
+ if not raw_name:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ compact_name = re.sub(r"\s+", "", raw_name)
|
|
|
|
|
+ if re.search(r"[A-Za-z]", compact_name):
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ return bool(self._year_version_only_name_pattern.fullmatch(compact_name))
|
|
|
|
|
+
|
|
|
|
|
+ def _is_year_only_number(self, raw_number: str) -> bool:
|
|
|
|
|
+ """判断提取出的标准号是否只是 2000-2026 年份。"""
|
|
|
|
|
+ if not raw_number:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ compact_number = re.sub(r"\s+", "", raw_number)
|
|
|
|
|
+ return bool(self._year_only_number_pattern.fullmatch(compact_number))
|
|
|
|
|
+
|
|
|
def _clean_brackets_and_booknames(self, text: str) -> str:
|
|
def _clean_brackets_and_booknames(self, text: str) -> str:
|
|
|
"""
|
|
"""
|
|
|
清洗字符串前后的书名号和括号
|
|
清洗字符串前后的书名号和括号
|