|
|
@@ -67,6 +67,8 @@ HUMAN = """
|
|
|
|
|
|
3. **has_exact_match**(是否有名称编号都相同的文件)
|
|
|
- 参考文件中的编号和文件名与审查规范完全匹配,返回 true
|
|
|
+ - **重要**:比较时忽略括号格式差异(半角()和全角()视为相同)
|
|
|
+ - 例如:《规范》(GB 1234-2020)与《规范》(GB 1234-2020)视为完全匹配
|
|
|
- 否则返回 false
|
|
|
|
|
|
4. **exact_match_info**(名称编号相同的文件及状态)
|
|
|
@@ -163,6 +165,98 @@ def _extract_regulation_info(text: str) -> Tuple[str, Optional[str]]:
|
|
|
return name, number
|
|
|
|
|
|
|
|
|
+def _normalize_text(text: str) -> str:
|
|
|
+ """
|
|
|
+ 标准化文本,统一括号格式用于比较
|
|
|
+ 将全角括号转换为半角括号,去除多余空格
|
|
|
+ """
|
|
|
+ if not text:
|
|
|
+ return text
|
|
|
+ # 全角括号转为半角括号
|
|
|
+ text = text.replace('(', '(').replace(')', ')')
|
|
|
+ # 统一书名号(中文书名号保持不变,但统一全角半角)
|
|
|
+ text = text.replace('『', '《').replace('』', '》')
|
|
|
+ text = text.replace('﹄', '《').replace('﹃', '》')
|
|
|
+ # 去除多余空格
|
|
|
+ text = ' '.join(text.split())
|
|
|
+ return text.strip()
|
|
|
+
|
|
|
+
|
|
|
+def _extract_core_number(number: str) -> str:
|
|
|
+ """
|
|
|
+ 提取规范编号的核心部分(去掉年份)
|
|
|
+ 例如:JGJ 65-2013 -> JGJ65, GB/T 50010-2010 -> GB/T50010
|
|
|
+
|
|
|
+ Args:
|
|
|
+ number: 规范编号,如 "JGJ 65-2013"
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 核心编号,如 "JGJ65"
|
|
|
+ """
|
|
|
+ if not number:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # 标准化:转大写、去空格
|
|
|
+ normalized = number.upper().replace(' ', '')
|
|
|
+
|
|
|
+ # 去掉年份部分(-YYYY 或 —YYYY)
|
|
|
+ # 匹配末尾的年份 -4位数字 或 —4位数字 或 - 4位数字
|
|
|
+ normalized = re.sub(r'[-—]\s*\d{4}$', '', normalized)
|
|
|
+
|
|
|
+ return normalized
|
|
|
+
|
|
|
+
|
|
|
+def _is_same_regulation_family(original_number: str, generated_number: str, threshold: int = 100) -> bool:
|
|
|
+ """
|
|
|
+ 判断两个编号是否属于同一规范家族(核心部分相同或高度相似)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ original_number: 原始编号
|
|
|
+ generated_number: 生成的编号
|
|
|
+ threshold: 数字差异阈值,默认100
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ bool: 是否属于同一规范家族
|
|
|
+ """
|
|
|
+ original_core = _extract_core_number(original_number)
|
|
|
+ generated_core = _extract_core_number(generated_number)
|
|
|
+
|
|
|
+ if not original_core or not generated_core:
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 如果核心部分完全相同,肯定是同一规范
|
|
|
+ if original_core == generated_core:
|
|
|
+ return True
|
|
|
+
|
|
|
+ # 提取前缀(如 JGJ、GB/T 等)和数字部分
|
|
|
+ def _split_core(core: str) -> tuple:
|
|
|
+ """将核心编号拆分为前缀和数字部分"""
|
|
|
+ match = re.match(r'^([A-Z]+(?:/[A-Z])?)(\d+(?:\.\d+)?)$', core)
|
|
|
+ if match:
|
|
|
+ return match.group(1), match.group(2)
|
|
|
+ return core, ""
|
|
|
+
|
|
|
+ orig_prefix, orig_num = _split_core(original_core)
|
|
|
+ gen_prefix, gen_num = _split_core(generated_core)
|
|
|
+
|
|
|
+ # 如果前缀相同但数字不同,可能是同一系列的不同规范
|
|
|
+ # 例如 JGJ65 和 JGJ300 都是 JGJ 系列,但是完全不同的规范
|
|
|
+ # 我们认为:如果前缀相同且数字相似(差值在一定范围内),才算同一规范家族
|
|
|
+ if orig_prefix == gen_prefix and orig_num and gen_num:
|
|
|
+ try:
|
|
|
+ orig_val = float(orig_num)
|
|
|
+ gen_val = float(gen_num)
|
|
|
+ # 【关键阈值】如果数字差异达到或超过阈值,认为是完全不同的规范
|
|
|
+ if abs(orig_val - gen_val) >= threshold:
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+ except ValueError:
|
|
|
+ # 无法转换为数字,直接比较字符串
|
|
|
+ pass
|
|
|
+
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
# ===== 9) 新流程:验证并生成正确编号 =====
|
|
|
async def validate_and_generate_number(
|
|
|
review_item: str,
|
|
|
@@ -189,6 +283,21 @@ async def validate_and_generate_number(
|
|
|
if existing_number:
|
|
|
logger.info(f"[时效性验证] 验证编号: 《{regulation_name}》 {existing_number}")
|
|
|
|
|
|
+ # 先进行本地标准化比较:检查参考候选中是否有编号完全匹配(忽略括号差异)的
|
|
|
+ normalized_existing = _normalize_text(existing_number)
|
|
|
+ for candidate in reference_candidates:
|
|
|
+ # 从候选中提取编号
|
|
|
+ _, candidate_number = _extract_regulation_info(candidate)
|
|
|
+ if candidate_number and _normalize_text(candidate_number) == normalized_existing:
|
|
|
+ logger.info(f"[时效性验证] 本地验证通过(编号匹配): 《{regulation_name}》 {existing_number}")
|
|
|
+ return ValidationMatchResult(
|
|
|
+ review_item=review_item,
|
|
|
+ reference_candidates=reference_candidates,
|
|
|
+ is_valid=True,
|
|
|
+ validated_number=existing_number,
|
|
|
+ status="验证通过"
|
|
|
+ )
|
|
|
+
|
|
|
# 调用3模型验证
|
|
|
validation = await validate_reference_number(
|
|
|
regulation_name=regulation_name,
|
|
|
@@ -323,7 +432,44 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
|
|
|
exact_info = raw_item.get("exact_match_info", "")
|
|
|
same_name_current = raw_item.get("same_name_current", "")
|
|
|
|
|
|
- # 如果有精确匹配,直接接受
|
|
|
+ # 【校正逻辑】如果LLM判断has_exact_match=false,但本地比较发现编号相同(忽略括号差异),则校正为true
|
|
|
+ if not has_exact and exact_info:
|
|
|
+ _, review_number = _extract_regulation_info(review_item)
|
|
|
+ _, exact_number = _extract_regulation_info(exact_info)
|
|
|
+ if review_number and exact_number and _normalize_text(review_number) == _normalize_text(exact_number):
|
|
|
+ logger.info(f"[规范匹配校正] review_item='{review_item}' 编号实质相同,校正has_exact_match为true")
|
|
|
+ has_exact = True
|
|
|
+
|
|
|
+ # 【第一步】先检查向量搜索候选中是否有精确匹配(编号完全相同)
|
|
|
+ # ref_candidates 是 List[List[str]],需要获取当前项对应的候选列表
|
|
|
+ current_candidates = ref_candidates[i] if i < len(ref_candidates) else []
|
|
|
+ _, review_number = _extract_regulation_info(review_item)
|
|
|
+
|
|
|
+ if review_number and current_candidates:
|
|
|
+ normalized_review_number = _normalize_text(review_number)
|
|
|
+ exact_match_found = False
|
|
|
+
|
|
|
+ for candidate in current_candidates:
|
|
|
+ if isinstance(candidate, str):
|
|
|
+ _, candidate_number = _extract_regulation_info(candidate)
|
|
|
+ if candidate_number and _normalize_text(candidate_number) == normalized_review_number:
|
|
|
+ # 向量库中找到精确匹配,直接使用,不需要AI投票
|
|
|
+ logger.info(f"[规范匹配] 向量库中找到精确匹配: '{review_item}' -> '{candidate}'")
|
|
|
+ final_results.append({
|
|
|
+ "review_item": review_item,
|
|
|
+ "has_related_file": True,
|
|
|
+ "has_exact_match": True,
|
|
|
+ "exact_match_info": candidate,
|
|
|
+ "same_name_current": candidate
|
|
|
+ })
|
|
|
+ exact_match_found = True
|
|
|
+ break
|
|
|
+
|
|
|
+ # 如果找到了精确匹配,跳过本次循环
|
|
|
+ if exact_match_found:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 如果有精确匹配(由LLM判断),直接接受
|
|
|
if has_exact and exact_info:
|
|
|
final_results.append({
|
|
|
"review_item": review_item,
|
|
|
@@ -334,15 +480,35 @@ async def match_reference_files(reference_text: str, review_text: str) -> str:
|
|
|
})
|
|
|
continue
|
|
|
|
|
|
- # 如果没有精确匹配,但有相关文件,进行验证/生成
|
|
|
- if has_related or ref_candidates:
|
|
|
+ # 【第二步】如果没有精确匹配,但有相关文件,进行验证/生成
|
|
|
+ # 使用当前项的候选列表(不是整个二维列表)
|
|
|
+ if has_related or current_candidates:
|
|
|
try:
|
|
|
validation_result = await validate_and_generate_number(
|
|
|
review_item=review_item,
|
|
|
- reference_candidates=ref_candidates
|
|
|
+ reference_candidates=current_candidates
|
|
|
)
|
|
|
|
|
|
if validation_result.validated_number:
|
|
|
+ # 【关键逻辑】检查生成的编号与原始编号是否属于同一规范家族
|
|
|
+ is_same_family = _is_same_regulation_family(
|
|
|
+ review_number or "",
|
|
|
+ validation_result.validated_number
|
|
|
+ )
|
|
|
+
|
|
|
+ if not is_same_family:
|
|
|
+ # 生成的编号与原始编号完全不同,说明参考库中找到的文件实际上不相关
|
|
|
+ logger.info(f"[规范匹配] '{review_item}' 生成的编号({validation_result.validated_number})"
|
|
|
+ f"与原始编号({review_number})不属于同一规范家族,判定为无相关文件")
|
|
|
+ final_results.append({
|
|
|
+ "review_item": review_item,
|
|
|
+ "has_related_file": False, # 【关键】标记为无相关文件
|
|
|
+ "has_exact_match": False,
|
|
|
+ "exact_match_info": "",
|
|
|
+ "same_name_current": ""
|
|
|
+ })
|
|
|
+ continue
|
|
|
+
|
|
|
if validation_result.is_valid:
|
|
|
# 验证通过,原始编号正确
|
|
|
final_results.append({
|