|
|
@@ -147,11 +147,11 @@ def extract_first_json(text: str) -> dict:
|
|
|
|
|
|
# ===== 6.5) 辅助函数:提取规范名称 =====
|
|
|
def _extract_regulation_name(text: str) -> str:
|
|
|
- """从原文中提取规范名称(书名号内的内容)"""
|
|
|
+ """从原文中提取规范名称(书名号内的内容,标准化后返回)"""
|
|
|
match = re.search(r'《([^《》]+)》', text)
|
|
|
if match:
|
|
|
- return match.group(1).strip()
|
|
|
- return text.strip()
|
|
|
+ return _normalize_name(match.group(1))
|
|
|
+ return _normalize_name(text)
|
|
|
|
|
|
|
|
|
# ===== 6.6) 辅助函数:提取现有编号 =====
|
|
|
@@ -262,6 +262,20 @@ _REFERENCE_DB_PATH = os.path.join(
|
|
|
os.path.dirname(__file__), "..", "gauge", "basic_reference_db.csv"
|
|
|
)
|
|
|
|
|
|
+# 名称匹配时要去除的特殊字符(中英文标点,逐个字符列出避免转义问题)
|
|
|
+_NAME_STRIP_TABLE = str.maketrans('', '', (
|
|
|
+ ' \t\n\r,.\'\"<>()[]{}`~!@#$%^&*+=|\\/:;?'
|
|
|
+ '、。, ();:!?‘’“”'
|
|
|
+ '《》「」『』【】〔〕'
|
|
|
+ '—–―- ̄…'
|
|
|
+ '\xb7'
|
|
|
+))
|
|
|
+
|
|
|
+
|
|
|
+def _normalize_name(name: str) -> str:
|
|
|
+ """标准化名称:去除空白和标点特殊字符,用于库匹配"""
|
|
|
+ return name.translate(_NAME_STRIP_TABLE)
|
|
|
+
|
|
|
|
|
|
def _load_reference_db() -> Dict[str, bool]:
|
|
|
"""
|
|
|
@@ -281,7 +295,7 @@ def _load_reference_db() -> Dict[str, bool]:
|
|
|
if not raw_name:
|
|
|
continue
|
|
|
# 去掉书名号
|
|
|
- name = raw_name.replace("《", "").replace("》", "").strip()
|
|
|
+ name = _normalize_name(raw_name)
|
|
|
number_exists_str = (row.get("number_exists") or "").strip().lower()
|
|
|
db[name] = number_exists_str == "true"
|
|
|
except Exception:
|
|
|
@@ -298,18 +312,39 @@ def _check_number_required(original_text: str) -> Optional[bool]:
|
|
|
Returns:
|
|
|
True — 在库中且 number_exists=true,应有编号
|
|
|
False — 在库中且 number_exists=false,不需要编号
|
|
|
- None — 未在库中找到,不确定
|
|
|
+ None — 未在库中找到,不确定(视为不要求标准编号)
|
|
|
"""
|
|
|
ref_db = _load_reference_db()
|
|
|
- # 从原文中提取名称(去掉书名号)
|
|
|
+ # 从原文中提取名称并标准化
|
|
|
name_match = re.search(r'《([^》]+)》', original_text)
|
|
|
- name_key = name_match.group(1).strip() if name_match else original_text.strip()
|
|
|
+ name_key = _normalize_name(name_match.group(1)) if name_match else _normalize_name(original_text)
|
|
|
|
|
|
if name_key in ref_db:
|
|
|
return ref_db[name_key]
|
|
|
return None
|
|
|
|
|
|
|
|
|
+def _save_debug(debug_records: List[dict]):
|
|
|
+ """保存编制依据参考库命中情况的 debug 信息"""
|
|
|
+ debug_dir = os.path.join(
|
|
|
+ os.path.dirname(__file__), "..", "..", "..", "..", "..",
|
|
|
+ "temp", "construction_review", "debug"
|
|
|
+ )
|
|
|
+ os.makedirs(debug_dir, exist_ok=True)
|
|
|
+ debug_path = os.path.join(debug_dir, "规范性命中审查debug.json")
|
|
|
+ # 追加模式:如果已有文件,合并写入
|
|
|
+ existing = []
|
|
|
+ if os.path.exists(debug_path):
|
|
|
+ try:
|
|
|
+ with open(debug_path, "r", encoding="utf-8") as f:
|
|
|
+ existing = json.load(f)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ existing.extend(debug_records)
|
|
|
+ with open(debug_path, "w", encoding="utf-8") as f:
|
|
|
+ json.dump(existing, f, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+
|
|
|
# ===== 6.8) 辅助函数:生成建议 =====
|
|
|
async def _generate_suggestion_with_validation(
|
|
|
original_text: str,
|
|
|
@@ -338,60 +373,54 @@ async def process_punctuation_results(check_results: str) -> str:
|
|
|
|
|
|
# 为每个检查结果生成问题分析
|
|
|
results = []
|
|
|
+ debug_records = []
|
|
|
for item in check_data:
|
|
|
original_text = item.get("original_text", "")
|
|
|
title_status = item.get("title_mark_status", False)
|
|
|
bracket_status = item.get("bracket_status")
|
|
|
|
|
|
- # 根据状态生成问题分析
|
|
|
- issue_point = "编制依据格式错误"
|
|
|
- risk_level = "中风险"
|
|
|
-
|
|
|
- # 判断问题类型并生成建议
|
|
|
invalid_number_format = item.get("invalid_number_format", False)
|
|
|
invalid_number_content = item.get("invalid_number_content", "")
|
|
|
|
|
|
- if title_status is not True:
|
|
|
- # 书名号问题
|
|
|
+ # 先查库:命中且 number_exists=true 才检查格式,否则一律不提示
|
|
|
+ number_required = _check_number_required(original_text)
|
|
|
+
|
|
|
+ if number_required is not True:
|
|
|
+ issue_point = "编制依据格式正确"
|
|
|
+ risk_level = "无风险"
|
|
|
+ reason = "该条目不要求标准编号"
|
|
|
+ suggestion = "无"
|
|
|
+ elif title_status is not True:
|
|
|
+ issue_point = "编制依据格式错误"
|
|
|
+ risk_level = "中风险"
|
|
|
reason = "格式错误!正确格式:《规范名称》(编号)"
|
|
|
suggestion = "将规范名称用书名号《》包裹,正确格式:《规范名称》(编号)"
|
|
|
elif bracket_status is None:
|
|
|
- # 缺少编号:查编制依据参考库判断是否应有编号
|
|
|
- number_required = _check_number_required(original_text)
|
|
|
-
|
|
|
- if number_required is True:
|
|
|
- # 在参考库中且 number_exists=true,确实应有编号但缺失
|
|
|
- reason = "缺少规范编号"
|
|
|
- base_suggestion = "补充规范编号,正确格式:《规范名称》(编号)"
|
|
|
- suggestion = await _generate_suggestion_with_validation(
|
|
|
- original_text, "missing_number", base_suggestion
|
|
|
- )
|
|
|
- else:
|
|
|
- # 不在参考库中,或 number_exists=false → 不报错
|
|
|
- issue_point = "编制依据格式正确"
|
|
|
- reason = "该条目不要求标准编号"
|
|
|
- suggestion = "无"
|
|
|
- risk_level = "无风险"
|
|
|
+ issue_point = "编制依据格式错误"
|
|
|
+ risk_level = "中风险"
|
|
|
+ reason = "缺少规范编号"
|
|
|
+ base_suggestion = "补充规范编号,正确格式:《规范名称》(编号)"
|
|
|
+ suggestion = await _generate_suggestion_with_validation(
|
|
|
+ original_text, "missing_number", base_suggestion
|
|
|
+ )
|
|
|
elif invalid_number_format:
|
|
|
- # 括号内有编号,但格式不正确(纯数字)
|
|
|
+ issue_point = "编制依据格式错误"
|
|
|
+ risk_level = "中风险"
|
|
|
reason = f"格式错误!当前编号为纯数字"
|
|
|
suggestion = f"规范编号应为英文加数字或中文加数字,而不是纯数字"
|
|
|
elif bracket_status is False:
|
|
|
- # 有编号但无括号
|
|
|
+ issue_point = "编制依据格式错误"
|
|
|
+ risk_level = "中风险"
|
|
|
reason = "格式错误!正确格式:《规范名称》(编号)"
|
|
|
base_suggestion = "将规范编号用括号包裹,正确格式:《规范名称》(编号)"
|
|
|
- # 调用新流程生成建议(验证+生成)
|
|
|
suggestion = await _generate_suggestion_with_validation(
|
|
|
original_text, "missing_bracket", base_suggestion
|
|
|
)
|
|
|
else:
|
|
|
- # 格式正确:《文件名》(文件编号)
|
|
|
- # 编制依据格式检查只检查格式,不检查编号内容正确性
|
|
|
- # 编号是否正确(如是否已废止)由时效性检查处理
|
|
|
issue_point = "编制依据格式正确"
|
|
|
+ risk_level = "无风险"
|
|
|
reason = "规范名称和编号的标点符号使用规范"
|
|
|
suggestion = "无"
|
|
|
- risk_level = "无风险"
|
|
|
|
|
|
# 构建结果
|
|
|
result_item = {
|
|
|
@@ -403,6 +432,28 @@ async def process_punctuation_results(check_results: str) -> str:
|
|
|
}
|
|
|
results.append(result_item)
|
|
|
|
|
|
+ # 记录 debug 信息(所有条目都保存)
|
|
|
+ ref_db = _load_reference_db()
|
|
|
+ name_match = re.search(r'《([^》]+)》', original_text)
|
|
|
+ name_key = _normalize_name(name_match.group(1)) if name_match else _normalize_name(original_text)
|
|
|
+ debug_records.append({
|
|
|
+ "original_text": original_text,
|
|
|
+ "extracted_name": name_key,
|
|
|
+ "in_db": name_key in ref_db,
|
|
|
+ "db_number_exists": ref_db.get(name_key),
|
|
|
+ "title_mark_status": title_status,
|
|
|
+ "bracket_status": bracket_status,
|
|
|
+ "invalid_number_format": invalid_number_format,
|
|
|
+ "number_required": number_required,
|
|
|
+ "final_issue_point": issue_point,
|
|
|
+ "final_risk_level": risk_level,
|
|
|
+ "final_reason": reason,
|
|
|
+ })
|
|
|
+
|
|
|
+ # 保存 debug 信息
|
|
|
+ if debug_records:
|
|
|
+ _save_debug(debug_records)
|
|
|
+
|
|
|
# 返回JSON格式结果
|
|
|
return json.dumps(results, ensure_ascii=False, indent=2)
|
|
|
|