Просмотр исходного кода

Merge branch 'dev' into dev-planWrite

tangle 2 недель назад
Родитель
Сommit
ebfbb4bcdd

+ 88 - 37
core/construction_review/component/reviewers/reference_basis_reviewer/punctuation_result_processor.py

@@ -147,11 +147,11 @@ def extract_first_json(text: str) -> dict:
 
 # ===== 6.5) 辅助函数:提取规范名称 =====
 def _extract_regulation_name(text: str) -> str:
-    """从原文中提取规范名称(书名号内的内容)"""
+    """从原文中提取规范名称(书名号内的内容,标准化后返回)"""
     match = re.search(r'《([^《》]+)》', text)
     if match:
-        return match.group(1).strip()
-    return text.strip()
+        return _normalize_name(match.group(1))
+    return _normalize_name(text)
 
 
 # ===== 6.6) 辅助函数:提取现有编号 =====
@@ -262,6 +262,20 @@ _REFERENCE_DB_PATH = os.path.join(
     os.path.dirname(__file__), "..", "gauge", "basic_reference_db.csv"
 )
 
+# 名称匹配时要去除的特殊字符(中英文标点,逐个字符列出避免转义问题)
+_NAME_STRIP_TABLE = str.maketrans('', '', (
+    ' \t\n\r,.\'\"<>()[]{}`~!@#$%^&*+=|\\/:;?'
+    '、。, ();:!?‘’“”'
+    '《》「」『』【】〔〕'
+    '—–―- ̄…'
+    '\xb7'
+))
+
+
+def _normalize_name(name: str) -> str:
+    """标准化名称:去除空白和标点特殊字符,用于库匹配"""
+    return name.translate(_NAME_STRIP_TABLE)
+
 
 def _load_reference_db() -> Dict[str, bool]:
     """
@@ -281,7 +295,7 @@ def _load_reference_db() -> Dict[str, bool]:
                 if not raw_name:
                     continue
                 # 去掉书名号
-                name = raw_name.replace("《", "").replace("》", "").strip()
+                name = _normalize_name(raw_name)
                 number_exists_str = (row.get("number_exists") or "").strip().lower()
                 db[name] = number_exists_str == "true"
     except Exception:
@@ -298,18 +312,39 @@ def _check_number_required(original_text: str) -> Optional[bool]:
     Returns:
         True  — 在库中且 number_exists=true,应有编号
         False — 在库中且 number_exists=false,不需要编号
-        None  — 未在库中找到,不确定
+        None  — 未在库中找到,不确定(视为不要求标准编号)
     """
     ref_db = _load_reference_db()
-    # 从原文中提取名称(去掉书名号)
+    # 从原文中提取名称并标准化
     name_match = re.search(r'《([^》]+)》', original_text)
-    name_key = name_match.group(1).strip() if name_match else original_text.strip()
+    name_key = _normalize_name(name_match.group(1)) if name_match else _normalize_name(original_text)
 
     if name_key in ref_db:
         return ref_db[name_key]
     return None
 
 
+def _save_debug(debug_records: List[dict]):
+    """保存编制依据参考库命中情况的 debug 信息"""
+    debug_dir = os.path.join(
+        os.path.dirname(__file__), "..", "..", "..", "..", "..",
+        "temp", "construction_review", "debug"
+    )
+    os.makedirs(debug_dir, exist_ok=True)
+    debug_path = os.path.join(debug_dir, "规范性命中审查debug.json")
+    # 追加模式:如果已有文件,合并写入
+    existing = []
+    if os.path.exists(debug_path):
+        try:
+            with open(debug_path, "r", encoding="utf-8") as f:
+                existing = json.load(f)
+        except Exception:
+            pass
+    existing.extend(debug_records)
+    with open(debug_path, "w", encoding="utf-8") as f:
+        json.dump(existing, f, ensure_ascii=False, indent=2)
+
+
 # ===== 6.8) 辅助函数:生成建议 =====
 async def _generate_suggestion_with_validation(
     original_text: str,
@@ -338,60 +373,54 @@ async def process_punctuation_results(check_results: str) -> str:
 
     # 为每个检查结果生成问题分析
     results = []
+    debug_records = []
     for item in check_data:
         original_text = item.get("original_text", "")
         title_status = item.get("title_mark_status", False)
         bracket_status = item.get("bracket_status")
 
-        # 根据状态生成问题分析
-        issue_point = "编制依据格式错误"
-        risk_level = "中风险"
-
-        # 判断问题类型并生成建议
         invalid_number_format = item.get("invalid_number_format", False)
         invalid_number_content = item.get("invalid_number_content", "")
 
-        if title_status is not True:
-            # 书名号问题
+        # 先查库:命中且 number_exists=true 才检查格式,否则一律不提示
+        number_required = _check_number_required(original_text)
+
+        if number_required is not True:
+            issue_point = "编制依据格式正确"
+            risk_level = "无风险"
+            reason = "该条目不要求标准编号"
+            suggestion = "无"
+        elif title_status is not True:
+            issue_point = "编制依据格式错误"
+            risk_level = "中风险"
             reason = "格式错误!正确格式:《规范名称》(编号)"
             suggestion = "将规范名称用书名号《》包裹,正确格式:《规范名称》(编号)"
         elif bracket_status is None:
-            # 缺少编号:查编制依据参考库判断是否应有编号
-            number_required = _check_number_required(original_text)
-
-            if number_required is True:
-                # 在参考库中且 number_exists=true,确实应有编号但缺失
-                reason = "缺少规范编号"
-                base_suggestion = "补充规范编号,正确格式:《规范名称》(编号)"
-                suggestion = await _generate_suggestion_with_validation(
-                    original_text, "missing_number", base_suggestion
-                )
-            else:
-                # 不在参考库中,或 number_exists=false → 不报错
-                issue_point = "编制依据格式正确"
-                reason = "该条目不要求标准编号"
-                suggestion = "无"
-                risk_level = "无风险"
+            issue_point = "编制依据格式错误"
+            risk_level = "中风险"
+            reason = "缺少规范编号"
+            base_suggestion = "补充规范编号,正确格式:《规范名称》(编号)"
+            suggestion = await _generate_suggestion_with_validation(
+                original_text, "missing_number", base_suggestion
+            )
         elif invalid_number_format:
-            # 括号内有编号,但格式不正确(纯数字)
+            issue_point = "编制依据格式错误"
+            risk_level = "中风险"
             reason = f"格式错误!当前编号为纯数字"
             suggestion = f"规范编号应为英文加数字或中文加数字,而不是纯数字"
         elif bracket_status is False:
-            # 有编号但无括号
+            issue_point = "编制依据格式错误"
+            risk_level = "中风险"
             reason = "格式错误!正确格式:《规范名称》(编号)"
             base_suggestion = "将规范编号用括号包裹,正确格式:《规范名称》(编号)"
-            # 调用新流程生成建议(验证+生成)
             suggestion = await _generate_suggestion_with_validation(
                 original_text, "missing_bracket", base_suggestion
             )
         else:
-            # 格式正确:《文件名》(文件编号)
-            # 编制依据格式检查只检查格式,不检查编号内容正确性
-            # 编号是否正确(如是否已废止)由时效性检查处理
             issue_point = "编制依据格式正确"
+            risk_level = "无风险"
             reason = "规范名称和编号的标点符号使用规范"
             suggestion = "无"
-            risk_level = "无风险"
 
         # 构建结果
         result_item = {
@@ -403,6 +432,28 @@ async def process_punctuation_results(check_results: str) -> str:
         }
         results.append(result_item)
 
+        # 记录 debug 信息(所有条目都保存)
+        ref_db = _load_reference_db()
+        name_match = re.search(r'《([^》]+)》', original_text)
+        name_key = _normalize_name(name_match.group(1)) if name_match else _normalize_name(original_text)
+        debug_records.append({
+            "original_text": original_text,
+            "extracted_name": name_key,
+            "in_db": name_key in ref_db,
+            "db_number_exists": ref_db.get(name_key),
+            "title_mark_status": title_status,
+            "bracket_status": bracket_status,
+            "invalid_number_format": invalid_number_format,
+            "number_required": number_required,
+            "final_issue_point": issue_point,
+            "final_risk_level": risk_level,
+            "final_reason": reason,
+        })
+
+    # 保存 debug 信息
+    if debug_records:
+        _save_debug(debug_records)
+
     # 返回JSON格式结果
     return json.dumps(results, ensure_ascii=False, indent=2)
 

+ 2 - 2
core/construction_review/component/reviewers/utils/directory_extraction.py

@@ -80,8 +80,8 @@ def fallback_regex(text: str) -> BasisItems:
     """
     items: List[BasisItem] = []
     
-    # 标准化换行符:将换行符替换为空格
-    text = text.replace("\r\n", " ").replace("\r", " ").replace("\n", " ")
+    # 去掉换行符和空白字符,避免 pattern4 因空格/换行提前截断条目
+    text = re.sub(r'\s+', '', text)
     
     # 使用正则表达式匹配所有编制依据条目
     # 优化后的模式: