|
@@ -11,8 +11,6 @@ import difflib
|
|
|
import logging
|
|
import logging
|
|
|
import re
|
|
import re
|
|
|
from typing import Dict, List, Optional, Set, Tuple, Any
|
|
from typing import Dict, List, Optional, Set, Tuple, Any
|
|
|
-from collections import defaultdict
|
|
|
|
|
-from pathlib import Path
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
|
|
|
|
@@ -129,10 +127,28 @@ class OutlineCatalogueMatcher:
|
|
|
pass # 加载失败不影响主功能
|
|
pass # 加载失败不影响主功能
|
|
|
|
|
|
|
|
def _normalize_text(self, text: str) -> str:
|
|
def _normalize_text(self, text: str) -> str:
|
|
|
- """文本标准化"""
|
|
|
|
|
|
|
+ """文本标准化 - 清洗序号、标点、空格"""
|
|
|
if not text:
|
|
if not text:
|
|
|
return ""
|
|
return ""
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 去除章节序号(如"一、" "5.1 " "(1)" "第1章"等)
|
|
|
|
|
+ # "第X章/节"格式
|
|
|
|
|
+ text = re.sub(r'^第[一二三四五六七八九十百千\d]+[章节][、.\s]*', '', text)
|
|
|
|
|
+ # 中文序号:一、二、三...,(一)、(二)...
|
|
|
|
|
+ text = re.sub(r'^[((]?[一二三四五六七八九十百千]+[))]?[、.\s]*', '', text)
|
|
|
|
|
+ # 多级数字序号:1.1.1、5.1、1.1.2等(循环处理多级)
|
|
|
|
|
+ while re.match(r'^\d+[..]', text):
|
|
|
|
|
+ text = re.sub(r'^\d+[..]', '', text)
|
|
|
|
|
+ # 括号数字:(1)、[1]、(1)
|
|
|
|
|
+ text = re.sub(r'^[((\[]\d+[))\]][、.\s]*', '', text)
|
|
|
|
|
+ # 单个数字+标点/空格:1)、1]、1】
|
|
|
|
|
+ text = re.sub(r'^\d+[))\]】][、.\s]*', '', text)
|
|
|
|
|
+ # 纯数字+空格(如"5 ")
|
|
|
|
|
+ text = re.sub(r'^\d+\s+', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 去除标点符号
|
|
|
text = re.sub(r'[\s\n\r\t.,;:!?,。;:!?、""''()()【】\[\]《》<>]', '', text)
|
|
text = re.sub(r'[\s\n\r\t.,;:!?,。;:!?、""''()()【】\[\]《》<>]', '', text)
|
|
|
|
|
+
|
|
|
return text.lower().strip()
|
|
return text.lower().strip()
|
|
|
|
|
|
|
|
def _calculate_similarity(self, text1: str, text2: str) -> float:
|
|
def _calculate_similarity(self, text1: str, text2: str) -> float:
|
|
@@ -284,11 +300,14 @@ class OutlineCatalogueMatcher:
|
|
|
|
|
|
|
|
matched_first = set()
|
|
matched_first = set()
|
|
|
missing_first = []
|
|
missing_first = []
|
|
|
|
|
+ # 🆕 建立标准code到实际code的映射(用于二级匹配时找到正确的subsections)
|
|
|
|
|
+ first_code_mapping: Dict[str, str] = {} # {标准req_code: 实际outline_code}
|
|
|
|
|
|
|
|
for req_code, req_name in self.first_names.items():
|
|
for req_code, req_name in self.first_names.items():
|
|
|
# 优先:直接用code精确匹配,因为一级分类通常较准
|
|
# 优先:直接用code精确匹配,因为一级分类通常较准
|
|
|
if req_code in actual_first_titles:
|
|
if req_code in actual_first_titles:
|
|
|
matched_first.add(req_code)
|
|
matched_first.add(req_code)
|
|
|
|
|
+ first_code_mapping[req_code] = req_code # 精确匹配,映射到自己
|
|
|
logger.debug(f"[一级匹配] {req_name}: 存在")
|
|
logger.debug(f"[一级匹配] {req_name}: 存在")
|
|
|
else:
|
|
else:
|
|
|
# 尝试用标题模糊匹配
|
|
# 尝试用标题模糊匹配
|
|
@@ -302,6 +321,7 @@ class OutlineCatalogueMatcher:
|
|
|
for code, title in actual_first_titles.items():
|
|
for code, title in actual_first_titles.items():
|
|
|
if title == matched_title:
|
|
if title == matched_title:
|
|
|
matched_first.add(req_code)
|
|
matched_first.add(req_code)
|
|
|
|
|
+ first_code_mapping[req_code] = code # 记录映射关系
|
|
|
logger.debug(f"[一级模糊匹配] {req_name} -> {matched_title} ({score:.3f})")
|
|
logger.debug(f"[一级模糊匹配] {req_name} -> {matched_title} ({score:.3f})")
|
|
|
break
|
|
break
|
|
|
else:
|
|
else:
|
|
@@ -331,7 +351,9 @@ class OutlineCatalogueMatcher:
|
|
|
first_code, second_code = req_key
|
|
first_code, second_code = req_key
|
|
|
|
|
|
|
|
# 🆕 步骤1:优先在同一一级下匹配
|
|
# 🆕 步骤1:优先在同一一级下匹配
|
|
|
- same_group_titles = outline_by_first.get(first_code, {}).get('subsections', [])
|
|
|
|
|
|
|
+ # 使用映射找到实际的code(处理一级模糊匹配后的映射关系)
|
|
|
|
|
+ actual_first_code = first_code_mapping.get(first_code, first_code)
|
|
|
|
|
+ same_group_titles = outline_by_first.get(actual_first_code, {}).get('subsections', [])
|
|
|
best_score_same = 0.0
|
|
best_score_same = 0.0
|
|
|
best_match_same = None
|
|
best_match_same = None
|
|
|
|
|
|