|
|
@@ -1069,79 +1069,46 @@ class AIReviewEngine(BaseReviewer):
|
|
|
# 使用模糊匹配器
|
|
|
matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
|
|
|
|
|
|
- # 构建 outline 标题到分类代码的映射
|
|
|
+ # 🆕 构建 match_catalogue_by_title 需要的 outline_by_first 结构
|
|
|
# outline: [{"chapter_classification": "basis", "title": "第一章 编制依据", "subsections": [...]}]
|
|
|
- outline_first_map = {} # title -> chapter_classification
|
|
|
- outline_second_map = {} # (first_title, sub_title) -> secondary_category_code
|
|
|
+ outline_by_first = {}
|
|
|
|
|
|
for chapter in outline_chapters:
|
|
|
if not isinstance(chapter, dict):
|
|
|
continue
|
|
|
first_code = chapter.get('chapter_classification', '')
|
|
|
first_title = chapter.get('title', '')
|
|
|
- if first_code and first_title:
|
|
|
- outline_first_map[first_title] = first_code
|
|
|
+
|
|
|
+ # 如果code为空但有title,用title的hash作为临时key
|
|
|
+ if not first_code and first_title:
|
|
|
+ import hashlib
|
|
|
+ first_code = f"__title_{hashlib.md5(first_title.encode()).hexdigest()[:8]}"
|
|
|
+ logger.debug(f"[{name}] 一级code为空,使用临时key: {first_code} -> {first_title}")
|
|
|
+
|
|
|
+ if not first_code:
|
|
|
+ continue
|
|
|
|
|
|
+ # 收集二级标题
|
|
|
+ subsections = []
|
|
|
for sub in chapter.get('subsections', []):
|
|
|
if isinstance(sub, dict):
|
|
|
sub_title = sub.get('title', '')
|
|
|
- second_code = sub.get('secondary_category_code', '')
|
|
|
- if first_title and sub_title and second_code:
|
|
|
- outline_second_map[(first_title, sub_title)] = second_code
|
|
|
-
|
|
|
- # 使用 catalog 的标题,匹配 outline 的分类代码
|
|
|
- outline_first = set()
|
|
|
- outline_secondary = {}
|
|
|
-
|
|
|
- for chapter in catalog_chapters:
|
|
|
- if not isinstance(chapter, dict):
|
|
|
- continue
|
|
|
-
|
|
|
- catalog_title = chapter.get('title', '')
|
|
|
- if not catalog_title:
|
|
|
- continue
|
|
|
-
|
|
|
- # 尝试从 outline 匹配一级分类代码
|
|
|
- first_code = outline_first_map.get(catalog_title)
|
|
|
-
|
|
|
- # 如果精确匹配失败,尝试模糊匹配
|
|
|
- if not first_code:
|
|
|
- for outline_title, code in outline_first_map.items():
|
|
|
- if matcher._calculate_similarity(catalog_title, outline_title) > 0.7:
|
|
|
- first_code = code
|
|
|
- break
|
|
|
-
|
|
|
- if first_code:
|
|
|
- outline_first.add(first_code)
|
|
|
-
|
|
|
- # 匹配二级分类
|
|
|
- for sub in chapter.get('subsections', []):
|
|
|
- if not isinstance(sub, dict):
|
|
|
- continue
|
|
|
- sub_title = sub.get('title', '')
|
|
|
- if not sub_title:
|
|
|
- continue
|
|
|
-
|
|
|
- # 尝试精确匹配
|
|
|
- second_code = outline_second_map.get((catalog_title, sub_title))
|
|
|
-
|
|
|
- # 模糊匹配
|
|
|
- if not second_code and first_code:
|
|
|
- for (outline_first, outline_sub), code in outline_second_map.items():
|
|
|
- if matcher._calculate_similarity(catalog_title, outline_first) > 0.7 and \
|
|
|
- matcher._calculate_similarity(sub_title, outline_sub) > 0.7:
|
|
|
- second_code = code
|
|
|
- break
|
|
|
-
|
|
|
- if first_code and second_code:
|
|
|
- outline_secondary[(first_code, second_code)] = sub_title
|
|
|
+ if sub_title:
|
|
|
+ subsections.append(sub_title)
|
|
|
+ elif isinstance(sub, str):
|
|
|
+ if sub:
|
|
|
+ subsections.append(sub)
|
|
|
+
|
|
|
+ outline_by_first[first_code] = {
|
|
|
+ 'title': first_title,
|
|
|
+ 'subsections': subsections
|
|
|
+ }
|
|
|
|
|
|
- logger.info(f"[{name}] 匹配到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
|
|
|
+ logger.info(f"[{name}] 提取到 {len(outline_by_first)} 个一级目录")
|
|
|
|
|
|
# 执行标准目录匹配检查
|
|
|
- match_result = matcher.match_catalogue(
|
|
|
- outline_first=outline_first,
|
|
|
- outline_secondary=outline_secondary,
|
|
|
+ match_result = matcher.match_catalogue_by_title(
|
|
|
+ outline_by_first=outline_by_first,
|
|
|
threshold=0.6
|
|
|
)
|
|
|
|