|
@@ -1012,8 +1012,6 @@ class AIReviewEngine(BaseReviewer):
|
|
|
Returns:
|
|
Returns:
|
|
|
Dict[str, Any]: 包含缺失一级、二级目录的统计结果
|
|
Dict[str, Any]: 包含缺失一级、二级目录的统计结果
|
|
|
"""
|
|
"""
|
|
|
- from .outline_catalogue_matcher import OutlineCatalogueMatcher
|
|
|
|
|
-
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
|
name = "outline_catalogue_check"
|
|
name = "outline_catalogue_check"
|
|
|
|
|
|
|
@@ -1025,149 +1023,104 @@ class AIReviewEngine(BaseReviewer):
|
|
|
Path(__file__).parent / 'doc_worker' / 'config' /
|
|
Path(__file__).parent / 'doc_worker' / 'config' /
|
|
|
'StandardCategoryTable.csv'
|
|
'StandardCategoryTable.csv'
|
|
|
)
|
|
)
|
|
|
- raw_content_csv = str(
|
|
|
|
|
- Path(__file__).parent / 'doc_worker' / 'config' /
|
|
|
|
|
- 'construction_plan_standards.csv'
|
|
|
|
|
- )
|
|
|
|
|
|
|
|
|
|
- # 获取 catalog(YOLO+OCR提取的原始目录)和 outline(分类后的结构)
|
|
|
|
|
|
|
+ # 获取 catalog(YOLO+OCR提取的目录,已包含分类代码)
|
|
|
catalog_chapters = []
|
|
catalog_chapters = []
|
|
|
- outline_chapters = []
|
|
|
|
|
|
|
|
|
|
if outline_data and isinstance(outline_data, dict):
|
|
if outline_data and isinstance(outline_data, dict):
|
|
|
- # 从 outline_data 获取 catalog(优先)
|
|
|
|
|
catalog_raw = outline_data.get('catalog')
|
|
catalog_raw = outline_data.get('catalog')
|
|
|
if catalog_raw and isinstance(catalog_raw, dict):
|
|
if catalog_raw and isinstance(catalog_raw, dict):
|
|
|
catalog_chapters = catalog_raw.get('chapters', [])
|
|
catalog_chapters = catalog_raw.get('chapters', [])
|
|
|
- # 获取 outline(用于分类代码映射)
|
|
|
|
|
- outline_raw = outline_data.get('outline')
|
|
|
|
|
- if isinstance(outline_raw, dict):
|
|
|
|
|
- outline_chapters = outline_raw.get('chapters', [])
|
|
|
|
|
- elif isinstance(outline_raw, list):
|
|
|
|
|
- outline_chapters = outline_raw
|
|
|
|
|
|
|
|
|
|
# 从 state 回退获取
|
|
# 从 state 回退获取
|
|
|
- if state and isinstance(state, dict):
|
|
|
|
|
|
|
+ if not catalog_chapters and state and isinstance(state, dict):
|
|
|
structured = state.get('structured_content', {})
|
|
structured = state.get('structured_content', {})
|
|
|
|
|
+ catalog_raw = structured.get('catalog')
|
|
|
|
|
+ if catalog_raw and isinstance(catalog_raw, dict):
|
|
|
|
|
+ catalog_chapters = catalog_raw.get('chapters', [])
|
|
|
|
|
|
|
|
- # 获取 catalog
|
|
|
|
|
- if not catalog_chapters:
|
|
|
|
|
- catalog_raw = structured.get('catalog')
|
|
|
|
|
- if catalog_raw and isinstance(catalog_raw, dict):
|
|
|
|
|
- catalog_chapters = catalog_raw.get('chapters', [])
|
|
|
|
|
-
|
|
|
|
|
- # 获取 outline
|
|
|
|
|
- if not outline_chapters:
|
|
|
|
|
- outline_raw = structured.get('outline', {})
|
|
|
|
|
- if isinstance(outline_raw, dict):
|
|
|
|
|
- outline_chapters = outline_raw.get('chapters', [])
|
|
|
|
|
- elif isinstance(outline_raw, list):
|
|
|
|
|
- outline_chapters = outline_raw
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f"[{name}] catalog: {len(catalog_chapters)} 章, outline: {len(outline_chapters)} 章")
|
|
|
|
|
-
|
|
|
|
|
- # 使用模糊匹配器
|
|
|
|
|
- matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
|
|
|
|
|
-
|
|
|
|
|
- # 构建 outline 标题到分类代码的映射
|
|
|
|
|
- # outline: [{"chapter_classification": "basis", "title": "第一章 编制依据", "subsections": [...]}]
|
|
|
|
|
- outline_first_map = {} # title -> chapter_classification
|
|
|
|
|
- outline_second_map = {} # (first_title, sub_title) -> secondary_category_code
|
|
|
|
|
-
|
|
|
|
|
- for chapter in outline_chapters:
|
|
|
|
|
|
|
+ logger.info(f"[{name}] catalog: {len(catalog_chapters)} 章")
|
|
|
|
|
+
|
|
|
|
|
+ # 读取 CSV 标准分类表
|
|
|
|
|
+ import pandas as pd
|
|
|
|
|
+ df = pd.read_csv(csv_path)
|
|
|
|
|
+
|
|
|
|
|
+ # 构建标准一级和二级目录
|
|
|
|
|
+ standard_first = {} # code -> name
|
|
|
|
|
+ standard_second = {} # (first_code, second_code) -> name
|
|
|
|
|
+ for _, row in df.iterrows():
|
|
|
|
|
+ first_code = row.get('first_code', '')
|
|
|
|
|
+ first_name = row.get('first_name', '')
|
|
|
|
|
+ second_code = row.get('second_code', '')
|
|
|
|
|
+ second_name = row.get('second_name', '')
|
|
|
|
|
+
|
|
|
|
|
+ if first_code and first_name:
|
|
|
|
|
+ standard_first[first_code] = first_name
|
|
|
|
|
+ if first_code and second_code and second_name:
|
|
|
|
|
+ standard_second[(first_code, second_code)] = second_name
|
|
|
|
|
+
|
|
|
|
|
+ # 从 catalog 收集实际存在的一级和二级 code
|
|
|
|
|
+ actual_first = set()
|
|
|
|
|
+ actual_second = set()
|
|
|
|
|
+ for chapter in catalog_chapters:
|
|
|
if not isinstance(chapter, dict):
|
|
if not isinstance(chapter, dict):
|
|
|
continue
|
|
continue
|
|
|
first_code = chapter.get('chapter_classification', '')
|
|
first_code = chapter.get('chapter_classification', '')
|
|
|
- first_title = chapter.get('title', '')
|
|
|
|
|
- if first_code and first_title:
|
|
|
|
|
- outline_first_map[first_title] = first_code
|
|
|
|
|
|
|
+ if first_code:
|
|
|
|
|
+ actual_first.add(first_code)
|
|
|
|
|
|
|
|
for sub in chapter.get('subsections', []):
|
|
for sub in chapter.get('subsections', []):
|
|
|
if isinstance(sub, dict):
|
|
if isinstance(sub, dict):
|
|
|
- sub_title = sub.get('title', '')
|
|
|
|
|
second_code = sub.get('secondary_category_code', '')
|
|
second_code = sub.get('secondary_category_code', '')
|
|
|
- if first_title and sub_title and second_code:
|
|
|
|
|
- outline_second_map[(first_title, sub_title)] = second_code
|
|
|
|
|
-
|
|
|
|
|
- # 使用 catalog 的标题,匹配 outline 的分类代码
|
|
|
|
|
- outline_first = set()
|
|
|
|
|
- outline_secondary = {}
|
|
|
|
|
-
|
|
|
|
|
- for chapter in catalog_chapters:
|
|
|
|
|
- if not isinstance(chapter, dict):
|
|
|
|
|
- continue
|
|
|
|
|
-
|
|
|
|
|
- catalog_title = chapter.get('title', '')
|
|
|
|
|
- if not catalog_title:
|
|
|
|
|
- continue
|
|
|
|
|
-
|
|
|
|
|
- # 尝试从 outline 匹配一级分类代码
|
|
|
|
|
- first_code = outline_first_map.get(catalog_title)
|
|
|
|
|
|
|
+ if first_code and second_code:
|
|
|
|
|
+ actual_second.add((first_code, second_code))
|
|
|
|
|
+
|
|
|
|
|
+ # 计算缺失项
|
|
|
|
|
+ missing_first = []
|
|
|
|
|
+ matched_first = set()
|
|
|
|
|
+ for code, name in standard_first.items():
|
|
|
|
|
+ if code in actual_first:
|
|
|
|
|
+ matched_first.add(code)
|
|
|
|
|
+ else:
|
|
|
|
|
+ missing_first.append({'first_code': code, 'first_name': name})
|
|
|
|
|
|
|
|
- # 如果精确匹配失败,尝试模糊匹配
|
|
|
|
|
- if not first_code:
|
|
|
|
|
- for outline_title, code in outline_first_map.items():
|
|
|
|
|
- if matcher._calculate_similarity(catalog_title, outline_title) > 0.7:
|
|
|
|
|
- first_code = code
|
|
|
|
|
- break
|
|
|
|
|
|
|
+ missing_second = []
|
|
|
|
|
+ matched_second = set()
|
|
|
|
|
+ for (fc, sc), name in standard_second.items():
|
|
|
|
|
+ if (fc, sc) in actual_second:
|
|
|
|
|
+ matched_second.add((fc, sc))
|
|
|
|
|
+ else:
|
|
|
|
|
+ missing_second.append({
|
|
|
|
|
+ 'first_code': fc,
|
|
|
|
|
+ 'second_code': sc,
|
|
|
|
|
+ 'second_name': name
|
|
|
|
|
+ })
|
|
|
|
|
|
|
|
- if first_code:
|
|
|
|
|
- outline_first.add(first_code)
|
|
|
|
|
|
|
+ logger.info(f"[{name}] 标准一级: {len(standard_first)} 个, 实际: {len(matched_first)} 个, 缺失: {len(missing_first)} 个")
|
|
|
|
|
+ logger.info(f"[{name}] 标准二级: {len(standard_second)} 个, 实际: {len(matched_second)} 个, 缺失: {len(missing_second)} 个")
|
|
|
|
|
|
|
|
- # 匹配二级分类
|
|
|
|
|
- for sub in chapter.get('subsections', []):
|
|
|
|
|
- if not isinstance(sub, dict):
|
|
|
|
|
- continue
|
|
|
|
|
- sub_title = sub.get('title', '')
|
|
|
|
|
- if not sub_title:
|
|
|
|
|
- continue
|
|
|
|
|
-
|
|
|
|
|
- # 尝试精确匹配
|
|
|
|
|
- second_code = outline_second_map.get((catalog_title, sub_title))
|
|
|
|
|
-
|
|
|
|
|
- # 模糊匹配
|
|
|
|
|
- if not second_code and first_code:
|
|
|
|
|
- for (outline_first, outline_sub), code in outline_second_map.items():
|
|
|
|
|
- if matcher._calculate_similarity(catalog_title, outline_first) > 0.7 and \
|
|
|
|
|
- matcher._calculate_similarity(sub_title, outline_sub) > 0.7:
|
|
|
|
|
- second_code = code
|
|
|
|
|
- break
|
|
|
|
|
-
|
|
|
|
|
- if first_code and second_code:
|
|
|
|
|
- outline_secondary[(first_code, second_code)] = sub_title
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f"[{name}] 匹配到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
|
|
|
|
|
-
|
|
|
|
|
- # 执行标准目录匹配检查
|
|
|
|
|
- match_result = matcher.match_catalogue(
|
|
|
|
|
- outline_first=outline_first,
|
|
|
|
|
- outline_secondary=outline_secondary,
|
|
|
|
|
- threshold=0.6
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
catalogue_result = {
|
|
catalogue_result = {
|
|
|
"level": "primary_and_secondary",
|
|
"level": "primary_and_secondary",
|
|
|
- "is_complete": match_result['missing_first_count'] == 0 and match_result['missing_second_count'] == 0,
|
|
|
|
|
|
|
+ "is_complete": len(missing_first) == 0 and len(missing_second) == 0,
|
|
|
"first_level": {
|
|
"first_level": {
|
|
|
- "total_required": len(matcher.first_names),
|
|
|
|
|
- "actual_present": len(match_result['matched_first']),
|
|
|
|
|
- "missing_count": match_result['missing_first_count'],
|
|
|
|
|
- "missing": match_result['missing_first']
|
|
|
|
|
|
|
+ "total_required": len(standard_first),
|
|
|
|
|
+ "actual_present": len(matched_first),
|
|
|
|
|
+ "missing_count": len(missing_first),
|
|
|
|
|
+ "missing": missing_first
|
|
|
},
|
|
},
|
|
|
"second_level": {
|
|
"second_level": {
|
|
|
- "total_required": len(matcher.second_names),
|
|
|
|
|
- "actual_present": len(match_result['matched_second']),
|
|
|
|
|
- "missing_count": match_result['missing_second_count'],
|
|
|
|
|
- "missing": match_result['missing_second']
|
|
|
|
|
- },
|
|
|
|
|
- "match_details": match_result['match_details']
|
|
|
|
|
|
|
+ "total_required": len(standard_second),
|
|
|
|
|
+ "actual_present": len(matched_second),
|
|
|
|
|
+ "missing_count": len(missing_second),
|
|
|
|
|
+ "missing": missing_second
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
execution_time = time.time() - start_time
|
|
execution_time = time.time() - start_time
|
|
|
logger.info(
|
|
logger.info(
|
|
|
f"[{name}] 检查完成,耗时: {execution_time:.2f}s, "
|
|
f"[{name}] 检查完成,耗时: {execution_time:.2f}s, "
|
|
|
- f"缺失一级: {match_result['missing_first_count']} 个, "
|
|
|
|
|
- f"缺失二级: {match_result['missing_second_count']} 个"
|
|
|
|
|
|
|
+ f"缺失一级: {len(missing_first)} 个, "
|
|
|
|
|
+ f"缺失二级: {len(missing_second)} 个"
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
return {
|
|
return {
|
|
@@ -1175,10 +1128,10 @@ class AIReviewEngine(BaseReviewer):
|
|
|
"execution_time": execution_time,
|
|
"execution_time": execution_time,
|
|
|
"details": {
|
|
"details": {
|
|
|
"name": name,
|
|
"name": name,
|
|
|
- "missing_first_count": match_result['missing_first_count'],
|
|
|
|
|
- "missing_second_count": match_result['missing_second_count'],
|
|
|
|
|
- "missing_first": match_result['missing_first'],
|
|
|
|
|
- "missing_second": match_result['missing_second'],
|
|
|
|
|
|
|
+ "missing_first_count": len(missing_first),
|
|
|
|
|
+ "missing_second_count": len(missing_second),
|
|
|
|
|
+ "missing_first": missing_first,
|
|
|
|
|
+ "missing_second": missing_second,
|
|
|
"catalogue_check": catalogue_result
|
|
"catalogue_check": catalogue_result
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|