compare_test.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. """
  2. 完整性审查对比测试
  3. 对比两种方案:
  4. 方案A(当前):先分类(LLM三级分类) → 再用集合运算判断完整性 → LLM生成建议
  5. 方案B(替代):直接LLM解释,一次性输出是否符合 + 证据 + 原因
  6. 评估维度:
  7. 1. 准确率:与人工标注对比的覆盖率/误报率
  8. 2. 一致性:相同输入多次运行的结果稳定性
  9. 3. 延迟:端到端耗时
  10. 4. 质量:建议的可操作性、证据的准确性
  11. 5. 成本:LLM调用次数、token消耗
  12. 使用方法:
  13. cd 项目根目录
  14. $env:PYTHONPATH = (Get-Location)
  15. python utils_test/Completeness_Compare_Test/compare_test.py
  16. """
  17. import asyncio
  18. import json
  19. import sys
  20. import time
  21. from pathlib import Path
  22. from typing import Any, Dict, List, Optional, Tuple
  23. from dataclasses import dataclass, field
  24. PROJECT_ROOT = str(Path(__file__).parent.parent.parent)
  25. if PROJECT_ROOT not in sys.path:
  26. from foundation.observability.logger.loggering import review_logger as logger
  27. from utils_test.Completeness_Compare_Test.method_b_direct_llm import (
  28. run_direct_llm_check,
  29. direct_result_to_dict,
  30. DirectCheckResult,
  31. )
  32. # ─────────────────────────────────────────────────────────────────────────────
  33. # 数据加载
  34. # ─────────────────────────────────────────────────────────────────────────────
  35. def load_final_result(json_path: str) -> Dict[str, Any]:
  36. """加载 final_result JSON"""
  37. with open(json_path, "r", encoding="utf-8") as f:
  38. return json.load(f)
  39. def extract_chunks_by_chapter(
  40. data: Dict[str, Any],
  41. chapter_code: str
  42. ) -> List[Dict[str, Any]]:
  43. """从 final_result 中提取指定一级章节的所有 chunks"""
  44. structured = data.get("document_result", {}).get("structured_content", {})
  45. all_chunks = structured.get("chunks", [])
  46. return [
  47. c for c in all_chunks
  48. if c.get("chapter_classification") == chapter_code
  49. ]
  50. def get_all_chapter_codes(data: Dict[str, Any]) -> List[str]:
  51. """获取文档中所有一级章节代码"""
  52. structured = data.get("document_result", {}).get("structured_content", {})
  53. all_chunks = structured.get("chunks", [])
  54. codes = []
  55. seen = set()
  56. for c in all_chunks:
  57. code = c.get("chapter_classification", "")
  58. if code and code not in seen and code not in ("quality_check", "catalog", "metadata"):
  59. codes.append(code)
  60. seen.add(code)
  61. return codes
  62. def load_standard_items_for_chapter(
  63. csv_path: str,
  64. chapter_code: str
  65. ) -> List[Dict[str, Any]]:
  66. """从 StandardCategoryTable.csv 加载指定章节的标准三级项"""
  67. import pandas as pd
  68. encodings = ['utf-8-sig', 'utf-16', 'gbk', 'utf-8']
  69. df = None
  70. for enc in encodings:
  71. try:
  72. df = pd.read_csv(csv_path, encoding=enc, sep=None, engine='python')
  73. break
  74. except UnicodeDecodeError:
  75. continue
  76. if df is None:
  77. raise ValueError(f"无法读取CSV: {csv_path}")
  78. df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
  79. items = []
  80. for _, row in df.iterrows():
  81. fc = str(row.get('first_code', '')).strip()
  82. if fc != chapter_code:
  83. continue
  84. items.append({
  85. "first_code": fc,
  86. "first_name": str(row.get('first_name', '')).strip(),
  87. "second_code": str(row.get('second_code', '')).strip(),
  88. "second_name": str(row.get('second_name', '')).strip(),
  89. "third_code": str(row.get('third_code', '')).strip(),
  90. "third_name": str(row.get('third_name', '')).strip(),
  91. "third_focus": str(row.get('third_focus', '')).strip(),
  92. })
  93. return items
  94. # ─────────────────────────────────────────────────────────────────────────────
  95. # 方案A:当前系统(复用 LightweightCompletenessChecker)
  96. # ─────────────────────────────────────────────────────────────────────────────
  97. async def run_method_a(
  98. chunks: List[Dict[str, Any]],
  99. csv_path: str,
  100. chapter_code: str,
  101. outline: Optional[List[Dict]] = None,
  102. model_client=None,
  103. ) -> Tuple[Dict[str, Any], float, int]:
  104. """
  105. 执行方案A:先分类再比对
  106. Returns:
  107. (result_dict, execution_time, llm_call_count)
  108. """
  109. from core.construction_review.component.reviewers.completeness_reviewer import (
  110. LightweightCompletenessChecker,
  111. result_to_dict,
  112. )
  113. start_time = time.time()
  114. checker = LightweightCompletenessChecker(csv_path, model_client=model_client)
  115. result = await checker.check(
  116. chunks=chunks,
  117. outline=outline,
  118. chapter_classification=chapter_code if chapter_code != "all" else None,
  119. )
  120. result_dict = result_to_dict(result)
  121. execution_time = time.time() - start_time
  122. # 方案A的LLM调用:仅在生成建议时调用(issue_point/reason是模板拼接)
  123. # 统计recommendations中使用LLM的数量
  124. llm_call_count = len([
  125. r for r in result_dict.get("recommendations", [])
  126. if r.get("level") != "通过"
  127. ])
  128. return result_dict, execution_time, llm_call_count
  129. # ─────────────────────────────────────────────────────────────────────────────
  130. # 对比评估
  131. # ─────────────────────────────────────────────────────────────────────────────
  132. @dataclass
  133. class CompareResult:
  134. """单章节对比结果"""
  135. chapter_code: str
  136. chapter_name: str
  137. # 方案A结果
  138. a_total_required: int = 0
  139. a_present: int = 0
  140. a_missing: int = 0
  141. a_completeness_rate: float = 0.0
  142. a_execution_time: float = 0.0
  143. a_llm_calls: int = 0
  144. a_missing_details: List[Dict] = field(default_factory=list)
  145. a_recommendations: List[Dict] = field(default_factory=list)
  146. # 方案B结果
  147. b_total_required: int = 0
  148. b_covered: int = 0
  149. b_missing: int = 0
  150. b_completeness_rate: float = 0.0
  151. b_execution_time: float = 0.0
  152. b_llm_calls: int = 0
  153. b_items: List[Dict] = field(default_factory=list)
  154. # 差异分析
  155. agreement_count: int = 0 # 两方案判断一致的数量
  156. disagreement_count: int = 0 # 两方案判断不一致的数量
  157. a_only_missing: List[str] = field(default_factory=list) # A认为缺失但B认为覆盖
  158. b_only_missing: List[str] = field(default_factory=list) # B认为缺失但A认为覆盖
  159. def compare_results(
  160. chapter_code: str,
  161. chapter_name: str,
  162. method_a: Dict[str, Any],
  163. method_b: DirectCheckResult,
  164. a_time: float,
  165. a_llm_calls: int,
  166. ) -> CompareResult:
  167. """对比两种方案的结果"""
  168. cr = CompareResult(chapter_code=chapter_code, chapter_name=chapter_name)
  169. # 方案A统计
  170. tertiary = method_a.get("tertiary_completeness", {})
  171. cr.a_total_required = tertiary.get("total", 0)
  172. cr.a_present = tertiary.get("present", 0)
  173. cr.a_missing = tertiary.get("missing", 0)
  174. rate_str = tertiary.get("completeness_rate", "0%").rstrip("%")
  175. try:
  176. cr.a_completeness_rate = float(rate_str)
  177. except ValueError:
  178. cr.a_completeness_rate = 0.0
  179. cr.a_execution_time = a_time
  180. cr.a_llm_calls = a_llm_calls
  181. cr.a_missing_details = tertiary.get("missing_details", [])
  182. cr.a_recommendations = method_a.get("recommendations", [])
  183. # 方案B统计
  184. cr.b_total_required = method_b.total_required
  185. cr.b_covered = method_b.covered_count
  186. cr.b_missing = method_b.missing_count
  187. cr.b_completeness_rate = method_b.completeness_rate
  188. cr.b_execution_time = method_b.execution_time
  189. cr.b_llm_calls = method_b.llm_call_count
  190. cr.b_items = [
  191. {
  192. "standard_code": item.standard_code,
  193. "standard_name": item.standard_name,
  194. "is_covered": item.is_covered,
  195. "evidence": item.evidence,
  196. "reason": item.reason,
  197. "confidence": item.confidence,
  198. }
  199. for item in method_b.items
  200. ]
  201. # 差异分析
  202. a_missing_codes = {d["tertiary_code"] for d in cr.a_missing_details}
  203. b_missing_codes = {
  204. item.standard_code for item in method_b.items if not item.is_covered
  205. }
  206. cr.a_only_missing = sorted(a_missing_codes - b_missing_codes)
  207. cr.b_only_missing = sorted(b_missing_codes - a_missing_codes)
  208. all_codes = a_missing_codes | b_missing_codes
  209. cr.disagreement_count = len(cr.a_only_missing) + len(cr.b_only_missing)
  210. cr.agreement_count = (
  211. cr.a_total_required - len(a_missing_codes) - len(cr.b_only_missing)
  212. )
  213. return cr
  214. # ─────────────────────────────────────────────────────────────────────────────
  215. # 报告生成
  216. # ─────────────────────────────────────────────────────────────────────────────
  217. def generate_report(
  218. compare_results: List[CompareResult],
  219. output_path: str
  220. ) -> str:
  221. """生成对比报告"""
  222. lines = []
  223. lines.append("=" * 80)
  224. lines.append("完整性审查方案对比报告")
  225. lines.append("=" * 80)
  226. lines.append(f"生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}")
  227. lines.append(f"对比章节数: {len(compare_results)}")
  228. lines.append("")
  229. # ── 总览表 ──
  230. lines.append("─" * 80)
  231. lines.append("【总览】")
  232. lines.append("─" * 80)
  233. lines.append(f"{'章节':<15} {'A完整率':>8} {'B完整率':>8} {'A耗时':>8} {'B耗时':>8} "
  234. f" {'A缺失':>6} {'B缺失':>6} {'一致':>6} {'分歧':>6}")
  235. lines.append("─" * 80)
  236. total_a_time = 0
  237. total_b_time = 0
  238. total_agree = 0
  239. total_disagree = 0
  240. for cr in compare_results:
  241. lines.append(
  242. f"{cr.chapter_code:<15} "
  243. f"{cr.a_completeness_rate:>7.1f}% "
  244. f"{cr.b_completeness_rate:>7.1f}% "
  245. f"{cr.a_execution_time:>7.2f}s "
  246. f"{cr.b_execution_time:>7.2f}s "
  247. f"{cr.a_missing:>6} "
  248. f"{cr.b_missing:>6} "
  249. f"{cr.agreement_count:>6} "
  250. f"{cr.disagreement_count:>6}"
  251. )
  252. total_a_time += cr.a_execution_time
  253. total_b_time += cr.b_execution_time
  254. total_agree += cr.agreement_count
  255. total_disagree += cr.disagreement_count
  256. lines.append("─" * 80)
  257. lines.append(
  258. f"{'合计':<15} {'':>8} {'':>8} "
  259. f"{total_a_time:>7.2f}s "
  260. f"{total_b_time:>7.2f}s "
  261. f"{'':>6} {'':>6} "
  262. f"{total_agree:>6} "
  263. f"{total_disagree:>6}"
  264. )
  265. lines.append("")
  266. # ── 详细对比 ──
  267. for cr in compare_results:
  268. lines.append("─" * 80)
  269. lines.append(f"【{cr.chapter_code}】{cr.chapter_name}")
  270. lines.append("─" * 80)
  271. lines.append(f" 方案A: 缺失 {cr.a_missing}/{cr.a_total_required} "
  272. f"({cr.a_completeness_rate:.1f}%), 耗时 {cr.a_execution_time:.2f}s, "
  273. f"LLM调用 {cr.a_llm_calls}次")
  274. lines.append(f" 方案B: 缺失 {cr.b_missing}/{cr.b_total_required} "
  275. f"({cr.b_completeness_rate:.1f}%), 耗时 {cr.b_execution_time:.2f}s, "
  276. f"LLM调用 {cr.b_llm_calls}次")
  277. lines.append("")
  278. # 方案A缺失项
  279. if cr.a_missing_details:
  280. lines.append(" 方案A认为缺失:")
  281. for d in cr.a_missing_details[:10]:
  282. lines.append(f" - [{d.get('tertiary_code', '')}] {d.get('tertiary_name', '')}")
  283. if len(cr.a_missing_details) > 10:
  284. lines.append(f" ... 共 {len(cr.a_missing_details)} 项")
  285. lines.append("")
  286. # 方案B缺失项
  287. b_missing_items = [item for item in cr.b_items if not item["is_covered"]]
  288. if b_missing_items:
  289. lines.append(" 方案B认为缺失:")
  290. for item in b_missing_items[:10]:
  291. lines.append(f" - [{item['standard_code']}] {item['standard_name']}")
  292. lines.append(f" 原因: {item['reason'][:80]}")
  293. if len(b_missing_items) > 10:
  294. lines.append(f" ... 共 {len(b_missing_items)} 项")
  295. lines.append("")
  296. # 分歧项
  297. if cr.a_only_missing:
  298. lines.append(" 分歧:仅方案A认为缺失(B认为已覆盖):")
  299. for code in cr.a_only_missing[:5]:
  300. b_item = next((i for i in cr.b_items if i["standard_code"] == code), None)
  301. if b_item:
  302. lines.append(f" - [{code}] {b_item['standard_name']}")
  303. lines.append(f" B的证据: {b_item['evidence'][:100]}")
  304. lines.append("")
  305. if cr.b_only_missing:
  306. lines.append(" 分歧:仅方案B认为缺失(A认为已覆盖):")
  307. for code in cr.b_only_missing[:5]:
  308. b_item = next((i for i in cr.b_items if i["standard_code"] == code), None)
  309. if b_item:
  310. lines.append(f" - [{code}] {b_item['standard_name']}")
  311. lines.append(f" B的原因: {b_item['reason'][:100]}")
  312. lines.append("")
  313. # ── 结论 ──
  314. lines.append("=" * 80)
  315. lines.append("【对比结论】")
  316. lines.append("=" * 80)
  317. lines.append(f" 总判断一致数: {total_agree}")
  318. lines.append(f" 总分歧数: {total_disagree}")
  319. if total_agree + total_disagree > 0:
  320. agree_rate = total_agree / (total_agree + total_disagree) * 100
  321. lines.append(f" 一致率: {agree_rate:.1f}%")
  322. lines.append(f" 方案A总耗时: {total_a_time:.2f}s")
  323. lines.append(f" 方案B总耗时: {total_b_time:.2f}s")
  324. lines.append("")
  325. report_text = "\n".join(lines)
  326. # 写入文件
  327. with open(output_path, "w", encoding="utf-8") as f:
  328. f.write(report_text)
  329. return report_text
  330. # ─────────────────────────────────────────────────────────────────────────────
  331. # 主测试流程
  332. # ─────────────────────────────────────────────────────────────────────────────
  333. async def run_comparison(
  334. json_path: str,
  335. csv_path: str,
  336. chapter_codes: Optional[List[str]] = None,
  337. output_dir: Optional[str] = None,
  338. ):
  339. """
  340. 执行完整对比测试
  341. Args:
  342. json_path: final_result JSON 文件路径
  343. csv_path: StandardCategoryTable.csv 路径
  344. chapter_codes: 要测试的章节代码列表,None则测试所有章节
  345. output_dir: 输出目录
  346. """
  347. logger.info("=" * 60)
  348. logger.info("开始完整性审查方案对比测试")
  349. logger.info("=" * 60)
  350. # 加载数据
  351. data = load_final_result(json_path)
  352. file_name = data.get("file_name", "unknown")
  353. logger.info(f"测试文件: {file_name}")
  354. # 确定测试章节
  355. if chapter_codes is None:
  356. chapter_codes = get_all_chapter_codes(data)
  357. logger.info(f"测试章节: {chapter_codes}")
  358. # 输出目录
  359. if output_dir is None:
  360. output_dir = str(Path(__file__).parent / "output")
  361. Path(output_dir).mkdir(parents=True, exist_ok=True)
  362. # 逐章节对比
  363. all_results: List[CompareResult] = []
  364. for chapter_code in chapter_codes:
  365. logger.info(f"\n{'─' * 40}")
  366. logger.info(f"测试章节: {chapter_code}")
  367. logger.info(f"{'─' * 40}")
  368. chunks = extract_chunks_by_chapter(data, chapter_code)
  369. if not chunks:
  370. logger.warning(f"章节 {chapter_code} 无chunks,跳过")
  371. continue
  372. chapter_name = chunks[0].get("first_name", chapter_code)
  373. logger.info(f" chunks数量: {len(chunks)}")
  374. # 加载标准要求
  375. standard_items = load_standard_items_for_chapter(csv_path, chapter_code)
  376. logger.info(f" 标准要求项数: {len(standard_items)}")
  377. if not standard_items:
  378. logger.warning(f"章节 {chapter_code} 无标准要求,跳过")
  379. continue
  380. # ── 方案A ──
  381. logger.info(" [方案A] 执行轻量级完整性审查...")
  382. a_result, a_time, a_llm_calls = await run_method_a(
  383. chunks=chunks,
  384. csv_path=csv_path,
  385. chapter_code=chapter_code,
  386. )
  387. logger.info(f" [方案A] 完成: 缺失{a_result.get('tertiary_completeness', {}).get('missing', 0)}项, "
  388. f"耗时{a_time:.2f}s")
  389. # ── 方案B ──
  390. logger.info(" [方案B] 执行直接LLM审查...")
  391. b_result = await run_direct_llm_check(
  392. chunks=chunks,
  393. standard_items=standard_items,
  394. chapter_code=chapter_code,
  395. chapter_name=chapter_name,
  396. )
  397. logger.info(f" [方案B] 完成: 缺失{b_result.missing_count}项, "
  398. f"耗时{b_result.execution_time:.2f}s")
  399. # ── 对比 ──
  400. cr = compare_results(
  401. chapter_code=chapter_code,
  402. chapter_name=chapter_name,
  403. method_a=a_result,
  404. method_b=b_result,
  405. a_time=a_time,
  406. a_llm_calls=a_llm_calls,
  407. )
  408. all_results.append(cr)
  409. # 保存单章节详细结果
  410. detail_path = Path(output_dir) / f"detail_{chapter_code}.json"
  411. with open(detail_path, "w", encoding="utf-8") as f:
  412. json.dump({
  413. "chapter_code": chapter_code,
  414. "chapter_name": chapter_name,
  415. "method_a": a_result,
  416. "method_b": direct_result_to_dict(b_result),
  417. "comparison": {
  418. "a_only_missing": cr.a_only_missing,
  419. "b_only_missing": cr.b_only_missing,
  420. "agreement_count": cr.agreement_count,
  421. "disagreement_count": cr.disagreement_count,
  422. }
  423. }, f, ensure_ascii=False, indent=2)
  424. # 生成报告
  425. report_path = Path(output_dir) / "comparison_report.txt"
  426. report_text = generate_report(all_results, str(report_path))
  427. logger.info(f"\n报告已保存: {report_path}")
  428. # 保存汇总JSON
  429. summary_path = Path(output_dir) / "comparison_summary.json"
  430. with open(summary_path, "w", encoding="utf-8") as f:
  431. json.dump({
  432. "file_name": file_name,
  433. "chapter_count": len(all_results),
  434. "chapters": [
  435. {
  436. "code": cr.chapter_code,
  437. "name": cr.chapter_name,
  438. "a_missing": cr.a_missing,
  439. "b_missing": cr.b_missing,
  440. "a_time": round(cr.a_execution_time, 2),
  441. "b_time": round(cr.b_execution_time, 2),
  442. "agreement": cr.agreement_count,
  443. "disagreement": cr.disagreement_count,
  444. "a_only_missing": cr.a_only_missing,
  445. "b_only_missing": cr.b_only_missing,
  446. }
  447. for cr in all_results
  448. ]
  449. }, f, ensure_ascii=False, indent=2)
  450. # 打印报告
  451. print(report_text)
  452. return all_results
  453. # ─────────────────────────────────────────────────────────────────────────────
  454. # 入口
  455. # ─────────────────────────────────────────────────────────────────────────────
  456. if __name__ == "__main__":
  457. # 默认使用最新的测试数据
  458. RESULT_DIR = Path(PROJECT_ROOT) / "temp" / "construction_review" / "final_result"
  459. CSV_PATH = Path(PROJECT_ROOT) / "core" / "construction_review" / "component" / "doc_worker" / "config" / "StandardCategoryTable.csv"
  460. # 选择一个测试文件(最新的)
  461. result_files = sorted(RESULT_DIR.glob("*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
  462. if not result_files:
  463. print("未找到测试数据文件")
  464. sys.exit(1)
  465. test_file = result_files[0]
  466. print(f"使用测试文件: {test_file.name}")
  467. # 可以指定章节,None表示全部
  468. # test_chapters = ["basis", "overview", "technology"]
  469. test_chapters = None # 全部章节
  470. asyncio.run(run_comparison(
  471. json_path=str(test_file),
  472. csv_path=str(CSV_PATH),
  473. chapter_codes=test_chapters,
  474. ))