test_e2e_completeness.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 端到端完整性审查测试
  5. 验证数据流程:
  6. 1. document_processor._build_parse_result() 生成 chunks
  7. 2. structure_content() 处理 chunks
  8. 3. completeness_reviewer 读取并统计
  9. 测试目标:验证 tertiary_classification_details 字段在整个流程中不丢失
  10. """
  11. import asyncio
  12. import json
  13. import sys
  14. from pathlib import Path
  15. # 添加项目路径
  16. project_root = Path(__file__).parent.parent.parent
  17. sys.path.insert(0, str(project_root))
  18. def test_build_parse_result():
  19. """测试 _build_parse_result 方法"""
  20. print("\n" + "=" * 60)
  21. print("测试 1: document_processor._build_parse_result()")
  22. print("=" * 60)
  23. from core.construction_review.component.document_processor import DocumentProcessor
  24. processor = DocumentProcessor()
  25. # 模拟三级分类后的 chunks
  26. mock_chunks = [
  27. {
  28. 'chunk_id': 'test_chunk_1',
  29. 'element_tag': {'page': 1},
  30. 'review_chunk_content': '测试内容',
  31. 'section_label': '第一章->四、编制原则',
  32. 'project_plan_type': 'construction_plan',
  33. 'chapter_classification': 'basis',
  34. 'secondary_category_cn': '编制原则',
  35. 'secondary_category_code': 'CompilationPrinciples',
  36. 'tertiary_category_cn': '国家方针、政策、标准和设计文件',
  37. 'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument',
  38. # 关键:这是三级分类详情列表
  39. 'tertiary_classification_details': [
  40. {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
  41. {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
  42. {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
  43. {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
  44. {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
  45. {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
  46. ]
  47. }
  48. ]
  49. # 调用 _build_parse_result
  50. result = processor._build_parse_result(
  51. file_type='docx',
  52. chunks=mock_chunks,
  53. pages_content=[],
  54. toc_info={},
  55. classified_items=[],
  56. target_level=1,
  57. total_chars=100
  58. )
  59. # 检查结果
  60. result_chunks = result.get('chunks', [])
  61. if not result_chunks:
  62. print(" [FAIL] 没有生成 chunks")
  63. return False
  64. first_chunk = result_chunks[0]
  65. metadata = first_chunk.get('metadata', {})
  66. # 验证 tertiary_classification_details 是否存在
  67. details = metadata.get('tertiary_classification_details', [])
  68. print(f" metadata keys: {list(metadata.keys())}")
  69. print(f" tertiary_classification_details 存在: {'tertiary_classification_details' in metadata}")
  70. print(f" tertiary_classification_details 数量: {len(details)}")
  71. if 'tertiary_classification_details' not in metadata:
  72. print(" [FAIL] _build_parse_result 丢失了 tertiary_classification_details")
  73. return False
  74. if len(details) != 6:
  75. print(f" [FAIL] tertiary_classification_details 数量不正确: {len(details)} != 6")
  76. return False
  77. print(" [PASS] _build_parse_result 正确保留了 tertiary_classification_details")
  78. return True
  79. def test_structure_content():
  80. """测试 structure_content 方法"""
  81. print("\n" + "=" * 60)
  82. print("测试 2: document_processor.structure_content()")
  83. print("=" * 60)
  84. from core.construction_review.component.document_processor import DocumentProcessor
  85. processor = DocumentProcessor()
  86. # 模拟 _build_parse_result 的输出
  87. mock_raw_content = {
  88. 'document_type': 'docx',
  89. 'toc_info': {'chapters': []},
  90. 'classification': {'items': [], 'target_level': 1},
  91. 'chunks': [
  92. {
  93. 'page': 1,
  94. 'content': '测试内容',
  95. 'metadata': {
  96. 'chunk_id': 'test_chunk_1',
  97. 'section_label': '第一章->四、编制原则',
  98. 'project_plan_type': 'construction_plan',
  99. 'chapter_classification': 'basis',
  100. 'secondary_category_cn': '编制原则',
  101. 'secondary_category_code': 'CompilationPrinciples',
  102. 'tertiary_category_cn': '国家方针、政策、标准和设计文件',
  103. 'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument',
  104. 'tertiary_classification_details': [
  105. {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
  106. {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
  107. {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
  108. {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
  109. {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
  110. {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
  111. ],
  112. 'element_tag': {}
  113. }
  114. }
  115. ],
  116. 'metadata': {}
  117. }
  118. # 调用 structure_content
  119. result = processor.structure_content(mock_raw_content)
  120. # 检查结果
  121. result_chunks = result.get('chunks', [])
  122. if not result_chunks:
  123. print(" [FAIL] 没有生成 chunks")
  124. return False
  125. first_chunk = result_chunks[0]
  126. print(f" chunk keys: {list(first_chunk.keys())}")
  127. print(f" tertiary_classification_details 存在: {'tertiary_classification_details' in first_chunk}")
  128. details = first_chunk.get('tertiary_classification_details', [])
  129. print(f" tertiary_classification_details 数量: {len(details)}")
  130. if 'tertiary_classification_details' not in first_chunk:
  131. print(" [FAIL] structure_content 丢失了 tertiary_classification_details")
  132. return False
  133. if len(details) != 6:
  134. print(f" [FAIL] tertiary_classification_details 数量不正确: {len(details)} != 6")
  135. return False
  136. print(" [PASS] structure_content 正确保留了 tertiary_classification_details")
  137. return True
  138. async def test_completeness_reviewer():
  139. """测试 completeness_reviewer 读取数据"""
  140. print("\n" + "=" * 60)
  141. print("测试 3: completeness_reviewer 数据读取")
  142. print("=" * 60)
  143. from core.construction_review.component.reviewers.completeness_reviewer import (
  144. LightweightCompletenessChecker
  145. )
  146. # 模拟 structure_content 的输出格式
  147. mock_chunks = [
  148. {
  149. 'chunk_id': 'test_chunk_1',
  150. 'page': 1,
  151. 'content': '测试内容',
  152. 'section_label': '第一章->四、编制原则',
  153. 'chapter_classification': 'basis',
  154. 'secondary_category_cn': '编制原则',
  155. 'secondary_category_code': 'CompilationPrinciples',
  156. 'tertiary_category_cn': '国家方针、政策、标准和设计文件',
  157. 'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument',
  158. # 关键:扁平结构中的 tertiary_classification_details
  159. 'tertiary_classification_details': [
  160. {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
  161. {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
  162. {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
  163. {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
  164. {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
  165. {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
  166. ]
  167. }
  168. ]
  169. csv_path = str(project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv")
  170. checker = LightweightCompletenessChecker(csv_path)
  171. # 执行检查
  172. result = await checker.check(
  173. chunks=mock_chunks,
  174. outline=None,
  175. chapter_classification='basis'
  176. )
  177. # 检查结果
  178. tertiary_result = result.tertiary_completeness
  179. print(f" 总体状态: {result.overall_status}")
  180. print(f" 三级完整性:")
  181. print(f" 总数: {tertiary_result.get('total', 0)}")
  182. print(f" 已有: {tertiary_result.get('present', 0)}")
  183. print(f" 缺失: {tertiary_result.get('missing', 0)}")
  184. # 验证编制原则的完整性
  185. secondary_stats = tertiary_result.get('secondary_stats', [])
  186. for stat in secondary_stats:
  187. if stat.get('secondary_code') == 'CompilationPrinciples':
  188. print(f"\n 编制原则统计:")
  189. print(f" 总数: {stat.get('total_tertiary', 0)}")
  190. print(f" 已有: {stat.get('present', 0)}")
  191. print(f" 缺失: {stat.get('missing', 0)}")
  192. if stat.get('missing', 0) == 0:
  193. print(" [PASS] completeness_reviewer 正确识别了所有三级分类")
  194. return True
  195. else:
  196. print(f" [FAIL] 还有 {stat.get('missing', 0)} 个缺失")
  197. return False
  198. print(" [FAIL] 没有找到编制原则的统计")
  199. return False
  200. def test_metadata_format():
  201. """测试 metadata 嵌套格式"""
  202. print("\n" + "=" * 60)
  203. print("测试 4: metadata 嵌套格式支持")
  204. print("=" * 60)
  205. from core.construction_review.component.reviewers.completeness_reviewer import (
  206. LightweightCompletenessChecker
  207. )
  208. # 模拟 metadata 嵌套格式
  209. mock_chunks = [
  210. {
  211. 'chunk_id': 'test_chunk_1',
  212. 'page': 1,
  213. 'content': '测试内容',
  214. 'metadata': {
  215. 'chapter_classification': 'basis',
  216. 'secondary_category_code': 'CompilationPrinciples',
  217. 'tertiary_classification_details': [
  218. {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针"},
  219. {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
  220. ]
  221. }
  222. }
  223. ]
  224. # 测试 _extract_tertiary_from_chunks 方法
  225. csv_path = str(project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv")
  226. checker = LightweightCompletenessChecker(csv_path)
  227. # 直接测试提取方法
  228. actual = checker._extract_tertiary_from_chunks(mock_chunks)
  229. print(f" 提取到的三级分类: {actual}")
  230. print(f" 数量: {len(actual)}")
  231. if len(actual) == 2:
  232. print(" [PASS] 正确从 metadata 嵌套格式提取三级分类")
  233. return True
  234. else:
  235. print(" [FAIL] 提取数量不正确")
  236. return False
  237. async def main():
  238. """运行所有测试"""
  239. print("\n" + "=" * 60)
  240. print("端到端完整性审查测试")
  241. print("验证 tertiary_classification_details 字段在数据流中不丢失")
  242. print("=" * 60)
  243. results = []
  244. # 测试 1: _build_parse_result
  245. results.append(("_build_parse_result", test_build_parse_result()))
  246. # 测试 2: structure_content
  247. results.append(("structure_content", test_structure_content()))
  248. # 测试 3: completeness_reviewer
  249. results.append(("completeness_reviewer", await test_completeness_reviewer()))
  250. # 测试 4: metadata 格式支持
  251. results.append(("metadata_format", test_metadata_format()))
  252. # 汇总
  253. print("\n" + "=" * 60)
  254. print("测试结果汇总")
  255. print("=" * 60)
  256. all_passed = True
  257. for name, passed in results:
  258. status = "[PASS]" if passed else "[FAIL]"
  259. print(f" {status} {name}")
  260. if not passed:
  261. all_passed = False
  262. print("\n" + "=" * 60)
  263. if all_passed:
  264. print("所有测试通过!")
  265. else:
  266. print("存在失败的测试,请检查!")
  267. print("=" * 60)
  268. return all_passed
  269. if __name__ == "__main__":
  270. asyncio.run(main())