#!/usr/bin/env python # -*- coding: utf-8 -*- """ 端到端完整性审查测试 验证数据流程: 1. document_processor._build_parse_result() 生成 chunks 2. structure_content() 处理 chunks 3. completeness_reviewer 读取并统计 测试目标:验证 tertiary_classification_details 字段在整个流程中不丢失 """ import asyncio import json import sys from pathlib import Path # 添加项目路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) def test_build_parse_result(): """测试 _build_parse_result 方法""" print("\n" + "=" * 60) print("测试 1: document_processor._build_parse_result()") print("=" * 60) from core.construction_review.component.document_processor import DocumentProcessor processor = DocumentProcessor() # 模拟三级分类后的 chunks mock_chunks = [ { 'chunk_id': 'test_chunk_1', 'element_tag': {'page': 1}, 'review_chunk_content': '测试内容', 'section_label': '第一章->四、编制原则', 'project_plan_type': 'construction_plan', 'chapter_classification': 'basis', 'secondary_category_cn': '编制原则', 'secondary_category_code': 'CompilationPrinciples', 'tertiary_category_cn': '国家方针、政策、标准和设计文件', 'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument', # 关键:这是三级分类详情列表 'tertiary_classification_details': [ {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"}, {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"}, {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"}, {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"}, {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"}, {"third_category_code": "ProcessControl", "third_category_name": "工序控制"} ] } ] # 调用 _build_parse_result result = processor._build_parse_result( file_type='docx', chunks=mock_chunks, pages_content=[], toc_info={}, classified_items=[], target_level=1, total_chars=100 ) # 检查结果 result_chunks = result.get('chunks', []) if not result_chunks: print(" [FAIL] 没有生成 chunks") return False first_chunk = result_chunks[0] metadata = first_chunk.get('metadata', {}) # 验证 tertiary_classification_details 是否存在 details = metadata.get('tertiary_classification_details', []) print(f" metadata keys: {list(metadata.keys())}") print(f" tertiary_classification_details 存在: {'tertiary_classification_details' in metadata}") print(f" tertiary_classification_details 数量: {len(details)}") if 'tertiary_classification_details' not in metadata: print(" [FAIL] _build_parse_result 丢失了 tertiary_classification_details") return False if len(details) != 6: print(f" [FAIL] tertiary_classification_details 数量不正确: {len(details)} != 6") return False print(" [PASS] _build_parse_result 正确保留了 tertiary_classification_details") return True def test_structure_content(): """测试 structure_content 方法""" print("\n" + "=" * 60) print("测试 2: document_processor.structure_content()") print("=" * 60) from core.construction_review.component.document_processor import DocumentProcessor processor = DocumentProcessor() # 模拟 _build_parse_result 的输出 mock_raw_content = { 'document_type': 'docx', 'toc_info': {'chapters': []}, 'classification': {'items': [], 'target_level': 1}, 'chunks': [ { 'page': 1, 'content': '测试内容', 'metadata': { 'chunk_id': 'test_chunk_1', 'section_label': '第一章->四、编制原则', 'project_plan_type': 'construction_plan', 'chapter_classification': 'basis', 'secondary_category_cn': '编制原则', 'secondary_category_code': 'CompilationPrinciples', 'tertiary_category_cn': '国家方针、政策、标准和设计文件', 'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument', 'tertiary_classification_details': [ {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"}, {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"}, {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"}, {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"}, {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"}, {"third_category_code": "ProcessControl", "third_category_name": "工序控制"} ], 'element_tag': {} } } ], 'metadata': {} } # 调用 structure_content result = processor.structure_content(mock_raw_content) # 检查结果 result_chunks = result.get('chunks', []) if not result_chunks: print(" [FAIL] 没有生成 chunks") return False first_chunk = result_chunks[0] print(f" chunk keys: {list(first_chunk.keys())}") print(f" tertiary_classification_details 存在: {'tertiary_classification_details' in first_chunk}") details = first_chunk.get('tertiary_classification_details', []) print(f" tertiary_classification_details 数量: {len(details)}") if 'tertiary_classification_details' not in first_chunk: print(" [FAIL] structure_content 丢失了 tertiary_classification_details") return False if len(details) != 6: print(f" [FAIL] tertiary_classification_details 数量不正确: {len(details)} != 6") return False print(" [PASS] structure_content 正确保留了 tertiary_classification_details") return True async def test_completeness_reviewer(): """测试 completeness_reviewer 读取数据""" print("\n" + "=" * 60) print("测试 3: completeness_reviewer 数据读取") print("=" * 60) from core.construction_review.component.reviewers.completeness_reviewer import ( LightweightCompletenessChecker ) # 模拟 structure_content 的输出格式 mock_chunks = [ { 'chunk_id': 'test_chunk_1', 'page': 1, 'content': '测试内容', 'section_label': '第一章->四、编制原则', 'chapter_classification': 'basis', 'secondary_category_cn': '编制原则', 'secondary_category_code': 'CompilationPrinciples', 'tertiary_category_cn': '国家方针、政策、标准和设计文件', 'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument', # 关键:扁平结构中的 tertiary_classification_details 'tertiary_classification_details': [ {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"}, {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"}, {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"}, {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"}, {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"}, {"third_category_code": "ProcessControl", "third_category_name": "工序控制"} ] } ] csv_path = str(project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv") checker = LightweightCompletenessChecker(csv_path) # 执行检查 result = await checker.check( chunks=mock_chunks, outline=None, chapter_classification='basis' ) # 检查结果 tertiary_result = result.tertiary_completeness print(f" 总体状态: {result.overall_status}") print(f" 三级完整性:") print(f" 总数: {tertiary_result.get('total', 0)}") print(f" 已有: {tertiary_result.get('present', 0)}") print(f" 缺失: {tertiary_result.get('missing', 0)}") # 验证编制原则的完整性 secondary_stats = tertiary_result.get('secondary_stats', []) for stat in secondary_stats: if stat.get('secondary_code') == 'CompilationPrinciples': print(f"\n 编制原则统计:") print(f" 总数: {stat.get('total_tertiary', 0)}") print(f" 已有: {stat.get('present', 0)}") print(f" 缺失: {stat.get('missing', 0)}") if stat.get('missing', 0) == 0: print(" [PASS] completeness_reviewer 正确识别了所有三级分类") return True else: print(f" [FAIL] 还有 {stat.get('missing', 0)} 个缺失") return False print(" [FAIL] 没有找到编制原则的统计") return False def test_metadata_format(): """测试 metadata 嵌套格式""" print("\n" + "=" * 60) print("测试 4: metadata 嵌套格式支持") print("=" * 60) from core.construction_review.component.reviewers.completeness_reviewer import ( LightweightCompletenessChecker ) # 模拟 metadata 嵌套格式 mock_chunks = [ { 'chunk_id': 'test_chunk_1', 'page': 1, 'content': '测试内容', 'metadata': { 'chapter_classification': 'basis', 'secondary_category_code': 'CompilationPrinciples', 'tertiary_classification_details': [ {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针"}, {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"}, ] } } ] # 测试 _extract_tertiary_from_chunks 方法 csv_path = str(project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv") checker = LightweightCompletenessChecker(csv_path) # 直接测试提取方法 actual = checker._extract_tertiary_from_chunks(mock_chunks) print(f" 提取到的三级分类: {actual}") print(f" 数量: {len(actual)}") if len(actual) == 2: print(" [PASS] 正确从 metadata 嵌套格式提取三级分类") return True else: print(" [FAIL] 提取数量不正确") return False async def main(): """运行所有测试""" print("\n" + "=" * 60) print("端到端完整性审查测试") print("验证 tertiary_classification_details 字段在数据流中不丢失") print("=" * 60) results = [] # 测试 1: _build_parse_result results.append(("_build_parse_result", test_build_parse_result())) # 测试 2: structure_content results.append(("structure_content", test_structure_content())) # 测试 3: completeness_reviewer results.append(("completeness_reviewer", await test_completeness_reviewer())) # 测试 4: metadata 格式支持 results.append(("metadata_format", test_metadata_format())) # 汇总 print("\n" + "=" * 60) print("测试结果汇总") print("=" * 60) all_passed = True for name, passed in results: status = "[PASS]" if passed else "[FAIL]" print(f" {status} {name}") if not passed: all_passed = False print("\n" + "=" * 60) if all_passed: print("所有测试通过!") else: print("存在失败的测试,请检查!") print("=" * 60) return all_passed if __name__ == "__main__": asyncio.run(main())