| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- 端到端完整性审查测试
- 验证数据流程:
- 1. document_processor._build_parse_result() 生成 chunks
- 2. structure_content() 处理 chunks
- 3. completeness_reviewer 读取并统计
- 测试目标:验证 tertiary_classification_details 字段在整个流程中不丢失
- """
- import asyncio
- import json
- import sys
- from pathlib import Path
- # 添加项目路径
- project_root = Path(__file__).parent.parent.parent
- sys.path.insert(0, str(project_root))
- def test_build_parse_result():
- """测试 _build_parse_result 方法"""
- print("\n" + "=" * 60)
- print("测试 1: document_processor._build_parse_result()")
- print("=" * 60)
- from core.construction_review.component.document_processor import DocumentProcessor
- processor = DocumentProcessor()
- # 模拟三级分类后的 chunks
- mock_chunks = [
- {
- 'chunk_id': 'test_chunk_1',
- 'element_tag': {'page': 1},
- 'review_chunk_content': '测试内容',
- 'section_label': '第一章->四、编制原则',
- 'project_plan_type': 'construction_plan',
- 'chapter_classification': 'basis',
- 'secondary_category_cn': '编制原则',
- 'secondary_category_code': 'CompilationPrinciples',
- 'tertiary_category_cn': '国家方针、政策、标准和设计文件',
- 'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument',
- # 关键:这是三级分类详情列表
- 'tertiary_classification_details': [
- {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
- {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
- {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
- {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
- {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
- {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
- ]
- }
- ]
- # 调用 _build_parse_result
- result = processor._build_parse_result(
- file_type='docx',
- chunks=mock_chunks,
- pages_content=[],
- toc_info={},
- classified_items=[],
- target_level=1,
- total_chars=100
- )
- # 检查结果
- result_chunks = result.get('chunks', [])
- if not result_chunks:
- print(" [FAIL] 没有生成 chunks")
- return False
- first_chunk = result_chunks[0]
- metadata = first_chunk.get('metadata', {})
- # 验证 tertiary_classification_details 是否存在
- details = metadata.get('tertiary_classification_details', [])
- print(f" metadata keys: {list(metadata.keys())}")
- print(f" tertiary_classification_details 存在: {'tertiary_classification_details' in metadata}")
- print(f" tertiary_classification_details 数量: {len(details)}")
- if 'tertiary_classification_details' not in metadata:
- print(" [FAIL] _build_parse_result 丢失了 tertiary_classification_details")
- return False
- if len(details) != 6:
- print(f" [FAIL] tertiary_classification_details 数量不正确: {len(details)} != 6")
- return False
- print(" [PASS] _build_parse_result 正确保留了 tertiary_classification_details")
- return True
- def test_structure_content():
- """测试 structure_content 方法"""
- print("\n" + "=" * 60)
- print("测试 2: document_processor.structure_content()")
- print("=" * 60)
- from core.construction_review.component.document_processor import DocumentProcessor
- processor = DocumentProcessor()
- # 模拟 _build_parse_result 的输出
- mock_raw_content = {
- 'document_type': 'docx',
- 'toc_info': {'chapters': []},
- 'classification': {'items': [], 'target_level': 1},
- 'chunks': [
- {
- 'page': 1,
- 'content': '测试内容',
- 'metadata': {
- 'chunk_id': 'test_chunk_1',
- 'section_label': '第一章->四、编制原则',
- 'project_plan_type': 'construction_plan',
- 'chapter_classification': 'basis',
- 'secondary_category_cn': '编制原则',
- 'secondary_category_code': 'CompilationPrinciples',
- 'tertiary_category_cn': '国家方针、政策、标准和设计文件',
- 'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument',
- 'tertiary_classification_details': [
- {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
- {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
- {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
- {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
- {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
- {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
- ],
- 'element_tag': {}
- }
- }
- ],
- 'metadata': {}
- }
- # 调用 structure_content
- result = processor.structure_content(mock_raw_content)
- # 检查结果
- result_chunks = result.get('chunks', [])
- if not result_chunks:
- print(" [FAIL] 没有生成 chunks")
- return False
- first_chunk = result_chunks[0]
- print(f" chunk keys: {list(first_chunk.keys())}")
- print(f" tertiary_classification_details 存在: {'tertiary_classification_details' in first_chunk}")
- details = first_chunk.get('tertiary_classification_details', [])
- print(f" tertiary_classification_details 数量: {len(details)}")
- if 'tertiary_classification_details' not in first_chunk:
- print(" [FAIL] structure_content 丢失了 tertiary_classification_details")
- return False
- if len(details) != 6:
- print(f" [FAIL] tertiary_classification_details 数量不正确: {len(details)} != 6")
- return False
- print(" [PASS] structure_content 正确保留了 tertiary_classification_details")
- return True
- async def test_completeness_reviewer():
- """测试 completeness_reviewer 读取数据"""
- print("\n" + "=" * 60)
- print("测试 3: completeness_reviewer 数据读取")
- print("=" * 60)
- from core.construction_review.component.reviewers.completeness_reviewer import (
- LightweightCompletenessChecker
- )
- # 模拟 structure_content 的输出格式
- mock_chunks = [
- {
- 'chunk_id': 'test_chunk_1',
- 'page': 1,
- 'content': '测试内容',
- 'section_label': '第一章->四、编制原则',
- 'chapter_classification': 'basis',
- 'secondary_category_cn': '编制原则',
- 'secondary_category_code': 'CompilationPrinciples',
- 'tertiary_category_cn': '国家方针、政策、标准和设计文件',
- 'tertiary_category_code': 'NationalPoliciesStandardsAndDesignDocument',
- # 关键:扁平结构中的 tertiary_classification_details
- 'tertiary_classification_details': [
- {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针、政策、标准和设计文件"},
- {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
- {"third_category_code": "ProjectFunctionImplementation", "third_category_name": "工程项目功能实现"},
- {"third_category_code": "ContractPerformance", "third_category_name": "合同履约"},
- {"third_category_code": "ConstructionForceConcentration", "third_category_name": "施工力量集中"},
- {"third_category_code": "ProcessControl", "third_category_name": "工序控制"}
- ]
- }
- ]
- csv_path = str(project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv")
- checker = LightweightCompletenessChecker(csv_path)
- # 执行检查
- result = await checker.check(
- chunks=mock_chunks,
- outline=None,
- chapter_classification='basis'
- )
- # 检查结果
- tertiary_result = result.tertiary_completeness
- print(f" 总体状态: {result.overall_status}")
- print(f" 三级完整性:")
- print(f" 总数: {tertiary_result.get('total', 0)}")
- print(f" 已有: {tertiary_result.get('present', 0)}")
- print(f" 缺失: {tertiary_result.get('missing', 0)}")
- # 验证编制原则的完整性
- secondary_stats = tertiary_result.get('secondary_stats', [])
- for stat in secondary_stats:
- if stat.get('secondary_code') == 'CompilationPrinciples':
- print(f"\n 编制原则统计:")
- print(f" 总数: {stat.get('total_tertiary', 0)}")
- print(f" 已有: {stat.get('present', 0)}")
- print(f" 缺失: {stat.get('missing', 0)}")
- if stat.get('missing', 0) == 0:
- print(" [PASS] completeness_reviewer 正确识别了所有三级分类")
- return True
- else:
- print(f" [FAIL] 还有 {stat.get('missing', 0)} 个缺失")
- return False
- print(" [FAIL] 没有找到编制原则的统计")
- return False
- def test_metadata_format():
- """测试 metadata 嵌套格式"""
- print("\n" + "=" * 60)
- print("测试 4: metadata 嵌套格式支持")
- print("=" * 60)
- from core.construction_review.component.reviewers.completeness_reviewer import (
- LightweightCompletenessChecker
- )
- # 模拟 metadata 嵌套格式
- mock_chunks = [
- {
- 'chunk_id': 'test_chunk_1',
- 'page': 1,
- 'content': '测试内容',
- 'metadata': {
- 'chapter_classification': 'basis',
- 'secondary_category_code': 'CompilationPrinciples',
- 'tertiary_classification_details': [
- {"third_category_code": "NationalPoliciesStandardsAndDesignDocument", "third_category_name": "国家方针"},
- {"third_category_code": "BasicConstructionProcedures", "third_category_name": "基本建设程序"},
- ]
- }
- }
- ]
- # 测试 _extract_tertiary_from_chunks 方法
- csv_path = str(project_root / "core/construction_review/component/doc_worker/config/StandardCategoryTable.csv")
- checker = LightweightCompletenessChecker(csv_path)
- # 直接测试提取方法
- actual = checker._extract_tertiary_from_chunks(mock_chunks)
- print(f" 提取到的三级分类: {actual}")
- print(f" 数量: {len(actual)}")
- if len(actual) == 2:
- print(" [PASS] 正确从 metadata 嵌套格式提取三级分类")
- return True
- else:
- print(" [FAIL] 提取数量不正确")
- return False
- async def main():
- """运行所有测试"""
- print("\n" + "=" * 60)
- print("端到端完整性审查测试")
- print("验证 tertiary_classification_details 字段在数据流中不丢失")
- print("=" * 60)
- results = []
- # 测试 1: _build_parse_result
- results.append(("_build_parse_result", test_build_parse_result()))
- # 测试 2: structure_content
- results.append(("structure_content", test_structure_content()))
- # 测试 3: completeness_reviewer
- results.append(("completeness_reviewer", await test_completeness_reviewer()))
- # 测试 4: metadata 格式支持
- results.append(("metadata_format", test_metadata_format()))
- # 汇总
- print("\n" + "=" * 60)
- print("测试结果汇总")
- print("=" * 60)
- all_passed = True
- for name, passed in results:
- status = "[PASS]" if passed else "[FAIL]"
- print(f" {status} {name}")
- if not passed:
- all_passed = False
- print("\n" + "=" * 60)
- if all_passed:
- print("所有测试通过!")
- else:
- print("存在失败的测试,请检查!")
- print("=" * 60)
- return all_passed
- if __name__ == "__main__":
- asyncio.run(main())
|