| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- """
- Agent 驱动的 RAG 管线测试运行器
- 执行全部测试样本,评估并生成报告
- """
- import sys
- import os
- # 确保项目根目录在路径中
- project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
- if project_root not in sys.path:
- sys.path.insert(0, project_root)
- from utils_test.RAG_Pipeline_Test.test_data import TEST_SAMPLES
- from utils_test.RAG_Pipeline_Test.rag_pipeline_runner import RAGPipelineRunner
- from utils_test.RAG_Pipeline_Test.rag_evaluator import RAGEvaluator
- def main():
- print("=" * 70)
- print("RAG 管线 Agent 驱动测试")
- print("=" * 70)
- print(f"测试样本数: {len(TEST_SAMPLES)}")
- print()
- # 初始化
- print("[1/3] 初始化管线执行器和评估器...")
- runner = RAGPipelineRunner()
- evaluator = RAGEvaluator()
- print(" 初始化完成")
- print()
- # 执行管线
- print("[2/3] 执行 RAG 管线...")
- results = runner.run_batch(TEST_SAMPLES)
- print()
- # 评估
- print("[3/3] 评估结果...")
- evaluations = []
- for i, (result, sample) in enumerate(zip(results, TEST_SAMPLES)):
- print(f" 评估样本 {i+1}/{len(results)}: {result.chunk_id}")
- ev = evaluator.evaluate_sample(result, sample["content"])
- evaluations.append(ev)
- print(f" 总分: {ev.overall_score:.1f}/5.0 [{ev.overall_status}]")
- print(f" {ev.analysis}")
- print()
- # 生成报告
- report = evaluator.generate_report(evaluations)
- # 先保存报告到文件(避免打印编码问题导致丢失)
- report_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "reports")
- os.makedirs(report_dir, exist_ok=True)
- report_path = os.path.join(report_dir, "rag_pipeline_test_report.md")
- with open(report_path, "w", encoding="utf-8") as f:
- f.write(report)
- print(f"\n报告已保存: {report_path}")
- # 输出报告
- print("=" * 70)
- try:
- print(report)
- except UnicodeEncodeError:
- safe_report = report.replace("✅", "[PASS]").replace("⚠️", "[WARN]").replace("❌", "[FAIL]")
- print(safe_report)
- print("=" * 70)
- # 返回汇总
- pass_count = sum(1 for ev in evaluations if ev.overall_status == "PASS")
- warn_count = sum(1 for ev in evaluations if ev.overall_status == "WARN")
- fail_count = sum(1 for ev in evaluations if ev.overall_status == "FAIL")
- print(f"\n汇总: {pass_count} PASS / {warn_count} WARN / {fail_count} FAIL")
- return evaluations
- if __name__ == "__main__":
- main()
|