| 12345678 |
- {
- "success": true,
- "content": "\"\"\"\n文件要点审查模块主程序\n\"\"\"\nimport asyncio\nfrom pathlib import Path\nfrom components.data_loader import CSVDataLoader\nfrom components.prompt_builder import PromptBuilder\nfrom components.llm_client import LLMClient\nfrom components.result_processor import ResultProcessor\nfrom components.review_pipeline import ReviewPipeline\nfrom components.result_saver import ResultSaver\nfrom components.result_analyzer import ResultAnalyzer\nfrom components.keyword_checker import KeywordChecker\nfrom utils.file_utils import write_json\nimport time\n\nasync def main():\n \"\"\"主函数\"\"\"\n # 配置文件路径\n base_dir = Path(__file__).parent\n csv_path = base_dir / 'config' / 'Construction_Plan_Content_Specification.csv'\n json_path = base_dir / 'data' / '文档切分预处理结果.json'\n prompt_config_path = base_dir / 'config' / 'prompt.yaml'\n api_config_path = base_dir / 'config' / 'llm_api.yaml'\n \n # 输出路径\n output_path = base_dir / 'output' / 'review_results.json'\n output_path.parent.mkdir(exist_ok=True)\n \n print(\"=\" * 60)\n print(\"文件要点审查模块\")\n print(\"=\" * 60)\n \n # 1. 加载数据\n print(\"\\n[1/5] 加载规范文件...\")\n data_loader = CSVDataLoader()\n specification = data_loader.load_specification(str(csv_path))\n print(f\" 加载完成,共 {len(specification)} 个标签类别\")\n \n print(\"\\n[2/5] 加载文档数据...\")\n documents = data_loader.load_documents(str(json_path))\n print(f\" 加载完成,共 {len(documents)} 个文档块\")\n \n # 2. 初始化组件\n print(\"\\n[3/5] 初始化组件...\")\n prompt_builder = PromptBuilder(str(prompt_config_path))\n llm_client = LLMClient(str(api_config_path))\n result_processor = ResultProcessor()\n \n # 获取配置\n api_config = llm_client.config\n concurrent_workers = api_config.get('keywords', {}).get('concurrent_workers', 20)\n keyword_check_config = api_config.get('keywords', {}).get('keyword_check', {})\n \n # 初始化关键词检查器(如果启用)\n keyword_checker = None\n if keyword_check_config.get('enabled', True):\n match_mode = keyword_check_config.get('match_mode', 'fuzzy')\n case_sensitive = keyword_check_config.get('case_sensitive', False)\n min_keyword_length = keyword_check_config.get('min_keyword_length', 2)\n keyword_checker = KeywordChecker(\n match_mode=match_mode,\n case_sensitive=case_sensitive,\n min_keyword_length=min_keyword_length\n )\n print(f\" 关键词检查已启用,匹配模式: {match_mode}\")\n else:\n print(\" 关键词检查已禁用\")\n \n review_pipeline = ReviewPipeline(\n prompt_builder=prompt_builder,\n llm_client=llm_client,\n result_processor=result_processor,\n max_concurrent=concurrent_workers,\n keyword_checker=keyword_checker\n )\n print(\" 组件初始化完成\")\n \n start_time = time.time()\n # 3. 执行审查\n print(\"\\n[4/5] 开始执行审查...\")\n print(f\" 使用模型: {llm_client.model_type}\")\n print(f\" 最大并发数: {concurrent_workers}\")\n \n results = await review_pipeline.review(documents, specification)\n \n # 统计结果\n success_count = sum(1 for r in results if isinstance(r.get('review_result', {}), dict) and 'error' not in r.get('review_result', {}))\n error_count = len(results) - success_count\n print(f\"\\n 审查完成: 成功 {success_count} 个, 失败 {error_count} 个\")\n\n # 4. 保存结果\n print(\"\\n[5/5] 保存审查结果...\")\n \n # 保存JSON格式结果\n output_data = {\n 'total_chunks': len(results),\n 'success_count': success_count,\n 'error_count': error_count,\n 'results': results\n }\n write_json(output_data, str(output_path))\n print(f\" JSON结果已保存至: {output_path}\")\n \n # 保存CSV格式结果\n csv_output_path = base_dir / 'output' / 'review_results.csv'\n ResultSaver.save_to_csv(results, specification, str(csv_output_path))\n print(f\" CSV结果已保存至: {csv_output_path}\")\n \n # 保存统计结果\n stats_output_path = base_dir / 'output' / 'review_statistics.txt'\n ResultSaver.save_statistics(results, specification, str(stats_output_path))\n print(f\" 统计结果已保存至: {stats_output_path}\")\n\n # 6. 使用结果解析处理组件,生成规范覆盖汇总表\n print(\"\\n[6/6] 生成规范要点覆盖汇总表...\")\n analyzer = ResultAnalyzer(str(csv_path), keyword_checker=keyword_checker)\n processed_results = analyzer.process_results(results, specification)\n spec_summary_csv_path = base_dir / 'output' / 'spec_review_summary.csv'\n summary_rows = analyzer.build_spec_summary(processed_results, str(spec_summary_csv_path))\n print(f\" 规范覆盖汇总结果已保存至: {spec_summary_csv_path}\")\n\n # 生成缺失要点 JSON 列表,便于前端消费\n missing_issue_json_path = base_dir / 'output' / 'spec_review_missing_issues.json'\n missing_issue_list = await analyzer.build_missing_issue_list(summary_rows)\n write_json(missing_issue_list, str(missing_issue_json_path))\n print(f\" 缺失要点 JSON 已保存至: {missing_issue_json_path}\")\n cost_time = time.time() - start_time\n # 5. 显示部分结果示例\n print(\"\\n\" + \"=\" * 60)\n print(\"审查结果示例(前3个):\")\n print(\"=\" * 60)\n for i, result in enumerate(results[:3]):\n print(f\"\\n文档块 {i+1}:\")\n print(f\" chunk_id: {result.get('chunk_id', 'N/A')}\")\n print(f\" chapter_classification: {result.get('chapter_classification', 'N/A')}\")\n review_result = result.get('review_result', {})\n # 错误情况\n if isinstance(review_result, dict) and 'error' in review_result:\n print(f\" 错误: {review_result['error']}\")\n # 正常结果(字典格式:二级目录->编号列表)\n elif isinstance(review_result, dict):\n if not review_result:\n print(\" 审查结果: 无数据\")\n else:\n print(\" 审查结果(包含的要点):\")\n for level2_name, point_numbers in review_result.items():\n if isinstance(point_numbers, list) and len(point_numbers) > 0:\n numbers_str = ', '.join(map(str, point_numbers))\n print(f\" - {level2_name}: 要点编号 [{numbers_str}]\")\n elif isinstance(point_numbers, list):\n print(f\" - {level2_name}: 无要点\")\n else:\n print(f\" - {level2_name}: {point_numbers}\")\n else:\n print(\" 审查结果格式未知,无法显示详情。\")\n print(f\"\\n 审查完成,耗时: {cost_time:.2f}s\")\n print(\"\\n\" + \"=\" * 60)\n print(\"审查完成!\")\n print(\"=\" * 60)\n\n\nif __name__ == '__main__':\n asyncio.run(main())\n\n",
- "start_line": 1,
- "end_line": 165,
- "total_lines": 165,
- "has_more": false
- }
|