Explorar el Código

dev:添加了大纲审查的临时文件初版;

ChenJiSheng hace 1 mes
padre
commit
274c9aa8e2

+ 86 - 72
core/construction_review/component/ai_review_engine.py

@@ -54,7 +54,9 @@ import time
 from dataclasses import dataclass
 from dataclasses import dataclass
 from enum import Enum
 from enum import Enum
 from typing import Any, Dict, List, Optional, Sequence
 from typing import Any, Dict, List, Optional, Sequence
-
+import pandas as pd
+import json
+import ast  # 用于安全解析字符串为Python对象
 import pandas as pd
 import pandas as pd
 
 
 from core.base.task_models import TaskFileInfo
 from core.base.task_models import TaskFileInfo
@@ -88,7 +90,7 @@ from .reviewers.check_completeness.components.result_saver import ResultSaver
 from .reviewers.check_completeness.components.result_analyzer import ResultAnalyzer
 from .reviewers.check_completeness.components.result_analyzer import ResultAnalyzer
 from .reviewers.check_completeness.utils.file_utils import write_json
 from .reviewers.check_completeness.utils.file_utils import write_json
 from core.construction_review.component.reviewers.base_reviewer import ReviewResult
 from core.construction_review.component.reviewers.base_reviewer import ReviewResult
-
+from .reviewers.outline_check import outline_review_results_df
 @dataclass
 @dataclass
 class ReviewResult:
 class ReviewResult:
     """审查结果"""
     """审查结果"""
@@ -668,7 +670,6 @@ class AIReviewEngine(BaseReviewer):
         #     json.dump(review_content, f, ensure_ascii=False, indent=4)
         #     json.dump(review_content, f, ensure_ascii=False, indent=4)
         name = "completeness_check"
         name = "completeness_check"
         start_time = time.time()
         start_time = time.time()
-        
         try:
         try:
             # 验证review_content格式
             # 验证review_content格式
             if not isinstance(review_content, list):
             if not isinstance(review_content, list):
@@ -741,10 +742,17 @@ class AIReviewEngine(BaseReviewer):
 
 
             review_results = await review_pipeline.review(documents, specification)
             review_results = await review_pipeline.review(documents, specification)
             review_results_df = pd.DataFrame(review_results)
             review_results_df = pd.DataFrame(review_results)
-            df_section_label = review_results_df['section_label'].str.split('->').str[0]
-            review_results_df['title'] = df_section_label
+            chapter_labels = review_results_df['section_label'].str.split('->').str[0]
+            review_results_df['title'] = chapter_labels
             review_results_df.to_csv(Path('temp') / 'document_temp' / '2_spec_review_results.csv', encoding='utf-8-sig', index=False)
             review_results_df.to_csv(Path('temp') / 'document_temp' / '2_spec_review_results.csv', encoding='utf-8-sig', index=False)
-            review_results_flag = review_results_df["chapter_classification"].unique().tolist()
+            csv_file = rf'temp\document_temp\2_spec_review_results.csv'
+            path2 = rf'temp\document_temp\outlines_review_results.csv'
+            data_df = pd.read_csv(csv_file, encoding='utf-8-sig')
+            outline_review_results_df(data=data_df, path=path2)
+            df_filtered = review_results_df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)
+            unique_chapter_labels = chapter_labels.unique().tolist()
+            chapter_classifications = df_filtered['chapter_classification']
+            review_results_flag = chapter_classifications.unique().tolist()
 
 
             # with open(r'temp\document_temp\1_spec_review_results.json', 'w', encoding='utf-8') as f:
             # with open(r'temp\document_temp\1_spec_review_results.json', 'w', encoding='utf-8') as f:
             #     json.dump(review_results, f, ensure_ascii=False, indent=4)
             #     json.dump(review_results, f, ensure_ascii=False, indent=4)
@@ -762,7 +770,7 @@ class AIReviewEngine(BaseReviewer):
             # logger.info(f"  规范覆盖汇总结果已保存至: {spec_summary_csv_path}")
             # logger.info(f"  规范覆盖汇总结果已保存至: {spec_summary_csv_path}")
             summary_rows = pd.DataFrame(summary_rows)
             summary_rows = pd.DataFrame(summary_rows)
             summary_rows = summary_rows[summary_rows['标签'].isin(review_results_flag)]
             summary_rows = summary_rows[summary_rows['标签'].isin(review_results_flag)]
-            # summary_rows.to_csv(str(spec_summary_csv_path), encoding='utf-8-sig', index=False)
+            summary_rows.to_csv(str(spec_summary_csv_path), encoding='utf-8-sig', index=False)
             summary_rows = summary_rows.to_dict('records')
             summary_rows = summary_rows.to_dict('records')
             # 生成缺失要点 JSON 列表,便于前端消费
             # 生成缺失要点 JSON 列表,便于前端消费
 
 
@@ -954,74 +962,80 @@ class AIReviewEngine(BaseReviewer):
         """
         """
         logger.info(f"开始大纲审查,trace_id: {trace_id_idx}")
         logger.info(f"开始大纲审查,trace_id: {trace_id_idx}")
 
 
-        # 1. 获取整体大纲(1级大纲目录)
-        overall_outline = ""
-
-        # 添加调试信息
-        logger.debug(f"outline_content结构: {list(outline_content.keys()) if outline_content else 'None'}")
-        outline_data = outline_content.get('outline', {})
-        logger.debug(f"outline_data结构: {list(outline_data.keys()) if outline_data else 'None'}")
-        chapters = outline_data.get('chapters', [])
-        logger.info(f"chapters数量: {len(chapters)}")
-
-        for i, chapter in enumerate(chapters):
-            chapter_title = chapter.get('title', 'N/A')
-            chapter_page = chapter.get('page', 'N/A')
-            logger.info(f"章节{i+1}: {chapter_title} (页码: {chapter_page})")
-            overall_outline += f"{chapter_title} (页码: {chapter_page})\n"
-
-        logger.info(f"生成的overall_outline长度: {len(overall_outline)}")
-        if overall_outline:
-            logger.info(f"overall_outline内容: {overall_outline[:200]}...")
-
-        # 2. 获取大纲各章节及其子目录的详细信息
-        detailed_outline = []
-
-        for chapter in chapters:
-            # 将每个章节作为整体项,包含标题、页码和子目录
-            chapter_content = f"\n{chapter['title']} (页码: {chapter['page']})\n"
-
-            # 添加子目录(如果有)
-            subsections = chapter.get('subsections', [])
-            if subsections:
-                chapter_content += "包含子目录:\n"
-                for subsection in subsections:
-                    indent = "  " * (subsection['level'] - 1)
-                    chapter_content += f"{indent}- {subsection['title']} (页码: {subsection['page']})\n"
-
-            # 将完整章节内容作为一个项添加到列表
-            detailed_outline.append(chapter_content)
-
-
-
-        logger.info(f"提取整体大纲完成{overall_outline}")
-        logger.info(f"提取详细大纲完成{detailed_outline}")
-
-        # 准备审查数据
-        review_data = {
-            'outline_content': outline_content,
-            'overall_outline': overall_outline,
-            'detailed_outline': detailed_outline,
-            'state': state,
-            'stage_name': stage_name
-        }
-
-        # 调用outline_reviewer进行审查
+        # CSV文件路径
+        csv_path = Path('temp') / 'document_temp' / 'outlines_review_results.csv'
+        
+        # 存储所有缺失项
+        missing_items = []
+        
         try:
         try:
-            outline_review_result = await self.outline_reviewer.outline_review(review_data, trace_id_idx, state,stage_name)
+            # 读取CSV文件
+            df = pd.read_csv(csv_path, encoding='utf-8-sig')
+            logger.info(f"成功读取CSV文件: {csv_path}, 共 {len(df)} 行")
+            
+            # 兼容新旧字段名
+            chapter_label_col = 'chapter_label' if 'chapter_label' in df.columns else 'section_label_first'
+            review_results_col = 'review_results_summary' if 'review_results_summary' in df.columns else 'merged_review_results'
+            
+            # 遍历每一行
+            for index, row in df.iterrows():
+                chapter_label = row.get(chapter_label_col, '')
+                merged_results_str = row.get(review_results_col, '')
+                
+                # 解析review_results_summary字典字符串
+                try:
+                    if pd.isna(merged_results_str) or merged_results_str == '':
+                        merged_results = {}
+                    else:
+                        # 尝试使用ast.literal_eval解析
+                        merged_results = ast.literal_eval(merged_results_str)
+                except (ValueError, SyntaxError):
+                    try:
+                        # 尝试使用json.loads解析
+                        merged_results = json.loads(merged_results_str)
+                    except (json.JSONDecodeError, TypeError):
+                        logger.warning(f"第 {index} 行无法解析review_results_summary: {merged_results_str}")
+                        merged_results = {}
+                
+                # 检查字典中的每个字段
+                if isinstance(merged_results, dict):
+                    for field_name, field_value in merged_results.items():
+                        # 检查列表是否为空
+                        if isinstance(field_value, list) and len(field_value) == 0:
+                            # 生成缺失项
+                            missing_item = {
+                                "check_item_code": "catalogue_completeness_check",
+                                "check_result": {
+                                    "issue_point": f"{field_name}缺失",
+                                    "location": "",
+                                    "suggestion": "",
+                                    "reason": "",
+                                    "risk_level": ""
+                                }
+                            }
+                            missing_items.append(missing_item)
+                            logger.info(f"发现缺失项: 章节[{chapter_label}] 字段[{field_name}]")
+            
+            logger.info(f"大纲审查完成,共发现 {len(missing_items)} 个缺失项")
+            
+        except FileNotFoundError:
+            logger.error(f"CSV文件不存在: {csv_path}")
+            return {
+                'outline_review_result': [],
+                'error': f'CSV文件不存在: {csv_path}'
+            }
         except Exception as e:
         except Exception as e:
-            logger.warning(f"大纲审查失败,但返回提取结果: {str(e)}")
-            outline_review_result = None
-
-        # 确保目录存在
-        # import os
-        # os.makedirs("temp/outline_result_temp", exist_ok=True)
-
-        # # with open("temp/outline_result_temp/outline_result.json","w",encoding="utf-8") as f:
-        # #     json.dump(outline_review_result,f,ensure_ascii=False,indent=4)
-        # 返回提取的大纲结果和审查结果
+            logger.error(f"大纲审查失败: {str(e)}", exc_info=True)
+            return {
+                'outline_review_result': [],
+                'error': f'大纲审查失败: {str(e)}'
+            }
+        
         return {
         return {
-            'outline_review_result': outline_review_result
+            'outline_review_result':
+                {
+                    "response": missing_items,
+                }
         }
         }
     
     
     async def reference_basis_reviewer(self, review_data: Dict[str, Any], trace_id: str,
     async def reference_basis_reviewer(self, review_data: Dict[str, Any], trace_id: str,

+ 127 - 0
core/construction_review/component/reviewers/outline_check.py

@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+使用pandas读取CSV文件
+"""
+import os
+import pandas as pd
+import json
+import ast  # 用于安全解析字符串为Python对象
+
+
+def parse_review_result(review_result_str):
+    """
+    解析review_result字符串为字典
+    """
+    try:
+        # 尝试解析JSON格式的字符串
+        if pd.isna(review_result_str) or review_result_str == '':
+            return {}
+        return json.loads(review_result_str)
+    except (json.JSONDecodeError, TypeError):
+        try:
+            # 尝试使用ast.literal_eval解析
+            return ast.literal_eval(review_result_str)
+        except (ValueError, SyntaxError):
+            return {}
+
+def merge_dict_fields_and_deduplicate(group):
+    """
+    合并字典字段的值列表并去重
+    """
+    merged = {}
+    for item in group:
+        if isinstance(item, dict):
+            for key, value in item.items():
+                if key not in merged:
+                    merged[key] = []
+                # 如果值是列表,则合并;否则添加为单个元素
+                if isinstance(value, list):
+                    merged[key].extend(value)
+                else:
+                    merged[key].append(value)
+    # 对每个字段的值去重
+    for key in merged:
+        # 使用dict.fromkeys去重同时保持顺序
+        merged[key] = list(dict.fromkeys(merged[key]))
+    return merged
+
+
+def outline_review_results_df(data, path=None):
+    """
+    处理大纲审查结果DataFrame,生成合并后的审查结果
+    
+    Args:
+        data: 输入的DataFrame数据
+        path: 输出CSV文件路径(可选)
+        
+    Returns:
+        处理后的DataFrame,包含以下列:
+        - chapter_label: 章节标签(如"第一章编制依据")
+        - review_results_summary: 合并后的审查结果字典
+        - chapter_classification: 章节分类(如"basis", "overview"等)
+    """
+    try:
+        df = data
+        # 提取章节标签的第一部分
+        chapter_labels = df['section_label'].str.split('->').str[0]
+        df['title'] = chapter_labels
+        df_filtered = df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)
+        unique_chapter_labels = chapter_labels.unique().tolist()
+        chapter_classifications = df_filtered['chapter_classification']
+        
+        # 创建新的DataFrame来存储结果
+        new_df = pd.DataFrame()
+        
+        # 存储章节标签
+        new_df['chapter_label'] = unique_chapter_labels
+        
+        # 检查是否存在review_result列
+        if 'review_result' in df.columns:
+            df['parsed_review_result'] = df['review_result'].apply(parse_review_result)
+            
+            # 按title分组,使用字典推导式创建合并后的数据
+            grouped_data = df.groupby('title')['parsed_review_result']
+            
+            # 使用字典推导式合并相同title的字典的字段的值列表并去重
+            merged_dict = {
+                title: merge_dict_fields_and_deduplicate(group)
+                for title, group in grouped_data
+            }
+            
+            # 创建合并后的数据帧
+            merged_data = pd.DataFrame(list(merged_dict.items()), columns=['title', 'review_results_summary'])
+            
+            # 将合并后的数据赋值给新列,按照unique_chapter_labels的顺序
+            new_df['review_results_summary'] = merged_data.set_index('title').reindex(unique_chapter_labels)['review_results_summary'].tolist()
+        else:
+            # 如果没有review_result列,则填充空值
+            new_df['review_results_summary'] = [''] * len(unique_chapter_labels)
+        
+        # 存储章节分类
+        new_df['chapter_classification'] = chapter_classifications.values
+        
+        print(f"章节分类: {chapter_classifications.values}")
+        print(f"处理后的DataFrame:\n{new_df}")
+        
+        if path:
+            # 判断文件是否存在
+            if os.path.exists(path):
+                # 文件已存在,追加写入,不写表头
+                new_df.to_csv(path, mode='a', index=False, encoding='utf-8-sig', header=False)
+            else:
+                # 文件不存在,首次写入,包含表头
+                new_df.to_csv(path, mode='w', index=False, encoding='utf-8-sig')
+        
+        return new_df
+        
+    except FileNotFoundError:
+        print(f"错误: 文件不存在!")
+    except Exception as e:
+        print(f"读取CSV文件时发生错误: {e}")
+
+if __name__ == '__main__':
+    csv_file = rf'temp\document_temp\2_spec_review_results.csv'
+    path2 = rf'temp\document_temp\outlines_review_results.csv'
+    df = pd.read_csv(csv_file, encoding='utf-8-sig')
+    outline_review_results_df(data=df, path=path2)

+ 3 - 3
core/construction_review/workflows/ai_review_workflow.py

@@ -498,9 +498,9 @@ class AIReviewWorkflow:
                 logger.info(f"开始执行大纲审查")
                 logger.info(f"开始执行大纲审查")
 
 
 
 
-                # outline_review_result = await self.ai_review_engine.outline_check(state["callback_task_id"], state["structured_content"],
-                #                                     state, state.get("stage_name", "大纲审查"))
-                outline_review_result = {} 
+                outline_review_result = await self.ai_review_engine.outline_check(state["callback_task_id"], state["structured_content"],
+                                                    state, state.get("stage_name", "大纲审查"))
+                # outline_review_result = {} 
                 check_completeness_result = await self.ai_review_engine.check_completeness(
                 check_completeness_result = await self.ai_review_engine.check_completeness(
                     trace_id_idx = state["callback_task_id"],
                     trace_id_idx = state["callback_task_id"],
                     review_content = state["structured_content"]["chunks"],
                     review_content = state["structured_content"]["chunks"],