Quellcode durchsuchen

dev:添加了大纲审查的临时文件初版;

ChenJiSheng vor 1 Monat
Ursprung
Commit
274c9aa8e2

+ 86 - 72
core/construction_review/component/ai_review_engine.py

@@ -54,7 +54,9 @@ import time
 from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Dict, List, Optional, Sequence
-
+import pandas as pd
+import json
+import ast  # 用于安全解析字符串为Python对象
 import pandas as pd
 
 from core.base.task_models import TaskFileInfo
@@ -88,7 +90,7 @@ from .reviewers.check_completeness.components.result_saver import ResultSaver
 from .reviewers.check_completeness.components.result_analyzer import ResultAnalyzer
 from .reviewers.check_completeness.utils.file_utils import write_json
 from core.construction_review.component.reviewers.base_reviewer import ReviewResult
-
+from .reviewers.outline_check import outline_review_results_df
 @dataclass
 class ReviewResult:
     """审查结果"""
@@ -668,7 +670,6 @@ class AIReviewEngine(BaseReviewer):
         #     json.dump(review_content, f, ensure_ascii=False, indent=4)
         name = "completeness_check"
         start_time = time.time()
-        
         try:
             # 验证review_content格式
             if not isinstance(review_content, list):
@@ -741,10 +742,17 @@ class AIReviewEngine(BaseReviewer):
 
             review_results = await review_pipeline.review(documents, specification)
             review_results_df = pd.DataFrame(review_results)
-            df_section_label = review_results_df['section_label'].str.split('->').str[0]
-            review_results_df['title'] = df_section_label
+            chapter_labels = review_results_df['section_label'].str.split('->').str[0]
+            review_results_df['title'] = chapter_labels
             review_results_df.to_csv(Path('temp') / 'document_temp' / '2_spec_review_results.csv', encoding='utf-8-sig', index=False)
-            review_results_flag = review_results_df["chapter_classification"].unique().tolist()
+            csv_file = rf'temp\document_temp\2_spec_review_results.csv'
+            path2 = rf'temp\document_temp\outlines_review_results.csv'
+            data_df = pd.read_csv(csv_file, encoding='utf-8-sig')
+            outline_review_results_df(data=data_df, path=path2)
+            df_filtered = review_results_df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)
+            unique_chapter_labels = chapter_labels.unique().tolist()
+            chapter_classifications = df_filtered['chapter_classification']
+            review_results_flag = chapter_classifications.unique().tolist()
 
             # with open(r'temp\document_temp\1_spec_review_results.json', 'w', encoding='utf-8') as f:
             #     json.dump(review_results, f, ensure_ascii=False, indent=4)
@@ -762,7 +770,7 @@ class AIReviewEngine(BaseReviewer):
             # logger.info(f"  规范覆盖汇总结果已保存至: {spec_summary_csv_path}")
             summary_rows = pd.DataFrame(summary_rows)
             summary_rows = summary_rows[summary_rows['标签'].isin(review_results_flag)]
-            # summary_rows.to_csv(str(spec_summary_csv_path), encoding='utf-8-sig', index=False)
+            summary_rows.to_csv(str(spec_summary_csv_path), encoding='utf-8-sig', index=False)
             summary_rows = summary_rows.to_dict('records')
             # 生成缺失要点 JSON 列表,便于前端消费
 
@@ -954,74 +962,80 @@ class AIReviewEngine(BaseReviewer):
         """
         logger.info(f"开始大纲审查,trace_id: {trace_id_idx}")
 
-        # 1. 获取整体大纲(1级大纲目录)
-        overall_outline = ""
-
-        # 添加调试信息
-        logger.debug(f"outline_content结构: {list(outline_content.keys()) if outline_content else 'None'}")
-        outline_data = outline_content.get('outline', {})
-        logger.debug(f"outline_data结构: {list(outline_data.keys()) if outline_data else 'None'}")
-        chapters = outline_data.get('chapters', [])
-        logger.info(f"chapters数量: {len(chapters)}")
-
-        for i, chapter in enumerate(chapters):
-            chapter_title = chapter.get('title', 'N/A')
-            chapter_page = chapter.get('page', 'N/A')
-            logger.info(f"章节{i+1}: {chapter_title} (页码: {chapter_page})")
-            overall_outline += f"{chapter_title} (页码: {chapter_page})\n"
-
-        logger.info(f"生成的overall_outline长度: {len(overall_outline)}")
-        if overall_outline:
-            logger.info(f"overall_outline内容: {overall_outline[:200]}...")
-
-        # 2. 获取大纲各章节及其子目录的详细信息
-        detailed_outline = []
-
-        for chapter in chapters:
-            # 将每个章节作为整体项,包含标题、页码和子目录
-            chapter_content = f"\n{chapter['title']} (页码: {chapter['page']})\n"
-
-            # 添加子目录(如果有)
-            subsections = chapter.get('subsections', [])
-            if subsections:
-                chapter_content += "包含子目录:\n"
-                for subsection in subsections:
-                    indent = "  " * (subsection['level'] - 1)
-                    chapter_content += f"{indent}- {subsection['title']} (页码: {subsection['page']})\n"
-
-            # 将完整章节内容作为一个项添加到列表
-            detailed_outline.append(chapter_content)
-
-
-
-        logger.info(f"提取整体大纲完成{overall_outline}")
-        logger.info(f"提取详细大纲完成{detailed_outline}")
-
-        # 准备审查数据
-        review_data = {
-            'outline_content': outline_content,
-            'overall_outline': overall_outline,
-            'detailed_outline': detailed_outline,
-            'state': state,
-            'stage_name': stage_name
-        }
-
-        # 调用outline_reviewer进行审查
+        # CSV文件路径
+        csv_path = Path('temp') / 'document_temp' / 'outlines_review_results.csv'
+        
+        # 存储所有缺失项
+        missing_items = []
+        
         try:
-            outline_review_result = await self.outline_reviewer.outline_review(review_data, trace_id_idx, state,stage_name)
+            # 读取CSV文件
+            df = pd.read_csv(csv_path, encoding='utf-8-sig')
+            logger.info(f"成功读取CSV文件: {csv_path}, 共 {len(df)} 行")
+            
+            # 兼容新旧字段名
+            chapter_label_col = 'chapter_label' if 'chapter_label' in df.columns else 'section_label_first'
+            review_results_col = 'review_results_summary' if 'review_results_summary' in df.columns else 'merged_review_results'
+            
+            # 遍历每一行
+            for index, row in df.iterrows():
+                chapter_label = row.get(chapter_label_col, '')
+                merged_results_str = row.get(review_results_col, '')
+                
+                # 解析review_results_summary字典字符串
+                try:
+                    if pd.isna(merged_results_str) or merged_results_str == '':
+                        merged_results = {}
+                    else:
+                        # 尝试使用ast.literal_eval解析
+                        merged_results = ast.literal_eval(merged_results_str)
+                except (ValueError, SyntaxError):
+                    try:
+                        # 尝试使用json.loads解析
+                        merged_results = json.loads(merged_results_str)
+                    except (json.JSONDecodeError, TypeError):
+                        logger.warning(f"第 {index} 行无法解析review_results_summary: {merged_results_str}")
+                        merged_results = {}
+                
+                # 检查字典中的每个字段
+                if isinstance(merged_results, dict):
+                    for field_name, field_value in merged_results.items():
+                        # 检查列表是否为空
+                        if isinstance(field_value, list) and len(field_value) == 0:
+                            # 生成缺失项
+                            missing_item = {
+                                "check_item_code": "catalogue_completeness_check",
+                                "check_result": {
+                                    "issue_point": f"{field_name}缺失",
+                                    "location": "",
+                                    "suggestion": "",
+                                    "reason": "",
+                                    "risk_level": ""
+                                }
+                            }
+                            missing_items.append(missing_item)
+                            logger.info(f"发现缺失项: 章节[{chapter_label}] 字段[{field_name}]")
+            
+            logger.info(f"大纲审查完成,共发现 {len(missing_items)} 个缺失项")
+            
+        except FileNotFoundError:
+            logger.error(f"CSV文件不存在: {csv_path}")
+            return {
+                'outline_review_result': [],
+                'error': f'CSV文件不存在: {csv_path}'
+            }
         except Exception as e:
-            logger.warning(f"大纲审查失败,但返回提取结果: {str(e)}")
-            outline_review_result = None
-
-        # 确保目录存在
-        # import os
-        # os.makedirs("temp/outline_result_temp", exist_ok=True)
-
-        # # with open("temp/outline_result_temp/outline_result.json","w",encoding="utf-8") as f:
-        # #     json.dump(outline_review_result,f,ensure_ascii=False,indent=4)
-        # 返回提取的大纲结果和审查结果
+            logger.error(f"大纲审查失败: {str(e)}", exc_info=True)
+            return {
+                'outline_review_result': [],
+                'error': f'大纲审查失败: {str(e)}'
+            }
+        
         return {
-            'outline_review_result': outline_review_result
+            'outline_review_result':
+                {
+                    "response": missing_items,
+                }
         }
     
     async def reference_basis_reviewer(self, review_data: Dict[str, Any], trace_id: str,

+ 127 - 0
core/construction_review/component/reviewers/outline_check.py

@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+使用pandas读取CSV文件
+"""
+import os
+import pandas as pd
+import json
+import ast  # 用于安全解析字符串为Python对象
+
+
+def parse_review_result(review_result_str):
+    """
+    解析review_result字符串为字典
+    """
+    try:
+        # 尝试解析JSON格式的字符串
+        if pd.isna(review_result_str) or review_result_str == '':
+            return {}
+        return json.loads(review_result_str)
+    except (json.JSONDecodeError, TypeError):
+        try:
+            # 尝试使用ast.literal_eval解析
+            return ast.literal_eval(review_result_str)
+        except (ValueError, SyntaxError):
+            return {}
+
+def merge_dict_fields_and_deduplicate(group):
+    """
+    合并字典字段的值列表并去重
+    """
+    merged = {}
+    for item in group:
+        if isinstance(item, dict):
+            for key, value in item.items():
+                if key not in merged:
+                    merged[key] = []
+                # 如果值是列表,则合并;否则添加为单个元素
+                if isinstance(value, list):
+                    merged[key].extend(value)
+                else:
+                    merged[key].append(value)
+    # 对每个字段的值去重
+    for key in merged:
+        # 使用dict.fromkeys去重同时保持顺序
+        merged[key] = list(dict.fromkeys(merged[key]))
+    return merged
+
+
+def outline_review_results_df(data, path=None):
+    """
+    处理大纲审查结果DataFrame,生成合并后的审查结果
+    
+    Args:
+        data: 输入的DataFrame数据
+        path: 输出CSV文件路径(可选)
+        
+    Returns:
+        处理后的DataFrame,包含以下列:
+        - chapter_label: 章节标签(如"第一章编制依据")
+        - review_results_summary: 合并后的审查结果字典
+        - chapter_classification: 章节分类(如"basis", "overview"等)
+    """
+    try:
+        df = data
+        # 提取章节标签的第一部分
+        chapter_labels = df['section_label'].str.split('->').str[0]
+        df['title'] = chapter_labels
+        df_filtered = df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)
+        unique_chapter_labels = chapter_labels.unique().tolist()
+        chapter_classifications = df_filtered['chapter_classification']
+        
+        # 创建新的DataFrame来存储结果
+        new_df = pd.DataFrame()
+        
+        # 存储章节标签
+        new_df['chapter_label'] = unique_chapter_labels
+        
+        # 检查是否存在review_result列
+        if 'review_result' in df.columns:
+            df['parsed_review_result'] = df['review_result'].apply(parse_review_result)
+            
+            # 按title分组,使用字典推导式创建合并后的数据
+            grouped_data = df.groupby('title')['parsed_review_result']
+            
+            # 使用字典推导式合并相同title的字典的字段的值列表并去重
+            merged_dict = {
+                title: merge_dict_fields_and_deduplicate(group)
+                for title, group in grouped_data
+            }
+            
+            # 创建合并后的数据帧
+            merged_data = pd.DataFrame(list(merged_dict.items()), columns=['title', 'review_results_summary'])
+            
+            # 将合并后的数据赋值给新列,按照unique_chapter_labels的顺序
+            new_df['review_results_summary'] = merged_data.set_index('title').reindex(unique_chapter_labels)['review_results_summary'].tolist()
+        else:
+            # 如果没有review_result列,则填充空值
+            new_df['review_results_summary'] = [''] * len(unique_chapter_labels)
+        
+        # 存储章节分类
+        new_df['chapter_classification'] = chapter_classifications.values
+        
+        print(f"章节分类: {chapter_classifications.values}")
+        print(f"处理后的DataFrame:\n{new_df}")
+        
+        if path:
+            # 判断文件是否存在
+            if os.path.exists(path):
+                # 文件已存在,追加写入,不写表头
+                new_df.to_csv(path, mode='a', index=False, encoding='utf-8-sig', header=False)
+            else:
+                # 文件不存在,首次写入,包含表头
+                new_df.to_csv(path, mode='w', index=False, encoding='utf-8-sig')
+        
+        return new_df
+        
+    except FileNotFoundError:
+        print(f"错误: 文件不存在!")
+    except Exception as e:
+        print(f"读取CSV文件时发生错误: {e}")
+
+if __name__ == '__main__':
+    csv_file = rf'temp\document_temp\2_spec_review_results.csv'
+    path2 = rf'temp\document_temp\outlines_review_results.csv'
+    df = pd.read_csv(csv_file, encoding='utf-8-sig')
+    outline_review_results_df(data=df, path=path2)

+ 3 - 3
core/construction_review/workflows/ai_review_workflow.py

@@ -498,9 +498,9 @@ class AIReviewWorkflow:
                 logger.info(f"开始执行大纲审查")
 
 
-                # outline_review_result = await self.ai_review_engine.outline_check(state["callback_task_id"], state["structured_content"],
-                #                                     state, state.get("stage_name", "大纲审查"))
-                outline_review_result = {} 
+                outline_review_result = await self.ai_review_engine.outline_check(state["callback_task_id"], state["structured_content"],
+                                                    state, state.get("stage_name", "大纲审查"))
+                # outline_review_result = {} 
                 check_completeness_result = await self.ai_review_engine.check_completeness(
                     trace_id_idx = state["callback_task_id"],
                     review_content = state["structured_content"]["chunks"],