{ "success": true, "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\n使用pandas读取CSV文件\n\"\"\"\nimport os\nimport pandas as pd\nimport json\nimport ast # 用于安全解析字符串为Python对象\n\nfrom foundation.observability.logger.loggering import review_logger as logger\n\n\ndef parse_review_result(review_result_str):\n \"\"\"\n 解析review_result字符串为字典\n \"\"\"\n try:\n # 尝试解析JSON格式的字符串\n if isinstance(review_result_str, dict):\n return review_result_str\n if pd.isna(review_result_str) or review_result_str == '':\n return {}\n return json.loads(review_result_str)\n except (json.JSONDecodeError, TypeError):\n try:\n # 尝试使用ast.literal_eval解析\n return ast.literal_eval(review_result_str)\n except (ValueError, SyntaxError):\n return {}\n\ndef outline_review_results_df(data, path=None):\n \"\"\"\n 处理大纲审查结果DataFrame,生成合并后的审查结果\n \n Args:\n data: 输入的DataFrame数据\n path: 输出CSV文件路径(可选)\n \n Returns:\n 处理后的DataFrame,包含以下列:\n - chapter_label: 章节标签(如\"第一章编制依据\")\n - review_results_summary: 合并后的审查结果字典\n - chapter_classification: 章节分类(如\"basis\", \"overview\"等)\n \"\"\"\n logger.info(f\"开始处理大纲审查结果,数据行数: {len(data)}\")\n try:\n df = data\n # 提取章节标签的第一部分\n chapter_labels = df['section_label'].str.split('->').str[0]\n df['title'] = chapter_labels\n df_filtered = df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)\n unique_chapter_labels = chapter_labels.unique().tolist()\n chapter_classifications = df_filtered['chapter_classification']\n \n # 创建新的DataFrame来存储结果\n new_df = pd.DataFrame()\n \n # 存储章节标签\n new_df['chapter_label'] = unique_chapter_labels\n \n # 检查是否存在review_result列\n if 'review_result' in df.columns:\n df['parsed_review_result'] = df['review_result'].apply(parse_review_result)\n \n # 按title分组,使用字典推导式创建合并后的数据\n grouped_data = df.groupby('title')['parsed_review_result']\n \n # 使用字典推导式合并相同title的字典的字段的值列表并去重\n merged_dict = {\n title: merge_dict_lists_and_deduplicate(group.tolist())\n for title, group in grouped_data\n }\n \n # 创建合并后的数据帧\n merged_data = pd.DataFrame(list(merged_dict.items()), columns=['title', 'review_results_summary'])\n \n # 将合并后的数据赋值给新列,按照unique_chapter_labels的顺序\n new_df['review_results_summary'] = merged_data.set_index('title').reindex(unique_chapter_labels)['review_results_summary'].tolist()\n else:\n # 如果没有review_result列,则填充空值\n new_df['review_results_summary'] = [''] * len(unique_chapter_labels)\n \n # 存储章节分类\n new_df['chapter_classification'] = chapter_classifications.values\n \n if path:\n # 判断文件是否存在\n if os.path.exists(path):\n # 文件已存在,追加写入,不写表头\n new_df.to_csv(path, mode='a', index=False, encoding='utf-8-sig', header=False)\n else:\n # 文件不存在,首次写入,包含表头\n new_df.to_csv(path, mode='w', index=False, encoding='utf-8-sig')\n \n logger.info(f\"大纲审查结果处理完成,输出 {len(new_df)} 条记录\")\n return new_df\n \n except FileNotFoundError:\n logger.error(f\"文件不存在: {path}\")\n except Exception as e:\n logger.error(f\"处理大纲审查结果时发生错误: {e}\")\n\n\ndef merge_dict_lists_and_deduplicate(dict_list):\n \"\"\"\n 合并多个字典,其中每个字典的值都是列表,进行顺序去重合并\n \n Args:\n dict_list: 字典列表\n \n Returns:\n 合并后的字典,每个键对应的值是去重后的列表\n \"\"\"\n merged = {}\n for d in dict_list:\n if isinstance(d, dict):\n for key, value in d.items():\n if key not in merged:\n merged[key] = []\n if isinstance(value, list):\n merged[key].extend(value)\n else:\n merged[key].append(value)\n # 对每个字段的值去重(保持顺序)\n for key in merged:\n merged[key] = list(dict.fromkeys(merged[key]))\n return merged\n\n\ndef merge_results_by_classification(df):\n \"\"\"\n 根据chapter_classification列的标签将review_results_summary列的字典合并,\n chapter_label列的值合并为列表\n \n Args:\n df: outline_review_results_df返回的DataFrame,包含以下列:\n - chapter_label: 章节标签\n - review_results_summary: 审查结果字典(字符串或字典)\n - chapter_classification: 章节分类\n \n Returns:\n 合并后的DataFrame,chapter_classification列值唯一,其他列对应合并\n \"\"\"\n logger.info(f\"开始按chapter_classification合并结果,数据行数: {len(df)}\")\n try:\n # 解析review_results_summary列(如果是字符串)\n df['parsed_summary'] = df['review_results_summary'].apply(parse_review_result)\n \n # 按chapter_classification分组\n grouped = df.groupby('chapter_classification')\n \n # 合并数据\n result_data = []\n for classification, group in grouped:\n # 合并chapter_label为列表\n chapter_labels = group['chapter_label'].tolist()\n \n # 合并review_results_summary字典\n dict_list = group['parsed_summary'].tolist()\n merged_dict = merge_dict_lists_and_deduplicate(dict_list)\n \n result_data.append({\n 'chapter_classification': classification,\n 'chapter_label': chapter_labels,\n 'review_results_summary': merged_dict\n })\n \n # 创建新的DataFrame\n result_df = pd.DataFrame(result_data)\n \n logger.info(f\"合并完成,输出 {len(result_df)} 条记录\")\n return result_df\n \n except Exception as e:\n logger.error(f\"按chapter_classification合并结果时发生错误: {e}\")\n raise\n\n\n\n# 定义函数:提取字典中空列表对应的键\ndef get_empty_list_keys(dict_data):\n \"\"\"\n 从字典中提取值为空列表的键列表\n \n 参数:\n dict_data: 输入的字典\n \n 返回:\n list: 空列表对应的键列表\n \"\"\"\n # 先检查输入是否为字典,避免非字典类型导致报错\n if not isinstance(dict_data, dict):\n return []\n \n # 遍历字典,筛选值为空列表的键\n empty_keys = [key for key, value in dict_data.items() if isinstance(value, list) and len(value) == 0]\n return empty_keys\n\nif __name__ == '__main__':\n csv_file = rf'temp\\construction_review\\document_temp\\2_spec_review_results.csv'\n path2 = rf'temp\\construction_review\\document_temp\\outlines_review_results.csv'\n df = pd.read_csv(csv_file, encoding='utf-8-sig')\n outline_review_results_df(data=df, path=path2)", "start_line": 1, "end_line": 204, "total_lines": 204, "has_more": false }