|
@@ -0,0 +1,105 @@
|
|
|
|
|
+import csv
|
|
|
|
|
+import os
|
|
|
|
|
+
|
|
|
|
|
+def process_standard_csv(input_file: str, output_file: str, delete_summary_file: str):
|
|
|
|
|
+ """
|
|
|
|
|
+ 处理CSV文件去重,保留空字段最少的数据,删除数据单独汇总
|
|
|
|
|
+ :param input_file: 原始CSV文件路径
|
|
|
|
|
+ :param output_file: 去重后输出文件路径
|
|
|
|
|
+ :param delete_summary_file: 删除数据汇总文件路径
|
|
|
|
|
+ """
|
|
|
|
|
+ # 定义需要处理的6个关键字段
|
|
|
|
|
+ target_fields = [
|
|
|
|
|
+ "标准编号", "标准名称", "状态",
|
|
|
|
|
+ "发布日期", "实施日期", "发布部门"
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ # 存储去重规则:key=标准编号+标准名称+状态,value=该行数据+空字段数量
|
|
|
|
|
+ unique_data = {}
|
|
|
|
|
+ # 存储所有被删除的数据
|
|
|
|
|
+ deleted_data = []
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 1. 读取原始CSV文件(修复编码和表头问题)
|
|
|
|
|
+ with open(input_file, 'r', encoding='utf-8-sig', newline='') as f:
|
|
|
|
|
+ reader = csv.DictReader(f)
|
|
|
|
|
+ # 保留原始表头,确保输出格式一致
|
|
|
|
|
+ fieldnames = reader.fieldnames
|
|
|
|
|
+
|
|
|
|
|
+ # 打印表头用于调试
|
|
|
|
|
+ print("📋 读取到CSV表头:", fieldnames)
|
|
|
|
|
+
|
|
|
|
|
+ # 校验必须包含目标字段
|
|
|
|
|
+ missing_fields = [field for field in target_fields if field not in fieldnames]
|
|
|
|
|
+ if missing_fields:
|
|
|
|
|
+ raise ValueError(f"CSV文件缺少必填字段:{missing_fields}")
|
|
|
|
|
+
|
|
|
|
|
+ print(f"✅ 成功读取原始文件,共{len(fieldnames)}列,开始处理数据...")
|
|
|
|
|
+
|
|
|
|
|
+ for row in reader:
|
|
|
|
|
+ # 提取去重关键字段值
|
|
|
|
|
+ code = row["标准编号"].strip() if row["标准编号"] else ""
|
|
|
|
|
+ name = row["标准名称"].strip() if row["标准名称"] else ""
|
|
|
|
|
+ status = row["状态"].strip() if row["状态"] else ""
|
|
|
|
|
+ key = f"{code}|{name}|{status}"
|
|
|
|
|
+
|
|
|
|
|
+ # 计算当前行6个目标字段的空字段数量(空字符串/None/纯空格都算空)
|
|
|
|
|
+ empty_count = 0
|
|
|
|
|
+ for field in target_fields:
|
|
|
|
|
+ val = row[field]
|
|
|
|
|
+ if not val or val.strip() == "":
|
|
|
|
|
+ empty_count += 1
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 去重逻辑:相同key只保留空字段最少的数据
|
|
|
|
|
+ if key not in unique_data:
|
|
|
|
|
+ # 首次出现,直接保存
|
|
|
|
|
+ unique_data[key] = {"data": row, "empty_count": empty_count}
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 已存在,比较空字段数量
|
|
|
|
|
+ existing = unique_data[key]
|
|
|
|
|
+ if empty_count < existing["empty_count"]:
|
|
|
|
|
+ # 新数据空字段更少,替换旧数据,旧数据移入删除列表
|
|
|
|
|
+ deleted_data.append(existing["data"])
|
|
|
|
|
+ unique_data[key] = {"data": row, "empty_count": empty_count}
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 新数据空字段更多/相等,直接移入删除列表
|
|
|
|
|
+ deleted_data.append(row)
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 导出【去重后的数据】
|
|
|
|
|
+ with open(output_file, 'w', encoding='utf-8-sig', newline='') as f:
|
|
|
|
|
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
|
|
|
+ writer.writeheader()
|
|
|
|
|
+ for item in unique_data.values():
|
|
|
|
|
+ writer.writerow(item["data"])
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 导出【删除数据汇总】
|
|
|
|
|
+ with open(delete_summary_file, 'w', encoding='utf-8-sig', newline='') as f:
|
|
|
|
|
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
|
|
|
+ writer.writeheader()
|
|
|
|
|
+ writer.writerows(deleted_data)
|
|
|
|
|
+
|
|
|
|
|
+ # 打印处理结果
|
|
|
|
|
+ print("=" * 50)
|
|
|
|
|
+ print(f"📊 处理完成!")
|
|
|
|
|
+ print(f"原始数据总数:{len(unique_data) + len(deleted_data)} 条")
|
|
|
|
|
+ print(f"去重后保留:{len(unique_data)} 条")
|
|
|
|
|
+ print(f"删除并汇总:{len(deleted_data)} 条")
|
|
|
|
|
+ print(f"✅ 去重文件:{output_file}")
|
|
|
|
|
+ print(f"✅ 删除汇总文件:{delete_summary_file}")
|
|
|
|
|
+ print("=" * 50)
|
|
|
|
|
+
|
|
|
|
|
+ except FileNotFoundError:
|
|
|
|
|
+ print(f"❌ 错误:未找到文件 {input_file}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"❌ 处理失败:{str(e)}")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ # ===================== 配置参数(已修复)=====================
|
|
|
|
|
+ INPUT_CSV = r"F:\时效性相关文档\新增标准原材料\原文档重新查询.csv"
|
|
|
|
|
+ OUTPUT_CSV = r"F:\时效性相关文档\新增标准原材料\原文档重新查询_去重后数据.csv"
|
|
|
|
|
+ DELETE_SUMMARY = r"F:\时效性相关文档\新增标准原材料\原文档重新查询_删除数据汇总.csv"
|
|
|
|
|
+ # =========================================================================
|
|
|
|
|
+
|
|
|
|
|
+ # 执行处理
|
|
|
|
|
+ process_standard_csv(INPUT_CSV, OUTPUT_CSV, DELETE_SUMMARY)
|