|
|
@@ -0,0 +1,209 @@
|
|
|
+import json
|
|
|
+import os
|
|
|
+import re
|
|
|
+import traceback
|
|
|
+from datetime import datetime
|
|
|
+
|
|
|
+
|
|
|
+def load_document_mapping(document_json_path):
|
|
|
+ """
|
|
|
+ 万能加载函数:适配任意格式的 t_samp_document_main.json,提取 title/id/file_url 映射
|
|
|
+ """
|
|
|
+ mapping = {}
|
|
|
+ try:
|
|
|
+ with open(document_json_path, 'r', encoding='utf-8') as f:
|
|
|
+ data = json.load(f)
|
|
|
+
|
|
|
+ # 定义递归提取函数(处理嵌套结构)
|
|
|
+ def extract_items(obj):
|
|
|
+ items = []
|
|
|
+ if isinstance(obj, list):
|
|
|
+ for item in obj:
|
|
|
+ items.extend(extract_items(item))
|
|
|
+ elif isinstance(obj, dict):
|
|
|
+ # 如果包含 title 字段,视为有效条目
|
|
|
+ if 'title' in obj and 'id' in obj:
|
|
|
+ items.append(obj)
|
|
|
+ # 否则递归遍历所有值
|
|
|
+ else:
|
|
|
+ for v in obj.values():
|
|
|
+ items.extend(extract_items(v))
|
|
|
+ return items
|
|
|
+
|
|
|
+ # 提取所有包含 title/id 的条目
|
|
|
+ valid_items = extract_items(data)
|
|
|
+ print(f"找到 {len(valid_items)} 个包含 title/id 的条目")
|
|
|
+
|
|
|
+ # 构建映射(规范化title中的空格)
|
|
|
+ for item in valid_items:
|
|
|
+ title = item.get('title', '')
|
|
|
+ # 规范化空格,处理多个连续空格的情况
|
|
|
+ title = normalize_whitespace(title)
|
|
|
+ if title:
|
|
|
+ mapping[title] = {
|
|
|
+ "id": item.get('id', ''),
|
|
|
+ "file_url": item.get('file_url', '')
|
|
|
+ }
|
|
|
+ # 打印匹配的标题(方便调试)
|
|
|
+ # print(f"映射标题:{title} -> ID: {item.get('id', '空')}")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"加载失败详情:{e}")
|
|
|
+ traceback.print_exc() # 打印完整错误栈
|
|
|
+
|
|
|
+ return mapping
|
|
|
+
|
|
|
+
|
|
|
+def normalize_whitespace(text):
|
|
|
+ """
|
|
|
+ 规范化空白字符:
|
|
|
+ - 替换所有换行符、回车符、制表符为空格
|
|
|
+ - 将多个连续空格合并为单个空格
|
|
|
+ """
|
|
|
+ # 首先将各种空白字符(换行、回车、制表符)替换为空格
|
|
|
+ text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
|
|
|
+ # 使用正则表达式将多个连续空格合并为单个空格
|
|
|
+ text = re.sub(r'\s+', ' ', text)
|
|
|
+ return text.strip()
|
|
|
+
|
|
|
+
|
|
|
+def extract_filename_from_path(file_path):
|
|
|
+ """提取文件名(去掉路径和后缀),并清洗空格/换行符"""
|
|
|
+ filename_with_suffix = os.path.basename(file_path)
|
|
|
+ filename = os.path.splitext(filename_with_suffix)[0]
|
|
|
+ # 规范化空白字符(处理多个连续空格的情况)
|
|
|
+ filename = normalize_whitespace(filename)
|
|
|
+ return filename
|
|
|
+
|
|
|
+
|
|
|
+def process_json_file(file_path, document_mapping, output_dir, failed_matches_list):
|
|
|
+ """
|
|
|
+ 处理单个JSON文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ file_path: 输入文件路径
|
|
|
+ document_mapping: 文档映射字典
|
|
|
+ output_dir: 输出目录
|
|
|
+ failed_matches_list: 用于收集匹配失败信息的列表
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
+ data = json.load(f)
|
|
|
+
|
|
|
+ # 统计匹配成功/失败的数量
|
|
|
+ match_count = 0
|
|
|
+ no_match_count = 0
|
|
|
+
|
|
|
+ for key, item in data.items():
|
|
|
+ if 'matedata' in item and 'source_file' in item['matedata']:
|
|
|
+ source_file = item['matedata']['source_file']
|
|
|
+ filename = extract_filename_from_path(source_file)
|
|
|
+
|
|
|
+ # 调试:打印待匹配的文件名
|
|
|
+ print(f"\n待匹配文件名:{filename}")
|
|
|
+
|
|
|
+ if filename in document_mapping:
|
|
|
+ doc_info = document_mapping[filename]
|
|
|
+ item['matedata']['document_id'] = doc_info['id']
|
|
|
+ item['matedata']['file_url'] = doc_info['file_url']
|
|
|
+ match_count += 1
|
|
|
+ print(f"✅ 匹配成功:{filename} -> ID: {doc_info['id']}")
|
|
|
+ else:
|
|
|
+ item['matedata']['document_id'] = ''
|
|
|
+ item['matedata']['file_url'] = ''
|
|
|
+ no_match_count += 1
|
|
|
+ print(f"❌ 匹配失败:{filename}(未在映射中找到)")
|
|
|
+
|
|
|
+ # 收集匹配失败的信息
|
|
|
+ failed_match_info = {
|
|
|
+ "source_file": source_file,
|
|
|
+ "extracted_filename": filename,
|
|
|
+ "chunk_id": item.get('matedata', {}).get('chunk_id', ''),
|
|
|
+ "section_label": item.get('matedata', {}).get('section_label', ''),
|
|
|
+ "category_info": item.get('category_info', {}),
|
|
|
+ "tags": item.get('tags', {}),
|
|
|
+ "input_file": os.path.basename(file_path),
|
|
|
+ "timestamp": datetime.now().isoformat()
|
|
|
+ }
|
|
|
+ failed_matches_list.append(failed_match_info)
|
|
|
+
|
|
|
+ # 保存文件
|
|
|
+ os.makedirs(output_dir, exist_ok=True)
|
|
|
+ output_file_path = os.path.join(output_dir, os.path.basename(file_path))
|
|
|
+ with open(output_file_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ print(f"\n文件处理结果:匹配成功 {match_count} 条,失败 {no_match_count} 条")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"处理文件失败:{e}")
|
|
|
+ traceback.print_exc()
|
|
|
+
|
|
|
+
|
|
|
+def save_failed_matches_report(failed_matches, output_path):
|
|
|
+ """
|
|
|
+ 保存匹配失败的报告到JSON文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ failed_matches: 匹配失败的列表
|
|
|
+ output_path: 输出文件路径
|
|
|
+ """
|
|
|
+ if not failed_matches:
|
|
|
+ print("\n✅ 所有小节都匹配成功,无失败记录")
|
|
|
+ return
|
|
|
+
|
|
|
+ report = {
|
|
|
+ "generated_at": datetime.now().isoformat(),
|
|
|
+ "summary": {
|
|
|
+ "total_failed": len(failed_matches),
|
|
|
+ "unique_files": len(set(item["extracted_filename"] for item in failed_matches))
|
|
|
+ },
|
|
|
+ "failed_matches": failed_matches
|
|
|
+ }
|
|
|
+
|
|
|
+ try:
|
|
|
+ with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(report, f, ensure_ascii=False, indent=2)
|
|
|
+ print(f"\n📄 匹配失败报告已保存:{output_path}")
|
|
|
+ print(f" 共 {len(failed_matches)} 条失败记录,涉及 {report['summary']['unique_files']} 个不同文件")
|
|
|
+ except Exception as e:
|
|
|
+ print(f"\n❌ 保存失败报告时出错:{e}")
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ # ===================== 配置参数(请确认路径正确)=====================
|
|
|
+ INPUT_FOLDER = r"E:\WeChat Files\wxid_rymkhe638gt022\FileStorage\File\2026-03\1\sgfa_classification_db"
|
|
|
+ OUTPUT_FOLDER = r"E:\WeChat Files\wxid_rymkhe638gt022\FileStorage\File\2026-03\1\your_output_folder"
|
|
|
+ DOCUMENT_JSON_PATH = r"E:\WeChat Files\wxid_rymkhe638gt022\FileStorage\File\2026-03\t_samp_document_main.json"
|
|
|
+
|
|
|
+ # 匹配失败报告保存路径
|
|
|
+ FAILED_MATCHES_PATH = os.path.join(OUTPUT_FOLDER, "failed_matches_report.json")
|
|
|
+
|
|
|
+ # ===================== 执行流程 =====================
|
|
|
+ print("="*50)
|
|
|
+ print("开始加载文档映射...")
|
|
|
+ document_mapping = load_document_mapping(DOCUMENT_JSON_PATH)
|
|
|
+ print(f"最终加载映射数量:{len(document_mapping)}")
|
|
|
+
|
|
|
+ # 用于收集所有匹配失败的信息
|
|
|
+ all_failed_matches = []
|
|
|
+
|
|
|
+ print("\n" + "="*50)
|
|
|
+ print("开始处理JSON文件...")
|
|
|
+ for filename in os.listdir(INPUT_FOLDER):
|
|
|
+ if filename.endswith('.json'):
|
|
|
+ file_path = os.path.join(INPUT_FOLDER, filename)
|
|
|
+ print(f"\n处理文件:{filename}")
|
|
|
+ process_json_file(file_path, document_mapping, OUTPUT_FOLDER, all_failed_matches)
|
|
|
+
|
|
|
+ print(f"\n" + "="*50)
|
|
|
+ print(f"所有文件处理完成!输出路径:{OUTPUT_FOLDER}")
|
|
|
+
|
|
|
+ # 保存匹配失败报告
|
|
|
+ print("\n" + "="*50)
|
|
|
+ print("保存匹配失败报告...")
|
|
|
+ save_failed_matches_report(all_failed_matches, FAILED_MATCHES_PATH)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|