فهرست منبع

feat:片段查询原文件的url并生成新文件

ai02 4 هفته پیش
والد
کامیت
53ddf2997c
1فایلهای تغییر یافته به همراه209 افزوده شده و 0 حذف شده
  1. 209 0
      src/app/scripts/select_url.py

+ 209 - 0
src/app/scripts/select_url.py

@@ -0,0 +1,209 @@
+import json
+import os
+import re
+import traceback
+from datetime import datetime
+
+
+def load_document_mapping(document_json_path):
+    """
+    万能加载函数:适配任意格式的 t_samp_document_main.json,提取 title/id/file_url 映射
+    """
+    mapping = {}
+    try:
+        with open(document_json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        
+        # 定义递归提取函数(处理嵌套结构)
+        def extract_items(obj):
+            items = []
+            if isinstance(obj, list):
+                for item in obj:
+                    items.extend(extract_items(item))
+            elif isinstance(obj, dict):
+                # 如果包含 title 字段,视为有效条目
+                if 'title' in obj and 'id' in obj:
+                    items.append(obj)
+                # 否则递归遍历所有值
+                else:
+                    for v in obj.values():
+                        items.extend(extract_items(v))
+            return items
+        
+        # 提取所有包含 title/id 的条目
+        valid_items = extract_items(data)
+        print(f"找到 {len(valid_items)} 个包含 title/id 的条目")
+        
+        # 构建映射(规范化title中的空格)
+        for item in valid_items:
+            title = item.get('title', '')
+            # 规范化空格,处理多个连续空格的情况
+            title = normalize_whitespace(title)
+            if title:
+                mapping[title] = {
+                    "id": item.get('id', ''),
+                    "file_url": item.get('file_url', '')
+                }
+                # 打印匹配的标题(方便调试)
+                # print(f"映射标题:{title} -> ID: {item.get('id', '空')}")
+        
+    except Exception as e:
+        print(f"加载失败详情:{e}")
+        traceback.print_exc()  # 打印完整错误栈
+    
+    return mapping
+
+
+def normalize_whitespace(text):
+    """
+    规范化空白字符:
+    - 替换所有换行符、回车符、制表符为空格
+    - 将多个连续空格合并为单个空格
+    """
+    # 首先将各种空白字符(换行、回车、制表符)替换为空格
+    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
+    # 使用正则表达式将多个连续空格合并为单个空格
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+
+
+def extract_filename_from_path(file_path):
+    """提取文件名(去掉路径和后缀),并清洗空格/换行符"""
+    filename_with_suffix = os.path.basename(file_path)
+    filename = os.path.splitext(filename_with_suffix)[0]
+    # 规范化空白字符(处理多个连续空格的情况)
+    filename = normalize_whitespace(filename)
+    return filename
+
+
+def process_json_file(file_path, document_mapping, output_dir, failed_matches_list):
+    """
+    处理单个JSON文件
+    
+    Args:
+        file_path: 输入文件路径
+        document_mapping: 文档映射字典
+        output_dir: 输出目录
+        failed_matches_list: 用于收集匹配失败信息的列表
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        
+        # 统计匹配成功/失败的数量
+        match_count = 0
+        no_match_count = 0
+        
+        for key, item in data.items():
+            if 'matedata' in item and 'source_file' in item['matedata']:
+                source_file = item['matedata']['source_file']
+                filename = extract_filename_from_path(source_file)
+                
+                # 调试:打印待匹配的文件名
+                print(f"\n待匹配文件名:{filename}")
+                
+                if filename in document_mapping:
+                    doc_info = document_mapping[filename]
+                    item['matedata']['document_id'] = doc_info['id']
+                    item['matedata']['file_url'] = doc_info['file_url']
+                    match_count += 1
+                    print(f"✅ 匹配成功:{filename} -> ID: {doc_info['id']}")
+                else:
+                    item['matedata']['document_id'] = ''
+                    item['matedata']['file_url'] = ''
+                    no_match_count += 1
+                    print(f"❌ 匹配失败:{filename}(未在映射中找到)")
+                    
+                    # 收集匹配失败的信息
+                    failed_match_info = {
+                        "source_file": source_file,
+                        "extracted_filename": filename,
+                        "chunk_id": item.get('matedata', {}).get('chunk_id', ''),
+                        "section_label": item.get('matedata', {}).get('section_label', ''),
+                        "category_info": item.get('category_info', {}),
+                        "tags": item.get('tags', {}),
+                        "input_file": os.path.basename(file_path),
+                        "timestamp": datetime.now().isoformat()
+                    }
+                    failed_matches_list.append(failed_match_info)
+        
+        # 保存文件
+        os.makedirs(output_dir, exist_ok=True)
+        output_file_path = os.path.join(output_dir, os.path.basename(file_path))
+        with open(output_file_path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+        
+        print(f"\n文件处理结果:匹配成功 {match_count} 条,失败 {no_match_count} 条")
+        
+    except Exception as e:
+        print(f"处理文件失败:{e}")
+        traceback.print_exc()
+
+
+def save_failed_matches_report(failed_matches, output_path):
+    """
+    保存匹配失败的报告到JSON文件
+    
+    Args:
+        failed_matches: 匹配失败的列表
+        output_path: 输出文件路径
+    """
+    if not failed_matches:
+        print("\n✅ 所有小节都匹配成功,无失败记录")
+        return
+    
+    report = {
+        "generated_at": datetime.now().isoformat(),
+        "summary": {
+            "total_failed": len(failed_matches),
+            "unique_files": len(set(item["extracted_filename"] for item in failed_matches))
+        },
+        "failed_matches": failed_matches
+    }
+    
+    try:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(report, f, ensure_ascii=False, indent=2)
+        print(f"\n📄 匹配失败报告已保存:{output_path}")
+        print(f"   共 {len(failed_matches)} 条失败记录,涉及 {report['summary']['unique_files']} 个不同文件")
+    except Exception as e:
+        print(f"\n❌ 保存失败报告时出错:{e}")
+
+
+def main():
+    # ===================== 配置参数(请确认路径正确)=====================
+    INPUT_FOLDER = r"E:\WeChat Files\wxid_rymkhe638gt022\FileStorage\File\2026-03\1\sgfa_classification_db"
+    OUTPUT_FOLDER = r"E:\WeChat Files\wxid_rymkhe638gt022\FileStorage\File\2026-03\1\your_output_folder"
+    DOCUMENT_JSON_PATH = r"E:\WeChat Files\wxid_rymkhe638gt022\FileStorage\File\2026-03\t_samp_document_main.json"
+    
+    # 匹配失败报告保存路径
+    FAILED_MATCHES_PATH = os.path.join(OUTPUT_FOLDER, "failed_matches_report.json")
+    
+    # ===================== 执行流程 =====================
+    print("="*50)
+    print("开始加载文档映射...")
+    document_mapping = load_document_mapping(DOCUMENT_JSON_PATH)
+    print(f"最终加载映射数量:{len(document_mapping)}")
+    
+    # 用于收集所有匹配失败的信息
+    all_failed_matches = []
+    
+    print("\n" + "="*50)
+    print("开始处理JSON文件...")
+    for filename in os.listdir(INPUT_FOLDER):
+        if filename.endswith('.json'):
+            file_path = os.path.join(INPUT_FOLDER, filename)
+            print(f"\n处理文件:{filename}")
+            process_json_file(file_path, document_mapping, OUTPUT_FOLDER, all_failed_matches)
+    
+    print(f"\n" + "="*50)
+    print(f"所有文件处理完成!输出路径:{OUTPUT_FOLDER}")
+    
+    # 保存匹配失败报告
+    print("\n" + "="*50)
+    print("保存匹配失败报告...")
+    save_failed_matches_report(all_failed_matches, FAILED_MATCHES_PATH)
+
+
+if __name__ == "__main__":
+    main()