4 tygodni temu · 53ddf2997c
--- a/src/app/scripts/select_url.py
+++ b/src/app/scripts/select_url.py
@@ -0,0 +1,209 @@
 
				+import json
			
 
				+import os
			
 
				+import re
			
 
				+import traceback
			
 
				+from datetime import datetime
			
 
				+
			
 
				+
			
 
				+def load_document_mapping(document_json_path):
			
 
				+    """
			
 
				+    万能加载函数：适配任意格式的 t_samp_document_main.json，提取 title/id/file_url 映射
			
 
				+    """
			
 
				+    mapping = {}
			
 
				+    try:
			
 
				+        with open(document_json_path, 'r', encoding='utf-8') as f:
			
 
				+            data = json.load(f)
			
 
				+        
			
 
				+        # 定义递归提取函数（处理嵌套结构）
			
 
				+        def extract_items(obj):
			
 
				+            items = []
			
 
				+            if isinstance(obj, list):
			
 
				+                for item in obj:
			
 
				+                    items.extend(extract_items(item))
			
 
				+            elif isinstance(obj, dict):
			
 
				+                # 如果包含 title 字段，视为有效条目
			
 
				+                if 'title' in obj and 'id' in obj:
			
 
				+                    items.append(obj)
			
 
				+                # 否则递归遍历所有值
			
 
				+                else:
			
 
				+                    for v in obj.values():
			
 
				+                        items.extend(extract_items(v))
			
 
				+            return items
			
 
				+        
			
 
				+        # 提取所有包含 title/id 的条目
			
 
				+        valid_items = extract_items(data)
			
 
				+        print(f"找到 {len(valid_items)} 个包含 title/id 的条目")
			
 
				+        
			
 
				+        # 构建映射（规范化title中的空格）
			
 
				+        for item in valid_items:
			
 
				+            title = item.get('title', '')
			
 
				+            # 规范化空格，处理多个连续空格的情况
			
 
				+            title = normalize_whitespace(title)
			
 
				+            if title:
			
 
				+                mapping[title] = {
			
 
				+                    "id": item.get('id', ''),
			
 
				+                    "file_url": item.get('file_url', '')
			
 
				+                }
			
 
				+                # 打印匹配的标题（方便调试）
			
 
				+                # print(f"映射标题：{title} -> ID: {item.get('id', '空')}")
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"加载失败详情：{e}")
			
 
				+        traceback.print_exc()  # 打印完整错误栈
			
 
				+    
			
 
				+    return mapping
			
 
				+
			
 
				+
			
 
				+def normalize_whitespace(text):
			
 
				+    """
			
 
				+    规范化空白字符：
			
 
				+    - 替换所有换行符、回车符、制表符为空格
			
 
				+    - 将多个连续空格合并为单个空格
			
 
				+    """
			
 
				+    # 首先将各种空白字符（换行、回车、制表符）替换为空格
			
 
				+    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
			
 
				+    # 使用正则表达式将多个连续空格合并为单个空格
			
 
				+    text = re.sub(r'\s+', ' ', text)
			
 
				+    return text.strip()
			
 
				+
			
 
				+
			
 
				+def extract_filename_from_path(file_path):
			
 
				+    """提取文件名（去掉路径和后缀），并清洗空格/换行符"""
			
 
				+    filename_with_suffix = os.path.basename(file_path)
			
 
				+    filename = os.path.splitext(filename_with_suffix)[0]
			
 
				+    # 规范化空白字符（处理多个连续空格的情况）
			
 
				+    filename = normalize_whitespace(filename)
			
 
				+    return filename
			
 
				+
			
 
				+
			
 
				+def process_json_file(file_path, document_mapping, output_dir, failed_matches_list):
			
 
				+    """
			
 
				+    处理单个JSON文件
			
 
				+    
			
 
				+    Args:
			
 
				+        file_path: 输入文件路径
			
 
				+        document_mapping: 文档映射字典
			
 
				+        output_dir: 输出目录
			
 
				+        failed_matches_list: 用于收集匹配失败信息的列表
			
 
				+    """
			
 
				+    try:
			
 
				+        with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+            data = json.load(f)
			
 
				+        
			
 
				+        # 统计匹配成功/失败的数量
			
 
				+        match_count = 0
			
 
				+        no_match_count = 0
			
 
				+        
			
 
				+        for key, item in data.items():
			
 
				+            if 'matedata' in item and 'source_file' in item['matedata']:
			
 
				+                source_file = item['matedata']['source_file']
			
 
				+                filename = extract_filename_from_path(source_file)
			
 
				+                
			
 
				+                # 调试：打印待匹配的文件名
			
 
				+                print(f"\n待匹配文件名：{filename}")
			
 
				+                
			
 
				+                if filename in document_mapping:
			
 
				+                    doc_info = document_mapping[filename]
			
 
				+                    item['matedata']['document_id'] = doc_info['id']
			
 
				+                    item['matedata']['file_url'] = doc_info['file_url']
			
 
				+                    match_count += 1
			
 
				+                    print(f"✅ 匹配成功：{filename} -> ID: {doc_info['id']}")
			
 
				+                else:
			
 
				+                    item['matedata']['document_id'] = ''
			
 
				+                    item['matedata']['file_url'] = ''
			
 
				+                    no_match_count += 1
			
 
				+                    print(f"❌ 匹配失败：{filename}（未在映射中找到）")
			
 
				+                    
			
 
				+                    # 收集匹配失败的信息
			
 
				+                    failed_match_info = {
			
 
				+                        "source_file": source_file,
			
 
				+                        "extracted_filename": filename,
			
 
				+                        "chunk_id": item.get('matedata', {}).get('chunk_id', ''),
			
 
				+                        "section_label": item.get('matedata', {}).get('section_label', ''),
			
 
				+                        "category_info": item.get('category_info', {}),
			
 
				+                        "tags": item.get('tags', {}),
			
 
				+                        "input_file": os.path.basename(file_path),
			
 
				+                        "timestamp": datetime.now().isoformat()
			
 
				+                    }
			
 
				+                    failed_matches_list.append(failed_match_info)
			
 
				+        
			
 
				+        # 保存文件
			
 
				+        os.makedirs(output_dir, exist_ok=True)
			
 
				+        output_file_path = os.path.join(output_dir, os.path.basename(file_path))
			
 
				+        with open(output_file_path, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(data, f, ensure_ascii=False, indent=2)
			
 
				+        
			
 
				+        print(f"\n文件处理结果：匹配成功 {match_count} 条，失败 {no_match_count} 条")
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"处理文件失败：{e}")
			
 
				+        traceback.print_exc()
			
 
				+
			
 
				+
			
 
				+def save_failed_matches_report(failed_matches, output_path):
			
 
				+    """
			
 
				+    保存匹配失败的报告到JSON文件
			
 
				+    
			
 
				+    Args:
			
 
				+        failed_matches: 匹配失败的列表
			
 
				+        output_path: 输出文件路径
			
 
				+    """
			
 
				+    if not failed_matches:
			
 
				+        print("\n✅ 所有小节都匹配成功，无失败记录")
			
 
				+        return
			
 
				+    
			
 
				+    report = {
			
 
				+        "generated_at": datetime.now().isoformat(),
			
 
				+        "summary": {
			
 
				+            "total_failed": len(failed_matches),
			
 
				+            "unique_files": len(set(item["extracted_filename"] for item in failed_matches))
			
 
				+        },
			
 
				+        "failed_matches": failed_matches
			
 
				+    }
			
 
				+    
			
 
				+    try:
			
 
				+        with open(output_path, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(report, f, ensure_ascii=False, indent=2)
			
 
				+        print(f"\n📄 匹配失败报告已保存：{output_path}")
			
 
				+        print(f"   共 {len(failed_matches)} 条失败记录，涉及 {report['summary']['unique_files']} 个不同文件")
			
 
				+    except Exception as e:
			
 
				+        print(f"\n❌ 保存失败报告时出错：{e}")
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    # ===================== 配置参数（请确认路径正确）=====================
			
 
				+    INPUT_FOLDER = r"E:\WeChat Files\wxid_rymkhe638gt022\FileStorage\File\2026-03\1\sgfa_classification_db"
			
 
				+    OUTPUT_FOLDER = r"E:\WeChat Files\wxid_rymkhe638gt022\FileStorage\File\2026-03\1\your_output_folder"
			
 
				+    DOCUMENT_JSON_PATH = r"E:\WeChat Files\wxid_rymkhe638gt022\FileStorage\File\2026-03\t_samp_document_main.json"
			
 
				+    
			
 
				+    # 匹配失败报告保存路径
			
 
				+    FAILED_MATCHES_PATH = os.path.join(OUTPUT_FOLDER, "failed_matches_report.json")
			
 
				+    
			
 
				+    # ===================== 执行流程 =====================
			
 
				+    print("="*50)
			
 
				+    print("开始加载文档映射...")
			
 
				+    document_mapping = load_document_mapping(DOCUMENT_JSON_PATH)
			
 
				+    print(f"最终加载映射数量：{len(document_mapping)}")
			
 
				+    
			
 
				+    # 用于收集所有匹配失败的信息
			
 
				+    all_failed_matches = []
			
 
				+    
			
 
				+    print("\n" + "="*50)
			
 
				+    print("开始处理JSON文件...")
			
 
				+    for filename in os.listdir(INPUT_FOLDER):
			
 
				+        if filename.endswith('.json'):
			
 
				+            file_path = os.path.join(INPUT_FOLDER, filename)
			
 
				+            print(f"\n处理文件：{filename}")
			
 
				+            process_json_file(file_path, document_mapping, OUTPUT_FOLDER, all_failed_matches)
			
 
				+    
			
 
				+    print(f"\n" + "="*50)
			
 
				+    print(f"所有文件处理完成！输出路径：{OUTPUT_FOLDER}")
			
 
				+    
			
 
				+    # 保存匹配失败报告
			
 
				+    print("\n" + "="*50)
			
 
				+    print("保存匹配失败报告...")
			
 
				+    save_failed_matches_report(all_failed_matches, FAILED_MATCHES_PATH)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()