#!/usr/bin/env python # -*- coding: utf-8 -*- """ 调试RAG检索元数据映射问题 """ import sys import os import json sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from foundation.ai.rag.retrieval.retrieval import retrieval_manager from foundation.ai.models.rerank_model import rerank_model from foundation.observability.logger.loggering import server_logger as logger def debug_multi_stage_recall(collection_name, query): """ 调试多阶段召回的详细过程 """ try: print("=" * 80) print("调试多阶段召回元数据映射问题") print("=" * 80) # 第一阶段:获取混合搜索结果 print("\n第一阶段:混合搜索召回") hybrid_results = retrieval_manager.hybrid_search_recall( collection_name=collection_name, query_text=query, top_k=5, ranker_type="weighted" ) print(f"混合搜索返回 {len(hybrid_results)} 个结果:") for i, result in enumerate(hybrid_results): print(f"\n--- 混合搜索结果 {i} ---") print(f"文本内容(前100字符): {result['text_content'][:100]}...") print(f"元数据: {result.get('metadata', {})}") # 提取候选文档文本 candidates = [result['text_content'] for result in hybrid_results] print(f"\n提取的候选文档数量: {len(candidates)}") # 第二阶段:获取重排序结果 print("\n第二阶段:重排序召回") rerank_api_results = rerank_model.bge_rerank(query, candidates, top_k=3) print(f"\nBGE重排序API原始返回 {len(rerank_api_results)} 个结果:") for i, result in enumerate(rerank_api_results): print(f"\n--- BGE API结果 {i} ---") print(f"文本内容(前100字符): {result['text'][:100]}...") print(f"重排序分数: {result.get('score', 'N/A')}") print(f"API返回的索引: {result.get('index', 'N/A')}") # 第三阶段:元数据映射 print("\n第三阶段:元数据映射") rerank_results = retrieval_manager.rerank_recall( candidates=candidates, query_text=query, top_k=3 ) print(f"\n重排序处理后的结果:") for i, result in enumerate(rerank_results): print(f"\n--- 重排序处理结果 {i} ---") print(f"文本内容(前100字符): {result['text_content'][:100]}...") print(f"重排序分数: {result.get('rerank_score', 'N/A')}") print(f"原始索引: {result.get('original_index', 'N/A')}") print(f"重排序排名: {result.get('rerank_rank', 'N/A')}") print(f"映射的元数据: {result.get('metadata', {})}") # 验证映射是否正确 orig_idx = result.get('original_index', 0) if orig_idx < len(hybrid_results): expected_content = hybrid_results[orig_idx]['text_content'][:100] actual_content = result['text_content'][:100] is_match = expected_content == actual_content print(f"内容匹配验证: {'正确' if is_match else '错误'}") if not is_match: print(f" 期望内容: {expected_content}...") print(f" 实际内容: {actual_content}...") else: print(f"索引越界: original_index={orig_idx} >= hybrid_results长度={len(hybrid_results)}") except Exception as e: print(f"[ERROR] 调试失败: {str(e)}") import traceback traceback.print_exc() def main(): """ 主调试函数 """ print("开始RAG元数据映射调试") # 简化查询,更容易观察映射关系 query = "水泥混凝土路面" collection_name = "first_bfp_collection" debug_multi_stage_recall(collection_name, query) if __name__ == "__main__": main()