| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- 调试RAG检索元数据映射问题
- """
- import sys
- import os
- import json
- sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
- from foundation.ai.rag.retrieval.retrieval import retrieval_manager
- from foundation.ai.models.rerank_model import rerank_model
- from foundation.observability.logger.loggering import server_logger as logger
- def debug_multi_stage_recall(collection_name, query):
- """
- 调试多阶段召回的详细过程
- """
- try:
- print("=" * 80)
- print("调试多阶段召回元数据映射问题")
- print("=" * 80)
- # 第一阶段:获取混合搜索结果
- print("\n第一阶段:混合搜索召回")
- hybrid_results = retrieval_manager.hybrid_search_recall(
- collection_name=collection_name,
- query_text=query,
- top_k=5,
- ranker_type="weighted"
- )
- print(f"混合搜索返回 {len(hybrid_results)} 个结果:")
- for i, result in enumerate(hybrid_results):
- print(f"\n--- 混合搜索结果 {i} ---")
- print(f"文本内容(前100字符): {result['text_content'][:100]}...")
- print(f"元数据: {result.get('metadata', {})}")
- # 提取候选文档文本
- candidates = [result['text_content'] for result in hybrid_results]
- print(f"\n提取的候选文档数量: {len(candidates)}")
- # 第二阶段:获取重排序结果
- print("\n第二阶段:重排序召回")
- rerank_api_results = rerank_model.bge_rerank(query, candidates, top_k=3)
- print(f"\nBGE重排序API原始返回 {len(rerank_api_results)} 个结果:")
- for i, result in enumerate(rerank_api_results):
- print(f"\n--- BGE API结果 {i} ---")
- print(f"文本内容(前100字符): {result['text'][:100]}...")
- print(f"重排序分数: {result.get('score', 'N/A')}")
- print(f"API返回的索引: {result.get('index', 'N/A')}")
- # 第三阶段:元数据映射
- print("\n第三阶段:元数据映射")
- rerank_results = retrieval_manager.rerank_recall(
- candidates=candidates,
- query_text=query,
- top_k=3
- )
- print(f"\n重排序处理后的结果:")
- for i, result in enumerate(rerank_results):
- print(f"\n--- 重排序处理结果 {i} ---")
- print(f"文本内容(前100字符): {result['text_content'][:100]}...")
- print(f"重排序分数: {result.get('rerank_score', 'N/A')}")
- print(f"原始索引: {result.get('original_index', 'N/A')}")
- print(f"重排序排名: {result.get('rerank_rank', 'N/A')}")
- print(f"映射的元数据: {result.get('metadata', {})}")
- # 验证映射是否正确
- orig_idx = result.get('original_index', 0)
- if orig_idx < len(hybrid_results):
- expected_content = hybrid_results[orig_idx]['text_content'][:100]
- actual_content = result['text_content'][:100]
- is_match = expected_content == actual_content
- print(f"内容匹配验证: {'正确' if is_match else '错误'}")
- if not is_match:
- print(f" 期望内容: {expected_content}...")
- print(f" 实际内容: {actual_content}...")
- else:
- print(f"索引越界: original_index={orig_idx} >= hybrid_results长度={len(hybrid_results)}")
- except Exception as e:
- print(f"[ERROR] 调试失败: {str(e)}")
- import traceback
- traceback.print_exc()
- def main():
- """
- 主调试函数
- """
- print("开始RAG元数据映射调试")
- # 简化查询,更容易观察映射关系
- query = "水泥混凝土路面"
- collection_name = "first_bfp_collection"
- debug_multi_stage_recall(collection_name, query)
- if __name__ == "__main__":
- main()
|