CRBC-MaaS-Platform-Project
/
LQAgentPlatform


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							#!/usr/bin/env python3
"""
测试修复后的 Milvus 向量实现
"""

import sys
import os
import json

# 添加项目根目录到路径
# 解决模块导入问题：添加项目根目录到 Python 路径
current_script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(current_script_path)
project_root = os.path.abspath(os.path.join(script_dir, "../../"))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

def load_documents_from_file(json_path: str):
    """
    从单个 JSON 文件读取所有 chunks，生成：
        [{'content': str, 'metadata': dict}, ...]
    只保留 chunk 自身的 content 和 metadata，不混入文件级元数据
    """
    documents = []

    if not os.path.isfile(json_path):
        print(f"[WARN] JSON 文件不存在: {json_path}")
        return documents

    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"[ERROR] 读取 JSON 文件失败: {json_path}, error: {e}")
        return documents

    chunks = data.get("chunks", [])
    if not isinstance(chunks, list):
        print(f"[WARN] 文件 {json_path} 中的 chunks 字段不是 list，跳过")
        return documents

    for idx, chunk in enumerate(chunks):
        if not isinstance(chunk, dict):
            continue

        # 提取 content
        content = chunk.get("content", "")
        if not content or not str(content).strip():
            # 空内容不入库
            continue

        # 只用 chunk 自己的 metadata
        metadata = chunk.get("metadata", {}) or {}

        # 如果你也想保留 chunk 的索引，可以打开下面这行
        # metadata["chunk_index"] = idx

        documents.append({
            "content": content,
            "metadata": metadata,
        })

    print(f"[INFO] 文件 {os.path.basename(json_path)} 提取出 {len(documents)} 条 chunk 文档")
    return documents


def test_basic_functionality():
    """测试基本功能"""
    try:
        # 导入并初始化 MilvusVectorManager
        from foundation.database.base.vector.milvus_vector import MilvusVectorManager
        print("成功导入 MilvusVectorManager")

        # 初始化管理器
        manager = MilvusVectorManager()
        print("MilvusVectorManager 初始化成功")

        # 测试 text_to_vector 方法
        test_text = "桥梁建设技术"
        vector = manager.text_to_vector(test_text)
        print(f"text_to_vector 测试成功，向量维度: {len(vector)}")

        # ====== 关键改动：从文件夹读取所有 JSON 文件，生成 documents ======
        # 指定你的 JSON 文件夹路径
        json_dir = "data_pipeline/test_rawdata"

        all_documents = []

        if not os.path.isdir(json_dir):
            print(f"[ERROR] 目录不存在: {json_dir}")
            return False

        # 遍历文件夹下所有文件
        for filename in os.listdir(json_dir):
            # 只处理 .json 文件（如果你是其它后缀，改这里）
            if not filename.lower().endswith(".json"):
                continue

            json_path = os.path.join(json_dir, filename)
            docs = load_documents_from_file(json_path)
            if docs:
                all_documents.extend(docs)

        print(f"[INFO] 总共从目录 {json_dir} 中解析出 {len(all_documents)} 条文档")

        if not all_documents:
            print("[ERROR] 未从任何文件中解析到文档，停止测试")
            return False
        # ====== 关键改动结束 ======

        collection_name = "first_bfp_collection_test"

        print(f"\n测试 create_hybrid_collection 方法...")
        vectorstore = manager.create_hybrid_collection(
            collection_name=collection_name,
            documents=all_documents  # ← 用目录里解析出的所有 documents
        )
        print("create_hybrid_collection 执行成功!")
        print(f"返回的 vectorstore 类型: {type(vectorstore)}")

        return True

    except Exception as e:
        print(f"测试失败: {e}")
        import traceback
        traceback.print_exc()
        return False

if __name__ == "__main__":
    success = test_basic_functionality()
    print("\n" + "=" * 50)
    print(f"测试结果: {'成功' if success else '失败'}")

    if success:
        print("修复验证成功！")
        print("- text_to_vector 方法正常工作")
        print("- create_hybrid_collection 方法正常工作")
        print("- hybrid_search 方法正常工作")