|
|
@@ -5,12 +5,64 @@
|
|
|
|
|
|
import sys
|
|
|
import os
|
|
|
+import json
|
|
|
|
|
|
# 添加项目根目录到路径
|
|
|
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
+# 解决模块导入问题:添加项目根目录到 Python 路径
|
|
|
+current_script_path = os.path.abspath(__file__)
|
|
|
+script_dir = os.path.dirname(current_script_path)
|
|
|
+project_root = os.path.abspath(os.path.join(script_dir, "../../"))
|
|
|
+if project_root not in sys.path:
|
|
|
+ sys.path.insert(0, project_root)
|
|
|
+
|
|
|
+def load_documents_from_file(json_path: str):
|
|
|
+ """
|
|
|
+ 从单个 JSON 文件读取所有 chunks,生成:
|
|
|
+ [{'content': str, 'metadata': dict}, ...]
|
|
|
+ 只保留 chunk 自身的 content 和 metadata,不混入文件级元数据
|
|
|
+ """
|
|
|
+ documents = []
|
|
|
+
|
|
|
+ if not os.path.isfile(json_path):
|
|
|
+ print(f"[WARN] JSON 文件不存在: {json_path}")
|
|
|
+ return documents
|
|
|
+
|
|
|
+ try:
|
|
|
+ with open(json_path, "r", encoding="utf-8") as f:
|
|
|
+ data = json.load(f)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"[ERROR] 读取 JSON 文件失败: {json_path}, error: {e}")
|
|
|
+ return documents
|
|
|
+
|
|
|
+ chunks = data.get("chunks", [])
|
|
|
+ if not isinstance(chunks, list):
|
|
|
+ print(f"[WARN] 文件 {json_path} 中的 chunks 字段不是 list,跳过")
|
|
|
+ return documents
|
|
|
+
|
|
|
+ for idx, chunk in enumerate(chunks):
|
|
|
+ if not isinstance(chunk, dict):
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 提取 content
|
|
|
+ content = chunk.get("content", "")
|
|
|
+ if not content or not str(content).strip():
|
|
|
+ # 空内容不入库
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 只用 chunk 自己的 metadata
|
|
|
+ metadata = chunk.get("metadata", {}) or {}
|
|
|
+
|
|
|
+ # 如果你也想保留 chunk 的索引,可以打开下面这行
|
|
|
+ # metadata["chunk_index"] = idx
|
|
|
+
|
|
|
+ documents.append({
|
|
|
+ "content": content,
|
|
|
+ "metadata": metadata,
|
|
|
+ })
|
|
|
+
|
|
|
+ print(f"[INFO] 文件 {os.path.basename(json_path)} 提取出 {len(documents)} 条 chunk 文档")
|
|
|
+ return documents
|
|
|
|
|
|
-print("测试修复后的 Milvus 向量实现")
|
|
|
-print("=" * 50)
|
|
|
|
|
|
def test_basic_functionality():
|
|
|
"""测试基本功能"""
|
|
|
@@ -28,61 +80,44 @@ def test_basic_functionality():
|
|
|
vector = manager.text_to_vector(test_text)
|
|
|
print(f"text_to_vector 测试成功,向量维度: {len(vector)}")
|
|
|
|
|
|
- # 简单测试文档
|
|
|
- test_documents = [
|
|
|
- {
|
|
|
- 'content': '四川路桥建设集团专注于桥梁和隧道工程建设',
|
|
|
- 'metadata': {"source":""}
|
|
|
-
|
|
|
- },
|
|
|
- {
|
|
|
- 'content': '高速公路桥梁建设技术包括预应力混凝土和钢结构',
|
|
|
- 'metadata': {'category': 'technology', 'type': 'highway'}
|
|
|
- }
|
|
|
- ]
|
|
|
-
|
|
|
- collection_name = "first_bfp_collection"
|
|
|
+ # ====== 关键改动:从文件夹读取所有 JSON 文件,生成 documents ======
|
|
|
+ # 指定你的 JSON 文件夹路径
|
|
|
+ json_dir = "data_pipeline/test_rawdata"
|
|
|
+
|
|
|
+ all_documents = []
|
|
|
+
|
|
|
+ if not os.path.isdir(json_dir):
|
|
|
+ print(f"[ERROR] 目录不存在: {json_dir}")
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 遍历文件夹下所有文件
|
|
|
+ for filename in os.listdir(json_dir):
|
|
|
+ # 只处理 .json 文件(如果你是其它后缀,改这里)
|
|
|
+ if not filename.lower().endswith(".json"):
|
|
|
+ continue
|
|
|
+
|
|
|
+ json_path = os.path.join(json_dir, filename)
|
|
|
+ docs = load_documents_from_file(json_path)
|
|
|
+ if docs:
|
|
|
+ all_documents.extend(docs)
|
|
|
+
|
|
|
+ print(f"[INFO] 总共从目录 {json_dir} 中解析出 {len(all_documents)} 条文档")
|
|
|
+
|
|
|
+ if not all_documents:
|
|
|
+ print("[ERROR] 未从任何文件中解析到文档,停止测试")
|
|
|
+ return False
|
|
|
+ # ====== 关键改动结束 ======
|
|
|
+
|
|
|
+ collection_name = "first_bfp_collection_test"
|
|
|
|
|
|
print(f"\n测试 create_hybrid_collection 方法...")
|
|
|
vectorstore = manager.create_hybrid_collection(
|
|
|
collection_name=collection_name,
|
|
|
- documents=test_documents
|
|
|
+ documents=all_documents # ← 用目录里解析出的所有 documents
|
|
|
)
|
|
|
print("create_hybrid_collection 执行成功!")
|
|
|
print(f"返回的 vectorstore 类型: {type(vectorstore)}")
|
|
|
|
|
|
- # 等待索引创建完成
|
|
|
- import time
|
|
|
- time.sleep(5)
|
|
|
-
|
|
|
- print(f"\n测试 hybrid_search 方法...")
|
|
|
- param = {'collection_name': collection_name}
|
|
|
-
|
|
|
- # 测试加权搜索
|
|
|
- results = manager.hybrid_search(
|
|
|
- param=param,
|
|
|
- query_text="桥梁建设",
|
|
|
- top_k=2,
|
|
|
- ranker_type="weighted",
|
|
|
- dense_weight=0.7,
|
|
|
- sparse_weight=0.3
|
|
|
- )
|
|
|
- print(f"Hybrid search 执行成功,返回 {len(results)} 个结果")
|
|
|
-
|
|
|
- for i, result in enumerate(results):
|
|
|
- content = result.get('text_content', '')[:50]
|
|
|
- print(f" {i+1}. {content}...")
|
|
|
-
|
|
|
- # 清理测试集合
|
|
|
- print(f"\n清理测试集合...")
|
|
|
- try:
|
|
|
- from pymilvus import utility
|
|
|
- if utility.has_collection(collection_name):
|
|
|
- utility.drop_collection(collection_name)
|
|
|
- print(f"成功清理集合: {collection_name}")
|
|
|
- except Exception as e:
|
|
|
- print(f"清理集合失败: {e}")
|
|
|
-
|
|
|
return True
|
|
|
|
|
|
except Exception as e:
|