|
@@ -0,0 +1,169 @@
|
|
|
|
|
+"""
|
|
|
|
|
+创建编制依据状态查询 Collection 脚本
|
|
|
|
|
+Collection: first_bfp_collection_status
|
|
|
|
|
+用途: 存储编制依据标准状态信息,用于快速查询标准当前状态
|
|
|
|
|
+"""
|
|
|
|
|
+from __future__ import annotations
|
|
|
|
|
+
|
|
|
|
|
+from pymilvus import DataType, Function, FunctionType
|
|
|
|
|
+
|
|
|
|
|
+from app.config.milvus_client import get_milvusclient
|
|
|
|
|
+
|
|
|
|
|
+# Collection 名称
|
|
|
|
|
+COLLECTION_NAME = "first_bfp_collection_status"
|
|
|
|
|
+
|
|
|
|
|
+# 向量维度(根据使用的embedding模型调整)
|
|
|
|
|
+DENSE_DIM = 4096
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def create_schema():
|
|
|
|
|
+ """创建Schema定义"""
|
|
|
|
|
+ client = get_milvusclient()
|
|
|
|
|
+
|
|
|
|
|
+ schema = client.create_schema(auto_id=True, enable_dynamic_fields=False)
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 主键和核心字段
|
|
|
|
|
+ schema.add_field(
|
|
|
|
|
+ "pk",
|
|
|
|
|
+ DataType.INT64,
|
|
|
|
|
+ is_primary=True,
|
|
|
|
|
+ auto_id=True,
|
|
|
|
|
+ description="主键ID,自增列"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # text字段:用于全文检索和BM25
|
|
|
|
|
+ schema.add_field(
|
|
|
|
|
+ "text",
|
|
|
|
|
+ DataType.VARCHAR,
|
|
|
|
|
+ max_length=65535,
|
|
|
|
|
+ enable_analyzer=True,
|
|
|
|
|
+ enable_match=True,
|
|
|
|
|
+ description="状态文本内容,如:《XXX标准》(GB/T XXX)状态为现行"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # dense字段:密集向量,用于语义检索
|
|
|
|
|
+ schema.add_field(
|
|
|
|
|
+ "dense",
|
|
|
|
|
+ DataType.FLOAT_VECTOR,
|
|
|
|
|
+ dim=DENSE_DIM,
|
|
|
|
|
+ description="密集向量,用于语义检索"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # sparse字段:稀疏向量,由BM25函数生成
|
|
|
|
|
+ schema.add_field(
|
|
|
|
|
+ "sparse",
|
|
|
|
|
+ DataType.SPARSE_FLOAT_VECTOR,
|
|
|
|
|
+ description="稀疏向量,BM25关键字检索"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 业务字段
|
|
|
|
|
+ schema.add_field(
|
|
|
|
|
+ "issuing_authority",
|
|
|
|
|
+ DataType.VARCHAR,
|
|
|
|
|
+ max_length=65535,
|
|
|
|
|
+ description="发布单位"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 添加BM25函数,自动从text生成sparse向量
|
|
|
|
|
+ bm25_function = Function(
|
|
|
|
|
+ name="bm25_function",
|
|
|
|
|
+ function_type=FunctionType.BM25,
|
|
|
|
|
+ input_field_names=["text"],
|
|
|
|
|
+ output_field_names=["sparse"],
|
|
|
|
|
+ )
|
|
|
|
|
+ schema.add_function(bm25_function)
|
|
|
|
|
+
|
|
|
|
|
+ return schema
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def create_index(client, collection_name: str):
|
|
|
|
|
+ """为Collection创建索引"""
|
|
|
|
|
+ index_params = client.prepare_index_params()
|
|
|
|
|
+
|
|
|
|
|
+ # dense向量索引 - 使用AUTOINDEX,metric_type为L2
|
|
|
|
|
+ index_params.add_index(
|
|
|
|
|
+ field_name="dense",
|
|
|
|
|
+ index_name="dense",
|
|
|
|
|
+ index_type="AUTOINDEX",
|
|
|
|
|
+ metric_type="L2"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # sparse向量索引 - 使用AUTOINDEX,metric_type为BM25
|
|
|
|
|
+ index_params.add_index(
|
|
|
|
|
+ field_name="sparse",
|
|
|
|
|
+ index_name="sparse",
|
|
|
|
|
+ index_type="AUTOINDEX",
|
|
|
|
|
+ metric_type="BM25"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ client.create_index(
|
|
|
|
|
+ collection_name=collection_name,
|
|
|
|
|
+ index_params=index_params
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def ensure_collection(collection_name: str = COLLECTION_NAME, create_idx: bool = True, auto_load: bool = True):
|
|
|
|
|
+ """
|
|
|
|
|
+ 确保Collection存在,不存在则创建,并自动加载到内存
|
|
|
|
|
+ """
|
|
|
|
|
+ client = get_milvusclient()
|
|
|
|
|
+
|
|
|
|
|
+ if client.has_collection(collection_name=collection_name):
|
|
|
|
|
+ print(f"ℹ️ Collection已存在: {collection_name}")
|
|
|
|
|
+ if auto_load:
|
|
|
|
|
+ try:
|
|
|
|
|
+ client.load_collection(collection_name=collection_name)
|
|
|
|
|
+ print(f"✅ Collection已加载到内存: {collection_name}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"⚠️ Collection加载失败(可能已加载): {e}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ print(f"📝 创建Schema: {collection_name}")
|
|
|
|
|
+ schema = create_schema()
|
|
|
|
|
+
|
|
|
|
|
+ print(f"🏗️ 创建Collection: {collection_name}")
|
|
|
|
|
+ client.create_collection(
|
|
|
|
|
+ collection_name=collection_name,
|
|
|
|
|
+ schema=schema,
|
|
|
|
|
+ consistency_level="Bounded"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if create_idx:
|
|
|
|
|
+ print(f"🔍 创建索引: {collection_name}")
|
|
|
|
|
+ create_index(client, collection_name)
|
|
|
|
|
+
|
|
|
|
|
+ if auto_load:
|
|
|
|
|
+ print(f"📂 加载Collection到内存: {collection_name}")
|
|
|
|
|
+ try:
|
|
|
|
|
+ client.load_collection(collection_name=collection_name)
|
|
|
|
|
+ print(f"✅ Collection加载完成: {collection_name}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"⚠️ Collection加载失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ print(f"✅ Collection创建完成: {collection_name}")
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def main():
|
|
|
|
|
+ """主函数"""
|
|
|
|
|
+ print("=" * 60)
|
|
|
|
|
+ print("编制依据状态查询 - Milvus Collection 创建工具")
|
|
|
|
|
+ print("=" * 60)
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ created = ensure_collection(COLLECTION_NAME, auto_load=True)
|
|
|
|
|
+
|
|
|
|
|
+ print("\n" + "=" * 60)
|
|
|
|
|
+ if created:
|
|
|
|
|
+ print("✅ Collection创建并加载完成!")
|
|
|
|
|
+ else:
|
|
|
|
|
+ print("ℹ️ Collection已存在并加载")
|
|
|
|
|
+ print("=" * 60)
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"\n❌ 创建失败: {str(e)}")
|
|
|
|
|
+ raise
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ main()
|