Explorar el Código

feat: 存储编制依据标准状态信息,用于快速查询标准当前状态

ai02 hace 4 semanas
padre
commit
93a2257224
Se han modificado 1 ficheros con 169 adiciones y 0 borrados
  1. 169 0
      src/app/scripts/first_bfp_collection_status_create.py

+ 169 - 0
src/app/scripts/first_bfp_collection_status_create.py

@@ -0,0 +1,169 @@
+"""
+创建编制依据状态查询 Collection 脚本
+Collection: first_bfp_collection_status
+用途: 存储编制依据标准状态信息,用于快速查询标准当前状态
+"""
+from __future__ import annotations
+
+from pymilvus import DataType, Function, FunctionType
+
+from app.config.milvus_client import get_milvusclient
+
+# Collection 名称
+COLLECTION_NAME = "first_bfp_collection_status"
+
+# 向量维度(根据使用的embedding模型调整)
+DENSE_DIM = 4096
+
+
+def create_schema():
+    """创建Schema定义"""
+    client = get_milvusclient()
+    
+    schema = client.create_schema(auto_id=True, enable_dynamic_fields=False)
+    
+    # 1. 主键和核心字段
+    schema.add_field(
+        "pk", 
+        DataType.INT64, 
+        is_primary=True, 
+        auto_id=True, 
+        description="主键ID,自增列"
+    )
+    
+    # text字段:用于全文检索和BM25
+    schema.add_field(
+        "text", 
+        DataType.VARCHAR, 
+        max_length=65535, 
+        enable_analyzer=True,
+        enable_match=True,
+        description="状态文本内容,如:《XXX标准》(GB/T XXX)状态为现行"
+    )
+    
+    # dense字段:密集向量,用于语义检索
+    schema.add_field(
+        "dense", 
+        DataType.FLOAT_VECTOR, 
+        dim=DENSE_DIM, 
+        description="密集向量,用于语义检索"
+    )
+    
+    # sparse字段:稀疏向量,由BM25函数生成
+    schema.add_field(
+        "sparse", 
+        DataType.SPARSE_FLOAT_VECTOR, 
+        description="稀疏向量,BM25关键字检索"
+    )
+    
+    # 2. 业务字段
+    schema.add_field(
+        "issuing_authority", 
+        DataType.VARCHAR, 
+        max_length=65535,
+        description="发布单位"
+    )
+    
+    # 3. 添加BM25函数,自动从text生成sparse向量
+    bm25_function = Function(
+        name="bm25_function",
+        function_type=FunctionType.BM25,
+        input_field_names=["text"],
+        output_field_names=["sparse"],
+    )
+    schema.add_function(bm25_function)
+    
+    return schema
+
+
+def create_index(client, collection_name: str):
+    """为Collection创建索引"""
+    index_params = client.prepare_index_params()
+    
+    # dense向量索引 - 使用AUTOINDEX,metric_type为L2
+    index_params.add_index(
+        field_name="dense",
+        index_name="dense",
+        index_type="AUTOINDEX",
+        metric_type="L2"
+    )
+    
+    # sparse向量索引 - 使用AUTOINDEX,metric_type为BM25
+    index_params.add_index(
+        field_name="sparse",
+        index_name="sparse",
+        index_type="AUTOINDEX",
+        metric_type="BM25"
+    )
+    
+    client.create_index(
+        collection_name=collection_name,
+        index_params=index_params
+    )
+
+
+def ensure_collection(collection_name: str = COLLECTION_NAME, create_idx: bool = True, auto_load: bool = True):
+    """
+    确保Collection存在,不存在则创建,并自动加载到内存
+    """
+    client = get_milvusclient()
+    
+    if client.has_collection(collection_name=collection_name):
+        print(f"ℹ️ Collection已存在: {collection_name}")
+        if auto_load:
+            try:
+                client.load_collection(collection_name=collection_name)
+                print(f"✅ Collection已加载到内存: {collection_name}")
+            except Exception as e:
+                print(f"⚠️ Collection加载失败(可能已加载): {e}")
+        return False
+    
+    print(f"📝 创建Schema: {collection_name}")
+    schema = create_schema()
+    
+    print(f"🏗️ 创建Collection: {collection_name}")
+    client.create_collection(
+        collection_name=collection_name, 
+        schema=schema,
+        consistency_level="Bounded"
+    )
+    
+    if create_idx:
+        print(f"🔍 创建索引: {collection_name}")
+        create_index(client, collection_name)
+    
+    if auto_load:
+        print(f"📂 加载Collection到内存: {collection_name}")
+        try:
+            client.load_collection(collection_name=collection_name)
+            print(f"✅ Collection加载完成: {collection_name}")
+        except Exception as e:
+            print(f"⚠️ Collection加载失败: {e}")
+    
+    print(f"✅ Collection创建完成: {collection_name}")
+    return True
+
+
+def main():
+    """主函数"""
+    print("=" * 60)
+    print("编制依据状态查询 - Milvus Collection 创建工具")
+    print("=" * 60)
+    
+    try:
+        created = ensure_collection(COLLECTION_NAME, auto_load=True)
+        
+        print("\n" + "=" * 60)
+        if created:
+            print("✅ Collection创建并加载完成!")
+        else:
+            print("ℹ️ Collection已存在并加载")
+        print("=" * 60)
+        
+    except Exception as e:
+        print(f"\n❌ 创建失败: {str(e)}")
+        raise
+
+
+if __name__ == "__main__":
+    main()