Browse Source

dev:t_kngs_construction_plan_parent 的collection创建

ai02 1 tháng trước cách đây
mục cha
commit
2868ce8fbf
1 tập tin đã thay đổi với 286 bổ sung0 xóa
  1. 286 0
      src/app/scripts/t_kngs_construction_plan_collections_create.py

+ 286 - 0
src/app/scripts/t_kngs_construction_plan_collections_create.py

@@ -0,0 +1,286 @@
+"""
+创建施工方案知识结构体系Milvus Collection脚本
+独立运行,用于创建父子表结构
+t_kngs_construction_plan_paren
+t_kngs_construction_plan_child
+"""
+from __future__ import annotations
+
+from pymilvus import DataType, Function, FunctionType
+
+from app.config.milvus_client import get_milvusclient
+
+# Collection 名称
+PARENT_COLLECTION_NAME = "t_kngs_construction_plan_parent"
+CHILD_COLLECTION_NAME = "t_kngs_construction_plan_child"
+
+# 向量维度(根据使用的embedding模型调整)
+DENSE_DIM = 4096
+
+
+def create_schema():
+    """创建Schema定义"""
+    client = get_milvusclient()
+    
+    schema = client.create_schema(auto_id=True, enable_dynamic_fields=False)
+    
+    # 1. 主键和核心字段
+    schema.add_field(
+        "pk", 
+        DataType.INT64, 
+        is_primary=True, 
+        auto_id=True, 
+        description="主键ID,自增列"
+    )
+    
+    # text字段:用于全文检索和BM25
+    schema.add_field(
+        "text", 
+        DataType.VARCHAR, 
+        max_length=65535, 
+        enable_analyzer=True, 
+        description="内容文本"
+    )
+    
+    # dense字段:密集向量,用于语义检索
+    schema.add_field(
+        "dense", 
+        DataType.FLOAT_VECTOR, 
+        dim=DENSE_DIM, 
+        description="密集向量,用于语义检索"
+    )
+    
+    # sparse字段:稀疏向量,由BM25函数生成
+    schema.add_field(
+        "sparse", 
+        DataType.SPARSE_FLOAT_VECTOR, 
+        description="稀疏向量,BM25关键字检索"
+    )
+    
+    # 2. 业务字段
+    schema.add_field(
+        "document_id", 
+        DataType.VARCHAR, 
+        max_length=256, 
+        description="文档ID,样本中心上传文档ID"
+    )
+    
+    schema.add_field(
+        "parent_id", 
+        DataType.VARCHAR, 
+        max_length=256, 
+        description="父段ID"
+    )
+    
+    schema.add_field(
+        "index", 
+        DataType.INT64, 
+        description="索引序号"
+    )
+    
+    schema.add_field(
+        "tag_list", 
+        DataType.VARCHAR, 
+        max_length=4096, 
+        description="标签列表,逗号分隔"
+    )
+    
+    schema.add_field(
+        "permission", 
+        DataType.JSON, 
+        description="权限信息JSON,后期扩展"
+    )
+    
+    schema.add_field(
+        "metadata", 
+        DataType.JSON, 
+        description="元数据JSON"
+    )
+    
+    schema.add_field(
+        "file_name", 
+        DataType.VARCHAR, 
+        max_length=512, 
+        description="文件名称"
+    )
+    
+    schema.add_field(
+        "plan_type", 
+        DataType.VARCHAR, 
+        max_length=256, 
+        description="施工方案工艺类型,如:简支梁(T型梁或小箱梁)预制、运输及架桥机安装"
+    )
+    
+    schema.add_field(
+        "file_url", 
+        DataType.VARCHAR, 
+        max_length=1024, 
+        description="文件URL地址,上传OSS文件URL地址"
+    )
+    
+    schema.add_field(
+        "chapter_title", 
+        DataType.VARCHAR, 
+        max_length=512, 
+        description="章节标题"
+    )
+    
+    schema.add_field(
+        "chapter_level_1", 
+        DataType.VARCHAR, 
+        max_length=128, 
+        description="一级章节类型"
+    )
+    
+    schema.add_field(
+        "chapter_level_2", 
+        DataType.VARCHAR, 
+        max_length=128, 
+        description="二级章节类型"
+    )
+    
+    schema.add_field(
+        "chapter_level_3", 
+        DataType.VARCHAR, 
+        max_length=128, 
+        description="三级章节类型"
+    )
+    
+    schema.add_field(
+        "is_deleted", 
+        DataType.BOOL, 
+        description="删除标志"
+    )
+    
+    # 3. 审计字段
+    schema.add_field(
+        "created_by", 
+        DataType.VARCHAR, 
+        max_length=256, 
+        description="创建人"
+    )
+    
+    schema.add_field(
+        "created_time", 
+        DataType.INT64, 
+        description="创建时间戳"
+    )
+    
+    schema.add_field(
+        "updated_by", 
+        DataType.VARCHAR, 
+        max_length=256, 
+        description="修改人"
+    )
+    
+    schema.add_field(
+        "updated_time", 
+        DataType.INT64, 
+        description="修改时间戳"
+    )
+    
+    # 4. 添加BM25函数
+    schema.add_function(
+        Function(
+            name="bm25_fn",
+            input_field_names=["text"],
+            output_field_names=["sparse"],
+            function_type=FunctionType.BM25,
+        )
+    )
+    
+    return schema
+
+
+def create_index(client, collection_name: str):
+    """为Collection创建索引"""
+    # 为dense向量创建索引(使用IVF_FLAT或HNSW)
+    index_params = client.prepare_index_params()
+    
+    # dense向量索引 - 使用HNSW适合高维向量检索
+    index_params.add_index(
+        field_name="dense",
+        index_name="dense_vector_idx",
+        index_type="HNSW",
+        metric_type="COSINE",
+        params={
+            "M": 16,        # 图中每个节点的最大连接数
+            "efConstruction": 200  # 构建时的搜索范围
+        }
+    )
+    
+    # sparse向量索引 - 使用SPARSE_INVERTED_INDEX
+    index_params.add_index(
+        field_name="sparse",
+        index_name="sparse_vector_idx",
+        index_type="SPARSE_INVERTED_INDEX",
+        metric_type="BM25"
+    )
+    
+    client.create_index(
+        collection_name=collection_name,
+        index_params=index_params
+    )
+
+
+def ensure_collection(collection_name: str, create_idx: bool = True):
+    """
+    确保Collection存在,不存在则创建
+    
+    Args:
+        collection_name: Collection名称
+        create_idx: 是否创建索引
+    """
+    client = get_milvusclient()
+    
+    if client.has_collection(collection_name=collection_name):
+        print(f"ℹ️ Collection已存在: {collection_name}")
+        return False
+    
+    print(f"📝 创建Schema: {collection_name}")
+    schema = create_schema()
+    
+    print(f"🏗️ 创建Collection: {collection_name}")
+    client.create_collection(
+        collection_name=collection_name, 
+        schema=schema,
+        consistency_level="Strong"  # 强一致性
+    )
+    
+    if create_idx:
+        print(f"🔍 创建索引: {collection_name}")
+        create_index(client, collection_name)
+    
+    print(f"✅ Collection创建完成: {collection_name}")
+    return True
+
+
+def main():
+    """主函数"""
+    print("=" * 60)
+    print("施工方案知识结构体系 - Milvus Collection 创建工具")
+    print("=" * 60)
+    
+    try:
+        # 创建父表
+        print("\n📋 创建父表...")
+        parent_created = ensure_collection(PARENT_COLLECTION_NAME)
+        
+        # 创建子表
+        print("\n📋 创建子表...")
+        child_created = ensure_collection(CHILD_COLLECTION_NAME)
+        
+        print("\n" + "=" * 60)
+        if parent_created or child_created:
+            print("✅ 所有Collection创建完成!")
+        else:
+            print("ℹ️ 所有Collection已存在,无需创建")
+        print("=" * 60)
+        
+    except Exception as e:
+        print(f"\n❌ 创建失败: {str(e)}")
+        raise
+
+
+if __name__ == "__main__":
+    main()