|
|
@@ -0,0 +1,286 @@
|
|
|
+"""
|
|
|
+创建施工方案知识结构体系Milvus Collection脚本
|
|
|
+独立运行,用于创建父子表结构
|
|
|
+t_kngs_construction_plan_paren
|
|
|
+t_kngs_construction_plan_child
|
|
|
+"""
|
|
|
+from __future__ import annotations
|
|
|
+
|
|
|
+from pymilvus import DataType, Function, FunctionType
|
|
|
+
|
|
|
+from app.config.milvus_client import get_milvusclient
|
|
|
+
|
|
|
+# Collection 名称
|
|
|
+PARENT_COLLECTION_NAME = "t_kngs_construction_plan_parent"
|
|
|
+CHILD_COLLECTION_NAME = "t_kngs_construction_plan_child"
|
|
|
+
|
|
|
+# 向量维度(根据使用的embedding模型调整)
|
|
|
+DENSE_DIM = 4096
|
|
|
+
|
|
|
+
|
|
|
+def create_schema():
|
|
|
+ """创建Schema定义"""
|
|
|
+ client = get_milvusclient()
|
|
|
+
|
|
|
+ schema = client.create_schema(auto_id=True, enable_dynamic_fields=False)
|
|
|
+
|
|
|
+ # 1. 主键和核心字段
|
|
|
+ schema.add_field(
|
|
|
+ "pk",
|
|
|
+ DataType.INT64,
|
|
|
+ is_primary=True,
|
|
|
+ auto_id=True,
|
|
|
+ description="主键ID,自增列"
|
|
|
+ )
|
|
|
+
|
|
|
+ # text字段:用于全文检索和BM25
|
|
|
+ schema.add_field(
|
|
|
+ "text",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=65535,
|
|
|
+ enable_analyzer=True,
|
|
|
+ description="内容文本"
|
|
|
+ )
|
|
|
+
|
|
|
+ # dense字段:密集向量,用于语义检索
|
|
|
+ schema.add_field(
|
|
|
+ "dense",
|
|
|
+ DataType.FLOAT_VECTOR,
|
|
|
+ dim=DENSE_DIM,
|
|
|
+ description="密集向量,用于语义检索"
|
|
|
+ )
|
|
|
+
|
|
|
+ # sparse字段:稀疏向量,由BM25函数生成
|
|
|
+ schema.add_field(
|
|
|
+ "sparse",
|
|
|
+ DataType.SPARSE_FLOAT_VECTOR,
|
|
|
+ description="稀疏向量,BM25关键字检索"
|
|
|
+ )
|
|
|
+
|
|
|
+ # 2. 业务字段
|
|
|
+ schema.add_field(
|
|
|
+ "document_id",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=256,
|
|
|
+ description="文档ID,样本中心上传文档ID"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "parent_id",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=256,
|
|
|
+ description="父段ID"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "index",
|
|
|
+ DataType.INT64,
|
|
|
+ description="索引序号"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "tag_list",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=4096,
|
|
|
+ description="标签列表,逗号分隔"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "permission",
|
|
|
+ DataType.JSON,
|
|
|
+ description="权限信息JSON,后期扩展"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "metadata",
|
|
|
+ DataType.JSON,
|
|
|
+ description="元数据JSON"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "file_name",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=512,
|
|
|
+ description="文件名称"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "plan_type",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=256,
|
|
|
+ description="施工方案工艺类型,如:简支梁(T型梁或小箱梁)预制、运输及架桥机安装"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "file_url",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=1024,
|
|
|
+ description="文件URL地址,上传OSS文件URL地址"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "chapter_title",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=512,
|
|
|
+ description="章节标题"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "chapter_level_1",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=128,
|
|
|
+ description="一级章节类型"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "chapter_level_2",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=128,
|
|
|
+ description="二级章节类型"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "chapter_level_3",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=128,
|
|
|
+ description="三级章节类型"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "is_deleted",
|
|
|
+ DataType.BOOL,
|
|
|
+ description="删除标志"
|
|
|
+ )
|
|
|
+
|
|
|
+ # 3. 审计字段
|
|
|
+ schema.add_field(
|
|
|
+ "created_by",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=256,
|
|
|
+ description="创建人"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "created_time",
|
|
|
+ DataType.INT64,
|
|
|
+ description="创建时间戳"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "updated_by",
|
|
|
+ DataType.VARCHAR,
|
|
|
+ max_length=256,
|
|
|
+ description="修改人"
|
|
|
+ )
|
|
|
+
|
|
|
+ schema.add_field(
|
|
|
+ "updated_time",
|
|
|
+ DataType.INT64,
|
|
|
+ description="修改时间戳"
|
|
|
+ )
|
|
|
+
|
|
|
+ # 4. 添加BM25函数
|
|
|
+ schema.add_function(
|
|
|
+ Function(
|
|
|
+ name="bm25_fn",
|
|
|
+ input_field_names=["text"],
|
|
|
+ output_field_names=["sparse"],
|
|
|
+ function_type=FunctionType.BM25,
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ return schema
|
|
|
+
|
|
|
+
|
|
|
+def create_index(client, collection_name: str):
|
|
|
+ """为Collection创建索引"""
|
|
|
+ # 为dense向量创建索引(使用IVF_FLAT或HNSW)
|
|
|
+ index_params = client.prepare_index_params()
|
|
|
+
|
|
|
+ # dense向量索引 - 使用HNSW适合高维向量检索
|
|
|
+ index_params.add_index(
|
|
|
+ field_name="dense",
|
|
|
+ index_name="dense_vector_idx",
|
|
|
+ index_type="HNSW",
|
|
|
+ metric_type="COSINE",
|
|
|
+ params={
|
|
|
+ "M": 16, # 图中每个节点的最大连接数
|
|
|
+ "efConstruction": 200 # 构建时的搜索范围
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ # sparse向量索引 - 使用SPARSE_INVERTED_INDEX
|
|
|
+ index_params.add_index(
|
|
|
+ field_name="sparse",
|
|
|
+ index_name="sparse_vector_idx",
|
|
|
+ index_type="SPARSE_INVERTED_INDEX",
|
|
|
+ metric_type="BM25"
|
|
|
+ )
|
|
|
+
|
|
|
+ client.create_index(
|
|
|
+ collection_name=collection_name,
|
|
|
+ index_params=index_params
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+def ensure_collection(collection_name: str, create_idx: bool = True):
|
|
|
+ """
|
|
|
+ 确保Collection存在,不存在则创建
|
|
|
+
|
|
|
+ Args:
|
|
|
+ collection_name: Collection名称
|
|
|
+ create_idx: 是否创建索引
|
|
|
+ """
|
|
|
+ client = get_milvusclient()
|
|
|
+
|
|
|
+ if client.has_collection(collection_name=collection_name):
|
|
|
+ print(f"ℹ️ Collection已存在: {collection_name}")
|
|
|
+ return False
|
|
|
+
|
|
|
+ print(f"📝 创建Schema: {collection_name}")
|
|
|
+ schema = create_schema()
|
|
|
+
|
|
|
+ print(f"🏗️ 创建Collection: {collection_name}")
|
|
|
+ client.create_collection(
|
|
|
+ collection_name=collection_name,
|
|
|
+ schema=schema,
|
|
|
+ consistency_level="Strong" # 强一致性
|
|
|
+ )
|
|
|
+
|
|
|
+ if create_idx:
|
|
|
+ print(f"🔍 创建索引: {collection_name}")
|
|
|
+ create_index(client, collection_name)
|
|
|
+
|
|
|
+ print(f"✅ Collection创建完成: {collection_name}")
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ """主函数"""
|
|
|
+ print("=" * 60)
|
|
|
+ print("施工方案知识结构体系 - Milvus Collection 创建工具")
|
|
|
+ print("=" * 60)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 创建父表
|
|
|
+ print("\n📋 创建父表...")
|
|
|
+ parent_created = ensure_collection(PARENT_COLLECTION_NAME)
|
|
|
+
|
|
|
+ # 创建子表
|
|
|
+ print("\n📋 创建子表...")
|
|
|
+ child_created = ensure_collection(CHILD_COLLECTION_NAME)
|
|
|
+
|
|
|
+ print("\n" + "=" * 60)
|
|
|
+ if parent_created or child_created:
|
|
|
+ print("✅ 所有Collection创建完成!")
|
|
|
+ else:
|
|
|
+ print("ℹ️ 所有Collection已存在,无需创建")
|
|
|
+ print("=" * 60)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"\n❌ 创建失败: {str(e)}")
|
|
|
+ raise
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|