Преглед изворни кода

fix: 使用chinese分词器

ai02 пре 4 недеља
родитељ
комит
7c0e92fc55
1 измењених фајлова са 69 додато и 28 уклоњено
  1. 69 28
      src/app/scripts/t_kngs_construction_plan_collections_create.py

+ 69 - 28
src/app/scripts/t_kngs_construction_plan_collections_create.py

@@ -39,6 +39,7 @@ def create_schema():
         DataType.VARCHAR, 
         DataType.VARCHAR, 
         max_length=65535, 
         max_length=65535, 
         enable_analyzer=True, 
         enable_analyzer=True, 
+        analyzer_params={"type": "chinese"},
         description="内容文本"
         description="内容文本"
     )
     )
     
     
@@ -194,33 +195,70 @@ def create_schema():
 
 
 def create_index(client, collection_name: str):
 def create_index(client, collection_name: str):
     """为Collection创建索引"""
     """为Collection创建索引"""
-    # 为dense向量创建索引(使用IVF_FLAT或HNSW)
-    index_params = client.prepare_index_params()
-    
-    # dense向量索引 - 使用HNSW适合高维向量检索
-    index_params.add_index(
-        field_name="dense",
-        index_name="dense_vector_idx",
-        index_type="HNSW",
-        metric_type="COSINE",
-        params={
-            "M": 16,        # 图中每个节点的最大连接数
-            "efConstruction": 200  # 构建时的搜索范围
-        }
-    )
-    
-    # sparse向量索引 - 使用SPARSE_INVERTED_INDEX
-    index_params.add_index(
-        field_name="sparse",
-        index_name="sparse_vector_idx",
-        index_type="SPARSE_INVERTED_INDEX",
-        metric_type="BM25"
-    )
-    
-    client.create_index(
-        collection_name=collection_name,
-        index_params=index_params
-    )
+    index_specs = [
+        # text字段倒排索引 - 配合中文分词进行全文检索
+        {"field_name": "text", "index_name": "text_idx", "index_type": "INVERTED"},
+        # dense向量索引 - 使用HNSW适合高维向量检索
+        {
+            "field_name": "dense",
+            "index_name": "dense_vector_idx",
+            "index_type": "HNSW",
+            "metric_type": "COSINE",
+            "params": {
+                "M": 16,  # 图中每个节点的最大连接数
+                "efConstruction": 200,  # 构建时的搜索范围
+            },
+        },
+        # sparse向量索引 - 使用SPARSE_INVERTED_INDEX
+        {
+            "field_name": "sparse",
+            "index_name": "sparse_vector_idx",
+            "index_type": "SPARSE_INVERTED_INDEX",
+            "metric_type": "BM25",
+        },
+        # 其余标量字段索引(除pk和sparse外)
+        {"field_name": "document_id", "index_name": "document_id_idx", "index_type": "INVERTED"},
+        {"field_name": "parent_id", "index_name": "parent_id_idx", "index_type": "INVERTED"},
+        {"field_name": "index", "index_name": "index_idx", "index_type": "INVERTED"},
+        {"field_name": "tag_list", "index_name": "tag_list_idx", "index_type": "INVERTED"},
+        {
+            "field_name": "permission",
+            "index_name": "permission_idx",
+            "index_type": "INVERTED",
+            "params": {"json_cast_type": "VARCHAR"},
+        },
+        {
+            "field_name": "metadata",
+            "index_name": "metadata_idx",
+            "index_type": "INVERTED",
+            "params": {"json_cast_type": "VARCHAR"},
+        },
+        {"field_name": "file_name", "index_name": "file_name_idx", "index_type": "INVERTED"},
+        {"field_name": "plan_type", "index_name": "plan_type_idx", "index_type": "INVERTED"},
+        {"field_name": "file_url", "index_name": "file_url_idx", "index_type": "INVERTED"},
+        {"field_name": "chapter_title", "index_name": "chapter_title_idx", "index_type": "INVERTED"},
+        {"field_name": "chapter_level_1", "index_name": "chapter_level_1_idx", "index_type": "INVERTED"},
+        {"field_name": "chapter_level_2", "index_name": "chapter_level_2_idx", "index_type": "INVERTED"},
+        {"field_name": "chapter_level_3", "index_name": "chapter_level_3_idx", "index_type": "INVERTED"},
+        {"field_name": "is_deleted", "index_name": "is_deleted_idx", "index_type": "INVERTED"},
+        {"field_name": "created_by", "index_name": "created_by_idx", "index_type": "INVERTED"},
+        {"field_name": "created_time", "index_name": "created_time_idx", "index_type": "INVERTED"},
+        {"field_name": "updated_by", "index_name": "updated_by_idx", "index_type": "INVERTED"},
+        {"field_name": "updated_time", "index_name": "updated_time_idx", "index_type": "INVERTED"},
+    ]
+
+    for spec in index_specs:
+        index_params = client.prepare_index_params()
+        index_params.add_index(**spec)
+        try:
+            client.create_index(collection_name=collection_name, index_params=index_params)
+            print(f"✅ 索引创建成功: {collection_name}.{spec['index_name']}")
+        except Exception as e:
+            error_text = str(e).lower()
+            if "already exist" in error_text or "duplicate" in error_text:
+                print(f"ℹ️ 索引已存在,跳过: {collection_name}.{spec['index_name']}")
+                continue
+            raise
 
 
 
 
 def ensure_collection(collection_name: str, create_idx: bool = True, auto_load: bool = True):
 def ensure_collection(collection_name: str, create_idx: bool = True, auto_load: bool = True):
@@ -236,6 +274,9 @@ def ensure_collection(collection_name: str, create_idx: bool = True, auto_load:
     
     
     if client.has_collection(collection_name=collection_name):
     if client.has_collection(collection_name=collection_name):
         print(f"ℹ️ Collection已存在: {collection_name}")
         print(f"ℹ️ Collection已存在: {collection_name}")
+        if create_idx:
+            print(f"🔍 Collection已存在,补建缺失索引: {collection_name}")
+            create_index(client, collection_name)
         # 即使已存在也尝试加载
         # 即使已存在也尝试加载
         if auto_load:
         if auto_load:
             try:
             try:
@@ -252,7 +293,7 @@ def ensure_collection(collection_name: str, create_idx: bool = True, auto_load:
     client.create_collection(
     client.create_collection(
         collection_name=collection_name, 
         collection_name=collection_name, 
         schema=schema,
         schema=schema,
-        consistency_level="Strong"  # 强一致性
+        consistency_level="Bounded"  # 有界一致性
     )
     )
     
     
     if create_idx:
     if create_idx: