linyang 4 недель назад
Родитель
Сommit
0810474eae

+ 9 - 4
src/app/api/v1/document/knowledge_base.py

@@ -60,8 +60,8 @@ async def get_knowledge_bases(
                     description="Synced from Milvus",
                     status="normal",
                     document_count=row_count,
-                    created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                    updated_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    created_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    updated_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                 )
                 db.add(new_kb)
                 has_changes = True
@@ -92,13 +92,18 @@ async def get_knowledge_bases(
 
     # 计算总数
     count_query = select(func.count()).select_from(query.subquery())
-    total = await db.scalar(count_query)
+    total = await db.scalar(count_query) or 0  # 确保 total 不为 None
 
     # 分页查询
-    query = query.order_by(KnowledgeBase.created_at.desc()).offset((page - 1) * page_size).limit(page_size)
+    # 使用 created_time 而不是 created_at
+    query = query.order_by(KnowledgeBase.created_time.desc()).offset((page - 1) * page_size).limit(page_size)
     result = await db.execute(query)
     items = result.scalars().all()
 
+    # 设置 is_synced 属性 (非数据库字段,用于前端展示)
+    for item in items:
+        item.is_synced = item.collection_name in milvus_names
+
     total_pages = ceil(total / page_size) if page_size else 0
     
     meta = PaginationSchema(

+ 2 - 0
src/app/sample/models/knowledge_base.py

@@ -17,6 +17,8 @@ class KnowledgeBase(BaseModel):
     status = Column(String(20), default="normal", comment="状态: normal(正常), test(测试), disabled(禁用)")
     document_count = Column(Integer, default=0, comment="文档数量")
     is_deleted = Column(TINYINT, default=0, comment="是否删除")
+    created_by = Column(String(128), nullable=True, comment="创建人")
+    updated_by = Column(String(128), nullable=True, comment="修改人")
 
     def __repr__(self):
         return f"<KnowledgeBase {self.name}>"

+ 4 - 0
src/app/sample/schemas/knowledge_base.py

@@ -34,6 +34,7 @@ class KnowledgeBaseUpdate(BaseModel):
     name: Optional[str] = None
     description: Optional[str] = None
     status: Optional[str] = None
+    metadata_fields: Optional[List[MetadataField]] = Field(None, description="元数据字段列表")
 
 class DescriptionUpdate(BaseModel):
     """仅更新描述(保留原有兼容性)"""
@@ -49,6 +50,9 @@ class KnowledgeBaseResponse(BaseModelSchema):
     description: Optional[str]
     status: str
     document_count: int
+    is_synced: bool = Field(False, description="是否已同步到Milvus")
+    created_by: Optional[str] = None
+    updated_by: Optional[str] = None
     created_at: Optional[datetime] = None
     updated_at: Optional[datetime] = None
 

+ 156 - 57
src/app/services/knowledge_base_service.py

@@ -101,6 +101,10 @@ class KnowledgeBaseService:
         result = await db.execute(query)
         items = result.scalars().all()
 
+        # 设置 is_synced 属性 (非数据库字段,用于前端展示)
+        for item in items:
+            item.is_synced = item.collection_name in milvus_names
+
         total_pages = ceil(total / page_size) if page_size else 0
         
         meta = PaginationSchema(
@@ -121,6 +125,20 @@ class KnowledgeBaseService:
         ))
         if exists.scalars().first():
             raise ValueError("知识库集合名称已存在")
+        
+        # 额外检查:是否在软删除记录中存在?如果存在,建议先彻底删除或恢复
+        # 这里我们简单处理:如果存在软删除记录,直接物理删除它,以便重新创建
+        # 或者在 exists 查询中去掉 is_deleted 条件,禁止重名
+        
+        soft_deleted = await db.execute(select(KnowledgeBase).where(
+            KnowledgeBase.collection_name == payload.collection_name,
+            KnowledgeBase.is_deleted == 1
+        ))
+        soft_deleted_kb = soft_deleted.scalars().first()
+        if soft_deleted_kb:
+            # 物理删除软删除记录,允许重建
+            await db.delete(soft_deleted_kb)
+            await db.flush() # 立即执行删除
 
         # 2. 检查 Milvus 是否已存在 (如果之前残留)
         # if milvus_service.has_collection(payload.collection_name):
@@ -138,6 +156,8 @@ class KnowledgeBaseService:
                 collection_name=payload.collection_name,
                 description=payload.description,
                 status=payload.status or "normal",
+                created_by="admin", # 暂时硬编码为 admin,后续对接用户系统
+                updated_by="admin",
                 created_time=now,
                 updated_time=now
             )
@@ -156,21 +176,10 @@ class KnowledgeBaseService:
                     )
                     db.add(new_metadata)
             
-            # 6. 保存自定义Schema定义 (如果有)
-            if payload.custom_schemas:
-                for schema_field in payload.custom_schemas:
-                    new_schema = CustomSchema(
-                        id=str(uuid.uuid4()),
-                        knowledge_base_id=new_kb.id,
-                        field_name=schema_field.field_name,
-                        field_type=schema_field.field_type,
-                        max_length=schema_field.max_length,
-                        is_primary=schema_field.is_primary,
-                        description=schema_field.description,
-                        created_time=now,
-                        updated_time=now
-                    )
-                    db.add(new_schema)
+            # 6. 保存自定义Schema定义 (如果有) - 已废弃,使用固定Schema
+            # if payload.custom_schemas:
+            #     for schema_field in payload.custom_schemas:
+            #         ...
 
             await db.commit()
             await db.refresh(new_kb)
@@ -198,7 +207,25 @@ class KnowledgeBaseService:
             if payload.status is not None:
                 kb.status = payload.status
             
-            kb.created_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            # 更新元数据字段 (Metadata Fields)
+            if payload.metadata_fields is not None:
+                # 1. 删除旧的元数据字段
+                await db.execute(sql_delete(SampleMetadata).where(SampleMetadata.knowledge_base_id == id))
+                
+                # 2. 插入新的元数据字段
+                for field in payload.metadata_fields:
+                    new_metadata = SampleMetadata(
+                        id=str(uuid.uuid4()),
+                        knowledge_base_id=kb.id,
+                        field_zh_name=field.field_zh_name,
+                        field_en_name=field.field_en_name,
+                        field_type=field.field_type,
+                        remark=field.remark
+                    )
+                    db.add(new_metadata)
+            
+            kb.updated_by = "admin" # 暂时硬编码
+            kb.updated_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # 使用 updated_time 而不是 created_time
             await db.commit()
             await db.refresh(kb)
             
@@ -240,6 +267,20 @@ class KnowledgeBaseService:
         if not kb:
             raise ValueError("知识库不存在")
 
+        # 检查文档数量
+        # 先尝试从 Milvus 获取最新的实时数量
+        current_count = kb.document_count
+        if kb.collection_name and milvus_service.has_collection(kb.collection_name):
+            try:
+                stats = milvus_service.client.get_collection_stats(kb.collection_name)
+                current_count = int(stats.get("row_count", 0))
+            except Exception:
+                # 获取失败则使用 DB 中的缓存值
+                pass
+        
+        if current_count > 0:
+            raise ValueError(f"知识库中仍有 {current_count} 条文档,请先清空文档后再删除")
+
         try:
             # 1. 删除 Milvus 集合 (强制删除)
             try:
@@ -275,52 +316,122 @@ class KnowledgeBaseService:
         if milvus_service.has_collection(kb.collection_name):
             raise ValueError("Milvus集合已存在")
             
-        # 查询自定义Schema
-        schema_query = select(CustomSchema).where(CustomSchema.knowledge_base_id == id)
-        schema_result = await db.execute(schema_query)
-        custom_schemas = schema_result.scalars().all()
-        
-        fields = []
-        # 1. 添加用户自定义的Schema字段
-        if custom_schemas:
-            for s in custom_schemas:
-                fields.append({
-                    "name": s.field_name,
-                    "type": s.field_type,
-                    "max_length": s.max_length,
-                    "is_primary": s.is_primary,
-                    "description": s.description
-                })
+        # 使用固定的 Schema 定义
+        # 这些字段必须与 snippet_service.py 中 insert 的数据结构一致
+        fields = [
+            {
+                "name": "pk",
+                "type": "INT64",
+                "is_primary": True,
+                "description": "主键"
+            },
+            {
+                "name": "text",
+                "type": "VARCHAR",
+                "max_length": 65535,
+                "description": "内容"
+            },
+            {
+                "name": "vector",
+                "type": "FLOAT_VECTOR",
+                "description": "向量列"
+            },
+            {
+                "name": "sparse",
+                "type": "BM25",
+                "description": "内容的BM25关键字检索"
+            },
+            {
+                "name": "document_id",
+                "type": "VARCHAR",
+                "max_length": 128,
+                "description": "样本中心上传文档ID"
+            },
+            {
+                "name": "parent_id",
+                "type": "VARCHAR",
+                "max_length": 128,
+                "description": "父段ID"
+            },
+            {
+                "name": "index",
+                "type": "INT64",
+                "description": "索引序号"
+            },
+            {
+                "name": "tag_list",
+                "type": "VARCHAR",
+                "max_length": 2048,
+                "description": "标签"
+            },
+            {
+                "name": "permission",
+                "type": "JSON",
+                "description": "权限"
+            },
+            {
+                "name": "metadata",
+                "type": "JSON",
+                "description": "元数据"
+            },
+            {
+                "name": "is_deleted",
+                "type": "BOOL",
+                "description": "删除标志"
+            },
+            {
+                "name": "created_by",
+                "type": "VARCHAR",
+                "max_length": 128,
+                "description": "创建人"
+            },
+            {
+                "name": "created_time",
+                "type": "INT64",
+                "description": "创建时间"
+            },
+            {
+                "name": "updated_by",
+                "type": "VARCHAR",
+                "max_length": 128,
+                "description": "修改人"
+            },
+            {
+                "name": "updated_time",
+                "type": "INT64",
+                "description": "修改时间"
+            }
+        ]
         
-        # 2. 自动添加 metadata 字段 (JSON类型)
+        # 2. 自动添加 metadata 字段 (JSON类型) - 已包含在 fields 中,移除此段逻辑
         # 即使没有定义元数据字段,通常也需要一个 JSON 类型的 metadata 字段来存储灵活的元数据
         # 如果用户在 t_samp_metadata 中定义了元数据结构,这些结构实际上是存储在 metadata 字段中的 KV 对
         # 但为了方便检索,我们也可以选择将 metadata 作为一个独立的 JSON 字段存在 Milvus 中
         
         # 检查是否已经有名为 'metadata' 的自定义字段,避免冲突
-        has_metadata_field = any(f['name'] == 'metadata' for f in fields)
-        if not has_metadata_field:
-            fields.append({
-                "name": "metadata",
-                "type": "JSON",
-                "description": "默认元数据字段"
-            })
+        # has_metadata_field = any(f['name'] == 'metadata' for f in fields)
+        # if not has_metadata_field:
+        #     fields.append({
+        #         "name": "metadata",
+        #         "type": "JSON",
+        #         "description": "默认元数据字段"
+        #     })
         
         try:
             # 暂时无法获取维度信息,默认768,或者应该在数据库中存储维度
             # 假设默认 768,后续可以在 KnowledgeBase 模型中增加 dimension 字段
             milvus_service.create_collection(
                 name=kb.collection_name,
-                dimension=768, 
+                dimension=milvus_service.DENSE_DIM, 
                 description=kb.description or "",
-                fields=fields if fields else None
+                fields=fields
             )
             return kb
         except Exception as e:
             raise e
 
     async def get_metadata_and_schema(self, db: AsyncSession, kb_id: str) -> Dict[str, List[dict]]:
-        """获取知识库的元数据字段列表和自定义Schema"""
+        """获取知识库的元数据字段列表 (Schema已固定,不再返回自定义Schema)"""
         # 检查知识库是否存在
         result = await db.execute(select(KnowledgeBase).where(KnowledgeBase.id == kb_id, KnowledgeBase.is_deleted == 0))
         kb = result.scalars().first()
@@ -332,22 +443,10 @@ class KnowledgeBaseService:
         meta_result = await db.execute(meta_query)
         metadata_fields = [f.to_dict() for f in meta_result.scalars().all()]
         
-        # 查询自定义Schema表
-        schema_query = select(CustomSchema).where(CustomSchema.knowledge_base_id == kb_id)
-        schema_result = await db.execute(schema_query)
-        
-        custom_schemas = []
-        for s in schema_result.scalars().all():
-            custom_schemas.append({
-                "field_name": s.field_name,
-                "field_type": s.field_type,
-                "max_length": s.max_length,
-                "description": s.description
-            })
-            
+        # 返回空的 custom_schemas,因为现在是固定 Schema
         return {
             "metadata_fields": metadata_fields,
-            "custom_schemas": custom_schemas
+            "custom_schemas": [] 
         }
 
     async def get_metadata_fields(self, db: AsyncSession, kb_id: str) -> List[dict]:

+ 530 - 282
src/app/services/milvus_service.py

@@ -1,305 +1,553 @@
+"""
+Milvus Service:业务层(直接用 manager.client 调 Milvus 原生方法)
+"""
+from __future__ import annotations
+
+import sys
+import os
+
+# 添加src目录到Python路径
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../..'))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../..'))
 
-import time
-import re
-import hashlib
 import logging
-import json
-from typing import List, Dict, Any, Tuple, Optional
-from langchain_core.documents import Document
-from langchain_openai import OpenAIEmbeddings
-from pymilvus import MilvusClient, DataType, Function, FunctionType
+from typing import List, Dict, Any
+from datetime import datetime
 
-from app.core.config import config_handler
+from app.base import get_milvus_manager, get_milvus_vectorstore, get_embedding_model
 
 logger = logging.getLogger(__name__)
 
+
 class MilvusService:
-    """Milvus 向量库服务类,实现父子块切分与混合检索存储"""
-    
-    def __init__(self, uri: str, db_name: str, parent_collection: str, child_collection: str):
-        self.client = MilvusClient(uri=uri, db_name=db_name)
-        self.parent_collection = parent_collection
-        self.child_collection = child_collection
-        self.emb = self._get_embeddings()
-        
-        # 配置参数
-        self.PARENT_MAX_CHARS = 6000
+    def __init__(self):
+        self.client = get_milvus_manager().client
+        # 获取embedding model
+        self.emdmodel = get_embedding_model()
+        # 默认向量维度 (Qwen3-Embedding-8B default)
         self.DENSE_DIM = 4096
-        self.H1_RE = re.compile(r"^#\s+(.+?)\s*$", re.MULTILINE)
-        self.BLANK_SPLIT_RE = re.compile(r"\n\s*\n+")
-        
-        # 确保集合已创建
-        self.ensure_collections()
 
-    def has_collection(self, collection_name: str) -> bool:
-        """检查集合是否存在"""
-        return self.client.has_collection(collection_name=collection_name)
-
-    def _get_embeddings(self) -> OpenAIEmbeddings:
-        """获取 Embedding 模型配置"""
-        return OpenAIEmbeddings(
-            base_url=config_handler.get("admin_app", "EMBEDDING_BASE_URL", "http://192.168.91.253:9003/v1"),
-            model=config_handler.get("admin_app", "EMBEDDING_MODEL", "Qwen3-Embedding-8B"),
-            api_key=config_handler.get("admin_app", "EMBEDDING_API_KEY", "dummy"),
-        )
+    def create_collection(self, name: str, dimension: int = None, description: str = "", fields: List[Dict] = None) -> None:
+        """
+        创建 Milvus 集合
+        :param dimension: 向量维度,如果为None则使用默认值
+        :param fields: 自定义字段列表,每个元素为 {"name": "age", "type": "INT64", ...}
+        """
+        # 使用默认维度
+        if dimension is None:
+            dimension = self.DENSE_DIM
 
-    # --- 切分工具方法 ---
-    
-    def _split_md_by_blank_lines(self, md: str) -> List[str]:
-        md = md.replace("\r\n", "\n").replace("\r", "\n")
-        parts = self.BLANK_SPLIT_RE.split(md)
-        return [p.strip() for p in parts if p.strip()]
-
-    def _is_heading_chunk(self, chunk: str) -> Optional[Tuple[int, str]]:
-        first_line = chunk.split("\n", 1)[0].strip()
-        m = re.match(r"^(#{1,6})\s+(.+?)\s*$", first_line)
-        if not m:
-            return None
-        return len(m.group(1)), m.group(2).strip()
-
-    def _split_md_by_h1_sections(self, md: str) -> List[Tuple[str, str]]:
-        """按一级标题切分父块"""
-        md = md.replace("\r\n", "\n").replace("\r", "\n")
-        matches = list(self.H1_RE.finditer(md))
-        if not matches:
-            txt = md.strip()
-            return [("__NO_H1__", txt)] if txt else []
-
-        sections = []
-        # 检查第一个#之前的内容
-        first_match_start = matches[0].start()
-        preamble = md[:first_match_start].strip()
-        if preamble:
-            sections.append(("__PREAMBLE__", preamble))
-        
-        for i, m in enumerate(matches):
-            title = m.group(1).strip()
-            start = m.start()
-            end = matches[i + 1].start() if i + 1 < len(matches) else len(md)
-            sec = md[start:end].strip()
-            if sec:
-                sections.append((title, sec))
-        return sections
-
-    def _make_parent_id(self, doc_id: str, doc_version: int, doc_name: str, h1_title: str, parent_seq: int) -> str:
-        """生成稳定的 parent_id"""
-        raw = f"{doc_id}|{doc_version}|{doc_name}|{parent_seq}|{h1_title}".encode("utf-8")
-        return hashlib.sha1(raw).hexdigest()
-
-    def _split_text_by_max_chars(self, text: str, max_chars: int) -> List[str]:
-        """父段过长时切片"""
-        text = (text or "").strip()
-        if not text or len(text) <= max_chars:
-            return [text] if text else []
-
-        chunks = self._split_md_by_blank_lines(text)
-        result = []
-        current_slice = ""
-        
-        for chunk in chunks:
-            if len(chunk) > max_chars:
-                if current_slice.strip():
-                    result.append(current_slice.strip())
-                    current_slice = ""
-                start = 0
-                while start < len(chunk):
-                    result.append(chunk[start : start + max_chars].strip())
-                    start += max_chars
-            else:
-                test_slice = current_slice + "\n\n" + chunk if current_slice else chunk
-                if len(test_slice) <= max_chars:
-                    current_slice = test_slice
-                else:
-                    if current_slice.strip():
-                        result.append(current_slice.strip())
-                    current_slice = chunk
+        if self.client.has_collection(name):
+            logger.info(f"Collection {name} already exists.")
+            return
         
-        if current_slice.strip():
-            result.append(current_slice.strip())
-        return [s for s in result if s]
-
-    # --- 核心业务逻辑 ---
-
-    def ensure_collections(self):
-        """确保父子 Collection 已创建并配置索引"""
-        for col_name in [self.parent_collection, self.child_collection]:
-            if not self.client.has_collection(collection_name=col_name):
-                schema = self.client.create_schema(auto_id=True, enable_dynamic_fields=False)
-                schema.add_field("pk", DataType.INT64, is_primary=True, auto_id=True)
-                schema.add_field("text", DataType.VARCHAR, max_length=65535, enable_analyzer=True)
-                schema.add_field("dense", DataType.FLOAT_VECTOR, dim=self.DENSE_DIM)
-                schema.add_field("sparse", DataType.SPARSE_FLOAT_VECTOR)
-                schema.add_field("document_id", DataType.VARCHAR, max_length=256)
-                schema.add_field("parent_id", DataType.VARCHAR, max_length=256)
-                schema.add_field("index", DataType.INT64)
-                schema.add_field("tag_list", DataType.VARCHAR, max_length=2048)
-                schema.add_field("permission", DataType.JSON)
-                schema.add_field("metadata", DataType.JSON)
-                schema.add_field("is_deleted", DataType.INT64)
-                schema.add_field("created_by", DataType.VARCHAR, max_length=256)
-                schema.add_field("created_time", DataType.INT64)
-                schema.add_field("updated_by", DataType.VARCHAR, max_length=256)
-                schema.add_field("updated_time", DataType.INT64)
-
-                schema.add_function(Function(
-                    name="bm25_fn",
-                    input_field_names=["text"],
-                    output_field_names=["sparse"],
-                    function_type=FunctionType.BM25,
-                ))
-                
-                self.client.create_collection(collection_name=col_name, schema=schema)
-                
-                index_params = self.client.prepare_index_params()
-                index_params.add_index(field_name="dense", index_name="dense_idx", index_type="AUTOINDEX", metric_type="COSINE")
-                index_params.add_index(field_name="sparse", index_name="bm25_idx", index_type="SPARSE_INVERTED_INDEX", metric_type="BM25", params={"inverted_index_algo": "DAAT_MAXSCORE"})
-                self.client.create_index(collection_name=col_name, index_params=index_params)
+        # 如果有自定义字段,使用 schema 创建
+        if fields:
+            from pymilvus import MilvusClient, DataType, Function, FunctionType
             
-            self.client.load_collection(collection_name=col_name)
-
-    async def insert_knowledge(self, md_text: str, doc_info: Dict[str, Any]):
-        """执行切分、向量化并存入 Milvus"""
-        doc_id = doc_info['doc_id']
-        doc_name = doc_info.get('doc_name', 'unknown')
-        doc_version = doc_info.get('doc_version', 20260127)
-        tag_list = str(doc_info.get('tags') or '')
-        
-        # 公共字段准备
-        created_by = doc_info.get('created_by', 'system')
-        created_time = doc_info.get('created_time', int(time.time() * 1000))
-        updated_by = doc_info.get('updated_by', 'system')
-        updated_time = doc_info.get('updated_time', int(time.time() * 1000))
-        permission = doc_info.get('permission', {})
-
-        try:
-            # 1. 幂等处理:清理旧数据
-            try:
-                self.client.delete(collection_name=self.parent_collection, filter=f"document_id == '{doc_id}'")
-                self.client.delete(collection_name=self.child_collection, filter=f"document_id == '{doc_id}'")
-            except Exception as e:
-                logger.warning(f"清理旧数据失败 (doc_id: {doc_id}): {e}")
-                # 继续执行,可能是第一次入库
+            # 1. 创建 Schema
+            schema = MilvusClient.create_schema(
+                auto_id=True,
+                enable_dynamic_field=True,
+                description=description
+            )
+            
+            # 检查字段中是否定义了主键
+            has_primary = any(f.get("is_primary") for f in fields)
+            if not has_primary:
+                # 如果没有定义主键,添加默认主键
+                schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True, auto_id=True)
+            
+            # 检查是否有默认向量列,如果没有则添加 (兼容旧逻辑,但如果fields里有vector则不添加)
+            has_vector = any(f.get("type") == "FLOAT_VECTOR" for f in fields)
+            if not has_vector:
+                schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=dimension)
+            
+            # 3. 添加用户自定义字段
+            type_map = {
+                "BOOL": DataType.BOOL,
+                "INT8": DataType.INT8,
+                "INT16": DataType.INT16,
+                "INT32": DataType.INT32,
+                "INT64": DataType.INT64,
+                "FLOAT": DataType.FLOAT,
+                "DOUBLE": DataType.DOUBLE,
+                "VARCHAR": DataType.VARCHAR,
+                "JSON": DataType.JSON,
+                "FLOAT_VECTOR": DataType.FLOAT_VECTOR,
+                "SPARSE_FLOAT_VECTOR": DataType.SPARSE_FLOAT_VECTOR,
+                "BM25": DataType.SPARSE_FLOAT_VECTOR # BM25 特殊处理,映射为稀疏向量
+            }
+            
+            bm25_field = None
+            text_field_name = "text" # 默认文本字段名
 
-            # 2. 切分父子块
-            try:
-                parent_sections = self._split_md_by_h1_sections(md_text)
-                parent_entities = []
-                child_entities = []
+            for f in fields:
+                field_type_str = f.get("type", "").upper()
+                dtype = type_map.get(field_type_str)
+                if not dtype:
+                    continue 
                 
-                # 预生成所有 parent_id
-                parent_seq_to_id = {}
-                for seq, (title, _) in enumerate(parent_sections):
-                    parent_seq_to_id[seq] = self._make_parent_id(doc_id, doc_version, doc_name, title, seq)
-
-                # 3. 处理子块
-                for seq, (h1_title, sec_text) in enumerate(parent_sections):
-                    p_id = parent_seq_to_id[seq]
-                    chunks = self._split_md_by_blank_lines(sec_text)
-                    heading_path = []
-                    
-                    for c_idx, chunk in enumerate(chunks):
-                        h_info = self._is_heading_chunk(chunk)
-                        if h_info:
-                            level, title = h_info
-                            heading_path = heading_path[:level-1] + [title]
-                        
-                        outline_path = " > ".join(heading_path)
-                        
-                        child_entities.append({
-                            "text": chunk,
-                            "is_deleted": 0,
-                            "parent_id": p_id,
-                            "document_id": doc_id,
-                            "index": int(c_idx),
-                            "tag_list": tag_list,
-                            "permission": permission,
-                            "metadata": {
-                                "doc_name": doc_name,
-                                "outline_path": outline_path,
-                                "doc_version": doc_version
-                            },
-                            "created_by": created_by,
-                            "created_time": created_time,
-                            "updated_by": updated_by,
-                            "updated_time": updated_time
-                        })
-
-                # 4. 处理父块
-                for seq, (h1_title, sec_text) in enumerate(parent_sections):
-                    p_id = parent_seq_to_id[seq]
-                    slices = self._split_text_by_max_chars(sec_text, self.PARENT_MAX_CHARS)
-                    for s_idx, slice_text in enumerate(slices):
-                        parent_entities.append({
-                            "text": slice_text,
-                            "is_deleted": 0,
-                            "parent_id": p_id,
-                            "document_id": doc_id,
-                            "index": int(seq),
-                            "tag_list": tag_list,
-                            "permission": permission,
-                            "metadata": {
-                                "doc_name": doc_name,
-                                "outline_path": h1_title if h1_title not in ["__PREAMBLE__", "__NO_H1__"] else doc_name,
-                                "doc_version": doc_version
-                            },
-                            "created_by": created_by,
-                            "created_time": created_time,
-                            "updated_by": updated_by,
-                            "updated_time": updated_time
-                        })
-            except Exception as e:
-                logger.error(f"文档切分失败 (doc_id: {doc_id}): {e}")
-                raise RuntimeError(f"文档切分处理异常: {str(e)}")
+                # 记录文本字段名,供BM25使用
+                if f.get("name") in ["text", "content", "chunk"]: 
+                    text_field_name = f.get("name")
 
-            # 5. 向量化并插入
-            # 处理父块
-            if parent_entities:
-                try:
-                    p_texts = [e['text'] for e in parent_entities]
-                    p_vecs = self.emb.embed_documents(p_texts)
-                    for e, v in zip(parent_entities, p_vecs): e['dense'] = v
-                except Exception as e:
-                    logger.error(f"父块向量化失败 (Embedding Service): {e}")
-                    raise RuntimeError(f"Embedding 服务调用失败: {str(e)}")
+                kwargs = {
+                    "field_name": f.get("name"),
+                    "datatype": dtype,
+                    "description": f.get("description", "")
+                }
                 
-                try:
-                    self.client.insert(collection_name=self.parent_collection, data=parent_entities)
-                except Exception as e:
-                    logger.error(f"父块存入 Milvus 失败: {e}")
-                    raise RuntimeError(f"向量数据库写入失败(Parent): {str(e)}")
+                if f.get("is_primary"):
+                    kwargs["is_primary"] = True
+                    kwargs["auto_id"] = True # 假设主键都是自增
                 
-            # 处理子块
-            if child_entities:
-                try:
-                    c_texts = [e['text'] for e in child_entities]
-                    c_vecs = self.emb.embed_documents(c_texts)
-                    for e, v in zip(child_entities, c_vecs): e['dense'] = v
-                except Exception as e:
-                    logger.error(f"子块向量化失败 (Embedding Service): {e}")
-                    raise RuntimeError(f"Embedding 服务调用失败: {str(e)}")
+                if dtype == DataType.VARCHAR:
+                    kwargs["max_length"] = f.get("max_length", 65535)
+                    # 关键修复:如果要被 BM25 引用,必须启用 analyzer
+                    if f.get("name") in ["text", "content", "chunk"]:
+                        kwargs["enable_analyzer"] = True
+                
+                if dtype == DataType.FLOAT_VECTOR:
+                    kwargs["dim"] = dimension # 使用传入的 dimension
                 
+                schema.add_field(**kwargs)
+
+                # 如果是 BM25 类型,记录下来以便后续添加 Function
+                if field_type_str == "BM25":
+                    bm25_field = f.get("name")
+
+            # 处理 BM25 Function
+            if bm25_field:
                 try:
-                    self.client.insert(collection_name=self.child_collection, data=child_entities)
+                    schema.add_function(Function(
+                        name="bm25_fn",
+                        input_field_names=[text_field_name],
+                        output_field_names=[bm25_field],
+                        function_type=FunctionType.BM25
+                    ))
+                    logger.info(f"Added BM25 function mapping {text_field_name} -> {bm25_field}")
                 except Exception as e:
-                    logger.error(f"子块存入 Milvus 失败: {e}")
-                    raise RuntimeError(f"向量数据库写入失败(Child): {str(e)}")
+                    logger.error(f"Failed to add BM25 function: {e}")
 
-            logger.info(f"Successfully entered knowledge base for doc_id: {doc_id}, parents: {len(parent_entities)}, children: {len(child_entities)}")
-            return len(parent_entities), len(child_entities)
+            # 4. 准备索引参数
+            index_params = self.client.prepare_index_params()
+            
+            # 5. 为所有向量字段添加索引
+            for f in fields:
+                ftype = f.get("type", "").upper()
+                if ftype == "FLOAT_VECTOR":
+                    index_params.add_index(
+                        field_name=f.get("name"), 
+                        index_type="AUTOINDEX",
+                        metric_type="COSINE"
+                    )
+                elif ftype == "BM25" or ftype == "SPARSE_FLOAT_VECTOR":
+                    index_params.add_index(
+                        field_name=f.get("name"),
+                        index_type="SPARSE_INVERTED_INDEX", # 稀疏向量索引
+                        metric_type="BM25"
+                    )
             
+            # 6. 为自定义标量字段添加索引
+            for f in fields:
+                if f.get("type", "").upper() in ["VARCHAR", "INT64", "INT32", "BOOL"] and not f.get("is_primary"):
+                     # 排除主键,主键自动索引
+                    index_params.add_index(
+                        field_name=f.get("name"),
+                        index_type="INVERTED"
+                    )
+
+            # 7. 创建集合
+            self.client.create_collection(
+                collection_name=name,
+                schema=schema,
+                index_params=index_params
+            )
+            
+        else:
+            # 使用简化的 create_collection API
+            self.client.create_collection(
+                collection_name=name,
+                dimension=dimension,
+                description=description,
+                auto_id=True,  # 自动生成 ID
+                id_type="int", # ID 类型
+                metric_type="COSINE" # 默认使用余弦相似度
+            )
+        
+        logger.info(f"Created collection {name} with dimension {dimension}")
+
+    def drop_collection(self, name: str) -> None:
+        """删除 Milvus 集合"""
+        if self.client.has_collection(name):
+            self.client.drop_collection(name)
+            logger.info(f"Dropped collection {name}")
+
+    def has_collection(self, name: str) -> bool:
+        """检查集合是否存在"""
+        return self.client.has_collection(name)
+
+    def get_collection_details(self) -> List[Dict[str, Any]]:
+        """
+        获取所有 Collections 详细信息
+        """
+        details: List[Dict[str, Any]] = []
+
+        names = self.client.list_collections()
+
+        for name in names:
+            desc = self.client.describe_collection(collection_name=name)
+            stats = self.client.get_collection_stats(collection_name=name)
+            load_state = self.client.get_load_state(collection_name=name)
+
+            # ===== 时间戳转换(按你指定写法,无封装)=====
+            created_time = None
+            updated_time = None
+
+            if desc.get("created_timestamp") is not None:
+                ts_int = int(desc["created_timestamp"])
+                physical_ms = ts_int >> 18
+                created_time = datetime.fromtimestamp(physical_ms / 1000).strftime("%Y-%m-%d %H:%M:%S")
+
+            if desc.get("update_timestamp") is not None:
+                ts_int = int(desc["update_timestamp"])
+                physical_ms = ts_int >> 18
+                updated_time = datetime.fromtimestamp(physical_ms / 1000).strftime("%Y-%m-%d %H:%M:%S")
+
+            # ===== 数量:不保底(要求返回结构必须有 row_count)=====
+            entity_count = stats["row_count"]
+
+            # ===== 状态:不保底(要求返回结构必须有 state)=====
+            status = load_state["state"]
+
+            details.append(
+                {
+                    "name": name,
+                    "status": status,
+                    "entity_count": entity_count,
+                    "description": desc.get("description", ""),
+                    "created_time": created_time,
+                    "updated_time": updated_time,
+                }
+            )
+
+        logger.info(f"成功获取Collections详细信息,共{len(details)}个")
+        return details
+
+    def set_collection_state(self, name: str, action: str) -> Dict[str, Any]:
+        """
+        改变指定 Collection 的加载状态。
+
+        参数:
+        - name: 集合名称
+        - action: 操作,取值 'load' 或 'release'
+
+        返回:
+        - 包含集合名称和当前状态的字典,例如: {"name": name, "state": "Loaded"}
+        """
+        action_norm = (action or "").strip().lower()
+        if action_norm not in {"load", "release"}:
+            raise ValueError("action 必须为 'load' 或 'release'")
+
+        # 执行加载/释放
+        if action_norm == "load":
+            self.client.load_collection(collection_name=name)
+        else:
+            self.client.release_collection(collection_name=name)
+
+        # 返回最新状态
+        load_state = self.client.get_load_state(collection_name=name)
+        state = load_state.get("state") if isinstance(load_state, dict) else load_state
+        result = {"name": name, "state": state, "action": action_norm}
+        logger.info(f"集合 {name} 状态更新为 {state} (action={action_norm})")
+        return result
+
+    def delete_collection_if_empty(self, name: str) -> Dict[str, Any]:
+        """仅当集合内容为空时删除集合,否则抛出异常"""
+        stats = self.client.get_collection_stats(collection_name=name)
+        row_count = stats.get("row_count") if isinstance(stats, dict) else None
+        if row_count is None:
+            raise ValueError("无法获取集合行数,禁止删除")
+        if int(row_count) > 0:
+            raise ValueError("集合内容不为空,不能删除")
+
+        self.client.drop_collection(collection_name=name)
+        logger.info(f"集合 {name} 已删除")
+        return {"name": name, "deleted": True}
+
+    def get_collection_detail(self, name: str) -> Dict[str, Any]:
+        """获取单个集合的详细信息,包含schema、索引等所有desc字段"""
+        desc = self.client.describe_collection(collection_name=name)
+        stats = self.client.get_collection_stats(collection_name=name)
+        load_state = self.client.get_load_state(collection_name=name)
+
+        # 时间戳转换
+        created_time = None
+        updated_time = None
+
+        if desc.get("created_timestamp") is not None:
+            ts_int = int(desc["created_timestamp"])
+            physical_ms = ts_int >> 18
+            created_time = datetime.fromtimestamp(physical_ms / 1000).strftime("%Y-%m-%d %H:%M:%S")
+
+        if desc.get("update_timestamp") is not None:
+            ts_int = int(desc["update_timestamp"])
+            physical_ms = ts_int >> 18
+            updated_time = datetime.fromtimestamp(physical_ms / 1000).strftime("%Y-%m-%d %H:%M:%S")
+
+        entity_count = stats.get("row_count", 0)
+        status = load_state.get("state") if isinstance(load_state, dict) else load_state
+
+        # 提取字段schema
+        fields = []
+        if "fields" in desc:
+            for field in desc["fields"]:
+                field_info = {
+                    "name": field.get("name"),
+                    "type": str(field.get("type")),
+                    "description": field.get("description", ""),
+                    "is_primary": field.get("is_primary", False),
+                    "auto_id": field.get("auto_id"),
+                }
+                # 向量维度
+                if "params" in field and "dim" in field["params"]:
+                    field_info["dim"] = field["params"]["dim"]
+                # 字符串长度
+                if "params" in field and "max_length" in field["params"]:
+                    field_info["max_length"] = field["params"]["max_length"]
+                # 其他params
+                if "params" in field:
+                    field_info["params"] = field["params"]
+                fields.append(field_info)
+
+        # 提取索引信息
+        indices = []
+        
+        # 尝试从 describe_collection 结果中获取 (兼容旧逻辑)
+        if "indexes" in desc:
+            for idx in desc["indexes"]:
+                index_info = {
+                    "field_name": idx.get("field_name"),
+                    "index_name": idx.get("index_name"),
+                    "index_type": idx.get("index_type"),
+                    "metric_type": idx.get("metric_type"),
+                    "params": idx.get("params"),
+                }
+                indices.append(index_info)
+        
+        # 如果没有获取到索引信息,尝试主动查询 list_indexes
+        if not indices:
+            try:
+                # 获取索引列表 (通常返回索引名称列表)
+                index_names = self.client.list_indexes(collection_name=name)
+                if index_names:
+                    for idx_name in index_names:
+                        try:
+                            # 获取索引详情
+                            idx_desc = self.client.describe_index(collection_name=name, index_name=idx_name)
+                            if idx_desc:
+                                indices.append({
+                                    "field_name": idx_desc.get("field_name"),
+                                    "index_name": idx_desc.get("index_name"),
+                                    "index_type": idx_desc.get("index_type"),
+                                    "metric_type": idx_desc.get("metric_type"),
+                                    "params": idx_desc.get("params"),
+                                })
+                        except Exception:
+                            continue
+            except Exception as e:
+                logger.warning(f"Failed to list/describe indexes for {name}: {e}")
+
+        detail = {
+            "name": name,
+            "description": desc.get("description", ""),
+            "status": status,
+            "entity_count": entity_count,
+            "created_time": created_time,
+            "updated_time": updated_time,
+            "fields": fields,
+            "enable_dynamic_field": desc.get("enable_dynamic_field", False),
+            "consistency_level": desc.get("consistency_level"),
+            "num_shards": desc.get("num_shards"),
+            "num_partitions": desc.get("num_partitions"),
+            "indices": indices,
+            "properties": desc.get("properties"),
+            "aliases": desc.get("aliases", []),
+        }
+
+        logger.info(f"成功获取集合 {name} 的详细信息")
+        return detail
+
+    
+    def update_collection_description(self, name: str, description: str) -> Dict[str, Any]:
+        """使用 alter_collection_properties 更新集合描述"""
+        description = description or ""
+
+        # 1. 更新集合 description(唯一修改点)
+        self.client.alter_collection_properties(
+            collection_name=name,
+            properties={"collection.description": description},
+        )
+
+        # 2. 重新获取集合信息
+        desc = self.client.describe_collection(collection_name=name)
+        print(desc)
+        stats = self.client.get_collection_stats(collection_name=name)
+        load_state = self.client.get_load_state(collection_name=name)
+
+        # 3. 时间戳转换(Milvus TSO -> 物理时间)
+        def ts_to_str(ts):
+            if ts is None:
+                return None
+            ts_int = int(ts)
+            physical_ms = ts_int >> 18
+            return datetime.fromtimestamp(physical_ms / 1000).strftime("%Y-%m-%d %H:%M:%S")
+
+        created_time = ts_to_str(desc.get("created_timestamp"))
+        updated_time = ts_to_str(desc.get("update_timestamp"))
+
+        entity_count = stats.get("row_count") if isinstance(stats, dict) else None
+        status = load_state.get("state") if isinstance(load_state, dict) else load_state
+
+        return {
+            "name": name,
+            "status": status,
+            "entity_count": entity_count,
+            "description": desc.get("description", ""),
+            "created_time": created_time,
+            "updated_time": updated_time,
+        }
+
+    def hybrid_search(self, collection_name: str, query_text: str,
+                     top_k: int = 3, ranker_type: str = "weighted",
+                     dense_weight: float = 0.7, sparse_weight: float = 0.3):
+        """
+        混合搜索(参考 test_hybrid_v2.6.py 的实现)
+
+        Args:
+            param: 包含collection_name的参数字典
+            query_text: 查询文本
+            top_k: 返回结果数量
+            ranker_type: 重排序类型 "weighted" 或 "rrf"
+            dense_weight: 密集向量权重(当ranker_type="weighted"时使用)
+            sparse_weight: 稀疏向量权重(当ranker_type="weighted"时使用)
+
+        Returns:
+            List[Dict]: 搜索结果列表
+        """
+        try:
+            collection_name = collection_name
+
+            # 获取 vectorstore 实例(包含 Milvus 和 BM25BuiltInFunction)
+            vectorstore = get_milvus_vectorstore(
+                collection_name=collection_name,
+                consistency_level="Strong"
+            )
+
+            # 执行混合搜索 (完全按照 test_hybrid_v2.6.py 的逻辑)
+            if ranker_type == "weighted":
+                results = vectorstore.similarity_search(
+                    query=query_text,
+                    k=top_k,
+                    ranker_type="weighted",
+                    ranker_params={"weights": [dense_weight, sparse_weight]}
+                )
+            else:  # rrf
+                results = vectorstore.similarity_search(
+                    query=query_text,
+                    k=top_k,
+                    ranker_type="rrf",
+                    ranker_params={"k": 60}
+                )
+
+            # 格式化结果,保持与其他搜索方法一致
+            formatted_results = []
+            for doc in results:
+                formatted_results.append({
+                    'id': doc.metadata.get('pk', 0),
+                    'text_content': doc.page_content,
+                    'metadata': doc.metadata,
+                    'distance': 0.0,
+                    'similarity': 1.0
+                })
+
+            logger.info(f"Hybrid search returned {len(formatted_results)} results")
+            return formatted_results
+
         except Exception as e:
-            # 重新抛出已处理的异常或包装未处理的异常
-            if not isinstance(e, RuntimeError):
-                logger.exception(f"入库流程发生未知异常 (doc_id: {doc_id})")
-                raise RuntimeError(f"入库未知错误: {str(e)}")
-            raise e
-
-# 全局 Milvus 服务实例
-milvus_host = config_handler.get("admin_app", "MILVUS_HOST", "192.168.92.61")
-milvus_port = config_handler.get("admin_app", "MILVUS_PORT", "19530")
-milvus_service = MilvusService(
-    uri=f"http://{milvus_host}:{milvus_port}",
-    db_name=config_handler.get("admin_app", "MILVUS_DB", "lq_db"),
-    parent_collection=config_handler.get("admin_app", "PARENT_COLLECTION_NAME", "test_27_parent"),
-    child_collection=config_handler.get("admin_app", "CHILD_COLLECTION_NAME", "test_27_child")
-)
+            logger.error(f"Error in hybrid search: {e}")
+            # 回退到传统的向量搜索
+            logger.info("Falling back to traditional vector search")
+
+
+# 可选:单例
+milvus_service = MilvusService()
+
+
+if __name__ == "__main__":
+    # 推荐这样跑:
+    # uv run python -m src.app.services.milvus_service
+    import json
+
+    service = MilvusService()
+    
+    # 测试混合搜索 hybrid_search
+    print("=" * 50)
+    print("测试混合检索 (Hybrid Search)")
+    print("=" * 50)
+    
+    try:
+        # 示例参数,需要根据实际情况修改
+        collection_name = "first_bfp_collection_status" 
+        query_text = "《公路水运工程临时用电技术规程》(JTT1499-2024)状态为现行"  # 修改为实际查询内容
+        
+        # 测试 weighted 模式
+        print("\n1. 测试 Weighted 重排序模式:")
+        print(f"   集合: {collection_name}")
+        print(f"   查询: {query_text}")
+        print(f"   密集权重: 0.7, 稀疏权重: 0.3")
+        
+        results_weighted = service.hybrid_search(
+            collection_name=collection_name,
+            query_text=query_text,
+            top_k=5,
+            ranker_type="weighted",
+            dense_weight=0.7,
+            sparse_weight=0.3
+        )
+        
+        print(f"\n   结果数量: {len(results_weighted)}")
+        for i, result in enumerate(results_weighted, 1):
+            print(f"   [{i}] ID: {result.get('id')}, Text: {result.get('text_content')[:50]}...")
+        
+        # 测试 RRF 模式
+        print("\n2. 测试 RRF (Reciprocal Rank Fusion) 重排序模式:")
+        print(f"   集合: {collection_name}")
+        print(f"   查询: {query_text}")
+        
+        results_rrf = service.hybrid_search(
+            collection_name=collection_name,
+            query_text=query_text,
+            top_k=5,
+            ranker_type="rrf"
+        )
+        
+        print(f"\n   结果数量: {len(results_rrf)}")
+        for i, result in enumerate(results_rrf, 1):
+            print(f"   [{i}] ID: {result.get('id')}, Text: {result.get('text_content')[:50]}...")
+        
+        print("\n✓ 混合检索测试完成")
+        
+    except Exception as e:
+        print(f"\n✗ 混合检索测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+    
+    # 也可以查看集合详情
+    print("\n" + "=" * 50)
+    print("获取所有集合信息:")
+    print("=" * 50)
+    data = service.get_collection_details()
+    for item in data:
+        print(json.dumps(item, ensure_ascii=False, indent=2))

+ 124 - 52
src/app/services/snippet_service.py

@@ -8,6 +8,7 @@ import random
 import csv
 import io
 import time
+import uuid
 from datetime import datetime
 
 from app.services.milvus_service import milvus_service
@@ -56,17 +57,38 @@ class SnippetService:
                 stats = milvus_service.client.get_collection_stats(col_name)
                 col_count = int(stats.get("row_count", 0)) if isinstance(stats, dict) else 0
                 
+                # 构建过滤表达式
+                filter_exprs = []
+                
+                # 状态过滤 (假设 is_deleted=False 表示 normal, True 表示 disabled/deleted)
+                # 注意:Milvus 中通常使用 is_deleted 标记删除,这里我们需要根据传入的 status 映射
+                # status: 'normal' -> is_deleted == False
+                # status: 'disabled' -> 实际上我们没有 disabled 状态,只有删除。如果必须支持,可能需要额外的 status 字段
+                # 暂时假设:normal = 未删除, disabled = (无此状态或暂不支持)
+                
+                # 如果要严格支持 normal/disabled,需要在 schema 中增加 status 字段
+                # 目前 Schema 只有 is_deleted。
+                # 兼容处理:
+                if status == 'normal':
+                    filter_exprs.append("is_deleted == false")
+                elif status == 'disabled':
+                    # 假设 disabled 对应 is_deleted == true (软删除状态)
+                    # 但通常 deleted 数据不应被查出。如果需要查出“已禁用”,则需要额外字段。
+                    # 按照之前 Schema 定义,is_deleted 是 bool。
+                    filter_exprs.append("is_deleted == true")
+                else:
+                    # 默认只查未删除的
+                    filter_exprs.append("is_deleted == false")
+
                 if keyword:
                     # 关键词模式:必须实际查询
-                    desc = milvus_service.client.describe_collection(col_name)
-                    existing_fields = [f['name'] for f in desc.get('fields', [])]
-                    
-                    # 尝试获取所有字段
+                    filter_exprs.append(f'text like "%{keyword}%"')
+
+                expr = " && ".join(filter_exprs) if filter_exprs else ""
+                
+                if keyword:
+                    # 有关键词,必须 query
                     output_fields = ["*"]
-                    
-                    expr = f'text like "%{keyword}%"' if 'text' in existing_fields else "" 
-                    if not expr: continue 
-                    
                     res = milvus_service.client.query(col_name, filter=expr, output_fields=output_fields, limit=100)
                     col_hits = len(res)
                     global_total += col_hits
@@ -86,32 +108,70 @@ class SnippetService:
                     if need_count <= 0: break
                     
                 else:
-                    # 无关键词模式
-                    global_total += col_count
-                    
-                    if skip_count >= col_count:
-                        skip_count -= col_count
-                        continue
-                    
-                    if need_count > 0:
-                        current_offset = skip_count
-                        current_limit = min(need_count, col_count - current_offset)
+                    # 无关键词模式,只有状态过滤
+                    # 如果有状态过滤,也必须 query,不能直接用 stats
+                    if status:
+                         # 必须 query 计数
+                         # 优化:先 count
+                         res_cnt = milvus_service.client.query(col_name, filter=expr, output_fields=["count(*)"])
+                         # res_cnt 格式可能不同,视 Milvus 版本。通常 query 不支持聚合。
+                         # 只能先 query id
+                         res = milvus_service.client.query(col_name, filter=expr, output_fields=["pk"])
+                         col_hits = len(res)
+                         global_total += col_hits
+                         
+                         if skip_count >= col_hits:
+                            skip_count -= col_hits
+                            continue
+                         
+                         # 获取分页数据
+                         output_fields = ["*"]
+                         res_page = milvus_service.client.query(
+                            collection_name=col_name,
+                            filter=expr,
+                            output_fields=output_fields,
+                            limit=need_count,
+                            offset=skip_count # 注意:query 的 offset 性能较差,但在小数据量下尚可
+                         )
+                         
+                         for r in res_page:
+                            items.append(self._format_snippet(r, col_name))
+                         
+                         skip_count = 0 
+                         need_count -= len(res_page)
+
+                    else:
+                        # 既无关键词也无状态过滤(默认查所有未删除),可以直接用 offset
+                        # 但默认还是过滤 is_deleted == false
+                        expr = "is_deleted == false"
                         
-                        output_fields = ["*"]
+                        # 简单起见,统一走 query 路径以支持 is_deleted 过滤
+                        # 如果完全不过滤,才可以用 limit/offset 直接分页
+                        
+                        # 这里为了性能,假设默认查询不需要精确 count,或者全部都是未删除
+                        # 暂时强制走 query 以保证准确性
+                        res = milvus_service.client.query(col_name, filter=expr, output_fields=["pk"])
+                        col_hits = len(res)
+                        global_total += col_hits
                         
-                        res = milvus_service.client.query(
+                        if skip_count >= col_hits:
+                            skip_count -= col_hits
+                            continue
+                            
+                        output_fields = ["*"]
+                        res_page = milvus_service.client.query(
                             collection_name=col_name,
-                            filter="",
+                            filter=expr,
                             output_fields=output_fields,
-                            limit=current_limit,
-                            offset=current_offset
+                            limit=need_count,
+                            offset=skip_count
                         )
                         
-                        for r in res:
+                        for r in res_page:
                             items.append(self._format_snippet(r, col_name))
-                        
-                        skip_count = 0 
-                        need_count -= current_limit
+                            
+                        skip_count = 0
+                        need_count -= len(res_page)
 
             except Exception as e:
                 print(f"Collection {col_name} query error: {e}")
@@ -137,9 +197,11 @@ class SnippetService:
         # 基础数据
         now = int(time.time() * 1000)
         item = {
-            "dense": fake_vector,
+            "vector": fake_vector,
             "text": payload.content,
-            "document_id": "manual_add",
+            "document_id": str(uuid.uuid4()), # 生成UUID
+            "parent_id": payload.custom_fields.get("parent_id", "") if hasattr(payload, 'custom_fields') and payload.custom_fields else "",
+            "index": 0,
             "tag_list": "",
             "permission": {},
             "metadata": {
@@ -147,17 +209,16 @@ class SnippetService:
                 "file_name": payload.doc_name, 
                 "title": payload.doc_name
             },
-            "index": 0,
-            "is_deleted": 0,
+            "is_deleted": False,
             "created_by": "system",
             "created_time": now,
             "updated_by": "system",
             "updated_time": now
         }
         
-        # 合并自定义字段
-        if hasattr(payload, 'custom_fields') and payload.custom_fields:
-            item.update(payload.custom_fields)
+        # 合并自定义字段 (Schema已固定)
+        # if hasattr(payload, 'custom_fields') and payload.custom_fields:
+        #     item.update(payload.custom_fields)
             
         data = [item]
         
@@ -192,9 +253,11 @@ class SnippetService:
         
         now = int(time.time() * 1000)
         item = {
-            "dense": fake_vector,
+            "vector": fake_vector,
             "text": payload.content,
-            "document_id": "updated",
+            "document_id": str(uuid.uuid4()), # 更新也会生成新文档ID
+            "parent_id": payload.custom_fields.get("parent_id", "") if hasattr(payload, 'custom_fields') and payload.custom_fields else "",
+            "index": 0,
             "tag_list": "",
             "permission": {},
             "metadata": {
@@ -202,17 +265,16 @@ class SnippetService:
                 "file_name": payload.doc_name,
                 "title": payload.doc_name
             },
-            "index": 0,
-            "is_deleted": 0,
+            "is_deleted": False,
             "created_by": "system",
             "created_time": now,
             "updated_by": "system",
             "updated_time": now
         }
         
-        # 合并自定义字段
-        if hasattr(payload, 'custom_fields') and payload.custom_fields:
-            item.update(payload.custom_fields)
+        # 合并自定义字段 (Schema已固定)
+        # if hasattr(payload, 'custom_fields') and payload.custom_fields:
+        #     item.update(payload.custom_fields)
             
         data = [item]
         
@@ -242,20 +304,30 @@ class SnippetService:
         milvus_service.client.flush(kb)
 
     def _format_snippet(self, r: Dict, col_name: str) -> Dict:
-        id_val = r.get("id") or r.get("pk")
-        content = r.get("text") or r.get("content") or r.get("page_content") or ""
+        id_val = r.get("pk") or r.get("id")
+        content = r.get("text") or r.get("content") or ""
         
-        if not content:
-            try:
-                debug_content = r.copy()
-                if "dense" in debug_content: del debug_content["dense"]
-                content = json.dumps(debug_content, default=str, ensure_ascii=False)
-            except:
-                content = "无法解析内容"
+        # 尝试从 metadata 中获取 doc_name
+        doc_name = "未知文档"
+        meta = r.get("metadata") or {}
+        if isinstance(meta, dict):
+             doc_name = meta.get("doc_name") or meta.get("file_name") or meta.get("title") or doc_name
+        else:
+             # 兼容旧数据
+             doc_name = r.get("file_name") or r.get("title") or r.get("source") or r.get("doc_name") or doc_name
 
-        doc_name = r.get("file_name") or r.get("title") or r.get("source") or r.get("doc_name") or "未知文档"
         meta_info = f"ParentID: {r.get('parent_id', '-')}"
         
+        # 时间处理
+        created_at = "-"
+        if r.get("created_time"):
+            try:
+                # 假设是毫秒时间戳
+                ts = int(r.get("created_time")) / 1000
+                created_at = datetime.fromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S")
+            except:
+                pass
+        
         return {
             "id": str(id_val),
             "collection_name": col_name,
@@ -265,7 +337,7 @@ class SnippetService:
             "char_count": len(content) if content else 0,
             "meta_info": meta_info,
             "status": "normal",
-            "created_at": "-",
+            "created_at": created_at,
             "updated_at": "-"
         }
 

+ 49 - 9
src/app/utils/vector_utils.py

@@ -1,30 +1,70 @@
 import hashlib
 import math
+import requests
+import logging
 from typing import List
+from app.core.config import config_handler
+
+logger = logging.getLogger(__name__)
+
+# Read config
+EMBEDDING_BASE_URL = config_handler.get("admin_app", "EMBEDDING_BASE_URL", "")
+EMBEDDING_MODEL = config_handler.get("admin_app", "EMBEDDING_MODEL", "")
+EMBEDDING_API_KEY = config_handler.get("admin_app", "EMBEDDING_API_KEY", "dummy")
 
 def text_to_vector_algo(text: str, dim: int = 768) -> List[float]:
     """
-    [算法实现] 特征哈希 (Feature Hashing / Hashing Trick)
-    统一的向量生成算法,确保写入和检索时使用相同的逻辑。
+    调用 Embedding API 生成向量。
+    如果 API 调用失败,返回全 0 向量 (长度为 dim)。
+    注意:返回的向量维度取决于模型,可能与传入的 dim 不一致。
     """
     if not text:
         return [0.0] * dim
+
+    # 如果没有配置 URL,回退到原来的哈希算法 (或者直接报错,视需求而定)
+    # 这里为了防止完全无法运行,保留一个简单的 fallback,但打个 warning
+    if not EMBEDDING_BASE_URL:
+        logger.warning("未配置 EMBEDDING_BASE_URL,使用 Dummy Hash 向量")
+        return _dummy_hash_vector(text, dim)
+
+    try:
+        url = f"{EMBEDDING_BASE_URL}/embeddings"
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {EMBEDDING_API_KEY}"
+        }
+        payload = {
+            "input": text,
+            "model": EMBEDDING_MODEL
+        }
         
+        # 简单重试机制
+        response = requests.post(url, json=payload, headers=headers, timeout=30)
+        response.raise_for_status()
+        data = response.json()
+        
+        # 兼容 OpenAI 格式
+        if 'data' in data and len(data['data']) > 0:
+            embedding = data['data'][0]['embedding']
+            return embedding
+        else:
+            logger.error(f"Embedding API 响应格式错误: {data}")
+            return [0.0] * dim
+            
+    except Exception as e:
+        logger.error(f"Embedding API 调用失败: {e}")
+        return [0.0] * dim
+
+def _dummy_hash_vector(text: str, dim: int) -> List[float]:
+    """原有的特征哈希算法,作为 fallback"""
     vector = [0.0] * dim
-    
-    # 简单分词:中文按字,英文按词 (这里简化处理,全部按字符处理以支持中文)
     tokens = list(text) 
-    
     for token in tokens:
-        # 使用 MD5 保证确定性
         hash_obj = hashlib.md5(token.encode('utf-8'))
         hash_val = int(hash_obj.hexdigest(), 16)
         idx = hash_val % dim
         vector[idx] += 1.0
-        
-    # L2 归一化
     magnitude = math.sqrt(sum(x*x for x in vector))
     if magnitude > 0:
         vector = [x / magnitude for x in vector]
-    
     return vector