knowledge.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. import io
  2. import zipfile
  3. from enum import Enum
  4. import uuid_utils.compat as uuid
  5. from django.contrib.postgres.fields import ArrayField
  6. from django.contrib.postgres.search import SearchVectorField
  7. from django.db import models
  8. from django.db.models import QuerySet
  9. from django.db.models.signals import pre_delete
  10. from django.dispatch import receiver
  11. from mptt.fields import TreeForeignKey
  12. from mptt.models import MPTTModel
  13. from common.db.sql_execute import select_one
  14. from common.mixins.app_model_mixin import AppModelMixin
  15. from common.utils.common import get_sha256_hash
  16. from models_provider.models import Model
  17. from users.models import User
  18. class KnowledgeType(models.IntegerChoices):
  19. BASE = 0, '通用类型'
  20. WEB = 1, 'web站点类型'
  21. LARK = 2, '飞书类型'
  22. YUQUE = 3, '语雀类型'
  23. WORKFLOW = 4, '工作流类型'
  24. SAMPLE_CENTER = 5, '样本中心'
  25. class TaskType(Enum):
  26. # 向量
  27. EMBEDDING = 1
  28. # 生成问题
  29. GENERATE_PROBLEM = 2
  30. # 同步
  31. SYNC = 3
  32. class State(Enum):
  33. # 等待
  34. PENDING = '0'
  35. # 执行中
  36. STARTED = '1'
  37. # 成功
  38. SUCCESS = '2'
  39. # 失败
  40. FAILURE = '3'
  41. # 取消任务
  42. REVOKE = '4'
  43. # 取消成功
  44. REVOKED = '5'
  45. # 忽略
  46. IGNORED = 'n'
  47. class KnowledgeScope(models.TextChoices):
  48. SHARED = "SHARED", '共享'
  49. WORKSPACE = "WORKSPACE", "工作空间可用"
  50. class HitHandlingMethod(models.TextChoices):
  51. optimization = 'optimization', '模型优化'
  52. directly_return = 'directly_return', '直接返回'
  53. class Status:
  54. type_cls = TaskType
  55. state_cls = State
  56. def __init__(self, status: str = None):
  57. self.task_status = {}
  58. status_list = list(status[::-1] if status is not None else '')
  59. for _type in self.type_cls:
  60. index = _type.value - 1
  61. _state = self.state_cls(status_list[index] if len(status_list) > index else 'n')
  62. self.task_status[_type] = _state
  63. @staticmethod
  64. def of(status: str):
  65. return Status(status)
  66. def __str__(self):
  67. result = []
  68. for _type in sorted(self.type_cls, key=lambda item: item.value, reverse=True):
  69. result.insert(len(self.type_cls) - _type.value, self.task_status[_type].value)
  70. return ''.join(result)
  71. def __setitem__(self, key, value):
  72. self.task_status[key] = value
  73. def __getitem__(self, item):
  74. return self.task_status[item]
  75. def update_status(self, task_type: TaskType, state: State):
  76. self.task_status[task_type] = state
  77. def default_status_meta():
  78. return {"state_time": {}}
  79. class KnowledgeFolder(MPTTModel, AppModelMixin):
  80. id = models.CharField(primary_key=True, max_length=64, editable=False, verbose_name="主键id")
  81. name = models.CharField(max_length=64, verbose_name="文件夹名称", db_index=True)
  82. desc = models.CharField(max_length=200, null=True, blank=True, verbose_name="描述")
  83. user = models.ForeignKey(User, on_delete=models.SET_NULL, db_constraint=False, blank=True, null=True)
  84. workspace_id = models.CharField(max_length=64, verbose_name="工作空间id", default="default", db_index=True)
  85. parent = TreeForeignKey('self', on_delete=models.DO_NOTHING, null=True, blank=True, related_name='children')
  86. class Meta:
  87. db_table = "knowledge_folder"
  88. class MPTTMeta:
  89. order_insertion_by = ['name']
  90. class Knowledge(AppModelMixin):
  91. """
  92. 知识库表
  93. """
  94. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  95. name = models.CharField(max_length=150, verbose_name="知识库名称", db_index=True)
  96. workspace_id = models.CharField(max_length=64, verbose_name="工作空间id", default="default", db_index=True)
  97. desc = models.CharField(max_length=256, verbose_name="描述")
  98. user = models.ForeignKey(User, on_delete=models.SET_NULL, db_constraint=False, blank=True, null=True)
  99. type = models.IntegerField(verbose_name='类型', choices=KnowledgeType.choices, default=KnowledgeType.BASE,
  100. db_index=True)
  101. scope = models.CharField(max_length=20, verbose_name='可用范围', choices=KnowledgeScope.choices,
  102. default=KnowledgeScope.WORKSPACE, db_index=True)
  103. folder = models.ForeignKey(KnowledgeFolder, on_delete=models.DO_NOTHING, verbose_name="文件夹id", default='default')
  104. embedding_model = models.ForeignKey(Model, on_delete=models.SET_NULL, db_constraint=False, blank=True, null=True)
  105. file_size_limit = models.IntegerField(verbose_name="文件大小限制", default=100)
  106. file_count_limit = models.IntegerField(verbose_name="文件数量限制", default=50)
  107. meta = models.JSONField(verbose_name="元数据", default=dict)
  108. class Meta:
  109. db_table = "knowledge"
  110. class KnowledgeWorkflow(AppModelMixin):
  111. """
  112. 知识库工作流表
  113. """
  114. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  115. knowledge = models.OneToOneField(Knowledge, on_delete=models.CASCADE, verbose_name="知识库",
  116. db_constraint=False, related_name='workflow')
  117. workspace_id = models.CharField(max_length=64, verbose_name="工作空间id", default="default", db_index=True)
  118. work_flow = models.JSONField(verbose_name="工作流数据", default=dict)
  119. is_publish = models.BooleanField(verbose_name="是否发布", default=False, db_index=True)
  120. publish_time = models.DateTimeField(verbose_name="发布时间", null=True, blank=True)
  121. class Meta:
  122. db_table = "knowledge_workflow"
  123. class KnowledgeWorkflowVersion(AppModelMixin):
  124. """
  125. 知识库工作流版本表 - 记录工作流历史版本
  126. """
  127. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  128. knowledge = models.ForeignKey(Knowledge, on_delete=models.CASCADE, verbose_name="知识库", db_constraint=False)
  129. workspace_id = models.CharField(max_length=64, verbose_name="工作空间id", default="default", db_index=True)
  130. name = models.CharField(verbose_name="版本名称", max_length=128, default="")
  131. work_flow = models.JSONField(verbose_name="工作流数据", default=dict)
  132. publish_user_id = models.UUIDField(verbose_name="发布者id", max_length=128, default=None, null=True)
  133. publish_user_name = models.CharField(verbose_name="发布者名称", max_length=128, default="")
  134. class Meta:
  135. db_table = "knowledge_workflow_version"
  136. def get_default_status():
  137. return Status('').__str__()
  138. class Document(AppModelMixin):
  139. """
  140. 文档表
  141. """
  142. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  143. knowledge = models.ForeignKey(Knowledge, on_delete=models.DO_NOTHING, verbose_name="知识库id")
  144. name = models.CharField(max_length=150, verbose_name="文档名称", db_index=True)
  145. char_length = models.IntegerField(verbose_name="文档字符数 冗余字段")
  146. status = models.CharField(verbose_name='状态', max_length=20, default=get_default_status, db_index=True)
  147. status_meta = models.JSONField(verbose_name="状态统计数据", default=default_status_meta)
  148. is_active = models.BooleanField(default=True, db_index=True)
  149. type = models.IntegerField(verbose_name='类型', choices=KnowledgeType.choices, default=KnowledgeType.BASE,
  150. db_index=True)
  151. hit_handling_method = models.CharField(verbose_name='命中处理方式', max_length=20,
  152. choices=HitHandlingMethod.choices,
  153. default=HitHandlingMethod.optimization)
  154. directly_return_similarity = models.FloatField(verbose_name='直接回答相似度', default=0.9)
  155. meta = models.JSONField(verbose_name="元数据", default=dict)
  156. class Meta:
  157. db_table = "document"
  158. class Tag(AppModelMixin):
  159. """
  160. 标签表 - 存储标签的key-value定义
  161. """
  162. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  163. knowledge = models.ForeignKey(Knowledge, on_delete=models.DO_NOTHING, verbose_name="知识库", db_constraint=False)
  164. key = models.CharField(max_length=64, verbose_name="标签键", db_index=True)
  165. value = models.CharField(max_length=128, verbose_name="标签值", db_index=True)
  166. class Meta:
  167. db_table = "tag"
  168. unique_together = [['knowledge', 'key', 'value']] # 在同一知识库内key-value组合唯一
  169. indexes = [
  170. models.Index(fields=['knowledge', 'key']),
  171. ]
  172. class DocumentTag(AppModelMixin):
  173. """
  174. 文档标签关联表
  175. """
  176. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  177. document = models.ForeignKey(Document, on_delete=models.DO_NOTHING, verbose_name="文档", db_constraint=False)
  178. tag = models.ForeignKey(Tag, on_delete=models.DO_NOTHING, verbose_name="标签", db_constraint=False)
  179. class Meta:
  180. db_table = "document_tag"
  181. unique_together = [['document', 'tag']] # 文档和标签的组合唯一
  182. class Paragraph(AppModelMixin):
  183. """
  184. 段落表
  185. """
  186. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  187. document = models.ForeignKey(Document, on_delete=models.DO_NOTHING, db_constraint=False)
  188. knowledge = models.ForeignKey(Knowledge, on_delete=models.DO_NOTHING)
  189. content = models.CharField(max_length=102400, verbose_name="段落内容")
  190. title = models.CharField(max_length=256, verbose_name="标题", default="", db_index=True)
  191. status = models.CharField(verbose_name='状态', max_length=20, default=get_default_status, db_index=True)
  192. status_meta = models.JSONField(verbose_name="状态数据", default=default_status_meta)
  193. hit_num = models.IntegerField(verbose_name="命中次数", default=0)
  194. is_active = models.BooleanField(default=True, db_index=True)
  195. position = models.IntegerField(verbose_name="段落顺序", default=0, db_index=True)
  196. chunks = ArrayField(verbose_name="块", base_field=models.CharField(), default=list)
  197. class Meta:
  198. db_table = "paragraph"
  199. class Problem(AppModelMixin):
  200. """
  201. 问题表
  202. """
  203. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  204. knowledge = models.ForeignKey(Knowledge, on_delete=models.DO_NOTHING, db_constraint=False)
  205. content = models.CharField(max_length=256, verbose_name="问题内容", db_index=True)
  206. hit_num = models.IntegerField(verbose_name="命中次数", default=0)
  207. class Meta:
  208. db_table = "problem"
  209. class ProblemParagraphMapping(AppModelMixin):
  210. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  211. knowledge = models.ForeignKey(Knowledge, on_delete=models.DO_NOTHING, db_constraint=False)
  212. document = models.ForeignKey(Document, on_delete=models.DO_NOTHING, db_constraint=False)
  213. problem = models.ForeignKey(Problem, on_delete=models.DO_NOTHING, db_constraint=False)
  214. paragraph = models.ForeignKey(Paragraph, on_delete=models.DO_NOTHING, db_constraint=False)
  215. class Meta:
  216. db_table = "problem_paragraph_mapping"
  217. class SourceType(models.IntegerChoices):
  218. """订单类型"""
  219. PROBLEM = 0, '问题'
  220. PARAGRAPH = 1, '段落'
  221. TITLE = 2, '标题'
  222. class SearchMode(models.TextChoices):
  223. embedding = 'embedding'
  224. keywords = 'keywords'
  225. blend = 'blend'
  226. class FileSourceType(models.TextChoices):
  227. # 知识库 跟随知识库被删除而被删除 source_id 为知识库id
  228. KNOWLEDGE = "KNOWLEDGE"
  229. # 应用 跟随应用被删除而被删除 source_id 为应用id
  230. APPLICATION = "APPLICATION"
  231. # 工具 跟随工具被删除而被删除 source_id 为应用id
  232. TOOL = "TOOL"
  233. # 文档
  234. DOCUMENT = "DOCUMENT"
  235. # 对话
  236. CHAT = "CHAT"
  237. SYSTEM = "SYSTEM"
  238. # 临时30分钟 数据30分钟后被清理 source_id 为TEMPORARY_30_MINUTE
  239. TEMPORARY_30_MINUTE = "TEMPORARY_30_MINUTE"
  240. # 临时120分钟 数据120分钟后被清理 source_id为TEMPORARY_100_MINUTE
  241. TEMPORARY_120_MINUTE = "TEMPORARY_120_MINUTE"
  242. # 临时1天 数据1天后被清理 source_id为TEMPORARY_1_DAY
  243. TEMPORARY_1_DAY = "TEMPORARY_1_DAY"
  244. class VectorField(models.Field):
  245. def db_type(self, connection):
  246. return 'vector'
  247. class Embedding(models.Model):
  248. id = models.CharField(max_length=128, primary_key=True, verbose_name="主键id")
  249. source_id = models.CharField(max_length=128, verbose_name="资源id", db_index=True)
  250. source_type = models.CharField(verbose_name='资源类型', max_length=5, choices=SourceType.choices,
  251. default=SourceType.PROBLEM, db_index=True)
  252. is_active = models.BooleanField(verbose_name="是否可用", max_length=1, default=True)
  253. knowledge = models.ForeignKey(Knowledge, on_delete=models.DO_NOTHING, verbose_name="文档关联", db_constraint=False)
  254. document = models.ForeignKey(Document, on_delete=models.DO_NOTHING, verbose_name="文档关联", db_constraint=False)
  255. paragraph = models.ForeignKey(Paragraph, on_delete=models.DO_NOTHING, verbose_name="段落关联", db_constraint=False)
  256. embedding = VectorField(verbose_name="向量")
  257. search_vector = SearchVectorField(verbose_name="分词", default="")
  258. meta = models.JSONField(verbose_name="元数据", default=dict)
  259. class Meta:
  260. db_table = "embedding"
  261. class File(AppModelMixin):
  262. id = models.UUIDField(primary_key=True, max_length=128, default=uuid.uuid7, editable=False, verbose_name="主键id")
  263. file_name = models.CharField(max_length=256, verbose_name="文件名称", default="")
  264. file_size = models.IntegerField(verbose_name="文件大小", default=0)
  265. sha256_hash = models.CharField(verbose_name="文件sha256_hash标识", default="")
  266. source_type = models.CharField(verbose_name="资源类型", choices=FileSourceType,
  267. default=FileSourceType.TEMPORARY_120_MINUTE.value, db_index=True)
  268. source_id = models.CharField(verbose_name="资源id", default=FileSourceType.TEMPORARY_120_MINUTE.value,
  269. db_index=True)
  270. loid = models.IntegerField(verbose_name="loid")
  271. meta = models.JSONField(verbose_name="文件关联数据", default=dict)
  272. class Meta:
  273. db_table = "file"
  274. def save(self, bytea=None, force_insert=False, force_update=False, using=None, update_fields=None):
  275. if bytea is None:
  276. raise ValueError("bytea参数不能为空")
  277. sha256_hash = get_sha256_hash(bytea)
  278. self.sha256_hash = sha256_hash
  279. existing_file = QuerySet(File).filter(sha256_hash=sha256_hash).first()
  280. if existing_file:
  281. self.loid = existing_file.loid
  282. self.file_size = existing_file.file_size
  283. return super().save()
  284. compressed_data = self._compress_data(bytea)
  285. self.file_size = len(compressed_data)
  286. self.loid = self._create_large_object()
  287. self._write_compressed_data(compressed_data)
  288. # 调用父类保存
  289. return super().save()
  290. def _compress_data(self, data, compression_level=9):
  291. """压缩数据到内存"""
  292. buffer = io.BytesIO()
  293. with zipfile.ZipFile(buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
  294. zipinfo = zipfile.ZipInfo(self.file_name)
  295. zipinfo.compress_type = zipfile.ZIP_DEFLATED
  296. zip_file.writestr(zipinfo, data, compresslevel=compression_level)
  297. return buffer.getvalue()
  298. def _create_large_object(self):
  299. result = select_one("SELECT lo_creat(-1)::int8 as lo_id;", [])
  300. return result['lo_id']
  301. def _write_compressed_data(self, data, block_size=64 * 1024):
  302. buffer = io.BytesIO(data)
  303. offset = 0
  304. while True:
  305. chunk = buffer.read(block_size)
  306. if not chunk:
  307. break
  308. offset += len(chunk)
  309. select_one(
  310. "SELECT lo_put(%s::oid, %s::bigint, %s::bytea)::VARCHAR;",
  311. [self.loid, offset - len(chunk), chunk]
  312. )
  313. def get_bytes(self):
  314. buffer = io.BytesIO()
  315. for chunk in self.get_bytes_stream():
  316. buffer.write(chunk)
  317. try:
  318. # 解压数据
  319. with zipfile.ZipFile(buffer) as zip_file:
  320. # 用 zip 内实际存储的条目名,避免文件名不匹配
  321. name = zip_file.namelist()[0]
  322. return zip_file.read(name)
  323. except Exception as e:
  324. # 如果数据不是zip格式,直接返回原始数据
  325. return buffer.getvalue()
  326. def get_bytes_stream(self, start=0, end=None, chunk_size=64 * 1024):
  327. def _read_with_offset():
  328. offset = start
  329. while True:
  330. result = select_one(
  331. "SELECT lo_get(%s::oid, %s, %s) as chunk",
  332. [self.loid, offset, end - offset if end and (end - offset) < chunk_size else chunk_size]
  333. )
  334. chunk = result['chunk'] if result else None
  335. if not chunk:
  336. break
  337. yield chunk
  338. offset += len(chunk)
  339. if len(chunk) < chunk_size:
  340. break
  341. if end and offset > end:
  342. break
  343. return _read_with_offset()
  344. @receiver(pre_delete, sender=File)
  345. def on_delete_file(sender, instance, **kwargs):
  346. exist = QuerySet(File).filter(loid=instance.loid).exclude(id=instance.id).exists()
  347. if not exist:
  348. select_one(f'SELECT lo_unlink({instance.loid})', [])