knowledge.py 66 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313
  1. import io
  2. import json
  3. import os
  4. import pickle
  5. import re
  6. import tempfile
  7. import traceback
  8. import zipfile
  9. from collections import defaultdict
  10. from functools import reduce
  11. from tempfile import TemporaryDirectory
  12. from typing import Dict, List
  13. from urllib.parse import quote
  14. import requests
  15. import uuid_utils.compat as uuid
  16. from celery_once import AlreadyQueued
  17. from django.core import validators
  18. from django.core.files.uploadedfile import SimpleUploadedFile
  19. from django.db import transaction, models
  20. from django.db.models import QuerySet
  21. from django.db.models.functions import Reverse, Substr
  22. from django.db.models.query_utils import Q
  23. from django.http import HttpResponse
  24. from django.utils.translation import gettext_lazy as _, gettext
  25. from rest_framework import serializers
  26. from common.config.embedding_config import VectorStore
  27. from common.database_model_manage.database_model_manage import DatabaseModelManage
  28. from common.db.search import native_search, get_dynamics_model, native_page_search
  29. from common.db.sql_execute import select_list
  30. from common.event.listener_manage import ListenerManagement
  31. from common.exception.app_exception import AppApiException
  32. from common.field.common import UploadedFileField
  33. from common.utils.common import post, get_file_content, parse_image, bulk_create_in_batches
  34. from common.utils.fork import Fork, ChildLink
  35. from common.utils.logger import maxkb_logger
  36. from common.utils.split_model import get_split_model
  37. from knowledge.models import Knowledge, KnowledgeScope, KnowledgeType, Document, Paragraph, Problem, \
  38. ProblemParagraphMapping, TaskType, State, SearchMode, KnowledgeFolder, File, Tag, DocumentTag, KnowledgeWorkflow, \
  39. FileSourceType
  40. from knowledge.serializers.common import BatchSerializer, BatchMoveSerializer, ProblemParagraphObject
  41. from knowledge.serializers.common import ProblemParagraphManage, drop_knowledge_index, \
  42. get_embedding_model_id_by_knowledge_id, MetaSerializer, \
  43. GenerateRelatedSerializer, get_embedding_model_by_knowledge_id, list_paragraph, write_image, zip_dir, \
  44. update_resource_mapping_by_knowledge
  45. from knowledge.serializers.document import DocumentSerializers
  46. from knowledge.task.embedding import embedding_by_knowledge, delete_embedding_by_knowledge
  47. from knowledge.task.generate import generate_related_by_knowledge_id
  48. from knowledge.task.sync import sync_web_knowledge, sync_replace_web_knowledge
  49. from maxkb.conf import PROJECT_DIR
  50. from maxkb.const import CONFIG
  51. from models_provider.models import Model
  52. from system_manage.models import WorkspaceUserResourcePermission, AuthTargetType
  53. from system_manage.models.resource_mapping import ResourceMapping
  54. from system_manage.serializers.resource_mapping_serializers import ResourceMappingSerializer
  55. from system_manage.serializers.user_resource_permission import UserResourcePermissionSerializer
  56. from users.serializers.user import is_workspace_manage
  57. class KnowledgeModelSerializer(serializers.ModelSerializer):
  58. class Meta:
  59. model = Knowledge
  60. fields = ['id', 'name', 'desc', 'meta', 'folder_id', 'type', 'workspace_id', 'create_time',
  61. 'update_time', 'file_size_limit', 'file_count_limit', 'embedding_model_id']
  62. class KnowledgeBaseCreateRequest(serializers.Serializer):
  63. name = serializers.CharField(required=True, label=_('knowledge name'))
  64. folder_id = serializers.CharField(required=True, label=_('folder id'))
  65. desc = serializers.CharField(required=False, allow_null=True, allow_blank=True, label=_('knowledge description'))
  66. embedding_model_id = serializers.CharField(required=True, label=_('knowledge embedding'))
  67. class KnowledgeImportRequest(serializers.Serializer):
  68. file = UploadedFileField(required=True, label=_("file"))
  69. class KnowledgeWebCreateRequest(serializers.Serializer):
  70. name = serializers.CharField(required=True, label=_('knowledge name'))
  71. folder_id = serializers.CharField(required=True, label=_('folder id'))
  72. desc = serializers.CharField(required=False, allow_null=True, allow_blank=True, label=_('knowledge description'))
  73. embedding_model_id = serializers.CharField(required=True, label=_('knowledge embedding'))
  74. source_url = serializers.CharField(required=True, label=_('source url'))
  75. selector = serializers.CharField(required=False, label=_('knowledge selector'), allow_null=True, allow_blank=True)
  76. class KnowledgeEditRequest(serializers.Serializer):
  77. name = serializers.CharField(required=False, max_length=64, min_length=1, label=_('knowledge name'))
  78. desc = serializers.CharField(required=False, max_length=256, min_length=1, label=_('knowledge description'))
  79. meta = serializers.DictField(required=False)
  80. application_id_list = serializers.ListSerializer(
  81. required=False,
  82. child=serializers.UUIDField(required=True, label=_('application id')),
  83. label=_('application id list')
  84. )
  85. file_size_limit = serializers.IntegerField(required=False, label=_('file size limit'))
  86. file_count_limit = serializers.IntegerField(required=False, label=_('file count limit'))
  87. @staticmethod
  88. def get_knowledge_meta_valid_map():
  89. knowledge_meta_valid_map = {
  90. KnowledgeType.BASE: MetaSerializer.BaseMeta,
  91. KnowledgeType.WEB: MetaSerializer.WebMeta
  92. }
  93. return knowledge_meta_valid_map
  94. def is_valid(self, *, knowledge: Knowledge = None):
  95. super().is_valid(raise_exception=True)
  96. if 'meta' in self.data and self.data.get('meta') is not None:
  97. knowledge_meta_valid_map = self.get_knowledge_meta_valid_map()
  98. valid_class = knowledge_meta_valid_map.get(knowledge.type)
  99. valid_class(data=self.data.get('meta')).is_valid(raise_exception=True)
  100. class HitTestSerializer(serializers.Serializer):
  101. query_text = serializers.CharField(required=True, label=_('query text'))
  102. top_number = serializers.IntegerField(required=True, max_value=10000, min_value=1, label=_("top number"))
  103. similarity = serializers.FloatField(required=True, max_value=2, min_value=0, label=_('similarity'))
  104. search_mode = serializers.CharField(required=True, label=_('search mode'), validators=[
  105. validators.RegexValidator(regex=re.compile("^embedding|keywords|blend$"),
  106. message=_('The type only supports embedding|keywords|blend'), code=500)
  107. ])
  108. class KnowledgeSerializer(serializers.Serializer):
  109. class Query(serializers.Serializer):
  110. workspace_id = serializers.CharField(required=True)
  111. folder_id = serializers.CharField(required=False, label=_('folder id'), allow_null=True)
  112. name = serializers.CharField(required=False, label=_('knowledge name'), allow_null=True, allow_blank=True,
  113. max_length=64, min_length=1)
  114. desc = serializers.CharField(required=False, label=_('knowledge description'), allow_null=True,
  115. allow_blank=True, max_length=256, min_length=1)
  116. user_id = serializers.UUIDField(required=False, label=_('user id'), allow_null=True)
  117. scope = serializers.CharField(required=False, label=_('knowledge scope'), allow_null=True)
  118. create_user = serializers.UUIDField(required=False, label=_('create user'), allow_null=True)
  119. @staticmethod
  120. def is_x_pack_ee():
  121. workspace_user_role_mapping_model = DatabaseModelManage.get_model("workspace_user_role_mapping")
  122. role_permission_mapping_model = DatabaseModelManage.get_model("role_permission_mapping_model")
  123. return workspace_user_role_mapping_model is not None and role_permission_mapping_model is not None
  124. def get_query_set(self, workspace_manage, is_x_pack_ee):
  125. self.is_valid(raise_exception=True)
  126. workspace_id = self.data.get("workspace_id")
  127. query_set_dict = {}
  128. query_set = QuerySet(model=get_dynamics_model({
  129. 'temp.name': models.CharField(),
  130. 'temp.desc': models.CharField(),
  131. "document_temp.char_length": models.IntegerField(),
  132. 'temp.create_time': models.DateTimeField(),
  133. 'temp.user_id': models.CharField(),
  134. 'temp.workspace_id': models.CharField(),
  135. 'temp.folder_id': models.CharField(),
  136. 'temp.id': models.CharField(),
  137. 'temp.scope': models.CharField(),
  138. }))
  139. folder_query_set = QuerySet(KnowledgeFolder)
  140. if "desc" in self.data and self.data.get('desc') is not None:
  141. query_set = query_set.filter(**{'temp.desc__icontains': self.data.get("desc")})
  142. folder_query_set = folder_query_set.filter(**{'desc__icontains': self.data.get("desc")})
  143. if "name" in self.data and self.data.get('name') is not None:
  144. query_set = query_set.filter(**{'temp.name__icontains': self.data.get("name")})
  145. folder_query_set = folder_query_set.filter(**{'name__icontains': self.data.get("name")})
  146. if "workspace_id" in self.data and self.data.get('workspace_id') is not None:
  147. query_set = query_set.filter(**{'temp.workspace_id': self.data.get("workspace_id")})
  148. folder_query_set = folder_query_set.filter(**{'workspace_id': self.data.get("workspace_id")})
  149. if "folder_id" in self.data and self.data.get('folder_id') is not None and self.data.get(
  150. 'workspace_id') != self.data.get('folder_id'):
  151. query_set = query_set.filter(**{'temp.folder_id': self.data.get("folder_id")})
  152. folder_query_set = folder_query_set.filter(**{'parent_id': self.data.get("folder_id")})
  153. if "scope" in self.data and self.data.get('scope') is not None:
  154. query_set = query_set.filter(**{'temp.scope': self.data.get("scope")})
  155. if "create_user" in self.data and self.data.get('create_user') is not None:
  156. query_set = query_set.filter(**{'temp.user_id': self.data.get("create_user")})
  157. query_set = query_set.order_by("-temp.create_time", "temp.id")
  158. query_set_dict['default_sql'] = query_set
  159. query_set_dict['knowledge_custom_sql'] = QuerySet(model=get_dynamics_model({
  160. 'knowledge.workspace_id': models.CharField(),
  161. })).filter(**{'knowledge.workspace_id': workspace_id})
  162. # query_set_dict['folder_query_set'] = folder_query_set
  163. if not workspace_manage:
  164. query_set_dict['workspace_user_resource_permission_query_set'] = QuerySet(
  165. WorkspaceUserResourcePermission).filter(
  166. auth_target_type="KNOWLEDGE",
  167. workspace_id=workspace_id,
  168. user_id=self.data.get("user_id"))
  169. return query_set_dict
  170. def page(self, current_page: int, page_size: int):
  171. self.is_valid(raise_exception=True)
  172. folder_id = self.data.get('folder_id', self.data.get("workspace_id"))
  173. root = KnowledgeFolder.objects.filter(id=folder_id).first()
  174. if not root:
  175. raise serializers.ValidationError(_('Folder not found'))
  176. workspace_manage = is_workspace_manage(self.data.get('user_id'), self.data.get('workspace_id'))
  177. is_x_pack_ee = self.is_x_pack_ee()
  178. result = native_page_search(
  179. current_page,
  180. page_size,
  181. self.get_query_set(workspace_manage, is_x_pack_ee),
  182. select_string=get_file_content(
  183. os.path.join(
  184. PROJECT_DIR,
  185. "apps",
  186. "knowledge", 'sql',
  187. 'list_knowledge.sql' if workspace_manage else (
  188. 'list_knowledge_user_ee.sql' if is_x_pack_ee else 'list_knowledge_user.sql'
  189. )
  190. )
  191. ),
  192. post_records_handler=lambda r: r
  193. )
  194. return ResourceMappingSerializer().get_resource_count(result)
  195. def list(self):
  196. self.is_valid(raise_exception=True)
  197. folder_id = self.data.get('folder_id')
  198. if not folder_id:
  199. folder_id = self.data.get('workspace_id')
  200. root = KnowledgeFolder.objects.filter(id=folder_id).first()
  201. if not root:
  202. raise serializers.ValidationError(_('Folder not found'))
  203. workspace_manage = is_workspace_manage(self.data.get('user_id'), self.data.get('workspace_id'))
  204. is_x_pack_ee = self.is_x_pack_ee()
  205. return native_search(
  206. self.get_query_set(workspace_manage, is_x_pack_ee),
  207. select_string=get_file_content(
  208. os.path.join(
  209. PROJECT_DIR,
  210. "apps",
  211. "knowledge", 'sql',
  212. 'list_knowledge.sql' if workspace_manage else (
  213. 'list_knowledge_user_ee.sql' if self.is_x_pack_ee() else 'list_knowledge_user.sql'
  214. )
  215. )
  216. ),
  217. )
  218. class Operate(serializers.Serializer):
  219. user_id = serializers.UUIDField(required=True, label=_('user id'))
  220. workspace_id = serializers.CharField(required=True, label=_('workspace id'))
  221. knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
  222. def is_valid(self, *, raise_exception=False):
  223. super().is_valid(raise_exception=True)
  224. workspace_id = self.data.get('workspace_id')
  225. query_set = QuerySet(Knowledge).filter(id=self.data.get('knowledge_id'))
  226. if workspace_id:
  227. query_set = query_set.filter(workspace_id=workspace_id)
  228. if not query_set.exists():
  229. raise AppApiException(500, _('Knowledge id does not exist'))
  230. @transaction.atomic
  231. def embedding(self, with_valid=True):
  232. if with_valid:
  233. self.is_valid(raise_exception=True)
  234. knowledge_id = self.data.get('knowledge_id')
  235. knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
  236. embedding_model_id = knowledge.embedding_model_id
  237. embedding_model = QuerySet(Model).filter(id=embedding_model_id).first()
  238. if embedding_model is None:
  239. raise AppApiException(500, _('Model does not exist'))
  240. ListenerManagement.update_status(
  241. QuerySet(Document).filter(knowledge_id=self.data.get('knowledge_id')),
  242. TaskType.EMBEDDING,
  243. State.PENDING
  244. )
  245. ListenerManagement.update_status(
  246. QuerySet(Paragraph).filter(knowledge_id=self.data.get('knowledge_id')),
  247. TaskType.EMBEDDING,
  248. State.PENDING
  249. )
  250. ListenerManagement.get_aggregation_document_status_by_knowledge_id(self.data.get('knowledge_id'))()
  251. embedding_model_id = get_embedding_model_id_by_knowledge_id(self.data.get('knowledge_id'))
  252. try:
  253. embedding_by_knowledge.delay(knowledge_id, embedding_model_id)
  254. except AlreadyQueued as e:
  255. raise AppApiException(500, _('Failed to send the vectorization task, please try again later!'))
  256. def generate_related(self, instance: Dict, with_valid=True):
  257. if with_valid:
  258. self.is_valid(raise_exception=True)
  259. GenerateRelatedSerializer(data=instance).is_valid(raise_exception=True)
  260. knowledge_id = self.data.get('knowledge_id')
  261. model_id = instance.get("model_id")
  262. prompt = instance.get("prompt")
  263. model_params_setting = instance.get("model_params_setting")
  264. state_list = instance.get('state_list')
  265. ListenerManagement.update_status(
  266. QuerySet(Document).filter(knowledge_id=knowledge_id),
  267. TaskType.GENERATE_PROBLEM,
  268. State.PENDING
  269. )
  270. ListenerManagement.update_status(
  271. QuerySet(Paragraph).annotate(
  272. reversed_status=Reverse('status'),
  273. task_type_status=Substr('reversed_status', TaskType.GENERATE_PROBLEM.value, 1),
  274. ).filter(
  275. task_type_status__in=state_list, knowledge_id=knowledge_id
  276. ).values('id'),
  277. TaskType.GENERATE_PROBLEM,
  278. State.PENDING
  279. )
  280. ListenerManagement.get_aggregation_document_status_by_knowledge_id(knowledge_id)()
  281. try:
  282. generate_related_by_knowledge_id.delay(knowledge_id, model_id, model_params_setting, prompt, state_list)
  283. except AlreadyQueued as e:
  284. raise AppApiException(500, _('Failed to send the vectorization task, please try again later!'))
  285. def list_application(self, with_valid=True):
  286. if with_valid:
  287. self.is_valid(raise_exception=True)
  288. # knowledge = QuerySet(Knowledge).get(id=self.data.get("knowledge_id"))
  289. return select_list(
  290. get_file_content(
  291. os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_knowledge_application.sql')
  292. ),
  293. [
  294. self.data.get('user_id'),
  295. ]
  296. )
  297. @staticmethod
  298. def is_x_pack_ee():
  299. workspace_user_role_mapping_model = DatabaseModelManage.get_model("workspace_user_role_mapping")
  300. role_permission_mapping_model = DatabaseModelManage.get_model("role_permission_mapping_model")
  301. return workspace_user_role_mapping_model is not None and role_permission_mapping_model is not None
  302. def one(self):
  303. self.is_valid()
  304. workspace_manage = is_workspace_manage(self.data.get('user_id'), self.data.get('workspace_id'))
  305. is_x_pack_ee = self.is_x_pack_ee()
  306. query_set_dict = {
  307. 'default_sql': QuerySet(
  308. model=get_dynamics_model({'temp.id': models.CharField()})
  309. ).filter(**{'temp.id': self.data.get("knowledge_id")}),
  310. 'knowledge_custom_sql': QuerySet(
  311. model=get_dynamics_model({'knowledge.id': models.CharField()})
  312. ).filter(**{'knowledge.id': self.data.get("knowledge_id")}),
  313. }
  314. if not workspace_manage:
  315. query_set_dict['workspace_user_resource_permission_query_set'] = QuerySet(
  316. WorkspaceUserResourcePermission).filter(
  317. auth_target_type="KNOWLEDGE",
  318. workspace_id=self.data.get('workspace_id'),
  319. user_id=self.data.get("user_id")
  320. )
  321. all_application_list = [str(adm.get('id')) for adm in self.list_application(with_valid=False)]
  322. knowledge_dict = native_search(query_set_dict, select_string=get_file_content(
  323. os.path.join(
  324. PROJECT_DIR, "apps", "knowledge", 'sql',
  325. 'list_knowledge.sql' if workspace_manage else (
  326. 'list_knowledge_user_ee.sql' if is_x_pack_ee else 'list_knowledge_user.sql'
  327. )
  328. )
  329. ), with_search_one=True)
  330. workflow = {}
  331. if knowledge_dict.get('type') == 4:
  332. from knowledge.models import KnowledgeWorkflow
  333. k = QuerySet(KnowledgeWorkflow).filter(knowledge_id=knowledge_dict.get('id')).first()
  334. if k:
  335. workflow['work_flow'] = k.work_flow
  336. workflow['is_publish'] = k.is_publish
  337. workflow['publish_time'] = k.publish_time
  338. return {
  339. **knowledge_dict,
  340. **workflow,
  341. 'meta': json.loads(knowledge_dict.get('meta', '{}')),
  342. 'application_id_list': list(filter(
  343. lambda application_id: all_application_list.__contains__(application_id),
  344. [
  345. str(
  346. application_knowledge_mapping.source_id
  347. ) for application_knowledge_mapping in
  348. QuerySet(ResourceMapping).filter(source_type='APPLICATION',
  349. target_type='KNOWLEDGE',
  350. target_id=self.data.get('knowledge_id'))
  351. ]
  352. ))
  353. }
  354. @transaction.atomic
  355. def edit(self, instance: Dict, select_one=True):
  356. self.is_valid()
  357. knowledge = QuerySet(Knowledge).get(id=self.data.get("knowledge_id"))
  358. KnowledgeEditRequest(data=instance).is_valid(knowledge=knowledge)
  359. if 'embedding_model_id' in instance:
  360. knowledge.embedding_model_id = instance.get('embedding_model_id')
  361. if "name" in instance:
  362. knowledge.name = instance.get("name")
  363. if 'desc' in instance:
  364. knowledge.desc = instance.get("desc")
  365. if 'meta' in instance:
  366. knowledge.meta = instance.get('meta')
  367. if 'folder_id' in instance:
  368. knowledge.folder_id = instance.get('folder_id')
  369. if 'file_size_limit' in instance:
  370. knowledge.file_size_limit = instance.get('file_size_limit')
  371. if 'file_count_limit' in instance:
  372. knowledge.file_count_limit = instance.get('file_count_limit')
  373. knowledge.save()
  374. update_resource_mapping_by_knowledge(str(knowledge.id))
  375. if select_one:
  376. return self.one()
  377. return None
  378. @transaction.atomic
  379. def delete(self):
  380. self.is_valid()
  381. knowledge = QuerySet(Knowledge).get(id=self.data.get("knowledge_id"))
  382. QuerySet(Document).filter(knowledge=knowledge).delete()
  383. QuerySet(ProblemParagraphMapping).filter(knowledge=knowledge).delete()
  384. QuerySet(Paragraph).filter(knowledge=knowledge).delete()
  385. QuerySet(Problem).filter(knowledge=knowledge).delete()
  386. QuerySet(WorkspaceUserResourcePermission).filter(target=knowledge.id).delete()
  387. drop_knowledge_index(knowledge_id=knowledge.id)
  388. knowledge.delete()
  389. File.objects.filter(
  390. source_id=knowledge.id,
  391. ).delete()
  392. QuerySet(ResourceMapping).filter(
  393. Q(target_id=self.data.get('knowledge_id')) | Q(source_id=self.data.get('knowledge_id'))
  394. ).delete()
  395. delete_embedding_by_knowledge(self.data.get('knowledge_id'))
  396. return True
  397. def export_excel(self, with_valid=True):
  398. if with_valid:
  399. self.is_valid(raise_exception=True)
  400. document_list = QuerySet(Document).filter(knowledge_id=self.data.get('knowledge_id'))
  401. paragraph_list = native_search(
  402. QuerySet(Paragraph).filter(knowledge_id=self.data.get("knowledge_id")),
  403. get_file_content(
  404. os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_paragraph_document_name.sql')
  405. )
  406. )
  407. problem_mapping_list = native_search(
  408. QuerySet(ProblemParagraphMapping).filter(knowledge_id=self.data.get("knowledge_id")),
  409. get_file_content(os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_problem_mapping.sql')),
  410. with_table_name=True
  411. )
  412. data_dict, document_dict = DocumentSerializers.Operate.merge_problem(
  413. paragraph_list, problem_mapping_list, document_list
  414. )
  415. workbook = DocumentSerializers.Operate.get_workbook(data_dict, document_dict)
  416. response = HttpResponse(content_type='application/vnd.ms-excel')
  417. response['Content-Disposition'] = 'attachment; filename="knowledge.xlsx"'
  418. workbook.save(response)
  419. return response
  420. def export_zip(self, with_valid=True):
  421. if with_valid:
  422. self.is_valid(raise_exception=True)
  423. knowledge = QuerySet(Knowledge).filter(id=self.data.get("knowledge_id")).first()
  424. document_list = QuerySet(Document).filter(knowledge_id=self.data.get('knowledge_id'))
  425. paragraph_list = native_search(
  426. QuerySet(Paragraph).filter(knowledge_id=self.data.get("knowledge_id")),
  427. get_file_content(
  428. os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_paragraph_document_name.sql')
  429. )
  430. )
  431. problem_mapping_list = native_search(
  432. QuerySet(ProblemParagraphMapping).filter(knowledge_id=self.data.get("knowledge_id")),
  433. get_file_content(os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_problem_mapping.sql')),
  434. with_table_name=True
  435. )
  436. data_dict, document_dict = DocumentSerializers.Operate.merge_problem(
  437. paragraph_list, problem_mapping_list, document_list
  438. )
  439. res = [parse_image(paragraph.get('content')) for paragraph in paragraph_list]
  440. workbook = DocumentSerializers.Operate.get_workbook(data_dict, document_dict)
  441. response = HttpResponse(content_type='application/zip')
  442. response['Content-Disposition'] = f'attachment; filename="{knowledge.name}.zip"'
  443. zip_buffer = io.BytesIO()
  444. with TemporaryDirectory() as tempdir:
  445. knowledge_file = os.path.join(tempdir, 'knowledge.xlsx')
  446. workbook.save(knowledge_file)
  447. for r in res:
  448. write_image(tempdir, r)
  449. zip_dir(tempdir, zip_buffer)
  450. response.write(zip_buffer.getvalue())
  451. return response
  452. def export_knowledge(self, with_valid=True):
  453. if with_valid:
  454. self.is_valid(raise_exception=True)
  455. knowledge_id = self.data.get("knowledge_id")
  456. knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
  457. document_list = QuerySet(Document).filter(knowledge_id=knowledge_id)
  458. paragraph_list = native_search(
  459. QuerySet(Paragraph).filter(knowledge_id=self.data.get("knowledge_id")),
  460. get_file_content(
  461. os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_paragraph_document_name.sql')
  462. )
  463. )
  464. problem_mapping_list = native_search(
  465. QuerySet(ProblemParagraphMapping).filter(knowledge_id=self.data.get("knowledge_id")),
  466. get_file_content(os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_problem_mapping.sql')),
  467. with_table_name=True
  468. )
  469. data_dict, document_dict = DocumentSerializers.Operate.merge_problem(
  470. paragraph_list, problem_mapping_list, document_list
  471. )
  472. # 查询标签和文档标签关联
  473. tag_list = list(QuerySet(Tag).filter(knowledge_id=knowledge_id).values('id', 'key', 'value'))
  474. document_tag_list = list(
  475. QuerySet(DocumentTag).filter(document__knowledge_id=knowledge_id).values('document_id', 'tag_id')
  476. )
  477. # 知识库标签map
  478. tag_map = {t['id']: t for t in tag_list}
  479. # 文档标签map
  480. doc_tag_map = defaultdict(list)
  481. for dt in document_tag_list:
  482. tag = tag_map.get(dt['tag_id'])
  483. if tag:
  484. doc_tag_map[dt['document_id']].append(f"{tag['key']}:{tag['value']}")
  485. # doc_id -> document_obj
  486. doc_obj_map = {doc.id: doc for doc in document_list}
  487. # paragraph_id -> is_active
  488. paragraph_active_map = {}
  489. for p in paragraph_list:
  490. doc_id = p.get('document_id')
  491. if doc_id not in paragraph_active_map:
  492. paragraph_active_map[doc_id] = []
  493. paragraph_active_map[doc_id].append('1' if p.get('is_active') else '0')
  494. res = [parse_image(paragraph.get('content')) for paragraph in paragraph_list]
  495. # 新增字段
  496. workbook = self._get_knowledge_workbook(data_dict, document_dict, doc_tag_map, doc_obj_map,
  497. paragraph_active_map)
  498. response = HttpResponse(content_type='application/zip')
  499. response['Content-Disposition'] = f"attachment; filename*=UTF-8''{quote(knowledge.name)}.zip"
  500. zip_buffer = io.BytesIO()
  501. with TemporaryDirectory() as tempdir:
  502. knowledge_file_path = os.path.join(tempdir, 'knowledge.xlsx')
  503. workbook.save(knowledge_file_path)
  504. for r in res:
  505. write_image(tempdir, r)
  506. knowledge_json = {
  507. 'name': knowledge.name,
  508. 'desc': knowledge.desc,
  509. 'type': knowledge.type,
  510. 'meta': {} if knowledge.type == KnowledgeType.LARK else (knowledge.meta if knowledge.meta else {}),
  511. 'file_size_limit': knowledge.file_size_limit,
  512. 'file_count_limit': knowledge.file_count_limit,
  513. 'tags': [{'key': t['key'], 'value': t['value']} for t in tag_list]
  514. }
  515. with open(os.path.join(tempdir, 'knowledge.json'), 'w', encoding='utf-8') as f:
  516. json.dump(knowledge_json, f, ensure_ascii=False)
  517. if knowledge.type == KnowledgeType.WORKFLOW:
  518. knowledge_workflow = QuerySet(KnowledgeWorkflow).filter(knowledge_id=knowledge_id).first()
  519. if knowledge_workflow:
  520. from knowledge.serializers.knowledge_workflow import KnowledgeWorkflowSerializer
  521. from knowledge.serializers.knowledge_workflow import KnowledgeWorkflowModelSerializer
  522. from application.flow.tools import get_tool_id_list
  523. from tools.models import Tool, ToolScope, ToolType, ToolWorkflow
  524. from knowledge.serializers.knowledge_workflow import KBWFInstance
  525. tool_id_list = get_tool_id_list(knowledge_workflow.work_flow, True)
  526. tool_list = []
  527. if len(tool_id_list) > 0:
  528. tool_list = QuerySet(Tool).filter(id__in=tool_id_list).exclude(scope=ToolScope.SHARED)
  529. tw_dict = {tw.tool_id: tw
  530. for tw in QuerySet(ToolWorkflow).filter(
  531. tool_id__in=[tool.id for tool in tool_list if tool.tool_type == ToolType.WORKFLOW])}
  532. knowledge_workflow_dict = KnowledgeWorkflowModelSerializer(knowledge_workflow).data
  533. kbwf_instance = KBWFInstance(
  534. knowledge_workflow_dict,
  535. [],
  536. 'v2',
  537. [KnowledgeWorkflowSerializer.Export.to_tool_dict(tool, tw_dict) for tool in tool_list]
  538. )
  539. knowledge_workflow_pickle = pickle.dumps(kbwf_instance)
  540. with open(os.path.join(tempdir, 'workflow.kbwf'), 'wb') as f:
  541. f.write(knowledge_workflow_pickle)
  542. zip_dir(tempdir, zip_buffer)
  543. response.write(zip_buffer.getvalue())
  544. return response
  545. @staticmethod
  546. def _get_knowledge_workbook(data_dict: dict, document_dict: dict, doc_tag_map: dict, doc_obj_map: dict,
  547. paragraph_active_map: dict):
  548. import openpyxl
  549. from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
  550. workbook = openpyxl.Workbook()
  551. workbook.remove(workbook.active)
  552. if len(data_dict.keys()) == 0:
  553. data_dict['sheet'] = []
  554. for sheet_id in data_dict:
  555. sheet_name = document_dict.get(sheet_id)
  556. worksheet = workbook.create_sheet(sheet_name)
  557. doc = doc_obj_map.get(sheet_id) if sheet_id in doc_obj_map else None
  558. tags_str = '|'.join(doc_tag_map.get(sheet_id, []))
  559. hit_method = doc.hit_handling_method if doc else ''
  560. similarity = doc.directly_return_similarity if doc else ''
  561. is_active = '1' if (doc and doc.is_active) else '0'
  562. doc_type = doc.type if doc else ''
  563. doc_meta = json.dumps(doc.meta, ensure_ascii=False) if (doc and doc.meta) else ''
  564. header = [gettext('Section title (optional)'),
  565. gettext('Section content (required, question answer, no more than 4096 characters)'),
  566. gettext('Question (optional, one per line in the cell)'),
  567. gettext('Tags'),
  568. gettext('Hit handling method'),
  569. gettext('Directly return similarity'),
  570. gettext('Is active'),
  571. gettext('Paragraph is active'),
  572. gettext('Document type'),
  573. gettext('Document meta')]
  574. rows = data_dict.get(sheet_id, [])
  575. para_active_list = paragraph_active_map.get(sheet_id, [])
  576. # 初始化标题
  577. data = [header]
  578. for row_idx, row in enumerate(rows):
  579. para_active = para_active_list[row_idx] if row_idx < len(para_active_list) else '1'
  580. # None 转为 ''
  581. row = [col if col is not None else '' for col in row]
  582. # 补齐到3列
  583. row = (row + ['','',''])[:3]
  584. if row_idx == 0:
  585. data.append(
  586. [*row, tags_str, hit_method, similarity, is_active, para_active, doc_type, doc_meta])
  587. else:
  588. data.append([*row, '', '', '', '', para_active, '', ''])
  589. for row_idx, row in enumerate(data):
  590. for col_idx, col in enumerate(row):
  591. cell = worksheet.cell(row=row_idx + 1, column=col_idx + 1)
  592. if isinstance(col, str):
  593. col = re.sub(ILLEGAL_CHARACTERS_RE, '', col)
  594. if col.startswith(('=', '+', '-', '@')):
  595. col = '\ufeff' + col
  596. cell.value = col
  597. return workbook
  598. @staticmethod
  599. def merge_problem(paragraph_list: List[Dict], problem_mapping_list: List[Dict]):
  600. result = {}
  601. document_dict = {}
  602. for paragraph in paragraph_list:
  603. problem_list = [problem_mapping.get('content') for problem_mapping in problem_mapping_list if
  604. problem_mapping.get('paragraph_id') == paragraph.get('id')]
  605. document_sheet = result.get(paragraph.get('document_id'))
  606. d = document_dict.get(paragraph.get('document_name'))
  607. if d is None:
  608. document_dict[paragraph.get('document_name')] = {paragraph.get('document_id')}
  609. else:
  610. d.add(paragraph.get('document_id'))
  611. if document_sheet is None:
  612. result[paragraph.get('document_id')] = [[paragraph.get('title'), paragraph.get('content'),
  613. '\n'.join(problem_list)]]
  614. else:
  615. document_sheet.append([paragraph.get('title'), paragraph.get('content'), '\n'.join(problem_list)])
  616. result_document_dict = {}
  617. for d_name in document_dict:
  618. for index, d_id in enumerate(document_dict.get(d_name)):
  619. result_document_dict[d_id] = d_name if index == 0 else d_name + str(index)
  620. return result, result_document_dict
  621. class ImportKnowledge(serializers.Serializer):
  622. user_id = serializers.UUIDField(required=True, label=_('user id'))
  623. workspace_id = serializers.CharField(required=True, label=_('workspace id'))
  624. folder_id = serializers.CharField(required=True, label=_('folder id'))
  625. @transaction.atomic
  626. def import_knowledge(self, file, is_import_tool=False, with_valid=True):
  627. if with_valid:
  628. self.is_valid(raise_exception=True)
  629. KnowledgeImportRequest(data={'file': file}).is_valid(raise_exception=True)
  630. try:
  631. zf = zipfile.ZipFile(file)
  632. except zipfile.BadZipFile:
  633. raise AppApiException(500, _('Not a valid zip file'))
  634. namelist = zf.namelist()
  635. if 'knowledge.json' not in namelist:
  636. raise AppApiException(500, _('Not a valid KB export file, missing knowledge.json'))
  637. if 'knowledge.xlsx' not in namelist:
  638. raise AppApiException(500, _('Not a valid KB export file, missing knowledge.xlsx'))
  639. # knowledge.json -> knowledge
  640. knowledge_data = json.loads(zf.read('knowledge.json'))
  641. workspace_id = self.data.get('workspace_id')
  642. user_id = self.data.get('user_id')
  643. knowledge_id = uuid.uuid7()
  644. folder_id = self.data.get('folder_id')
  645. knowledge = Knowledge(
  646. id=knowledge_id,
  647. name=knowledge_data.get('name', 'Untitled'),
  648. desc=knowledge_data.get('desc', ''),
  649. type=knowledge_data.get('type', KnowledgeType.BASE),
  650. scope=self.data.get('scope', KnowledgeScope.WORKSPACE),
  651. meta=knowledge_data.get('meta', {}),
  652. file_size_limit=knowledge_data.get('file_size_limit', 100),
  653. file_count_limit=knowledge_data.get('file_count_limit', 50),
  654. embedding_model=None,
  655. user_id=user_id,
  656. workspace_id=workspace_id,
  657. folder_id=folder_id
  658. )
  659. knowledge.save()
  660. # 图片
  661. old_to_new_file_map = {}
  662. for name in namelist:
  663. if name.startswith('oss/file/') and name != 'oss/file/':
  664. old_id = name.split('/')[-1]
  665. if not old_id:
  666. continue
  667. file_bytes = zf.read(name)
  668. new_file = File(
  669. id=uuid.uuid7(),
  670. file_name=old_id,
  671. source_type=FileSourceType.KNOWLEDGE,
  672. source_id=str(knowledge_id),
  673. meta={}
  674. )
  675. new_file.save(bytea=file_bytes)
  676. old_to_new_file_map[old_id] = str(new_file.id)
  677. # knowledge.xlsx -> doc + para + problem
  678. import openpyxl
  679. xlsx_bytes = io.BytesIO(zf.read('knowledge.xlsx'))
  680. workbook = openpyxl.load_workbook(xlsx_bytes)
  681. document_model_list = []
  682. paragraph_model_list = []
  683. problem_paragraph_object_list = []
  684. doc_tags_map = {}
  685. for sheet in workbook.worksheets:
  686. doc_name = sheet.title
  687. rows = list(sheet.iter_rows(min_row=2, values_only=True))
  688. if not rows:
  689. continue
  690. # 首行文档元数据
  691. first_row = rows[0]
  692. tags_str = first_row[3] if len(first_row) > 3 and first_row[3] else ''
  693. hit_method = first_row[4] if len(first_row) > 4 and first_row[4] else 'optimization'
  694. similarity = first_row[5] if len(first_row) > 5 and first_row[5] else 0.9
  695. doc_is_active = first_row[6] if len(first_row) > 6 and first_row[6] else '1'
  696. doc_type = first_row[8] if len(first_row) > 8 and first_row[8] else knowledge_data.get('type',
  697. KnowledgeType.BASE)
  698. doc_meta_str = first_row[9] if len(first_row) > 9 and first_row[9] else '{}'
  699. try:
  700. doc_meta = json.loads(doc_meta_str) if isinstance(doc_meta_str, str) else {}
  701. except (json.JSONDecodeError, TypeError):
  702. doc_meta = {}
  703. char_length = sum(len(row[1] or '') for row in rows)
  704. document_id = uuid.uuid7()
  705. document = Document(
  706. id=document_id,
  707. knowledge_id=knowledge_id,
  708. name=doc_name,
  709. char_length=char_length,
  710. is_active=str(doc_is_active) == '1',
  711. type=doc_type,
  712. hit_handling_method=hit_method,
  713. directly_return_similarity=float(similarity) if similarity else 0.9,
  714. meta=doc_meta
  715. )
  716. document_model_list.append(document)
  717. if tags_str:
  718. doc_tags_map[document_id] = tags_str
  719. # 逐行创建 para + problem
  720. for row_idx, row in enumerate(rows):
  721. title = str(row[0]) if len(row) > 0 and row[0] is not None else ''
  722. content = str(row[1]) if len(row) > 1 and row[1] is not None else ''
  723. problems_str = str(row[2]) if len(row) > 2 and row[2] is not None else ''
  724. para_is_active = row[7] if len(row) > 7 and row[7] else '1'
  725. # 图片 link 替换
  726. for old_id, new_id in old_to_new_file_map.items():
  727. content = content.replace(old_id, new_id)
  728. if title.startswith('\ufeff'):
  729. title = title[1:]
  730. if content.startswith('\ufeff'):
  731. content = content[1:]
  732. paragraph_id = uuid.uuid7()
  733. paragraph = Paragraph(
  734. id=paragraph_id,
  735. document_id=document_id,
  736. knowledge_id=knowledge_id,
  737. title=title,
  738. content=content,
  739. is_active=str(para_is_active) == '1',
  740. position=row_idx + 1
  741. )
  742. paragraph_model_list.append(paragraph)
  743. if problems_str:
  744. if problems_str.startswith('\ufeff'):
  745. problems_str = problems_str[1:]
  746. for problem_content in problems_str.split('\n'):
  747. problem_content = problem_content.strip()
  748. if problem_content:
  749. problem_paragraph_object_list.append(ProblemParagraphObject(
  750. knowledge_id, document_id, paragraph_id, problem_content
  751. ))
  752. # bulk create
  753. QuerySet(Document).bulk_create(document_model_list) if len(document_model_list) > 0 else None
  754. QuerySet(Paragraph).bulk_create(paragraph_model_list) if len(paragraph_model_list) > 0 else None
  755. # 问题
  756. problem_model_list, problem_paragraph_mapping_list = (
  757. ProblemParagraphManage(problem_paragraph_object_list, knowledge_id).to_problem_model_list()
  758. )
  759. bulk_create_in_batches(Problem, problem_model_list, batch_size=1000)
  760. bulk_create_in_batches(ProblemParagraphMapping, problem_paragraph_mapping_list, batch_size=1000)
  761. # Tag
  762. tag_list = knowledge_data.get('tags', [])
  763. if tag_list:
  764. tag_model_list = []
  765. tag_key_value_to_model = {}
  766. for tag in tag_list:
  767. tag_model = Tag(
  768. id=uuid.uuid7(),
  769. knowledge_id=knowledge_id,
  770. key=tag['key'],
  771. value=tag['value']
  772. )
  773. tag_model_list.append(tag_model)
  774. tag_key_value_to_model[f"{tag['key']}:{tag['value']}"] = tag_model
  775. QuerySet(Tag).bulk_create(tag_model_list)
  776. # Document_Tag
  777. document_tag_model_list = []
  778. for doc_id, tags_str in doc_tags_map.items():
  779. for tag_str in tags_str.split('|'):
  780. tag_str = tag_str.strip()
  781. if tag_str and tag_str in tag_key_value_to_model:
  782. document_tag_model_list.append(DocumentTag(
  783. id=uuid.uuid7(),
  784. document_id=doc_id,
  785. tag_id=tag_key_value_to_model[tag_str].id
  786. ))
  787. QuerySet(DocumentTag).bulk_create(document_tag_model_list) if len(document_tag_model_list) > 0 else None
  788. # 工作流导入
  789. if 'workflow.kbwf' in namelist:
  790. workflow_bytes = zf.read('workflow.kbwf')
  791. from knowledge.serializers.knowledge_workflow import KnowledgeWorkflowSerializer
  792. workflow_file = SimpleUploadedFile('workflow.kbwf', workflow_bytes)
  793. KnowledgeWorkflowSerializer.Import(
  794. data={'knowledge_id': str(knowledge_id), 'user_id': user_id, 'workspace_id': workspace_id}
  795. ).import_({'file': workflow_file}, is_import_tool)
  796. # 授权 + 资源映射
  797. UserResourcePermissionSerializer(data={
  798. 'workspace_id': self.data.get('workspace_id'),
  799. 'user_id': self.data.get('user_id'),
  800. 'auth_target_type': AuthTargetType.KNOWLEDGE.value
  801. }).auth_resource(str(knowledge_id))
  802. update_resource_mapping_by_knowledge(str(knowledge_id))
  803. zf.close()
  804. return {'knowledge_id': str(knowledge_id), 'type': knowledge.type}
  805. class Create(serializers.Serializer):
  806. user_id = serializers.UUIDField(required=True, label=_('user id'))
  807. workspace_id = serializers.CharField(required=True, label=_('workspace id'))
  808. scope = serializers.ChoiceField(required=False, label=_('scope'), default=KnowledgeScope.WORKSPACE,
  809. choices=KnowledgeScope.choices)
  810. @staticmethod
  811. def post_embedding_knowledge(document_list, knowledge_id):
  812. model_id = get_embedding_model_id_by_knowledge_id(knowledge_id)
  813. embedding_by_knowledge.delay(knowledge_id, model_id)
  814. return document_list
  815. @post(post_function=post_embedding_knowledge)
  816. @transaction.atomic
  817. def save_base(self, instance, with_valid=True):
  818. if with_valid:
  819. self.is_valid(raise_exception=True)
  820. KnowledgeBaseCreateRequest(data=instance).is_valid(raise_exception=True)
  821. folder_id = instance.get('folder_id', self.data.get('workspace_id'))
  822. knowledge_id = uuid.uuid7()
  823. knowledge = Knowledge(
  824. id=knowledge_id,
  825. name=instance.get('name'),
  826. workspace_id=self.data.get('workspace_id'),
  827. desc=instance.get('desc'),
  828. type=instance.get('type', KnowledgeType.BASE),
  829. user_id=self.data.get('user_id'),
  830. scope=self.data.get('scope', KnowledgeScope.WORKSPACE),
  831. folder_id=folder_id,
  832. embedding_model_id=instance.get('embedding_model_id'),
  833. meta=instance.get('meta', {}),
  834. )
  835. document_model_list = []
  836. paragraph_model_list = []
  837. problem_paragraph_object_list = []
  838. # 插入文档
  839. for document in instance.get('documents') if 'documents' in instance else []:
  840. document_paragraph_dict_model = DocumentSerializers.Create.get_document_paragraph_model(knowledge_id,
  841. document)
  842. document_model_list.append(document_paragraph_dict_model.get('document'))
  843. for paragraph in document_paragraph_dict_model.get('paragraph_model_list'):
  844. paragraph_model_list.append(paragraph)
  845. for problem_paragraph_object in document_paragraph_dict_model.get('problem_paragraph_object_list'):
  846. problem_paragraph_object_list.append(problem_paragraph_object)
  847. problem_model_list, problem_paragraph_mapping_list = (
  848. ProblemParagraphManage(problem_paragraph_object_list, knowledge_id)
  849. .to_problem_model_list())
  850. # 插入知识库
  851. knowledge.save()
  852. # 插入文档
  853. QuerySet(Document).bulk_create(document_model_list) if len(document_model_list) > 0 else None
  854. # 批量插入段落
  855. QuerySet(Paragraph).bulk_create(paragraph_model_list) if len(paragraph_model_list) > 0 else None
  856. # 批量插入问题
  857. QuerySet(Problem).bulk_create(problem_model_list) if len(problem_model_list) > 0 else None
  858. # 批量插入关联问题
  859. QuerySet(ProblemParagraphMapping).bulk_create(
  860. problem_paragraph_mapping_list
  861. ) if len(problem_paragraph_mapping_list) > 0 else None
  862. # 自动资源给授权当前用户
  863. UserResourcePermissionSerializer(data={
  864. 'workspace_id': self.data.get('workspace_id'),
  865. 'user_id': self.data.get('user_id'),
  866. 'auth_target_type': AuthTargetType.KNOWLEDGE.value
  867. }).auth_resource(str(knowledge_id))
  868. update_resource_mapping_by_knowledge(str(knowledge_id))
  869. return {
  870. **KnowledgeModelSerializer(knowledge).data,
  871. 'user_id': self.data.get('user_id'),
  872. 'document_list': document_model_list,
  873. "document_count": len(document_model_list),
  874. "char_length": reduce(lambda x, y: x + y, [d.char_length for d in document_model_list], 0)
  875. }, knowledge_id
  876. def save_web(self, instance: Dict, with_valid=True):
  877. if with_valid:
  878. self.is_valid(raise_exception=True)
  879. KnowledgeWebCreateRequest(data=instance).is_valid(raise_exception=True)
  880. folder_id = instance.get('folder_id', self.data.get('workspace_id'))
  881. knowledge_id = uuid.uuid7()
  882. knowledge = Knowledge(
  883. id=knowledge_id,
  884. name=instance.get('name'),
  885. desc=instance.get('desc'),
  886. user_id=self.data.get('user_id'),
  887. type=instance.get('type', KnowledgeType.WEB),
  888. scope=self.data.get('scope', KnowledgeScope.WORKSPACE),
  889. folder_id=folder_id,
  890. workspace_id=self.data.get('workspace_id'),
  891. embedding_model_id=instance.get('embedding_model_id'),
  892. meta={
  893. 'source_url': instance.get('source_url'),
  894. 'selector': instance.get('selector', 'body'),
  895. 'embedding_model_id': instance.get('embedding_model_id')
  896. },
  897. )
  898. knowledge.save()
  899. # 自动资源给授权当前用户
  900. UserResourcePermissionSerializer(data={
  901. 'workspace_id': self.data.get('workspace_id'),
  902. 'user_id': self.data.get('user_id'),
  903. 'auth_target_type': AuthTargetType.KNOWLEDGE.value
  904. }).auth_resource(str(knowledge_id))
  905. sync_web_knowledge.delay(str(knowledge_id), instance.get('source_url'), instance.get('selector'))
  906. update_resource_mapping_by_knowledge(str(knowledge_id))
  907. return {**KnowledgeModelSerializer(knowledge).data, 'document_list': []}
  908. class SyncWeb(serializers.Serializer):
  909. workspace_id = serializers.CharField(required=True, label=_('workspace id'))
  910. knowledge_id = serializers.CharField(required=True, label=_('knowledge id'))
  911. user_id = serializers.UUIDField(required=False, label=_('user id'))
  912. sync_type = serializers.CharField(required=True, label=_('sync type'), validators=[
  913. validators.RegexValidator(regex=re.compile("^replace|complete$"),
  914. message=_('The synchronization type only supports:replace|complete'), code=500)])
  915. def is_valid(self, *, raise_exception=False):
  916. super().is_valid(raise_exception=True)
  917. workspace_id = self.data.get('workspace_id')
  918. query_set = QuerySet(Knowledge).filter(id=self.data.get('knowledge_id'))
  919. if workspace_id:
  920. query_set = query_set.filter(workspace_id=workspace_id)
  921. if not query_set.exists():
  922. raise AppApiException(500, _('Knowledge id does not exist'))
  923. first = QuerySet(Knowledge).filter(id=self.data.get("knowledge_id")).first()
  924. if first is None:
  925. raise AppApiException(300, _('id does not exist'))
  926. if first.type != KnowledgeType.WEB:
  927. raise AppApiException(500, _('Synchronization is only supported for web site types'))
  928. def sync(self, with_valid=True):
  929. if with_valid:
  930. self.is_valid(raise_exception=True)
  931. sync_type = self.data.get('sync_type')
  932. knowledge_id = self.data.get('knowledge_id')
  933. knowledge = QuerySet(Knowledge).get(id=knowledge_id)
  934. self.__getattribute__(sync_type + '_sync')(knowledge)
  935. return True
  936. @staticmethod
  937. def get_sync_handler(knowledge):
  938. def handler(child_link: ChildLink, response: Fork.Response):
  939. if response.status == 200:
  940. try:
  941. document_name = child_link.tag.text if child_link.tag is not None and len(
  942. child_link.tag.text.strip()) > 0 else child_link.url
  943. paragraphs = get_split_model('web.md').parse(response.content)
  944. maxkb_logger.info(child_link.url.strip())
  945. first = QuerySet(Document).filter(
  946. meta__source_url=child_link.url.strip(),
  947. knowledge=knowledge
  948. ).first()
  949. if first is not None:
  950. # 如果存在,使用文档同步
  951. DocumentSerializers.Sync(data={'document_id': first.id}).sync()
  952. else:
  953. # 插入
  954. DocumentSerializers.Create(data={'knowledge_id': knowledge.id}).save(
  955. {'name': document_name, 'paragraphs': paragraphs,
  956. 'meta': {'source_url': child_link.url.strip(),
  957. 'selector': knowledge.meta.get('selector')},
  958. 'type': KnowledgeType.WEB}, with_valid=True)
  959. except Exception as e:
  960. maxkb_logger.error(f'{str(e)}:{traceback.format_exc()}')
  961. return handler
  962. def replace_sync(self, knowledge):
  963. """
  964. 替换同步
  965. :return:
  966. """
  967. url = knowledge.meta.get('source_url')
  968. selector = knowledge.meta.get('selector') if 'selector' in knowledge.meta else None
  969. sync_replace_web_knowledge.delay(str(knowledge.id), url, selector)
  970. def complete_sync(self, knowledge):
  971. """
  972. 完整同步 删掉当前数据集下所有的文档,再进行同步
  973. :return:
  974. """
  975. # 删除关联问题
  976. QuerySet(ProblemParagraphMapping).filter(knowledge=knowledge).delete()
  977. # 删除文档
  978. QuerySet(Document).filter(knowledge=knowledge).delete()
  979. # 删除段落
  980. QuerySet(Paragraph).filter(knowledge=knowledge).delete()
  981. # 删除向量
  982. delete_embedding_by_knowledge(self.data.get('knowledge_id'))
  983. # 同步
  984. self.replace_sync(knowledge)
  985. class HitTest(serializers.Serializer):
  986. workspace_id = serializers.CharField(required=True, label=_('workspace id'))
  987. knowledge_id = serializers.UUIDField(required=True, label=_("id"))
  988. user_id = serializers.UUIDField(required=False, label=_('user id'))
  989. query_text = serializers.CharField(required=True, label=_('query text'))
  990. top_number = serializers.IntegerField(required=True, max_value=10000, min_value=1, label=_("top number"))
  991. similarity = serializers.FloatField(required=True, max_value=2, min_value=0, label=_('similarity'))
  992. search_mode = serializers.CharField(required=True, label=_('search mode'), validators=[
  993. validators.RegexValidator(regex=re.compile("^embedding|keywords|blend$"),
  994. message=_('The type only supports embedding|keywords|blend'), code=500)
  995. ])
  996. def is_valid(self, *, raise_exception=True):
  997. super().is_valid(raise_exception=True)
  998. workspace_id = self.data.get('workspace_id')
  999. query_set = QuerySet(Knowledge).filter(id=self.data.get('knowledge_id'))
  1000. if workspace_id:
  1001. query_set = query_set.filter(workspace_id=workspace_id)
  1002. if not query_set.exists():
  1003. raise AppApiException(500, _('Knowledge id does not exist'))
  1004. if not QuerySet(Knowledge).filter(id=self.data.get("knowledge_id")).exists():
  1005. raise AppApiException(300, _('id does not exist'))
  1006. def hit_test(self):
  1007. self.is_valid()
  1008. vector = VectorStore.get_embedding_vector()
  1009. exclude_document_id_list = [
  1010. str(
  1011. document.id
  1012. ) for document in QuerySet(Document).filter(knowledge_id=self.data.get('knowledge_id'), is_active=False)
  1013. ]
  1014. model = get_embedding_model_by_knowledge_id(self.data.get('knowledge_id'))
  1015. # 向量库检索
  1016. hit_list = vector.hit_test(
  1017. self.data.get('query_text'),
  1018. [self.data.get('knowledge_id')],
  1019. exclude_document_id_list,
  1020. self.data.get('top_number'),
  1021. self.data.get('similarity'),
  1022. SearchMode(self.data.get('search_mode')),
  1023. model
  1024. )
  1025. hit_dict = reduce(lambda x, y: {**x, **y}, [{hit.get('paragraph_id'): hit} for hit in hit_list], {})
  1026. p_list = list_paragraph([h.get('paragraph_id') for h in hit_list])
  1027. return [
  1028. {
  1029. **p,
  1030. 'similarity': hit_dict.get(p.get('id')).get('similarity'),
  1031. 'comprehensive_score': hit_dict.get(p.get('id')).get('comprehensive_score')
  1032. } for p in p_list
  1033. ]
  1034. class StoreKnowledge(serializers.Serializer):
  1035. user_id = serializers.UUIDField(required=True, label=_("User ID"))
  1036. name = serializers.CharField(required=False, label=_("tool name"), allow_null=True, allow_blank=True)
  1037. def get_appstore_templates(self):
  1038. self.is_valid(raise_exception=True)
  1039. # 下载zip文件
  1040. try:
  1041. appstore_url = CONFIG.get('APPSTORE_URL', 'https://apps-assets.fit2cloud.com/stable/maxkb.json.zip')
  1042. res = requests.get(appstore_url, timeout=5)
  1043. res.raise_for_status()
  1044. # 创建临时文件保存zip
  1045. with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as temp_zip:
  1046. temp_zip.write(res.content)
  1047. temp_zip_path = temp_zip.name
  1048. try:
  1049. # 解压zip文件
  1050. with zipfile.ZipFile(temp_zip_path, 'r') as zip_ref:
  1051. # 获取zip中的第一个文件(假设只有一个json文件)
  1052. json_filename = zip_ref.namelist()[0]
  1053. json_content = zip_ref.read(json_filename)
  1054. # 将json转换为字典
  1055. tool_store = json.loads(json_content.decode('utf-8'))
  1056. tag_dict = {tag['name']: tag['key'] for tag in tool_store['additionalProperties']['tags']}
  1057. filter_apps = []
  1058. for tool in tool_store['apps']:
  1059. if self.data.get('name', '') != '':
  1060. if self.data.get('name').lower() not in tool.get('name', '').lower():
  1061. continue
  1062. if not tool['downloadUrl'].endswith('.kbwf'):
  1063. continue
  1064. versions = tool.get('versions', [])
  1065. tool['label'] = tag_dict[tool.get('tags')[0]] if tool.get('tags') else ''
  1066. tool['version'] = next(
  1067. (version.get('name') for version in versions if
  1068. version.get('downloadUrl') == tool['downloadUrl']),
  1069. )
  1070. filter_apps.append(tool)
  1071. tool_store['apps'] = filter_apps
  1072. return tool_store
  1073. finally:
  1074. # 清理临时文件
  1075. os.unlink(temp_zip_path)
  1076. except Exception as e:
  1077. maxkb_logger.error(f"fetch appstore tools error: {e}")
  1078. return {'apps': [], 'additionalProperties': {'tags': []}}
  1079. class TransformWorkflow(serializers.Serializer):
  1080. workspace_id = serializers.CharField(required=True, label=_('Workspace ID'))
  1081. knowledge_id = serializers.UUIDField(required=True, label=_('Knowledge ID'))
  1082. user_id = serializers.UUIDField(required=True, label=_('User ID'))
  1083. def transform(self, instance: Dict):
  1084. self.is_valid(raise_exception=True)
  1085. knowledge = QuerySet(Knowledge).filter(
  1086. id=self.data.get('knowledge_id'),
  1087. workspace_id=self.data.get('workspace_id')
  1088. ).first()
  1089. if not knowledge:
  1090. raise AppApiException(500, _('Knowledge not found'))
  1091. if knowledge.type == KnowledgeType.WORKFLOW:
  1092. raise AppApiException(500, _('Knowledge is already a workflow'))
  1093. knowledge.type = KnowledgeType.WORKFLOW
  1094. knowledge.save()
  1095. workflow_id = uuid.uuid7()
  1096. knowledge_workflow = KnowledgeWorkflow(
  1097. id=workflow_id,
  1098. workspace_id=knowledge.workspace_id,
  1099. knowledge_id=knowledge.id,
  1100. work_flow=instance.get('work_flow', {}),
  1101. )
  1102. knowledge_workflow.save()
  1103. return True
  1104. class Tags(serializers.Serializer):
  1105. workspace_id = serializers.CharField(required=True, label=_('workspace id'))
  1106. user_id = serializers.UUIDField(required=True, label=_('user id'))
  1107. knowledge_ids = serializers.ListField(
  1108. required=True, label=_('knowledge ids'),
  1109. child=serializers.UUIDField(required=True, label=_('id'))
  1110. )
  1111. def list(self):
  1112. self.is_valid(raise_exception=True)
  1113. if self.data.get('name'):
  1114. name = self.data.get('name')
  1115. tags = QuerySet(Tag).filter(
  1116. knowledge_id__in=self.data.get('knowledge_ids')
  1117. ).filter(
  1118. Q(key__icontains=name) | Q(value__icontains=name)
  1119. ).values('key', 'value', 'id', 'create_time', 'update_time').order_by('create_time', 'key', 'value')
  1120. else:
  1121. # 获取所有标签,按创建时间排序保持稳定顺序
  1122. tags = QuerySet(Tag).filter(
  1123. knowledge_id__in=self.data.get('knowledge_ids')
  1124. ).values('key', 'value', 'id', 'create_time', 'update_time').order_by('create_time', 'key', 'value')
  1125. # 按key分组
  1126. grouped_tags = defaultdict(list)
  1127. for tag in tags:
  1128. grouped_tags[tag['key']].append({
  1129. 'id': tag['id'],
  1130. 'value': tag['value'],
  1131. 'create_time': tag['create_time'],
  1132. 'update_time': tag['update_time']
  1133. })
  1134. # 转换为期望的格式,保持key的顺序
  1135. result = []
  1136. # 按key排序以确保结果顺序一致
  1137. for key in sorted(grouped_tags.keys()):
  1138. values = grouped_tags[key]
  1139. # 按创建时间对values进行排序
  1140. values.sort(key=lambda x: x['create_time'])
  1141. result.append({
  1142. 'key': key,
  1143. 'values': values,
  1144. })
  1145. return result
  1146. class KnowledgeBatchOperateSerializer(serializers.Serializer):
  1147. workspace_id = serializers.CharField(required=True, label=_('workspace id'))
  1148. def is_valid(self, *, raise_exception=False):
  1149. super().is_valid(raise_exception=True)
  1150. @transaction.atomic
  1151. def batch_delete(self, instance: Dict, with_valid=True):
  1152. if with_valid:
  1153. BatchSerializer(data=instance).is_valid(model=Knowledge, raise_exception=True)
  1154. self.is_valid(raise_exception=True)
  1155. id_list = instance.get('id_list')
  1156. workspace_id = self.data.get('workspace_id')
  1157. knowledge_query_set = QuerySet(Knowledge).filter(id__in=id_list, workspace_id=workspace_id)
  1158. # 删除所有关联
  1159. QuerySet(Document).filter(knowledge__in=knowledge_query_set).delete()
  1160. QuerySet(ProblemParagraphMapping).filter(knowledge__in=knowledge_query_set).delete()
  1161. QuerySet(Paragraph).filter(knowledge__in=knowledge_query_set).delete()
  1162. QuerySet(Problem).filter(knowledge__in=knowledge_query_set).delete()
  1163. QuerySet(WorkspaceUserResourcePermission).filter(target__in=id_list).delete()
  1164. for k_id in id_list:
  1165. drop_knowledge_index(knowledge_id=k_id)
  1166. delete_embedding_by_knowledge(k_id)
  1167. File.objects.filter(source_id__in=id_list).delete()
  1168. QuerySet(ResourceMapping).filter(
  1169. Q(target_id__in=id_list) | Q(source_id__in=id_list)
  1170. ).delete()
  1171. knowledge_query_set.delete()
  1172. return True
  1173. def batch_move(self, instance: Dict, with_valid=True):
  1174. if with_valid:
  1175. BatchMoveSerializer(data=instance).is_valid(model=Knowledge, raise_exception=True)
  1176. self.is_valid(raise_exception=True)
  1177. id_list = instance.get('id_list')
  1178. folder_id = instance.get('folder_id')
  1179. workspace_id = self.data.get('workspace_id')
  1180. QuerySet(Knowledge).filter(id__in=id_list, workspace_id=workspace_id).update(folder_id=folder_id)
  1181. return True