handler.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. # coding=utf-8
  2. import re
  3. import traceback
  4. from django.db.models import QuerySet
  5. from django.utils.translation import gettext_lazy as _
  6. from common.utils.fork import ChildLink, Fork
  7. from common.utils.logger import maxkb_logger
  8. from common.utils.split_model import get_split_model
  9. from knowledge.models import State
  10. from knowledge.models.knowledge import KnowledgeType, Document, Knowledge
  11. def get_save_handler(knowledge_id, selector):
  12. from knowledge.serializers.document import DocumentSerializers
  13. def handler(child_link: ChildLink, response: Fork.Response):
  14. if response.status == 200:
  15. try:
  16. document_name = child_link.tag.text if child_link.tag is not None and len(
  17. child_link.tag.text.strip()) > 0 else child_link.url
  18. paragraphs = get_split_model('web.md').parse(response.content)
  19. DocumentSerializers.Create(
  20. data={'knowledge_id': knowledge_id}
  21. ).save({
  22. 'name': document_name,
  23. 'paragraphs': paragraphs,
  24. 'meta': {'source_url': child_link.url, 'selector': selector},
  25. 'type': KnowledgeType.WEB
  26. }, with_valid=True)
  27. except Exception as e:
  28. maxkb_logger.error(f'{str(e)}:{traceback.format_exc()}')
  29. return handler
  30. def get_sync_handler(knowledge_id):
  31. from knowledge.serializers.document import DocumentSerializers
  32. knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
  33. def handler(child_link: ChildLink, response: Fork.Response):
  34. if response.status == 200:
  35. try:
  36. document_name = child_link.tag.text if child_link.tag is not None and len(
  37. child_link.tag.text.strip()) > 0 else child_link.url
  38. paragraphs = get_split_model('web.md').parse(response.content)
  39. first = QuerySet(Document).filter(meta__source_url=child_link.url.strip(), knowledge=knowledge).first()
  40. if first is not None:
  41. # 如果存在,使用文档同步
  42. DocumentSerializers.Sync(data={'document_id': first.id}).sync()
  43. else:
  44. # 插入
  45. DocumentSerializers.Create(
  46. data={'knowledge_id': knowledge.id}
  47. ).save({
  48. 'name': document_name,
  49. 'paragraphs': paragraphs,
  50. 'meta': {'source_url': child_link.url.strip(), 'selector': knowledge.meta.get('selector')},
  51. 'type': KnowledgeType.WEB
  52. }, with_valid=True)
  53. except Exception as e:
  54. maxkb_logger.error(f'{str(e)}:{traceback.format_exc()}')
  55. return handler
  56. def get_sync_web_document_handler(knowledge_id):
  57. from knowledge.serializers.document import DocumentSerializers
  58. def handler(source_url: str, selector, response: Fork.Response):
  59. if response.status == 200:
  60. try:
  61. paragraphs = get_split_model('web.md').parse(response.content)
  62. # 插入
  63. DocumentSerializers.Create(data={'knowledge_id': knowledge_id}).save(
  64. {'name': source_url[0:128], 'paragraphs': paragraphs,
  65. 'meta': {'source_url': source_url, 'selector': selector},
  66. 'type': KnowledgeType.WEB}, with_valid=True)
  67. except Exception as e:
  68. maxkb_logger.error(f'{str(e)}:{traceback.format_exc()}')
  69. else:
  70. Document(name=source_url[0:128],
  71. knowledge_id=knowledge_id,
  72. meta={'source_url': source_url, 'selector': selector, 'allow_download': True},
  73. type=KnowledgeType.WEB,
  74. char_length=0,
  75. status=State.FAILURE).save()
  76. return handler
  77. def save_problem(knowledge_id, document_id, paragraph_id, problem):
  78. from knowledge.serializers.paragraph import ParagraphSerializers
  79. # print(f"knowledge_id: {knowledge_id}")
  80. # print(f"document_id: {document_id}")
  81. # print(f"paragraph_id: {paragraph_id}")
  82. # print(f"problem: {problem}")
  83. problem = re.sub(r"^\d+\.\s*", "", problem)
  84. pattern = r"<question>(.*?)</question>"
  85. match = re.search(pattern, problem)
  86. problem = match.group(1) if match else None
  87. if problem is None or len(problem) == 0:
  88. return
  89. try:
  90. workspace_id = QuerySet(Knowledge).filter(id=knowledge_id).first().workspace_id
  91. ParagraphSerializers.Problem(
  92. data={
  93. 'workspace_id': workspace_id,
  94. "knowledge_id": knowledge_id,
  95. 'document_id': document_id,
  96. 'paragraph_id': paragraph_id
  97. }
  98. ).save(instance={"content": problem}, with_valid=True)
  99. except Exception as e:
  100. maxkb_logger.error(_('Association problem failed {error}').format(error=str(e)))