html_split_handle.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. # coding=utf-8
  2. """
  3. @project: maxkb
  4. @Author:虎
  5. @file: html_split_handle.py
  6. @date:2024/5/23 10:58
  7. @desc:
  8. """
  9. import re
  10. import traceback
  11. from typing import List
  12. from bs4 import BeautifulSoup
  13. from charset_normalizer import detect
  14. from html2text import html2text
  15. from common.handle.base_split_handle import BaseSplitHandle
  16. from common.utils.logger import maxkb_logger
  17. from common.utils.split_model import SplitModel
  18. default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
  19. re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
  20. re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
  21. re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
  22. re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
  23. re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
  24. def get_encoding(buffer):
  25. beautiful_soup = BeautifulSoup(buffer, "html.parser")
  26. meta_list = beautiful_soup.find_all('meta')
  27. charset_list = [meta.attrs.get('charset') for meta in meta_list if
  28. meta.attrs is not None and 'charset' in meta.attrs]
  29. if len(charset_list) > 0:
  30. charset = charset_list[0]
  31. return charset
  32. return detect(buffer)['encoding']
  33. class HTMLSplitHandle(BaseSplitHandle):
  34. def support(self, file, get_buffer):
  35. file_name: str = file.name.lower()
  36. if file_name.endswith(".html") or file_name.endswith(".HTML"):
  37. return True
  38. return False
  39. def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
  40. buffer = get_buffer(file)
  41. if type(limit) is str:
  42. limit = int(limit)
  43. if type(with_filter) is str:
  44. with_filter = with_filter.lower() == 'true'
  45. if pattern_list is not None and len(pattern_list) > 0:
  46. split_model = SplitModel(pattern_list, with_filter, limit)
  47. else:
  48. split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
  49. try:
  50. encoding = get_encoding(buffer)
  51. content = buffer.decode(encoding)
  52. content = html2text(content)
  53. except BaseException as e:
  54. maxkb_logger.error(f"Error processing HTML file {file.name}: {e}, {traceback.format_exc()}")
  55. return {
  56. 'name': file.name, 'content': []
  57. }
  58. return {
  59. 'name': file.name,
  60. 'content': split_model.parse(content)
  61. }
  62. def get_content(self, file, save_image):
  63. buffer = file.read()
  64. try:
  65. encoding = get_encoding(buffer)
  66. content = buffer.decode(encoding)
  67. return html2text(content)
  68. except BaseException as e:
  69. maxkb_logger.error(f'Exception: {e}', exc_info=True)
  70. return f'{e}'