mark_chunk_handle.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. # coding=utf-8
  2. """
  3. @project: MaxKB
  4. @Author:虎
  5. @file: mark_chunk_handle.py
  6. @date:2024/7/23 16:52
  7. @desc:
  8. """
  9. import re
  10. from typing import List
  11. from common.chunk.i_chunk_handle import IChunkHandle
  12. class MarkChunkHandle(IChunkHandle):
  13. def handle(self, chunk_list: List[str], chunk_size: int = 256):
  14. split_chunk_pattern = r'.{1,%d}[。| |\\.|!|;|;|!|\n]' % chunk_size
  15. max_chunk_pattern = r'.{1,%d}' % chunk_size
  16. result = []
  17. for chunk in chunk_list:
  18. chunk_result = re.findall(split_chunk_pattern, chunk, flags=re.DOTALL)
  19. for c_r in chunk_result:
  20. if len(c_r.strip()) > 0:
  21. result.append(c_r.strip())
  22. other_chunk_list = re.split(split_chunk_pattern, chunk, flags=re.DOTALL)
  23. for other_chunk in other_chunk_list:
  24. if len(other_chunk) > 0:
  25. if len(other_chunk) < chunk_size:
  26. if len(other_chunk.strip()) > 0:
  27. result.append(other_chunk.strip())
  28. else:
  29. max_chunk_list = re.findall(max_chunk_pattern, other_chunk, flags=re.DOTALL)
  30. for m_c in max_chunk_list:
  31. if len(m_c.strip()) > 0:
  32. result.append(m_c.strip())
  33. return result