csv_split_handle.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. # coding=utf-8
  2. """
  3. @project: maxkb
  4. @Author:虎
  5. @file: csv_parse_qa_handle.py
  6. @date:2024/5/21 14:59
  7. @desc:
  8. """
  9. import csv
  10. import io
  11. import os
  12. import traceback
  13. from typing import List
  14. from charset_normalizer import detect
  15. from common.handle.base_split_handle import BaseSplitHandle
  16. from common.utils.logger import maxkb_logger
  17. def post_cell(cell_value):
  18. return cell_value.replace('\n', '<br>').replace('|', '&#124;')
  19. def row_to_md(row):
  20. return '| ' + ' | '.join(
  21. [post_cell(cell) if cell is not None else '' for cell in row]) + ' |\n'
  22. class CsvSplitHandle(BaseSplitHandle):
  23. def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
  24. buffer = get_buffer(file)
  25. paragraphs = []
  26. file_name = os.path.basename(file.name)
  27. result = {'name': file_name, 'content': paragraphs}
  28. try:
  29. if type(limit) is str:
  30. limit = int(limit)
  31. reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
  32. try:
  33. title_row_list = reader.__next__()
  34. title_md_content = row_to_md(title_row_list)
  35. title_md_content += '| ' + ' | '.join(
  36. ['---' if cell is not None else '' for cell in title_row_list]) + ' |\n'
  37. except Exception as e:
  38. return result
  39. if len(title_row_list) == 0:
  40. return result
  41. result_item_content = ''
  42. for row in reader:
  43. next_md_content = row_to_md(row)
  44. next_md_content_len = len(next_md_content)
  45. result_item_content_len = len(result_item_content)
  46. if len(result_item_content) == 0:
  47. result_item_content += title_md_content
  48. result_item_content += next_md_content
  49. else:
  50. if result_item_content_len + next_md_content_len < limit:
  51. result_item_content += next_md_content
  52. else:
  53. paragraphs.append({'content': result_item_content, 'title': ''})
  54. result_item_content = title_md_content + next_md_content
  55. if len(result_item_content) > 0:
  56. paragraphs.append({'content': result_item_content, 'title': ''})
  57. return result
  58. except Exception as e:
  59. maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
  60. return result
  61. def get_content(self, file, save_image):
  62. buffer = file.read()
  63. try:
  64. reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
  65. rows = list(reader)
  66. if not rows:
  67. return ""
  68. # 构建 Markdown 表格
  69. md_lines = []
  70. # 添加表头
  71. header = [cell.replace('\n', '<br>').replace('\r', '') for cell in rows[0]]
  72. md_lines.append('| ' + ' | '.join(header) + ' |')
  73. # 添加分隔线
  74. md_lines.append('| ' + ' | '.join(['---'] * len(header)) + ' |')
  75. # 添加数据行
  76. for row in rows[1:]:
  77. if row: # 跳过空行
  78. # 确保行长度与表头一致,并将换行符转换为 <br>
  79. padded_row = [
  80. cell.replace('\n', '<br>').replace('\r', '') for cell in row
  81. ] + [''] * (len(header) - len(row))
  82. md_lines.append('| ' + ' | '.join(padded_row[:len(header)]) + ' |')
  83. return '\n'.join(md_lines)
  84. except Exception as e:
  85. maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
  86. return ""
  87. def support(self, file, get_buffer):
  88. file_name: str = file.name.lower()
  89. if file_name.endswith(".csv"):
  90. return True
  91. return False