csv_parse_table_handle.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. # coding=utf-8
  2. import csv
  3. import io
  4. import traceback
  5. from charset_normalizer import detect
  6. from common.handle.base_parse_qa_handle import get_title_row_index_dict, get_row_value
  7. from common.handle.base_parse_table_handle import BaseParseTableHandle
  8. from common.utils.logger import maxkb_logger
  9. class CsvParseTableHandle(BaseParseTableHandle):
  10. def support(self, file, get_buffer):
  11. file_name: str = file.name.lower()
  12. if file_name.endswith(".csv"):
  13. return True
  14. return False
  15. def handle(self, file, get_buffer, save_image):
  16. buffer = get_buffer(file)
  17. try:
  18. content = buffer.decode(detect(buffer)['encoding'])
  19. except BaseException as e:
  20. maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
  21. return [{'name': file.name, 'paragraphs': []}]
  22. csv_model = content.split('\n')
  23. paragraphs = []
  24. # 第一行为标题
  25. title = csv_model[0].split(',')
  26. for row in csv_model[1:]:
  27. if not row:
  28. continue
  29. line = '; '.join([f'{key}:{value}' for key, value in zip(title, row.split(','))])
  30. paragraphs.append({'title': '', 'content': line})
  31. return [{'name': file.name, 'paragraphs': paragraphs}]
  32. def get_content(self, file, save_image):
  33. buffer = file.read()
  34. try:
  35. reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
  36. rows = list(reader)
  37. if not rows:
  38. return ""
  39. # 构建 Markdown 表格
  40. md_lines = []
  41. # 添加表头
  42. header = [cell.replace('\n', '<br>').replace('\r', '') for cell in rows[0]]
  43. md_lines.append('| ' + ' | '.join(header) + ' |')
  44. # 添加分隔线
  45. md_lines.append('| ' + ' | '.join(['---'] * len(header)) + ' |')
  46. # 添加数据行
  47. for row in rows[1:]:
  48. if row: # 跳过空行
  49. # 确保行长度与表头一致,并将换行符转换为 <br>
  50. padded_row = [
  51. cell.replace('\n', '<br>').replace('\r', '') for cell in row
  52. ] + [''] * (len(header) - len(row))
  53. md_lines.append('| ' + ' | '.join(padded_row[:len(header)]) + ' |')
  54. return '\n'.join(md_lines)
  55. except Exception as e:
  56. maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
  57. return ""