csv_parse_qa_handle.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. # coding=utf-8
  2. """
  3. @project: maxkb
  4. @Author:虎
  5. @file: csv_parse_qa_handle.py
  6. @date:2024/5/21 14:59
  7. @desc:
  8. """
  9. import csv
  10. import io
  11. import traceback
  12. from charset_normalizer import detect
  13. from common.handle.base_parse_qa_handle import BaseParseQAHandle, get_title_row_index_dict, get_row_value
  14. from common.utils.logger import maxkb_logger
  15. def read_csv_standard(file_path):
  16. data = []
  17. with open(file_path, 'r') as file:
  18. reader = csv.reader(file)
  19. for row in reader:
  20. data.append(row)
  21. return data
  22. class CsvParseQAHandle(BaseParseQAHandle):
  23. def support(self, file, get_buffer):
  24. file_name: str = file.name.lower()
  25. if file_name.endswith(".csv"):
  26. return True
  27. return False
  28. def handle(self, file, get_buffer, save_image):
  29. buffer = get_buffer(file)
  30. try:
  31. reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
  32. try:
  33. title_row_list = reader.__next__()
  34. except Exception as e:
  35. return [{'name': file.name, 'paragraphs': []}]
  36. if len(title_row_list) == 0:
  37. return [{'name': file.name, 'paragraphs': []}]
  38. title_row_index_dict = get_title_row_index_dict(title_row_list)
  39. paragraph_list = []
  40. for row in reader:
  41. content = get_row_value(row, title_row_index_dict, 'content')
  42. if content is None:
  43. continue
  44. problem = get_row_value(row, title_row_index_dict, 'problem_list')
  45. problem = str(problem) if problem is not None else ''
  46. problem_list = [{'content': p[0:255]} for p in problem.split('\n') if len(p.strip()) > 0]
  47. title = get_row_value(row, title_row_index_dict, 'title')
  48. title = str(title) if title is not None else ''
  49. paragraph_list.append({'title': title[0:255],
  50. 'content': content[0:102400],
  51. 'problem_list': problem_list})
  52. return [{'name': file.name, 'paragraphs': paragraph_list}]
  53. except Exception as e:
  54. maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
  55. return [{'name': file.name, 'paragraphs': []}]