|
|
@@ -20,7 +20,7 @@ class PDFProcessor:
|
|
|
self.directory = directory # PDF 文件所在目录
|
|
|
self.file_group_num = kwargs.get('file_group_num', 20) # 每组处理的文件数
|
|
|
self.batch_num = kwargs.get('batch_num', 6) # 每次插入的批次数量
|
|
|
- self.chunksize = kwargs.get('chunksize', 500) # 切分文本的大小
|
|
|
+ self.chunksize = kwargs.get('chunksize', 1000) # 切分文本的大小
|
|
|
self.overlap = kwargs.get('overlap', 100) # 切分文本的重叠大小
|
|
|
self.file_suffix_list = kwargs.get('file_suffix_list', ['.pdf' , '.docx' , '.doc'])
|
|
|
server_logger.info(f"""
|
|
|
@@ -130,8 +130,16 @@ class PDFProcessor:
|
|
|
# 将文本切分成小段
|
|
|
docs = self.split_text(document_content)
|
|
|
pdf_contents.append(docs)
|
|
|
+ server_logger.info(f"Documents pdf_file_name:{pdf_file_name},docs:{len(docs)}")
|
|
|
|
|
|
# TODO 切分的问题 可以增加metadata元数据信息
|
|
|
server_logger.info(f"Processed Documents:{self.directory},docs:{len(pdf_contents)}")
|
|
|
return pdf_contents
|
|
|
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ def print_chunk_info(self, chunk):
|
|
|
+ """打印切分文本的信息"""
|
|
|
+ server_logger.info(f"Chunk: {chunk.page_content}")
|
|
|
+ server_logger.info(f"Metadata: {chunk.metadata}")
|