|
@@ -98,18 +98,12 @@ class DocumentProcessor:
|
|
|
|
|
|
|
|
async def parse_pdf_content(self, file_content: bytes) -> Dict[str, Any]:
|
|
async def parse_pdf_content(self, file_content: bytes) -> Dict[str, Any]:
|
|
|
"""解析PDF内容,使用doc_worker的智能处理能力"""
|
|
"""解析PDF内容,使用doc_worker的智能处理能力"""
|
|
|
- temp_file_path = None
|
|
|
|
|
try:
|
|
try:
|
|
|
- # 保存到临时文件
|
|
|
|
|
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
|
|
|
|
- temp_file.write(file_content)
|
|
|
|
|
- temp_file_path = temp_file.name
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f"开始使用doc_worker处理PDF文档: {temp_file_path}")
|
|
|
|
|
|
|
+ logger.info("开始使用doc_worker处理PDF文档(内存模式)")
|
|
|
|
|
|
|
|
- # 创建DocumentSource
|
|
|
|
|
|
|
+ # 创建DocumentSource(纯内存模式,不使用临时文件)
|
|
|
source = DocumentSource(
|
|
source = DocumentSource(
|
|
|
- path=Path(temp_file_path),
|
|
|
|
|
|
|
+ path=None,
|
|
|
content=file_content,
|
|
content=file_content,
|
|
|
file_type='pdf'
|
|
file_type='pdf'
|
|
|
)
|
|
)
|
|
@@ -120,7 +114,7 @@ class DocumentProcessor:
|
|
|
|
|
|
|
|
if toc_info.get('toc_count', 0) == 0:
|
|
if toc_info.get('toc_count', 0) == 0:
|
|
|
logger.warning("未检测到目录,使用基础处理模式")
|
|
logger.warning("未检测到目录,使用基础处理模式")
|
|
|
- return await self._fallback_pdf_processing(temp_file_path)
|
|
|
|
|
|
|
+ return await self._fallback_pdf_processing(file_content)
|
|
|
|
|
|
|
|
logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
|
|
logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
|
|
|
|
|
|
|
@@ -156,7 +150,7 @@ class DocumentProcessor:
|
|
|
|
|
|
|
|
if not pages_content:
|
|
if not pages_content:
|
|
|
logger.warning("无法提取文档全文,使用基础处理模式")
|
|
logger.warning("无法提取文档全文,使用基础处理模式")
|
|
|
- return await self._fallback_pdf_processing(temp_file_path)
|
|
|
|
|
|
|
+ return await self._fallback_pdf_processing(file_content)
|
|
|
|
|
|
|
|
total_chars = sum(len(page.get('text', '')) for page in pages_content)
|
|
total_chars = sum(len(page.get('text', '')) for page in pages_content)
|
|
|
logger.info(f"提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
|
|
logger.info(f"提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
|
|
@@ -177,7 +171,7 @@ class DocumentProcessor:
|
|
|
|
|
|
|
|
if not chunks:
|
|
if not chunks:
|
|
|
logger.warning("未能生成任何文本块,使用基础处理模式")
|
|
logger.warning("未能生成任何文本块,使用基础处理模式")
|
|
|
- return await self._fallback_pdf_processing(temp_file_path)
|
|
|
|
|
|
|
+ return await self._fallback_pdf_processing(file_content)
|
|
|
|
|
|
|
|
logger.info(f"切分完成,共生成 {len(chunks)} 个文本块")
|
|
logger.info(f"切分完成,共生成 {len(chunks)} 个文本块")
|
|
|
|
|
|
|
@@ -221,35 +215,21 @@ class DocumentProcessor:
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.error(f"PDF解析失败: {str(e)}")
|
|
logger.error(f"PDF解析失败: {str(e)}")
|
|
|
# 如果智能处理失败,尝试基础处理
|
|
# 如果智能处理失败,尝试基础处理
|
|
|
- if temp_file_path and os.path.exists(temp_file_path):
|
|
|
|
|
- try:
|
|
|
|
|
- logger.info("尝试使用基础处理模式")
|
|
|
|
|
- return await self._fallback_pdf_processing(temp_file_path)
|
|
|
|
|
- except Exception as fallback_error:
|
|
|
|
|
- logger.error(f"基础处理模式也失败: {str(fallback_error)}")
|
|
|
|
|
- raise
|
|
|
|
|
- finally:
|
|
|
|
|
- # 清理临时文件
|
|
|
|
|
- if temp_file_path and os.path.exists(temp_file_path):
|
|
|
|
|
- try:
|
|
|
|
|
- os.unlink(temp_file_path)
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.warning(f"清理临时文件失败: {str(e)}")
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ logger.info("尝试使用基础处理模式")
|
|
|
|
|
+ return await self._fallback_pdf_processing(file_content)
|
|
|
|
|
+ except Exception as fallback_error:
|
|
|
|
|
+ logger.error(f"基础处理模式也失败: {str(fallback_error)}")
|
|
|
|
|
+ raise
|
|
|
|
|
|
|
|
async def parse_docx_content(self, file_content: bytes) -> Dict[str, Any]:
|
|
async def parse_docx_content(self, file_content: bytes) -> Dict[str, Any]:
|
|
|
"""解析DOCX内容,使用doc_worker的智能处理能力"""
|
|
"""解析DOCX内容,使用doc_worker的智能处理能力"""
|
|
|
- temp_file_path = None
|
|
|
|
|
try:
|
|
try:
|
|
|
- # 保存到临时文件
|
|
|
|
|
- with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
|
|
|
|
|
- temp_file.write(file_content)
|
|
|
|
|
- temp_file_path = temp_file.name
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f"开始使用doc_worker处理DOCX文档: {temp_file_path}")
|
|
|
|
|
|
|
+ logger.info("开始使用doc_worker处理DOCX文档(内存模式)")
|
|
|
|
|
|
|
|
- # 创建DocumentSource
|
|
|
|
|
|
|
+ # 创建DocumentSource(纯内存模式,不使用临时文件)
|
|
|
source = DocumentSource(
|
|
source = DocumentSource(
|
|
|
- path=Path(temp_file_path),
|
|
|
|
|
|
|
+ path=None,
|
|
|
content=file_content,
|
|
content=file_content,
|
|
|
file_type='docx'
|
|
file_type='docx'
|
|
|
)
|
|
)
|
|
@@ -260,7 +240,7 @@ class DocumentProcessor:
|
|
|
|
|
|
|
|
if toc_info.get('toc_count', 0) == 0:
|
|
if toc_info.get('toc_count', 0) == 0:
|
|
|
logger.warning("未检测到目录,使用基础处理模式")
|
|
logger.warning("未检测到目录,使用基础处理模式")
|
|
|
- return await self._fallback_docx_processing(temp_file_path)
|
|
|
|
|
|
|
+ return await self._fallback_docx_processing(file_content)
|
|
|
|
|
|
|
|
logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
|
|
logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
|
|
|
|
|
|
|
@@ -296,7 +276,7 @@ class DocumentProcessor:
|
|
|
|
|
|
|
|
if not pages_content:
|
|
if not pages_content:
|
|
|
logger.warning("无法提取文档全文,使用基础处理模式")
|
|
logger.warning("无法提取文档全文,使用基础处理模式")
|
|
|
- return await self._fallback_docx_processing(temp_file_path)
|
|
|
|
|
|
|
+ return await self._fallback_docx_processing(file_content)
|
|
|
|
|
|
|
|
total_chars = sum(len(page.get('text', '')) for page in pages_content)
|
|
total_chars = sum(len(page.get('text', '')) for page in pages_content)
|
|
|
logger.info(f"提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
|
|
logger.info(f"提取完成,共 {len(pages_content)} 页,{total_chars} 个字符")
|
|
@@ -317,7 +297,7 @@ class DocumentProcessor:
|
|
|
|
|
|
|
|
if not chunks:
|
|
if not chunks:
|
|
|
logger.warning("未能生成任何文本块,使用基础处理模式")
|
|
logger.warning("未能生成任何文本块,使用基础处理模式")
|
|
|
- return await self._fallback_docx_processing(temp_file_path)
|
|
|
|
|
|
|
+ return await self._fallback_docx_processing(file_content)
|
|
|
|
|
|
|
|
logger.info(f"切分完成,共生成 {len(chunks)} 个文本块")
|
|
logger.info(f"切分完成,共生成 {len(chunks)} 个文本块")
|
|
|
|
|
|
|
@@ -366,29 +346,28 @@ class DocumentProcessor:
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.error(f"DOCX解析失败: {str(e)}")
|
|
logger.error(f"DOCX解析失败: {str(e)}")
|
|
|
# 如果智能处理失败,尝试基础处理
|
|
# 如果智能处理失败,尝试基础处理
|
|
|
- if temp_file_path and os.path.exists(temp_file_path):
|
|
|
|
|
- try:
|
|
|
|
|
- logger.info("尝试使用基础处理模式")
|
|
|
|
|
- return await self._fallback_docx_processing(temp_file_path)
|
|
|
|
|
- except Exception as fallback_error:
|
|
|
|
|
- logger.error(f"基础处理模式也失败: {str(fallback_error)}")
|
|
|
|
|
- raise
|
|
|
|
|
- finally:
|
|
|
|
|
- # 清理临时文件
|
|
|
|
|
- if temp_file_path and os.path.exists(temp_file_path):
|
|
|
|
|
- try:
|
|
|
|
|
- os.unlink(temp_file_path)
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.warning(f"清理临时文件失败: {str(e)}")
|
|
|
|
|
-
|
|
|
|
|
- async def _fallback_pdf_processing(self, file_path: str) -> Dict[str, Any]:
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ logger.info("尝试使用基础处理模式")
|
|
|
|
|
+ return await self._fallback_docx_processing(file_content)
|
|
|
|
|
+ except Exception as fallback_error:
|
|
|
|
|
+ logger.error(f"基础处理模式也失败: {str(fallback_error)}")
|
|
|
|
|
+ raise
|
|
|
|
|
+
|
|
|
|
|
+ async def _fallback_pdf_processing(self, file_content: bytes) -> Dict[str, Any]:
|
|
|
"""PDF基础处理模式(当智能处理失败时使用)"""
|
|
"""PDF基础处理模式(当智能处理失败时使用)"""
|
|
|
|
|
+ temp_file_path = None
|
|
|
try:
|
|
try:
|
|
|
from langchain_community.document_loaders import PyPDFLoader
|
|
from langchain_community.document_loaders import PyPDFLoader
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
|
|
|
|
logger.info("使用基础PDF处理模式")
|
|
logger.info("使用基础PDF处理模式")
|
|
|
- loader = PyPDFLoader(file_path)
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # PyPDFLoader需要文件路径,创建临时文件
|
|
|
|
|
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
|
|
|
|
|
+ temp_file.write(file_content)
|
|
|
|
|
+ temp_file_path = temp_file.name
|
|
|
|
|
+
|
|
|
|
|
+ loader = PyPDFLoader(temp_file_path)
|
|
|
documents = loader.load()
|
|
documents = loader.load()
|
|
|
|
|
|
|
|
# 文本分块
|
|
# 文本分块
|
|
@@ -432,14 +411,22 @@ class DocumentProcessor:
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.error(f"基础PDF处理失败: {str(e)}")
|
|
logger.error(f"基础PDF处理失败: {str(e)}")
|
|
|
raise
|
|
raise
|
|
|
|
|
+ finally:
|
|
|
|
|
+ # 清理临时文件
|
|
|
|
|
+ if temp_file_path and os.path.exists(temp_file_path):
|
|
|
|
|
+ try:
|
|
|
|
|
+ os.unlink(temp_file_path)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"清理临时文件失败: {str(e)}")
|
|
|
|
|
|
|
|
- async def _fallback_docx_processing(self, file_path: str) -> Dict[str, Any]:
|
|
|
|
|
|
|
+ async def _fallback_docx_processing(self, file_content: bytes) -> Dict[str, Any]:
|
|
|
"""DOCX基础处理模式(当智能处理失败时使用)"""
|
|
"""DOCX基础处理模式(当智能处理失败时使用)"""
|
|
|
try:
|
|
try:
|
|
|
from docx import Document
|
|
from docx import Document
|
|
|
|
|
+ from io import BytesIO
|
|
|
|
|
|
|
|
- logger.info("使用基础DOCX处理模式")
|
|
|
|
|
- doc = Document(file_path)
|
|
|
|
|
|
|
+ logger.info("使用基础DOCX处理模式(内存模式)")
|
|
|
|
|
+ doc = Document(BytesIO(file_content))
|
|
|
full_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
|
|
full_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
|
|
|
|
|
|
|
|
# 简单分块,并过滤空内容
|
|
# 简单分块,并过滤空内容
|