4 tháng trước cách đây · 48ca139db6
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -98,18 +98,12 @@ class DocumentProcessor:
 
				 
			
 
				     async def parse_pdf_content(self, file_content: bytes) -> Dict[str, Any]:
			
 
				         """解析PDF内容，使用doc_worker的智能处理能力"""
			
 
				-        temp_file_path = None
			
 
				         try:
			
 
				-            # 保存到临时文件
			
 
				-            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
			
 
				-                temp_file.write(file_content)
			
 
				-                temp_file_path = temp_file.name
			
 
				-
			
 
				-            logger.info(f"开始使用doc_worker处理PDF文档: {temp_file_path}")
			
 
				+            logger.info("开始使用doc_worker处理PDF文档（内存模式）")
			
 
				 
			
 
				-            # 创建DocumentSource
			
 
				+            # 创建DocumentSource（纯内存模式，不使用临时文件）
			
 
				             source = DocumentSource(
			
 
				-                path=Path(temp_file_path),
			
 
				+                path=None,
			
 
				                 content=file_content,
			
 
				                 file_type='pdf'
			
 
				             )
			
@@ -120,7 +114,7 @@ class DocumentProcessor:
 
				             
			
 
				             if toc_info.get('toc_count', 0) == 0:
			
 
				                 logger.warning("未检测到目录，使用基础处理模式")
			
 
				-                return await self._fallback_pdf_processing(temp_file_path)
			
 
				+                return await self._fallback_pdf_processing(file_content)
			
 
				 
			
 
				             logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
			
 
				 
			
@@ -156,7 +150,7 @@ class DocumentProcessor:
 
				             
			
 
				             if not pages_content:
			
 
				                 logger.warning("无法提取文档全文，使用基础处理模式")
			
 
				-                return await self._fallback_pdf_processing(temp_file_path)
			
 
				+                return await self._fallback_pdf_processing(file_content)
			
 
				 
			
 
				             total_chars = sum(len(page.get('text', '')) for page in pages_content)
			
 
				             logger.info(f"提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
			
@@ -177,7 +171,7 @@ class DocumentProcessor:
 
				 
			
 
				             if not chunks:
			
 
				                 logger.warning("未能生成任何文本块，使用基础处理模式")
			
 
				-                return await self._fallback_pdf_processing(temp_file_path)
			
 
				+                return await self._fallback_pdf_processing(file_content)
			
 
				 
			
 
				             logger.info(f"切分完成，共生成 {len(chunks)} 个文本块")
			
 
				 
			
@@ -221,35 +215,21 @@ class DocumentProcessor:
 
				         except Exception as e:
			
 
				             logger.error(f"PDF解析失败: {str(e)}")
			
 
				             # 如果智能处理失败，尝试基础处理
			
 
				-            if temp_file_path and os.path.exists(temp_file_path):
			
 
				-                try:
			
 
				-                    logger.info("尝试使用基础处理模式")
			
 
				-                    return await self._fallback_pdf_processing(temp_file_path)
			
 
				-                except Exception as fallback_error:
			
 
				-                    logger.error(f"基础处理模式也失败: {str(fallback_error)}")
			
 
				-            raise
			
 
				-        finally:
			
 
				-            # 清理临时文件
			
 
				-            if temp_file_path and os.path.exists(temp_file_path):
			
 
				-                try:
			
 
				-                    os.unlink(temp_file_path)
			
 
				-                except Exception as e:
			
 
				-                    logger.warning(f"清理临时文件失败: {str(e)}")
			
 
				+            try:
			
 
				+                logger.info("尝试使用基础处理模式")
			
 
				+                return await self._fallback_pdf_processing(file_content)
			
 
				+            except Exception as fallback_error:
			
 
				+                logger.error(f"基础处理模式也失败: {str(fallback_error)}")
			
 
				+                raise
			
 
				 
			
 
				     async def parse_docx_content(self, file_content: bytes) -> Dict[str, Any]:
			
 
				         """解析DOCX内容，使用doc_worker的智能处理能力"""
			
 
				-        temp_file_path = None
			
 
				         try:
			
 
				-            # 保存到临时文件
			
 
				-            with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
			
 
				-                temp_file.write(file_content)
			
 
				-                temp_file_path = temp_file.name
			
 
				-
			
 
				-            logger.info(f"开始使用doc_worker处理DOCX文档: {temp_file_path}")
			
 
				+            logger.info("开始使用doc_worker处理DOCX文档（内存模式）")
			
 
				 
			
 
				-            # 创建DocumentSource
			
 
				+            # 创建DocumentSource（纯内存模式，不使用临时文件）
			
 
				             source = DocumentSource(
			
 
				-                path=Path(temp_file_path),
			
 
				+                path=None,
			
 
				                 content=file_content,
			
 
				                 file_type='docx'
			
 
				             )
			
@@ -260,7 +240,7 @@ class DocumentProcessor:
 
				             
			
 
				             if toc_info.get('toc_count', 0) == 0:
			
 
				                 logger.warning("未检测到目录，使用基础处理模式")
			
 
				-                return await self._fallback_docx_processing(temp_file_path)
			
 
				+                return await self._fallback_docx_processing(file_content)
			
 
				 
			
 
				             logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
			
 
				 
			
@@ -296,7 +276,7 @@ class DocumentProcessor:
 
				             
			
 
				             if not pages_content:
			
 
				                 logger.warning("无法提取文档全文，使用基础处理模式")
			
 
				-                return await self._fallback_docx_processing(temp_file_path)
			
 
				+                return await self._fallback_docx_processing(file_content)
			
 
				 
			
 
				             total_chars = sum(len(page.get('text', '')) for page in pages_content)
			
 
				             logger.info(f"提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
			
@@ -317,7 +297,7 @@ class DocumentProcessor:
 
				 
			
 
				             if not chunks:
			
 
				                 logger.warning("未能生成任何文本块，使用基础处理模式")
			
 
				-                return await self._fallback_docx_processing(temp_file_path)
			
 
				+                return await self._fallback_docx_processing(file_content)
			
 
				 
			
 
				             logger.info(f"切分完成，共生成 {len(chunks)} 个文本块")
			
 
				 
			
@@ -366,29 +346,28 @@ class DocumentProcessor:
 
				         except Exception as e:
			
 
				             logger.error(f"DOCX解析失败: {str(e)}")
			
 
				             # 如果智能处理失败，尝试基础处理
			
 
				-            if temp_file_path and os.path.exists(temp_file_path):
			
 
				-                try:
			
 
				-                    logger.info("尝试使用基础处理模式")
			
 
				-                    return await self._fallback_docx_processing(temp_file_path)
			
 
				-                except Exception as fallback_error:
			
 
				-                    logger.error(f"基础处理模式也失败: {str(fallback_error)}")
			
 
				-            raise
			
 
				-        finally:
			
 
				-            # 清理临时文件
			
 
				-            if temp_file_path and os.path.exists(temp_file_path):
			
 
				-                try:
			
 
				-                    os.unlink(temp_file_path)
			
 
				-                except Exception as e:
			
 
				-                    logger.warning(f"清理临时文件失败: {str(e)}")
			
 
				-
			
 
				-    async def _fallback_pdf_processing(self, file_path: str) -> Dict[str, Any]:
			
 
				+            try:
			
 
				+                logger.info("尝试使用基础处理模式")
			
 
				+                return await self._fallback_docx_processing(file_content)
			
 
				+            except Exception as fallback_error:
			
 
				+                logger.error(f"基础处理模式也失败: {str(fallback_error)}")
			
 
				+                raise
			
 
				+
			
 
				+    async def _fallback_pdf_processing(self, file_content: bytes) -> Dict[str, Any]:
			
 
				         """PDF基础处理模式（当智能处理失败时使用）"""
			
 
				+        temp_file_path = None
			
 
				         try:
			
 
				             from langchain_community.document_loaders import PyPDFLoader
			
 
				             from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				             
			
 
				             logger.info("使用基础PDF处理模式")
			
 
				-            loader = PyPDFLoader(file_path)
			
 
				+            
			
 
				+            # PyPDFLoader需要文件路径，创建临时文件
			
 
				+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
			
 
				+                temp_file.write(file_content)
			
 
				+                temp_file_path = temp_file.name
			
 
				+            
			
 
				+            loader = PyPDFLoader(temp_file_path)
			
 
				             documents = loader.load()
			
 
				 
			
 
				             # 文本分块
			
@@ -432,14 +411,22 @@ class DocumentProcessor:
 
				         except Exception as e:
			
 
				             logger.error(f"基础PDF处理失败: {str(e)}")
			
 
				             raise
			
 
				+        finally:
			
 
				+            # 清理临时文件
			
 
				+            if temp_file_path and os.path.exists(temp_file_path):
			
 
				+                try:
			
 
				+                    os.unlink(temp_file_path)
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"清理临时文件失败: {str(e)}")
			
 
				 
			
 
				-    async def _fallback_docx_processing(self, file_path: str) -> Dict[str, Any]:
			
 
				+    async def _fallback_docx_processing(self, file_content: bytes) -> Dict[str, Any]:
			
 
				         """DOCX基础处理模式（当智能处理失败时使用）"""
			
 
				         try:
			
 
				             from docx import Document
			
 
				+            from io import BytesIO
			
 
				             
			
 
				-            logger.info("使用基础DOCX处理模式")
			
 
				-            doc = Document(file_path)
			
 
				+            logger.info("使用基础DOCX处理模式（内存模式）")
			
 
				+            doc = Document(BytesIO(file_content))
			
 
				             full_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
			
 
				 
			
 
				             # 简单分块，并过滤空内容