4 месяцев назад · 48ca139db6
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -98,18 +98,12 @@ class DocumentProcessor:
 
															     async def parse_pdf_content(self, file_content: bytes) -> Dict[str, Any]:
														
 
															         """解析PDF内容，使用doc_worker的智能处理能力"""
														
 
															-        temp_file_path = None
														
 
															         try:
														
 
															-            # 保存到临时文件
														
 
															-            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
														
 
															-                temp_file.write(file_content)
														
 
															-                temp_file_path = temp_file.name
														
 
															-
														
 
															-            logger.info(f"开始使用doc_worker处理PDF文档: {temp_file_path}")
														
 
															+            logger.info("开始使用doc_worker处理PDF文档（内存模式）")
														
 
															-            # 创建DocumentSource
														
 
															+            # 创建DocumentSource（纯内存模式，不使用临时文件）
														
 
															             source = DocumentSource(
														
 
															-                path=Path(temp_file_path),
														
 
															+                path=None,
														
 
															                 content=file_content,
														
 
															                 file_type='pdf'
														
 
															             )
														
@@ -120,7 +114,7 @@ class DocumentProcessor:
 
															             if toc_info.get('toc_count', 0) == 0:
														
 
															                 logger.warning("未检测到目录，使用基础处理模式")
														
 
															-                return await self._fallback_pdf_processing(temp_file_path)
														
 
															+                return await self._fallback_pdf_processing(file_content)
														
 
															             logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
														
@@ -156,7 +150,7 @@ class DocumentProcessor:
 
															             if not pages_content:
														
 
															                 logger.warning("无法提取文档全文，使用基础处理模式")
														
 
															-                return await self._fallback_pdf_processing(temp_file_path)
														
 
															+                return await self._fallback_pdf_processing(file_content)
														
 
															             total_chars = sum(len(page.get('text', '')) for page in pages_content)
														
 
															             logger.info(f"提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
														
@@ -177,7 +171,7 @@ class DocumentProcessor:
 
															             if not chunks:
														
 
															                 logger.warning("未能生成任何文本块，使用基础处理模式")
														
 
															-                return await self._fallback_pdf_processing(temp_file_path)
														
 
															+                return await self._fallback_pdf_processing(file_content)
														
 
															             logger.info(f"切分完成，共生成 {len(chunks)} 个文本块")
														
@@ -221,35 +215,21 @@ class DocumentProcessor:
 
															         except Exception as e:
														
 
															             logger.error(f"PDF解析失败: {str(e)}")
														
 
															             # 如果智能处理失败，尝试基础处理
														
 
															-            if temp_file_path and os.path.exists(temp_file_path):
														
 
															-                try:
														
 
															-                    logger.info("尝试使用基础处理模式")
														
 
															-                    return await self._fallback_pdf_processing(temp_file_path)
														
 
															-                except Exception as fallback_error:
														
 
															-                    logger.error(f"基础处理模式也失败: {str(fallback_error)}")
														
 
															-            raise
														
 
															-        finally:
														
 
															-            # 清理临时文件
														
 
															-            if temp_file_path and os.path.exists(temp_file_path):
														
 
															-                try:
														
 
															-                    os.unlink(temp_file_path)
														
 
															-                except Exception as e:
														
 
															-                    logger.warning(f"清理临时文件失败: {str(e)}")
														
 
															+            try:
														
 
															+                logger.info("尝试使用基础处理模式")
														
 
															+                return await self._fallback_pdf_processing(file_content)
														
 
															+            except Exception as fallback_error:
														
 
															+                logger.error(f"基础处理模式也失败: {str(fallback_error)}")
														
 
															+                raise
														
 
															     async def parse_docx_content(self, file_content: bytes) -> Dict[str, Any]:
														
 
															         """解析DOCX内容，使用doc_worker的智能处理能力"""
														
 
															-        temp_file_path = None
														
 
															         try:
														
 
															-            # 保存到临时文件
														
 
															-            with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
														
 
															-                temp_file.write(file_content)
														
 
															-                temp_file_path = temp_file.name
														
 
															-
														
 
															-            logger.info(f"开始使用doc_worker处理DOCX文档: {temp_file_path}")
														
 
															+            logger.info("开始使用doc_worker处理DOCX文档（内存模式）")
														
 
															-            # 创建DocumentSource
														
 
															+            # 创建DocumentSource（纯内存模式，不使用临时文件）
														
 
															             source = DocumentSource(
														
 
															-                path=Path(temp_file_path),
														
 
															+                path=None,
														
 
															                 content=file_content,
														
 
															                 file_type='docx'
														
 
															             )
														
@@ -260,7 +240,7 @@ class DocumentProcessor:
 
															             if toc_info.get('toc_count', 0) == 0:
														
 
															                 logger.warning("未检测到目录，使用基础处理模式")
														
 
															-                return await self._fallback_docx_processing(temp_file_path)
														
 
															+                return await self._fallback_docx_processing(file_content)
														
 
															             logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
														
@@ -296,7 +276,7 @@ class DocumentProcessor:
 
															             if not pages_content:
														
 
															                 logger.warning("无法提取文档全文，使用基础处理模式")
														
 
															-                return await self._fallback_docx_processing(temp_file_path)
														
 
															+                return await self._fallback_docx_processing(file_content)
														
 
															             total_chars = sum(len(page.get('text', '')) for page in pages_content)
														
 
															             logger.info(f"提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
														
@@ -317,7 +297,7 @@ class DocumentProcessor:
 
															             if not chunks:
														
 
															                 logger.warning("未能生成任何文本块，使用基础处理模式")
														
 
															-                return await self._fallback_docx_processing(temp_file_path)
														
 
															+                return await self._fallback_docx_processing(file_content)
														
 
															             logger.info(f"切分完成，共生成 {len(chunks)} 个文本块")
														
@@ -366,29 +346,28 @@ class DocumentProcessor:
 
															         except Exception as e:
														
 
															             logger.error(f"DOCX解析失败: {str(e)}")
														
 
															             # 如果智能处理失败，尝试基础处理
														
 
															-            if temp_file_path and os.path.exists(temp_file_path):
														
 
															-                try:
														
 
															-                    logger.info("尝试使用基础处理模式")
														
 
															-                    return await self._fallback_docx_processing(temp_file_path)
														
 
															-                except Exception as fallback_error:
														
 
															-                    logger.error(f"基础处理模式也失败: {str(fallback_error)}")
														
 
															-            raise
														
 
															-        finally:
														
 
															-            # 清理临时文件
														
 
															-            if temp_file_path and os.path.exists(temp_file_path):
														
 
															-                try:
														
 
															-                    os.unlink(temp_file_path)
														
 
															-                except Exception as e:
														
 
															-                    logger.warning(f"清理临时文件失败: {str(e)}")
														
 
															-
														
 
															-    async def _fallback_pdf_processing(self, file_path: str) -> Dict[str, Any]:
														
 
															+            try:
														
 
															+                logger.info("尝试使用基础处理模式")
														
 
															+                return await self._fallback_docx_processing(file_content)
														
 
															+            except Exception as fallback_error:
														
 
															+                logger.error(f"基础处理模式也失败: {str(fallback_error)}")
														
 
															+                raise
														
 
															+
														
 
															+    async def _fallback_pdf_processing(self, file_content: bytes) -> Dict[str, Any]:
														
 
															         """PDF基础处理模式（当智能处理失败时使用）"""
														
 
															+        temp_file_path = None
														
 
															         try:
														
 
															             from langchain_community.document_loaders import PyPDFLoader
														
 
															             from langchain.text_splitter import RecursiveCharacterTextSplitter
														
 
															             logger.info("使用基础PDF处理模式")
														
 
															-            loader = PyPDFLoader(file_path)
														
 
															+            
														
 
															+            # PyPDFLoader需要文件路径，创建临时文件
														
 
															+            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
														
 
															+                temp_file.write(file_content)
														
 
															+                temp_file_path = temp_file.name
														
 
															+            
														
 
															+            loader = PyPDFLoader(temp_file_path)
														
 
															             documents = loader.load()
														
 
															             # 文本分块
														
@@ -432,14 +411,22 @@ class DocumentProcessor:
 
															         except Exception as e:
														
 
															             logger.error(f"基础PDF处理失败: {str(e)}")
														
 
															             raise
														
 
															+        finally:
														
 
															+            # 清理临时文件
														
 
															+            if temp_file_path and os.path.exists(temp_file_path):
														
 
															+                try:
														
 
															+                    os.unlink(temp_file_path)
														
 
															+                except Exception as e:
														
 
															+                    logger.warning(f"清理临时文件失败: {str(e)}")
														
 
															-    async def _fallback_docx_processing(self, file_path: str) -> Dict[str, Any]:
														
 
															+    async def _fallback_docx_processing(self, file_content: bytes) -> Dict[str, Any]:
														
 
															         """DOCX基础处理模式（当智能处理失败时使用）"""
														
 
															         try:
														
 
															             from docx import Document
														
 
															+            from io import BytesIO
														
 
															-            logger.info("使用基础DOCX处理模式")
														
 
															-            doc = Document(file_path)
														
 
															+            logger.info("使用基础DOCX处理模式（内存模式）")
														
 
															+            doc = Document(BytesIO(file_content))
														
 
															             full_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
														
 
															             # 简单分块，并过滤空内容