5 달 전 · 1e9570efec
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -1,23 +1,43 @@
 
				 """
			
 
				 文档处理器
			
 
				 负责文档解析、内容提取和结构化处理
			
 
				+集成doc_worker模块的智能处理能力
			
 
				 """
			
 
				 
			
 
				-import io   
			
 
				-from docx import Document
			
 
				+import io
			
 
				+import os
			
 
				+import tempfile
			
 
				+from pathlib import Path
			
 
				 from typing import Dict, Any, Optional, Callable
			
 
				 from datetime import datetime
			
 
				 
			
 
				 from foundation.logger.loggering import server_logger as logger
			
 
				 
			
 
				-from langchain_community.document_loaders import PyPDFLoader
			
 
				-from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				+# 引入doc_worker核心组件
			
 
				+try:
			
 
				+    from ..doc_worker import TOCExtractor, TextSplitter, LLMClassifier
			
 
				+    from ..doc_worker.config_loader import get_config
			
 
				+except ImportError:
			
 
				+    from core.construction_review.doc_worker import TOCExtractor, TextSplitter, LLMClassifier
			
 
				+    from core.construction_review.doc_worker.config_loader import get_config
			
 
				 
			
 
				 class DocumentProcessor:
			
 
				     """文档处理器"""
			
 
				 
			
 
				     def __init__(self):
			
 
				         self.supported_types = ['pdf', 'docx']
			
 
				+        # 初始化doc_worker组件
			
 
				+        self.toc_extractor = TOCExtractor()
			
 
				+        self.text_splitter = TextSplitter()
			
 
				+        self.config = get_config()
			
 
				+        # LLM分类器可选，如果配置了模型URL则初始化
			
 
				+        self.llm_classifier = None
			
 
				+        try:
			
 
				+            model_url = self.config.llm_model_url
			
 
				+            if model_url:
			
 
				+                self.llm_classifier = LLMClassifier(model_url)
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"LLM分类器初始化失败，将使用基础处理模式: {str(e)}")
			
 
				 
			
 
				     async def process_document(self, file_content: bytes, file_type: str,
			
 
				                              progress_callback: Optional[Callable[[int, str], None]] = None) -> Dict[str, Any]:
			
@@ -56,16 +76,286 @@ class DocumentProcessor:
 
				             raise
			
 
				 
			
 
				     async def parse_pdf_content(self, file_content: bytes) -> Dict[str, Any]:
			
 
				-        """解析PDF内容"""
			
 
				+        """解析PDF内容，使用doc_worker的智能处理能力"""
			
 
				+        temp_file_path = None
			
 
				         try:
			
 
				             # 保存到临时文件
			
 
				-            import tempfile
			
 
				             with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
			
 
				                 temp_file.write(file_content)
			
 
				                 temp_file_path = temp_file.name
			
 
				 
			
 
				-            # 使用PyPDFLoader解析
			
 
				-            loader = PyPDFLoader(temp_file_path)
			
 
				+            logger.info(f"开始使用doc_worker处理PDF文档: {temp_file_path}")
			
 
				+
			
 
				+            # 步骤1: 提取目录
			
 
				+            logger.info("步骤1: 提取文档目录")
			
 
				+            toc_info = self.toc_extractor.extract_toc(temp_file_path)
			
 
				+            
			
 
				+            if toc_info['toc_count'] == 0:
			
 
				+                logger.warning("未检测到目录，使用基础处理模式")
			
 
				+                return await self._fallback_pdf_processing(temp_file_path)
			
 
				+
			
 
				+            logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
			
 
				+
			
 
				+            # 步骤2: 使用LLM进行分类（如果可用）
			
 
				+            classified_items = None
			
 
				+            target_level = self.config.target_level
			
 
				+            
			
 
				+            if self.llm_classifier:
			
 
				+                try:
			
 
				+                    logger.info(f"步骤2: 使用LLM对{target_level}级目录进行分类")
			
 
				+                    classification_result = self.llm_classifier.classify(
			
 
				+                        toc_info['toc_items'],
			
 
				+                        target_level=target_level
			
 
				+                    )
			
 
				+                    if classification_result:
			
 
				+                        classified_items = classification_result['items']
			
 
				+                        logger.info(f"分类完成，共分类 {len(classified_items)} 个目录项")
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"LLM分类失败，使用目录项直接处理: {str(e)}")
			
 
				+            
			
 
				+            # 如果没有分类结果，使用原始目录项（筛选目标层级）
			
 
				+            if not classified_items:
			
 
				+                classified_items = [
			
 
				+                    item for item in toc_info['toc_items'] 
			
 
				+                    if item['level'] == target_level
			
 
				+                ]
			
 
				+                # 为每个目录项添加默认分类信息
			
 
				+                for item in classified_items:
			
 
				+                    item['category'] = '未分类'
			
 
				+                    item['category_code'] = 'other'
			
 
				+
			
 
				+            # 步骤3: 提取文档全文
			
 
				+            logger.info("步骤3: 提取文档全文")
			
 
				+            pages_content = self.text_splitter.extract_full_text(temp_file_path)
			
 
				+            
			
 
				+            if not pages_content:
			
 
				+                logger.warning("无法提取文档全文，使用基础处理模式")
			
 
				+                return await self._fallback_pdf_processing(temp_file_path)
			
 
				+
			
 
				+            total_chars = sum(len(page['text']) for page in pages_content)
			
 
				+            logger.info(f"提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
			
 
				+
			
 
				+            # 步骤4: 按分类标题智能切分文本
			
 
				+            logger.info("步骤4: 按分类标题智能切分文本")
			
 
				+            max_chunk_size = self.config.max_chunk_size
			
 
				+            min_chunk_size = self.config.min_chunk_size
			
 
				+            
			
 
				+            chunks = self.text_splitter.split_by_hierarchy(
			
 
				+                classified_items,
			
 
				+                pages_content,
			
 
				+                toc_info,
			
 
				+                target_level=target_level,
			
 
				+                max_chunk_size=max_chunk_size,
			
 
				+                min_chunk_size=min_chunk_size
			
 
				+            )
			
 
				+
			
 
				+            if not chunks:
			
 
				+                logger.warning("未能生成任何文本块，使用基础处理模式")
			
 
				+                return await self._fallback_pdf_processing(temp_file_path)
			
 
				+
			
 
				+            logger.info(f"切分完成，共生成 {len(chunks)} 个文本块")
			
 
				+
			
 
				+            # 适配返回格式
			
 
				+            return {
			
 
				+                'document_type': 'pdf',
			
 
				+                'total_pages': len(pages_content),
			
 
				+                'total_chunks': len(chunks),
			
 
				+                'chunks': [
			
 
				+                    {
			
 
				+                        'page': chunk.get('element_tag', {}).get('page', 0),
			
 
				+                        'content': chunk.get('review_chunk_content', ''),
			
 
				+                        'metadata': {
			
 
				+                            'chunk_id': chunk.get('chunk_id', ''),
			
 
				+                            'section_label': chunk.get('section_label', ''),
			
 
				+                            'project_plan_type': chunk.get('project_plan_type', ''),
			
 
				+                            'element_tag': chunk.get('element_tag', {})
			
 
				+                        }
			
 
				+                    }
			
 
				+                    for chunk in chunks
			
 
				+                ],
			
 
				+                'splits': [
			
 
				+                    {
			
 
				+                        'content': chunk.get('review_chunk_content', ''),
			
 
				+                        'metadata': {
			
 
				+                            'chunk_id': chunk.get('chunk_id', ''),
			
 
				+                            'section_label': chunk.get('section_label', ''),
			
 
				+                            'page': chunk.get('element_tag', {}).get('page', 0)
			
 
				+                        }
			
 
				+                    }
			
 
				+                    for chunk in chunks
			
 
				+                ],
			
 
				+                'toc_info': toc_info,
			
 
				+                'classification': {
			
 
				+                    'items': classified_items,
			
 
				+                    'target_level': target_level
			
 
				+                } if classified_items else None
			
 
				+            }
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"PDF解析失败: {str(e)}")
			
 
				+            # 如果智能处理失败，尝试基础处理
			
 
				+            if temp_file_path and os.path.exists(temp_file_path):
			
 
				+                try:
			
 
				+                    logger.info("尝试使用基础处理模式")
			
 
				+                    return await self._fallback_pdf_processing(temp_file_path)
			
 
				+                except Exception as fallback_error:
			
 
				+                    logger.error(f"基础处理模式也失败: {str(fallback_error)}")
			
 
				+            raise
			
 
				+        finally:
			
 
				+            # 清理临时文件
			
 
				+            if temp_file_path and os.path.exists(temp_file_path):
			
 
				+                try:
			
 
				+                    os.unlink(temp_file_path)
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"清理临时文件失败: {str(e)}")
			
 
				+
			
 
				+    async def parse_docx_content(self, file_content: bytes) -> Dict[str, Any]:
			
 
				+        """解析DOCX内容，使用doc_worker的智能处理能力"""
			
 
				+        temp_file_path = None
			
 
				+        try:
			
 
				+            # 保存到临时文件
			
 
				+            with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
			
 
				+                temp_file.write(file_content)
			
 
				+                temp_file_path = temp_file.name
			
 
				+
			
 
				+            logger.info(f"开始使用doc_worker处理DOCX文档: {temp_file_path}")
			
 
				+
			
 
				+            # 步骤1: 提取目录
			
 
				+            logger.info("步骤1: 提取文档目录")
			
 
				+            toc_info = self.toc_extractor.extract_toc(temp_file_path)
			
 
				+            
			
 
				+            if toc_info['toc_count'] == 0:
			
 
				+                logger.warning("未检测到目录，使用基础处理模式")
			
 
				+                return await self._fallback_docx_processing(temp_file_path)
			
 
				+
			
 
				+            logger.info(f"成功提取 {toc_info['toc_count']} 个目录项")
			
 
				+
			
 
				+            # 步骤2: 使用LLM进行分类（如果可用）
			
 
				+            classified_items = None
			
 
				+            target_level = self.config.target_level
			
 
				+            
			
 
				+            if self.llm_classifier:
			
 
				+                try:
			
 
				+                    logger.info(f"步骤2: 使用LLM对{target_level}级目录进行分类")
			
 
				+                    classification_result = self.llm_classifier.classify(
			
 
				+                        toc_info['toc_items'],
			
 
				+                        target_level=target_level
			
 
				+                    )
			
 
				+                    if classification_result:
			
 
				+                        classified_items = classification_result['items']
			
 
				+                        logger.info(f"分类完成，共分类 {len(classified_items)} 个目录项")
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"LLM分类失败，使用目录项直接处理: {str(e)}")
			
 
				+            
			
 
				+            # 如果没有分类结果，使用原始目录项（筛选目标层级）
			
 
				+            if not classified_items:
			
 
				+                classified_items = [
			
 
				+                    item for item in toc_info['toc_items'] 
			
 
				+                    if item['level'] == target_level
			
 
				+                ]
			
 
				+                # 为每个目录项添加默认分类信息
			
 
				+                for item in classified_items:
			
 
				+                    item['category'] = '未分类'
			
 
				+                    item['category_code'] = 'other'
			
 
				+
			
 
				+            # 步骤3: 提取文档全文
			
 
				+            logger.info("步骤3: 提取文档全文")
			
 
				+            pages_content = self.text_splitter.extract_full_text(temp_file_path)
			
 
				+            
			
 
				+            if not pages_content:
			
 
				+                logger.warning("无法提取文档全文，使用基础处理模式")
			
 
				+                return await self._fallback_docx_processing(temp_file_path)
			
 
				+
			
 
				+            total_chars = sum(len(page['text']) for page in pages_content)
			
 
				+            logger.info(f"提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
			
 
				+
			
 
				+            # 步骤4: 按分类标题智能切分文本
			
 
				+            logger.info("步骤4: 按分类标题智能切分文本")
			
 
				+            max_chunk_size = self.config.max_chunk_size
			
 
				+            min_chunk_size = self.config.min_chunk_size
			
 
				+            
			
 
				+            chunks = self.text_splitter.split_by_hierarchy(
			
 
				+                classified_items,
			
 
				+                pages_content,
			
 
				+                toc_info,
			
 
				+                target_level=target_level,
			
 
				+                max_chunk_size=max_chunk_size,
			
 
				+                min_chunk_size=min_chunk_size
			
 
				+            )
			
 
				+
			
 
				+            if not chunks:
			
 
				+                logger.warning("未能生成任何文本块，使用基础处理模式")
			
 
				+                return await self._fallback_docx_processing(temp_file_path)
			
 
				+
			
 
				+            logger.info(f"切分完成，共生成 {len(chunks)} 个文本块")
			
 
				+
			
 
				+            # 适配返回格式
			
 
				+            return {
			
 
				+                'document_type': 'docx',
			
 
				+                'total_pages': len(pages_content),
			
 
				+                'total_chunks': len(chunks),
			
 
				+                'chunks': [
			
 
				+                    {
			
 
				+                        'page': chunk.get('element_tag', {}).get('page', 0),
			
 
				+                        'content': chunk.get('review_chunk_content', ''),
			
 
				+                        'metadata': {
			
 
				+                            'chunk_id': chunk.get('chunk_id', ''),
			
 
				+                            'section_label': chunk.get('section_label', ''),
			
 
				+                            'project_plan_type': chunk.get('project_plan_type', ''),
			
 
				+                            'element_tag': chunk.get('element_tag', {})
			
 
				+                        }
			
 
				+                    }
			
 
				+                    for chunk in chunks
			
 
				+                ],
			
 
				+                'splits': [
			
 
				+                    {
			
 
				+                        'content': chunk.get('review_chunk_content', ''),
			
 
				+                        'metadata': {
			
 
				+                            'chunk_id': chunk.get('chunk_id', ''),
			
 
				+                            'section_label': chunk.get('section_label', ''),
			
 
				+                            'page': chunk.get('element_tag', {}).get('page', 0)
			
 
				+                        }
			
 
				+                    }
			
 
				+                    for chunk in chunks
			
 
				+                ],
			
 
				+                'full_text': ''.join([page['text'] for page in pages_content]),
			
 
				+                'toc_info': toc_info,
			
 
				+                'classification': {
			
 
				+                    'items': classified_items,
			
 
				+                    'target_level': target_level
			
 
				+                } if classified_items else None,
			
 
				+                'metadata': {
			
 
				+                    'total_pages': len(pages_content),
			
 
				+                    'total_chars': total_chars
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"DOCX解析失败: {str(e)}")
			
 
				+            # 如果智能处理失败，尝试基础处理
			
 
				+            if temp_file_path and os.path.exists(temp_file_path):
			
 
				+                try:
			
 
				+                    logger.info("尝试使用基础处理模式")
			
 
				+                    return await self._fallback_docx_processing(temp_file_path)
			
 
				+                except Exception as fallback_error:
			
 
				+                    logger.error(f"基础处理模式也失败: {str(fallback_error)}")
			
 
				+            raise
			
 
				+        finally:
			
 
				+            # 清理临时文件
			
 
				+            if temp_file_path and os.path.exists(temp_file_path):
			
 
				+                try:
			
 
				+                    os.unlink(temp_file_path)
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"清理临时文件失败: {str(e)}")
			
 
				+
			
 
				+    async def _fallback_pdf_processing(self, file_path: str) -> Dict[str, Any]:
			
 
				+        """PDF基础处理模式（当智能处理失败时使用）"""
			
 
				+        try:
			
 
				+            from langchain_community.document_loaders import PyPDFLoader
			
 
				+            from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				+            
			
 
				+            logger.info("使用基础PDF处理模式")
			
 
				+            loader = PyPDFLoader(file_path)
			
 
				             documents = loader.load()
			
 
				 
			
 
				             # 文本分块
			
@@ -75,23 +365,21 @@ class DocumentProcessor:
 
				                 separators=["\n\n", "\n", " ", ""]
			
 
				             )
			
 
				             splits = text_splitter.split_documents(documents)
			
 
				-            original_count = len(splits)  # 记录原始分块数量
			
 
				 
			
 
				-            # 过滤空内容切块，确保每个切块内容不为空
			
 
				+            # 过滤空内容切块
			
 
				             valid_splits = []
			
 
				             for split in splits:
			
 
				                 content = split.page_content.strip()
			
 
				-                if content:  # 确保内容不为空
			
 
				-                    split.page_content = content  # 更新清理后的内容
			
 
				+                if content:
			
 
				+                    split.page_content = content
			
 
				                     valid_splits.append(split)
			
 
				 
			
 
				-            splits = valid_splits  # 使用过滤后的切块
			
 
				-            logger.info(f"PDF解析完成，过滤前分块数量: {original_count}，过滤后有效分块数量: {len(splits)}")
			
 
				+            logger.info(f"基础处理完成，有效分块数量: {len(valid_splits)}")
			
 
				 
			
 
				             return {
			
 
				                 'document_type': 'pdf',
			
 
				                 'total_pages': len(documents),
			
 
				-                'total_chunks': len(splits),
			
 
				+                'total_chunks': len(valid_splits),
			
 
				                 'chunks': [
			
 
				                     {
			
 
				                         'page': doc.metadata.get('page', 0),
			
@@ -105,22 +393,20 @@ class DocumentProcessor:
 
				                         'content': split.page_content,
			
 
				                         'metadata': split.metadata
			
 
				                     }
			
 
				-                    for split in splits
			
 
				+                    for split in valid_splits
			
 
				                 ]
			
 
				             }
			
 
				-
			
 
				         except Exception as e:
			
 
				-            logger.error(f"PDF解析失败: {str(e)}")
			
 
				+            logger.error(f"基础PDF处理失败: {str(e)}")
			
 
				             raise
			
 
				 
			
 
				-    async def parse_docx_content(self, file_content: bytes) -> Dict[str, Any]:
			
 
				-        """解析DOCX内容"""
			
 
				+    async def _fallback_docx_processing(self, file_path: str) -> Dict[str, Any]:
			
 
				+        """DOCX基础处理模式（当智能处理失败时使用）"""
			
 
				         try:
			
 
				-            # 简化实现：直接返回文本内容
			
 
				-            # 实际实现中可以使用python-docx库
			
 
				-
			
 
				-
			
 
				-            doc = Document(io.BytesIO(file_content))
			
 
				+            from docx import Document
			
 
				+            
			
 
				+            logger.info("使用基础DOCX处理模式")
			
 
				+            doc = Document(file_path)
			
 
				             full_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
			
 
				 
			
 
				             # 简单分块，并过滤空内容
			
@@ -129,7 +415,7 @@ class DocumentProcessor:
 
				             chunk_index = 1
			
 
				             for i in range(0, len(full_text), chunk_size):
			
 
				                 chunk_text = full_text[i:i+chunk_size].strip()
			
 
				-                if chunk_text:  # 确保切块内容不为空
			
 
				+                if chunk_text:
			
 
				                     chunks.append({
			
 
				                         'chunk_id': f'chunk_{chunk_index}',
			
 
				                         'content': chunk_text,
			
@@ -137,7 +423,7 @@ class DocumentProcessor:
 
				                     })
			
 
				                     chunk_index += 1
			
 
				 
			
 
				-            logger.info(f"DOCX解析完成，有效分块数量: {len(chunks)}")
			
 
				+            logger.info(f"基础处理完成，有效分块数量: {len(chunks)}")
			
 
				 
			
 
				             return {
			
 
				                 'document_type': 'docx',
			
@@ -149,48 +435,84 @@ class DocumentProcessor:
 
				                     'word_count': len(full_text.split())
			
 
				                 }
			
 
				             }
			
 
				-
			
 
				         except Exception as e:
			
 
				-            logger.error(f"DOCX解析失败: {str(e)}")
			
 
				+            logger.error(f"基础DOCX处理失败: {str(e)}")
			
 
				             raise
			
 
				 
			
 
				     def structure_content(self, raw_content: Dict[str, Any]) -> Dict[str, Any]:
			
 
				-        """结构化处理"""
			
 
				+        """结构化处理，适配doc_worker返回的格式"""
			
 
				         try:
			
 
				-            if raw_content['document_type'] == 'pdf':
			
 
				-                # PDF结构化
			
 
				+            document_type = raw_content.get('document_type', 'unknown')
			
 
				+            
			
 
				+            # 检查是否使用了doc_worker的智能处理（有toc_info或classification字段）
			
 
				+            is_smart_processing = 'toc_info' in raw_content or 'classification' in raw_content
			
 
				+            
			
 
				+            if is_smart_processing:
			
 
				+                # 使用doc_worker智能处理的结果
			
 
				                 chunks = []
			
 
				-                for i, chunk in enumerate(raw_content['chunks']):
			
 
				-                    content = chunk['content'].strip()
			
 
				-                    if content:  # 确保内容不为空
			
 
				+                for chunk in raw_content.get('chunks', []):
			
 
				+                    content = chunk.get('content', '').strip()
			
 
				+                    if content:
			
 
				+                        metadata = chunk.get('metadata', {})
			
 
				+                        element_tag = metadata.get('element_tag', {})
			
 
				+                        
			
 
				                         chunks.append({
			
 
				-                            'chunk_id': f'chunk_{i+1}',
			
 
				-                            'page': chunk['page'],
			
 
				+                            'chunk_id': metadata.get('chunk_id', ''),
			
 
				+                            'page': chunk.get('page', 0),
			
 
				                             'content': content,
			
 
				-                            'chapter': f'第{chunk["page"]}页',
			
 
				-                            'title': f'内容块{i+1}',
			
 
				+                            'section_label': metadata.get('section_label', ''),
			
 
				+                            'project_plan_type': metadata.get('project_plan_type', ''),
			
 
				+                            'element_tag': element_tag,
			
 
				+                            'chapter': metadata.get('section_label', f'第{chunk.get("page", 0)}页'),
			
 
				+                            'title': metadata.get('section_label', ''),
			
 
				                             'original_content': content[:100] + '...' if len(content) > 100 else content
			
 
				                         })
			
 
				             else:
			
 
				-                # DOCX结构化 - 也进行空内容检查
			
 
				-                all_chunks = raw_content.get('chunks', [])
			
 
				-                chunks = []
			
 
				-                for chunk in all_chunks:
			
 
				-                    content = chunk.get('content', '').strip()
			
 
				-                    if content:  # 确保内容不为空
			
 
				-                        chunks.append({
			
 
				-                            'chunk_id': chunk.get('chunk_id', f'chunk_{len(chunks)+1}'),
			
 
				-                            'content': content,
			
 
				-                            'metadata': chunk.get('metadata', {})
			
 
				-                        })
			
 
				+                # 使用基础处理的结果
			
 
				+                if document_type == 'pdf':
			
 
				+                    chunks = []
			
 
				+                    for i, chunk in enumerate(raw_content.get('chunks', [])):
			
 
				+                        content = chunk.get('content', '').strip() if isinstance(chunk, dict) else str(chunk).strip()
			
 
				+                        if content:
			
 
				+                            page = chunk.get('page', 0) if isinstance(chunk, dict) else 0
			
 
				+                            chunks.append({
			
 
				+                                'chunk_id': f'chunk_{i+1}',
			
 
				+                                'page': page,
			
 
				+                                'content': content,
			
 
				+                                'chapter': f'第{page}页',
			
 
				+                                'title': f'内容块{i+1}',
			
 
				+                                'original_content': content[:100] + '...' if len(content) > 100 else content
			
 
				+                            })
			
 
				+                else:
			
 
				+                    # DOCX基础处理
			
 
				+                    all_chunks = raw_content.get('chunks', [])
			
 
				+                    chunks = []
			
 
				+                    for chunk in all_chunks:
			
 
				+                        content = chunk.get('content', '').strip()
			
 
				+                        if content:
			
 
				+                            chunks.append({
			
 
				+                                'chunk_id': chunk.get('chunk_id', f'chunk_{len(chunks)+1}'),
			
 
				+                                'content': content,
			
 
				+                                'metadata': chunk.get('metadata', {})
			
 
				+                            })
			
 
				 
			
 
				-            return {
			
 
				-                'document_name': f"施工方案文档_{raw_content.get('document_type', 'unknown')}",
			
 
				-                'document_type': raw_content['document_type'],
			
 
				-                'total_chunks': len(chunks),  # 使用实际的切块数量
			
 
				+            # 构建返回结果
			
 
				+            result = {
			
 
				+                'document_name': f"施工方案文档_{document_type}",
			
 
				+                'document_type': document_type,
			
 
				+                'total_chunks': len(chunks),
			
 
				                 'chunks': chunks,
			
 
				                 'metadata': raw_content.get('metadata', {})
			
 
				             }
			
 
				+            
			
 
				+            # 如果使用了智能处理，保留额外信息
			
 
				+            if is_smart_processing:
			
 
				+                if 'toc_info' in raw_content:
			
 
				+                    result['toc_info'] = raw_content['toc_info']
			
 
				+                if 'classification' in raw_content:
			
 
				+                    result['classification'] = raw_content['classification']
			
 
				+
			
 
				+            return result
			
 
				 
			
 
				         except Exception as e:
			
 
				             logger.error(f"内容结构化失败: {str(e)}")
			
--- a/core/construction_review/doc_worker/__init__.py
+++ b/core/construction_review/doc_worker/__init__.py
@@ -0,0 +1,50 @@
 
				+"""
			
 
				+文档分类切分库
			
 
				+支持PDF和Word文档的目录提取、智能分类和文本切分
			
 
				+
			
 
				+主要功能：
			
 
				+1. 提取PDF/Word文档的目录结构
			
 
				+2. 使用大语言模型对目录进行智能分类
			
 
				+3. 按目录层级和字符数智能切分文本
			
 
				+4. 保存分类结果到多种格式
			
 
				+
			
 
				+使用示例：
			
 
				+    from doc_classifier import DocumentClassifier
			
 
				+    
			
 
				+    # 创建分类器实例
			
 
				+    classifier = DocumentClassifier(
			
 
				+        model_url="http://172.16.35.50:8000/v1/chat/completions"
			
 
				+    )
			
 
				+    
			
 
				+    # 处理文档
			
 
				+    result = classifier.process_document(
			
 
				+        file_path="document.pdf",
			
 
				+        target_level=2,
			
 
				+        output_dir="./output"
			
 
				+    )
			
 
				+"""
			
 
				+
			
 
				+__version__ = "2.0.0"
			
 
				+__author__ = "Your Name"
			
 
				+
			
 
				+try:
			
 
				+    from .core import DocumentClassifier
			
 
				+    from .toc_extractor import TOCExtractor
			
 
				+    from .text_splitter import TextSplitter
			
 
				+    from .llm_classifier import LLMClassifier
			
 
				+    from .result_saver import ResultSaver
			
 
				+except ImportError:
			
 
				+    from core import DocumentClassifier
			
 
				+    from toc_extractor import TOCExtractor
			
 
				+    from text_splitter import TextSplitter
			
 
				+    from llm_classifier import LLMClassifier
			
 
				+    from result_saver import ResultSaver
			
 
				+
			
 
				+__all__ = [
			
 
				+    'DocumentClassifier',
			
 
				+    'TOCExtractor',
			
 
				+    'TextSplitter',
			
 
				+    'LLMClassifier',
			
 
				+    'ResultSaver'
			
 
				+]
			
 
				+
			
--- a/core/construction_review/doc_worker/config.yaml
+++ b/core/construction_review/doc_worker/config.yaml
@@ -0,0 +1,173 @@
 
				+# 文档分类切分库配置文件
			
 
				+
			
 
				+# 大语言模型配置
			
 
				+llm:
			
 
				+  # 模型API地址
			
 
				+  model_url: "http://172.16.35.50:8000/v1/chat/completions"
			
 
				+  # 模型名称
			
 
				+  model_name: "Qwen2.5-7B-Instruct"
			
 
				+  # 温度参数（越低越确定）
			
 
				+  temperature: 0.1
			
 
				+  # 请求超时时间（秒）
			
 
				+  timeout: 60
			
 
				+
			
 
				+# 文本切分配置
			
 
				+text_splitting:
			
 
				+  # 目标层级（默认按几级目录分类）
			
 
				+  target_level: 1
			
 
				+  # 最大分块字符数
			
 
				+  max_chunk_size: 1000
			
 
				+  # 最小分块字符数
			
 
				+  min_chunk_size: 500
			
 
				+  # 模糊匹配阈值（0-1）
			
 
				+  fuzzy_threshold: 0.80
			
 
				+
			
 
				+# 目录提取配置
			
 
				+toc_extraction:
			
 
				+  # 最多读取的页数（目录通常在前几页）
			
 
				+  max_pages: 15
			
 
				+  # Word文档每页段落数（模拟分页）
			
 
				+  paragraphs_per_page: 30
			
 
				+
			
 
				+# 分类类别配置
			
 
				+categories:
			
 
				+  # 中文名称到英文代码的映射
			
 
				+  mapping:
			
 
				+    编制依据: basis
			
 
				+    工程概况: overview
			
 
				+    施工计划: plan
			
 
				+    施工工艺计算: technology
			
 
				+    安全保证措施: safety
			
 
				+    质量保证措施: quality
			
 
				+    环境保证措施: environment
			
 
				+    施工管理及作业人员配备与分工: management
			
 
				+    验收要求: acceptance
			
 
				+    其它资料: other
			
 
				+  
			
 
				+  # 类别描述（用于LLM分类提示词）
			
 
				+  descriptions:
			
 
				+    编制依据: "包括编制依据、编制说明、规范标准、设计文件、相关法律法规等内容"
			
 
				+    工程概况: "包括项目概况、工程概况、项目背景、建设概况、工程特点等内容"
			
 
				+    施工计划: "包括施工计划、施工进度计划、施工部署、施工准备、总体安排等内容"
			
 
				+    施工工艺计算: "包括施工工艺、施工方法、工艺流程、技术方案、施工计算等内容"
			
 
				+    安全保证措施: "包括安全保证措施、安全管理、安全施工、安全防护、安全生产等内容"
			
 
				+    质量保证措施: "包括质量保证措施、质量管理、质量控制、质量检验、质量标准等内容"
			
 
				+    环境保证措施: "包括环境保护措施、环保施工、水土保持、文明施工、环境管理等内容"
			
 
				+    施工管理及作业人员配备与分工: "包括人员配置、组织机构、人员分工、劳动力安排、管理体系等内容"
			
 
				+    验收要求: "包括验收标准、验收程序、验收要求、交工验收、竣工验收等内容"
			
 
				+    其它资料: "其他说明等不属于以上任何类别的内容"
			
 
				+
			
 
				+# LLM分类提示词模板
			
 
				+prompts:
			
 
				+  classification: |
			
 
				+    你是一个专业的工程文档分析助手。现在需要你对以下目录项进行分类。
			
 
				+
			
 
				+    【分类类别说明】
			
 
				+    {category_descriptions}
			
 
				+
			
 
				+    【待分类的目录项】
			
 
				+    {toc_items}
			
 
				+
			
 
				+    【任务要求】
			
 
				+    1. 请仔细阅读每个目录项的标题
			
 
				+    2. 根据标题的语义，将每个目录项分配到最合适的类别中
			
 
				+    3. 每个目录项只能属于一个类别
			
 
				+    4. 如果某个目录项不确定或不属于任何明确类别，请归类到"其它资料"
			
 
				+
			
 
				+    【输出格式】
			
 
				+    请严格按照以下JSON格式输出，不要包含任何其他文字说明：
			
 
				+    {{
			
 
				+      "分类结果": [
			
 
				+        {{
			
 
				+          "序号": 1,
			
 
				+          "标题": "目录项标题",
			
 
				+          "类别": "所属类别名称"
			
 
				+        }}
			
 
				+      ]
			
 
				+    }}
			
 
				+
			
 
				+    请开始分类：
			
 
				+
			
 
				+# 输出配置
			
 
				+output:
			
 
				+  # 默认输出目录名称
			
 
				+  default_dir_name: "分类切分结果"
			
 
				+  # 是否默认保存结果
			
 
				+  save_results: true
			
 
				+  # 文件名最大长度
			
 
				+  max_filename_length: 200
			
 
				+
			
 
				+# 标题层级识别配置
			
 
				+title_patterns:
			
 
				+  # 一级标题模式
			
 
				+  level1:
			
 
				+    - '^【\d+】'
			
 
				+    - '^第[一二三四五六七八九十\d]+章'
			
 
				+    - '^第[一二三四五六七八九十\d]+部分'
			
 
				+    - '^[一二三四五六七八九十]、'
			
 
				+    - '^\d+、'
			
 
				+    - '^第\d+条'
			
 
				+  
			
 
				+  # 二级标题模式
			
 
				+  level2:
			
 
				+    - '^第[一二三四五六七八九十\d]+节'
			
 
				+    - '^[一二三四五六七八九十]+、'
			
 
				+    - '^\(\d+\)'
			
 
				+    - '^（[一二三四五六七八九十\d]+）'
			
 
				+    - '^〖\d+(?:\.\d+)*〗'
			
 
				+  
			
 
				+  # 三级标题模式
			
 
				+  level3:
			
 
				+    - '^\([一二三四五六七八九十]+\)'
			
 
				+    - '^[①②③④⑤⑥⑦⑧⑨⑩]'
			
 
				+
			
 
				+# 编号格式配置
			
 
				+numbering:
			
 
				+  # 支持的编号格式
			
 
				+  formats:
			
 
				+    - '^【\d+】'
			
 
				+    - '^第[一二三四五六七八九十\d]+[章节条款]'
			
 
				+    - '^\d+[、．.]'
			
 
				+    - '^[一二三四五六七八九十]+[、．.]'
			
 
				+    - '^\d+\.\d+'
			
 
				+    - '^\(\d+\)'
			
 
				+    - '^（[一二三四五六七八九十\d]+）'
			
 
				+    - '^\([一二三四五六七八九十]+\)'
			
 
				+    - '^[①②③④⑤⑥⑦⑧⑨⑩]'
			
 
				+    - '^〖\d+(?:\.\d+)*〗'
			
 
				+
			
 
				+# 噪音过滤配置
			
 
				+noise_filters:
			
 
				+  # 噪音模式（用于过滤非目录内容）
			
 
				+  patterns:
			
 
				+    - '^\d{4}[-年]\d{1,2}[-月]\d{1,2}'
			
 
				+    - '^[A-Za-z0-9\-]{20,}$'
			
 
				+    - '^http[s]?://'
			
 
				+    - '^第\s*\d+\s*页'
			
 
				+    - '^共\s*\d+\s*页'
			
 
				+    - '^[\d\s\-_.]+$'
			
 
				+
			
 
				+# 目录识别配置
			
 
				+toc_detection:
			
 
				+  # 目录行的正则模式
			
 
				+  patterns:
			
 
				+    - '^(第[一二三四五六七八九十\d]+[章节条款].+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^(〖\d+(?:\.\d+)*〗.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^(\d+[、．.]\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^([一二三四五六七八九十]+[、．.]\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^(\d+(?:\.\d+)+\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^(.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+  
			
 
				+  # 标题长度限制
			
 
				+  min_length: 3
			
 
				+  max_length: 200
			
 
				+
			
 
				+# 日志配置
			
 
				+logging:
			
 
				+  # 日志级别（DEBUG, INFO, WARNING, ERROR）
			
 
				+  level: INFO
			
 
				+  # 日志格式
			
 
				+  format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
			
 
				+  # 日志文件名
			
 
				+  filename: 'doc_classifier.log'
			
 
				+
			
--- a/core/construction_review/doc_worker/config_loader.py
+++ b/core/construction_review/doc_worker/config_loader.py
@@ -0,0 +1,194 @@
 
				+"""
			
 
				+配置加载模块
			
 
				+从config.yaml文件加载配置参数
			
 
				+"""
			
 
				+
			
 
				+import yaml
			
 
				+from pathlib import Path
			
 
				+
			
 
				+
			
 
				+class Config:
			
 
				+    """配置类，用于加载和访问配置参数"""
			
 
				+    
			
 
				+    _instance = None
			
 
				+    _config = None
			
 
				+    
			
 
				+    def __new__(cls):
			
 
				+        """单例模式"""
			
 
				+        if cls._instance is None:
			
 
				+            cls._instance = super().__new__(cls)
			
 
				+        return cls._instance
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        """初始化配置"""
			
 
				+        if self._config is None:
			
 
				+            self.load_config()
			
 
				+    
			
 
				+    def load_config(self, config_path=None):
			
 
				+        """
			
 
				+        加载配置文件
			
 
				+        
			
 
				+        参数:
			
 
				+            config_path: 配置文件路径，默认为当前目录下的config.yaml
			
 
				+        """
			
 
				+        if config_path is None:
			
 
				+            config_path = Path(__file__).parent / 'config.yaml'
			
 
				+        else:
			
 
				+            config_path = Path(config_path)
			
 
				+        
			
 
				+        if not config_path.exists():
			
 
				+            raise FileNotFoundError(f"配置文件不存在: {config_path}")
			
 
				+        
			
 
				+        with open(config_path, 'r', encoding='utf-8') as f:
			
 
				+            self._config = yaml.safe_load(f)
			
 
				+    
			
 
				+    def get(self, key_path, default=None):
			
 
				+        """
			
 
				+        获取配置值
			
 
				+        
			
 
				+        参数:
			
 
				+            key_path: 配置键路径，用点号分隔，如 'llm.model_url'
			
 
				+            default: 默认值
			
 
				+            
			
 
				+        返回:
			
 
				+            配置值
			
 
				+        """
			
 
				+        keys = key_path.split('.')
			
 
				+        value = self._config
			
 
				+        
			
 
				+        for key in keys:
			
 
				+            if isinstance(value, dict) and key in value:
			
 
				+                value = value[key]
			
 
				+            else:
			
 
				+                return default
			
 
				+        
			
 
				+        return value
			
 
				+    
			
 
				+    # LLM配置
			
 
				+    @property
			
 
				+    def llm_model_url(self):
			
 
				+        return self.get('llm.model_url', 'http://172.16.35.50:8000/v1/chat/completions')
			
 
				+    
			
 
				+    @property
			
 
				+    def llm_model_name(self):
			
 
				+        return self.get('llm.model_name', 'Qwen2.5-7B-Instruct')
			
 
				+    
			
 
				+    @property
			
 
				+    def llm_temperature(self):
			
 
				+        return self.get('llm.temperature', 0.1)
			
 
				+    
			
 
				+    @property
			
 
				+    def llm_timeout(self):
			
 
				+        return self.get('llm.timeout', 60)
			
 
				+    
			
 
				+    # 文本切分配置
			
 
				+    @property
			
 
				+    def target_level(self):
			
 
				+        return self.get('text_splitting.target_level', 2)
			
 
				+    
			
 
				+    @property
			
 
				+    def max_chunk_size(self):
			
 
				+        return self.get('text_splitting.max_chunk_size', 1000)
			
 
				+    
			
 
				+    @property
			
 
				+    def min_chunk_size(self):
			
 
				+        return self.get('text_splitting.min_chunk_size', 500)
			
 
				+    
			
 
				+    @property
			
 
				+    def fuzzy_threshold(self):
			
 
				+        return self.get('text_splitting.fuzzy_threshold', 0.80)
			
 
				+    
			
 
				+    # 目录提取配置
			
 
				+    @property
			
 
				+    def toc_max_pages(self):
			
 
				+        return self.get('toc_extraction.max_pages', 15)
			
 
				+    
			
 
				+    @property
			
 
				+    def paragraphs_per_page(self):
			
 
				+        return self.get('toc_extraction.paragraphs_per_page', 30)
			
 
				+    
			
 
				+    # 分类配置
			
 
				+    @property
			
 
				+    def category_mapping(self):
			
 
				+        return self.get('categories.mapping', {})
			
 
				+    
			
 
				+    @property
			
 
				+    def category_descriptions(self):
			
 
				+        return self.get('categories.descriptions', {})
			
 
				+    
			
 
				+    # 提示词配置
			
 
				+    @property
			
 
				+    def classification_prompt_template(self):
			
 
				+        return self.get('prompts.classification', '')
			
 
				+    
			
 
				+    # 输出配置
			
 
				+    @property
			
 
				+    def default_output_dir(self):
			
 
				+        return self.get('output.default_dir_name', '分类切分结果')
			
 
				+    
			
 
				+    @property
			
 
				+    def save_results_default(self):
			
 
				+        return self.get('output.save_results', True)
			
 
				+    
			
 
				+    @property
			
 
				+    def max_filename_length(self):
			
 
				+        return self.get('output.max_filename_length', 200)
			
 
				+    
			
 
				+    # 标题模式配置
			
 
				+    @property
			
 
				+    def level1_patterns(self):
			
 
				+        return self.get('title_patterns.level1', [])
			
 
				+    
			
 
				+    @property
			
 
				+    def level2_patterns(self):
			
 
				+        return self.get('title_patterns.level2', [])
			
 
				+    
			
 
				+    @property
			
 
				+    def level3_patterns(self):
			
 
				+        return self.get('title_patterns.level3', [])
			
 
				+    
			
 
				+    # 编号格式配置
			
 
				+    @property
			
 
				+    def numbering_formats(self):
			
 
				+        return self.get('numbering.formats', [])
			
 
				+    
			
 
				+    # 噪音过滤配置
			
 
				+    @property
			
 
				+    def noise_patterns(self):
			
 
				+        return self.get('noise_filters.patterns', [])
			
 
				+    
			
 
				+    # 目录检测配置
			
 
				+    @property
			
 
				+    def toc_patterns(self):
			
 
				+        return self.get('toc_detection.patterns', [])
			
 
				+    
			
 
				+    @property
			
 
				+    def toc_min_length(self):
			
 
				+        return self.get('toc_detection.min_length', 3)
			
 
				+    
			
 
				+    @property
			
 
				+    def toc_max_length(self):
			
 
				+        return self.get('toc_detection.max_length', 200)
			
 
				+    
			
 
				+    # 日志配置
			
 
				+    @property
			
 
				+    def log_level(self):
			
 
				+        return self.get('logging.level', 'INFO')
			
 
				+    
			
 
				+    @property
			
 
				+    def log_format(self):
			
 
				+        return self.get('logging.format', '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+    
			
 
				+    @property
			
 
				+    def log_filename(self):
			
 
				+        return self.get('logging.filename', 'doc_classifier.log')
			
 
				+
			
 
				+
			
 
				+# 全局配置实例
			
 
				+config = Config()
			
 
				+
			
 
				+
			
 
				+def get_config():
			
 
				+    """获取全局配置实例"""
			
 
				+    return config
			
 
				+
			
--- a/core/construction_review/doc_worker/core.py
+++ b/core/construction_review/doc_worker/core.py
@@ -0,0 +1,205 @@
 
				+"""
			
 
				+核心处理模块
			
 
				+提供统一的文档处理接口
			
 
				+"""
			
 
				+
			
 
				+from pathlib import Path
			
 
				+from collections import Counter
			
 
				+
			
 
				+try:
			
 
				+    from .toc_extractor import TOCExtractor
			
 
				+    from .llm_classifier import LLMClassifier
			
 
				+    from .text_splitter import TextSplitter
			
 
				+    from .result_saver import ResultSaver
			
 
				+    from .config_loader import get_config
			
 
				+except ImportError:
			
 
				+    from toc_extractor import TOCExtractor
			
 
				+    from llm_classifier import LLMClassifier
			
 
				+    from text_splitter import TextSplitter
			
 
				+    from result_saver import ResultSaver
			
 
				+    from config_loader import get_config
			
 
				+
			
 
				+
			
 
				+class DocumentClassifier:
			
 
				+    """
			
 
				+    文档分类切分器
			
 
				+    
			
 
				+    支持PDF和Word文档的目录提取、分类和文本切分
			
 
				+    """
			
 
				+    
			
 
				+    def __init__(self, model_url=None):
			
 
				+        """
			
 
				+        初始化文档分类器
			
 
				+        
			
 
				+        参数:
			
 
				+            model_url: 大语言模型API地址（可选，默认从配置文件读取）
			
 
				+        """
			
 
				+        self.config = get_config()
			
 
				+        self.model_url = model_url or self.config.llm_model_url
			
 
				+        self.toc_extractor = TOCExtractor()
			
 
				+        self.llm_classifier = LLMClassifier(model_url)
			
 
				+        self.text_splitter = TextSplitter()
			
 
				+        self.result_saver = ResultSaver()
			
 
				+    
			
 
				+    def process_document(self, file_path, target_level=None, output_dir=None, 
			
 
				+                        max_chunk_size=None, min_chunk_size=None, save_results=None):
			
 
				+        """
			
 
				+        处理文档：提取目录、分类、切分文本块
			
 
				+        
			
 
				+        参数:
			
 
				+            file_path: 文档文件路径(PDF或Word)
			
 
				+            target_level: 要分类的目标层级(可选，默认从配置文件读取)
			
 
				+            output_dir: 输出目录(可选，仅在save_results=True时使用)
			
 
				+            max_chunk_size: 最大分块字符数(可选，默认从配置文件读取)
			
 
				+            min_chunk_size: 最小分块字符数(可选，默认从配置文件读取)
			
 
				+            save_results: 是否保存结果到文件(可选，默认从配置文件读取)
			
 
				+            
			
 
				+        返回:
			
 
				+            dict: 处理结果，包含目录、分类和文本块信息
			
 
				+        """
			
 
				+        # 从配置文件读取默认值
			
 
				+        if target_level is None:
			
 
				+            target_level = self.config.target_level
			
 
				+        if max_chunk_size is None:
			
 
				+            max_chunk_size = self.config.max_chunk_size
			
 
				+        if min_chunk_size is None:
			
 
				+            min_chunk_size = self.config.min_chunk_size
			
 
				+        if save_results is None:
			
 
				+            save_results = self.config.save_results_default
			
 
				+        file_path = Path(file_path)
			
 
				+        
			
 
				+        # 检查文件是否存在
			
 
				+        if not file_path.exists():
			
 
				+            raise FileNotFoundError(f"文件不存在: {file_path}")
			
 
				+        
			
 
				+        # 检查文件格式
			
 
				+        file_ext = file_path.suffix.lower()
			
 
				+        if file_ext not in ['.pdf', '.docx', '.doc']:
			
 
				+            raise ValueError(f"不支持的文件格式: {file_ext}")
			
 
				+        
			
 
				+        print("=" * 100)
			
 
				+        print("文档分类切分工具 v2.0")
			
 
				+        print("=" * 100)
			
 
				+        print(f"\n文件: {file_path}")
			
 
				+        print(f"格式: {file_ext.upper()}")
			
 
				+        print(f"目标层级: {target_level}级")
			
 
				+        print(f"分块大小: {min_chunk_size}-{max_chunk_size}字符")
			
 
				+        print(f"模型地址: {self.model_url}")
			
 
				+        
			
 
				+        # 设置输出目录
			
 
				+        if output_dir is None:
			
 
				+            output_dir = file_path.parent / self.config.default_output_dir
			
 
				+        else:
			
 
				+            output_dir = Path(output_dir)
			
 
				+        
			
 
				+        # ========== 步骤1: 提取目录 ==========
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("步骤1: 提取文档目录")
			
 
				+        print("=" * 100)
			
 
				+        
			
 
				+        toc_info = self.toc_extractor.extract_toc(file_path)
			
 
				+        
			
 
				+        if toc_info['toc_count'] == 0:
			
 
				+            raise ValueError("未在文档中检测到目录，无法继续处理")
			
 
				+        
			
 
				+        print(f"\n成功提取 {toc_info['toc_count']} 个目录项")
			
 
				+        print(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}")
			
 
				+        
			
 
				+        # 显示目录层级统计
			
 
				+        level_counts = Counter([item['level'] for item in toc_info['toc_items']])
			
 
				+        print("\n目录层级分布:")
			
 
				+        for level in sorted(level_counts.keys()):
			
 
				+            print(f"  {level}级: {level_counts[level]} 项")
			
 
				+        
			
 
				+        # ========== 步骤2: 调用模型进行分类 ==========
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("步骤2: 调用模型进行智能分类")
			
 
				+        print("=" * 100)
			
 
				+        
			
 
				+        classification_result = self.llm_classifier.classify(
			
 
				+            toc_info['toc_items'],
			
 
				+            target_level=target_level
			
 
				+        )
			
 
				+        
			
 
				+        if classification_result is None:
			
 
				+            raise ValueError("分类失败，无法继续处理")
			
 
				+        
			
 
				+        # 显示分类统计
			
 
				+        category_counts = Counter([item['category'] for item in classification_result['items']])
			
 
				+        print(f"\n分类统计:")
			
 
				+        for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
			
 
				+            print(f"  {category}: {count} 项")
			
 
				+        
			
 
				+        # ========== 步骤3: 提取文档全文 ==========
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("步骤3: 提取文档全文")
			
 
				+        print("=" * 100)
			
 
				+        
			
 
				+        pages_content = self.text_splitter.extract_full_text(file_path)
			
 
				+        
			
 
				+        if not pages_content:
			
 
				+            raise ValueError("无法提取文档全文")
			
 
				+        
			
 
				+        total_chars = sum(len(page['text']) for page in pages_content)
			
 
				+        print(f"\n提取完成，共 {len(pages_content)} 页，{total_chars} 个字符")
			
 
				+        
			
 
				+        # ========== 步骤4: 按分类标题切分文本 ==========
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("步骤4: 按分类标题智能切分文本")
			
 
				+        print("=" * 100)
			
 
				+        
			
 
				+        chunks = self.text_splitter.split_by_hierarchy(
			
 
				+            classification_result['items'],
			
 
				+            pages_content,
			
 
				+            toc_info,
			
 
				+            target_level=target_level,
			
 
				+            max_chunk_size=max_chunk_size,
			
 
				+            min_chunk_size=min_chunk_size
			
 
				+        )
			
 
				+        
			
 
				+        if not chunks:
			
 
				+            raise ValueError("未能生成任何文本块")
			
 
				+        
			
 
				+        print(f"\n切分完成，共生成 {len(chunks)} 个文本块")
			
 
				+        
			
 
				+        # 显示前5个文本块的信息
			
 
				+        print("\n文本块预览:")
			
 
				+        for i, chunk in enumerate(chunks[:5], 1):
			
 
				+            print(f"  [{i}] {chunk['section_label']} ({len(chunk['review_chunk_content'])} 字符)")
			
 
				+        if len(chunks) > 5:
			
 
				+            print(f"  ... 还有 {len(chunks) - 5} 个文本块")
			
 
				+        
			
 
				+        # ========== 步骤5: 保存结果（可选） ==========
			
 
				+        saved_files = None
			
 
				+        if save_results:
			
 
				+            print("\n" + "=" * 100)
			
 
				+            print("步骤5: 保存结果")
			
 
				+            print("=" * 100)
			
 
				+            
			
 
				+            # 保存结果
			
 
				+            saved_files = self.result_saver.save_all(
			
 
				+                file_path, 
			
 
				+                toc_info, 
			
 
				+                classification_result, 
			
 
				+                chunks, 
			
 
				+                output_dir
			
 
				+            )
			
 
				+        
			
 
				+        # ========== 完成 ==========
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("处理完成！")
			
 
				+        print("=" * 100)
			
 
				+        
			
 
				+        if save_results:
			
 
				+            print(f"\n结果已保存到: {output_dir}")
			
 
				+        print(f"文本块总数: {len(chunks)}")
			
 
				+        print(f"类别数量: {len(category_counts)}")
			
 
				+        
			
 
				+        return {
			
 
				+            'toc_info': toc_info,
			
 
				+            'classification': classification_result,
			
 
				+            'chunks': chunks,
			
 
				+            'saved_files': saved_files,
			
 
				+            'output_dir': str(output_dir) if output_dir else None
			
 
				+        }
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/README.md
+++ b/core/construction_review/doc_worker/extra/README.md
@@ -0,0 +1,251 @@
 
				+# 文档分类切分库 (doc_classifier)
			
 
				+
			
 
				+一个统一的Python库，用于处理PDF和Word文档的目录提取、智能分类和文本切分。
			
 
				+
			
 
				+## 主要特性
			
 
				+
			
 
				+- 支持PDF和Word文档格式
			
 
				+- 自动提取文档目录结构
			
 
				+- 使用大语言模型进行智能分类
			
 
				+- 按目录层级和字符数智能切分文本
			
 
				+- 支持自定义分块大小
			
 
				+- 输出多种格式（JSON、Markdown、统计报告）
			
 
				+
			
 
				+## 安装
			
 
				+
			
 
				+```bash
			
 
				+pip install -r requirements.txt
			
 
				+```
			
 
				+
			
 
				+## 快速开始
			
 
				+
			
 
				+### 基础使用
			
 
				+
			
 
				+```python
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+# 创建分类器实例
			
 
				+classifier = DocumentClassifier(
			
 
				+    model_url="http://172.16.35.50:8000/v1/chat/completions"
			
 
				+)
			
 
				+
			
 
				+# 处理文档
			
 
				+result = classifier.process_document(
			
 
				+    file_path="document.pdf",
			
 
				+    target_level=2,
			
 
				+    output_dir="./output"
			
 
				+)
			
 
				+
			
 
				+print(f"生成了 {len(result['chunks'])} 个文本块")
			
 
				+```
			
 
				+
			
 
				+### 处理Word文档
			
 
				+
			
 
				+```python
			
 
				+# 支持.docx和.doc格式
			
 
				+result = classifier.process_document(
			
 
				+    file_path="document.docx",
			
 
				+    target_level=2
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 自定义分块大小
			
 
				+
			
 
				+```python
			
 
				+result = classifier.process_document(
			
 
				+    file_path="document.pdf",
			
 
				+    target_level=2,
			
 
				+    max_chunk_size=1500,  # 最大1500字符
			
 
				+    min_chunk_size=800    # 最小800字符
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## 分块逻辑
			
 
				+
			
 
				+新的智能分块逻辑：
			
 
				+
			
 
				+1. **按目录层级定位**：定位到指定层级的正文标题
			
 
				+2. **子标题切分**：在正文块中按最低层级子标题进行初步分块
			
 
				+3. **大块分割**：超过`max_chunk_size`的块按句子级分割（保持语义完整）
			
 
				+4. **小块合并**：不足`min_chunk_size`的块尝试合并（合并后不超过`max_chunk_size`）
			
 
				+
			
 
				+注意：分割产生的块不参与合并，确保语义完整性。
			
 
				+
			
 
				+## 输出数据格式
			
 
				+
			
 
				+每个文本块包含以下字段：
			
 
				+
			
 
				+```python
			
 
				+{
			
 
				+    "file_name": "文档名称.pdf",
			
 
				+    "chunk_id": "doc_chunk_1.2.1",
			
 
				+    "section_label": "第一章.工程概况->【1】 工程简介->〖1.2〗 自然及环境条件 ->1.2.1 位置及交通",
			
 
				+    "context_summary": "自然及环境条件",
			
 
				+    "project_plan_type": "overview",  # 分类代码
			
 
				+    "element_tag": {
			
 
				+        "chunk_id": "doc_chunk_1.2.1",
			
 
				+        "page": 5,
			
 
				+        "serial_number": "1.2.1"
			
 
				+    },
			
 
				+    "review_chunk_content": "正文内容..."
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+## 分类类别
			
 
				+
			
 
				+支持以下预定义分类：
			
 
				+
			
 
				+| 中文名称 | 英文代码 | 说明 |
			
 
				+|---------|---------|------|
			
 
				+| 编制依据 | basis | 编制依据、规范标准等 |
			
 
				+| 工程概况 | overview | 项目概况、工程特点等 |
			
 
				+| 施工计划 | plan | 施工计划、进度安排等 |
			
 
				+| 施工工艺计算 | technology | 施工工艺、技术方案等 |
			
 
				+| 安全保证措施 | safety | 安全管理、安全防护等 |
			
 
				+| 质量保证措施 | quality | 质量管理、质量控制等 |
			
 
				+| 环境保证措施 | environment | 环境保护、文明施工等 |
			
 
				+| 施工管理及作业人员配备与分工 | management | 人员配置、组织机构等 |
			
 
				+| 验收要求 | acceptance | 验收标准、验收程序等 |
			
 
				+| 其它资料 | other | 其他内容 |
			
 
				+
			
 
				+## 输出文件
			
 
				+
			
 
				+处理完成后会生成以下文件：
			
 
				+
			
 
				+1. **完整结果JSON**：包含所有数据的JSON文件
			
 
				+2. **分类文件夹**：按类别分组的Markdown文件
			
 
				+3. **索引文件**：README.md，包含所有文本块的索引
			
 
				+4. **统计报告**：详细的统计信息
			
 
				+
			
 
				+## API参考
			
 
				+
			
 
				+### DocumentClassifier
			
 
				+
			
 
				+主要的文档处理类。
			
 
				+
			
 
				+#### 初始化
			
 
				+
			
 
				+```python
			
 
				+classifier = DocumentClassifier(model_url="http://...")
			
 
				+```
			
 
				+
			
 
				+参数：
			
 
				+- `model_url` (str): 大语言模型API地址
			
 
				+
			
 
				+#### process_document
			
 
				+
			
 
				+处理文档的主要方法。
			
 
				+
			
 
				+```python
			
 
				+result = classifier.process_document(
			
 
				+    file_path,
			
 
				+    target_level=2,
			
 
				+    output_dir=None,
			
 
				+    max_chunk_size=1000,
			
 
				+    min_chunk_size=500
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+参数：
			
 
				+- `file_path` (str): 文档路径（PDF或Word）
			
 
				+- `target_level` (int): 要分类的目标层级，默认2
			
 
				+- `output_dir` (str): 输出目录，默认为源文件同目录下的"分类切分结果"
			
 
				+- `max_chunk_size` (int): 最大分块字符数，默认1000
			
 
				+- `min_chunk_size` (int): 最小分块字符数，默认500
			
 
				+
			
 
				+返回：
			
 
				+- `dict`: 包含处理结果的字典
			
 
				+
			
 
				+## 高级用法
			
 
				+
			
 
				+### 单独使用各模块
			
 
				+
			
 
				+```python
			
 
				+from doc_classifier import TOCExtractor, LLMClassifier, TextSplitter
			
 
				+
			
 
				+# 只提取目录
			
 
				+toc_extractor = TOCExtractor()
			
 
				+toc_info = toc_extractor.extract_toc("document.pdf")
			
 
				+
			
 
				+# 只进行分类
			
 
				+llm_classifier = LLMClassifier(model_url="http://...")
			
 
				+classification = llm_classifier.classify(toc_info['toc_items'], target_level=2)
			
 
				+
			
 
				+# 只切分文本
			
 
				+text_splitter = TextSplitter()
			
 
				+pages_content = text_splitter.extract_full_text("document.pdf")
			
 
				+chunks = text_splitter.split_by_hierarchy(
			
 
				+    classification['items'],
			
 
				+    pages_content,
			
 
				+    toc_info,
			
 
				+    target_level=2
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 批量处理
			
 
				+
			
 
				+```python
			
 
				+from pathlib import Path
			
 
				+
			
 
				+classifier = DocumentClassifier()
			
 
				+
			
 
				+for file_path in Path("./documents").glob("*.pdf"):
			
 
				+    try:
			
 
				+        result = classifier.process_document(file_path)
			
 
				+        print(f"完成: {file_path}")
			
 
				+    except Exception as e:
			
 
				+        print(f"错误: {file_path} - {str(e)}")
			
 
				+```
			
 
				+
			
 
				+## 注意事项
			
 
				+
			
 
				+1. 文档必须包含目录结构，否则无法处理
			
 
				+2. 目录项需要有明确的编号格式（如1.1、第一章等）
			
 
				+3. 需要确保大语言模型API可访问
			
 
				+4. 处理大文档时可能需要较长时间
			
 
				+
			
 
				+## 与旧版本对比
			
 
				+
			
 
				+### 主要改进
			
 
				+
			
 
				+1. **统一接口**：整合PDF和Word处理为单一接口
			
 
				+2. **智能分块**：新增按字符数的智能分割和合并
			
 
				+3. **新数据格式**：更规范的输出数据结构
			
 
				+4. **库模块化**：可作为Python库被其他项目调用
			
 
				+5. **更好的文档**：完整的API文档和使用示例
			
 
				+
			
 
				+### 迁移指南
			
 
				+
			
 
				+旧版本：
			
 
				+```python
			
 
				+from pdf_classifier.main import process_pdf_with_classification_and_split
			
 
				+
			
 
				+process_pdf_with_classification_and_split(
			
 
				+    pdf_path="doc.pdf",
			
 
				+    target_level=2
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+新版本：
			
 
				+```python
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+classifier = DocumentClassifier()
			
 
				+result = classifier.process_document(
			
 
				+    file_path="doc.pdf",
			
 
				+    target_level=2
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## 示例
			
 
				+
			
 
				+更多示例请参考 `example.py` 文件。
			
 
				+
			
 
				+## 许可证
			
 
				+
			
 
				+MIT License
			
 
				+
			
 
				+## 贡献
			
 
				+
			
 
				+欢迎提交问题和改进建议。
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/config.yaml
+++ b/core/construction_review/doc_worker/extra/config.yaml
@@ -0,0 +1,173 @@
 
				+# 文档分类切分库配置文件
			
 
				+
			
 
				+# 大语言模型配置
			
 
				+llm:
			
 
				+  # 模型API地址
			
 
				+  model_url: "http://172.16.35.50:8000/v1/chat/completions"
			
 
				+  # 模型名称
			
 
				+  model_name: "Qwen2.5-7B-Instruct"
			
 
				+  # 温度参数（越低越确定）
			
 
				+  temperature: 0.1
			
 
				+  # 请求超时时间（秒）
			
 
				+  timeout: 60
			
 
				+
			
 
				+# 文本切分配置
			
 
				+text_splitting:
			
 
				+  # 目标层级（默认按几级目录分类）
			
 
				+  target_level: 2
			
 
				+  # 最大分块字符数
			
 
				+  max_chunk_size: 1000
			
 
				+  # 最小分块字符数
			
 
				+  min_chunk_size: 500
			
 
				+  # 模糊匹配阈值（0-1）
			
 
				+  fuzzy_threshold: 0.80
			
 
				+
			
 
				+# 目录提取配置
			
 
				+toc_extraction:
			
 
				+  # 最多读取的页数（目录通常在前几页）
			
 
				+  max_pages: 15
			
 
				+  # Word文档每页段落数（模拟分页）
			
 
				+  paragraphs_per_page: 30
			
 
				+
			
 
				+# 分类类别配置
			
 
				+categories:
			
 
				+  # 中文名称到英文代码的映射
			
 
				+  mapping:
			
 
				+    编制依据: basis
			
 
				+    工程概况: overview
			
 
				+    施工计划: plan
			
 
				+    施工工艺计算: technology
			
 
				+    安全保证措施: safety
			
 
				+    质量保证措施: quality
			
 
				+    环境保证措施: environment
			
 
				+    施工管理及作业人员配备与分工: management
			
 
				+    验收要求: acceptance
			
 
				+    其它资料: other
			
 
				+  
			
 
				+  # 类别描述（用于LLM分类提示词）
			
 
				+  descriptions:
			
 
				+    编制依据: "包括编制依据、编制说明、规范标准、设计文件、相关法律法规等内容"
			
 
				+    工程概况: "包括项目概况、工程概况、项目背景、建设概况、工程特点等内容"
			
 
				+    施工计划: "包括施工计划、施工进度计划、施工部署、施工准备、总体安排等内容"
			
 
				+    施工工艺计算: "包括施工工艺、施工方法、工艺流程、技术方案、施工计算等内容"
			
 
				+    安全保证措施: "包括安全保证措施、安全管理、安全施工、安全防护、安全生产等内容"
			
 
				+    质量保证措施: "包括质量保证措施、质量管理、质量控制、质量检验、质量标准等内容"
			
 
				+    环境保证措施: "包括环境保护措施、环保施工、水土保持、文明施工、环境管理等内容"
			
 
				+    施工管理及作业人员配备与分工: "包括人员配置、组织机构、人员分工、劳动力安排、管理体系等内容"
			
 
				+    验收要求: "包括验收标准、验收程序、验收要求、交工验收、竣工验收等内容"
			
 
				+    其它资料: "其他说明等不属于以上任何类别的内容"
			
 
				+
			
 
				+# LLM分类提示词模板
			
 
				+prompts:
			
 
				+  classification: |
			
 
				+    你是一个专业的工程文档分析助手。现在需要你对以下目录项进行分类。
			
 
				+
			
 
				+    【分类类别说明】
			
 
				+    {category_descriptions}
			
 
				+
			
 
				+    【待分类的目录项】
			
 
				+    {toc_items}
			
 
				+
			
 
				+    【任务要求】
			
 
				+    1. 请仔细阅读每个目录项的标题
			
 
				+    2. 根据标题的语义，将每个目录项分配到最合适的类别中
			
 
				+    3. 每个目录项只能属于一个类别
			
 
				+    4. 如果某个目录项不确定或不属于任何明确类别，请归类到"其它资料"
			
 
				+
			
 
				+    【输出格式】
			
 
				+    请严格按照以下JSON格式输出，不要包含任何其他文字说明：
			
 
				+    {{
			
 
				+      "分类结果": [
			
 
				+        {{
			
 
				+          "序号": 1,
			
 
				+          "标题": "目录项标题",
			
 
				+          "类别": "所属类别名称"
			
 
				+        }}
			
 
				+      ]
			
 
				+    }}
			
 
				+
			
 
				+    请开始分类：
			
 
				+
			
 
				+# 输出配置
			
 
				+output:
			
 
				+  # 默认输出目录名称
			
 
				+  default_dir_name: "分类切分结果"
			
 
				+  # 是否默认保存结果
			
 
				+  save_results: true
			
 
				+  # 文件名最大长度
			
 
				+  max_filename_length: 200
			
 
				+
			
 
				+# 标题层级识别配置
			
 
				+title_patterns:
			
 
				+  # 一级标题模式
			
 
				+  level1:
			
 
				+    - '^【\d+】'
			
 
				+    - '^第[一二三四五六七八九十\d]+章'
			
 
				+    - '^第[一二三四五六七八九十\d]+部分'
			
 
				+    - '^[一二三四五六七八九十]、'
			
 
				+    - '^\d+、'
			
 
				+    - '^第\d+条'
			
 
				+  
			
 
				+  # 二级标题模式
			
 
				+  level2:
			
 
				+    - '^第[一二三四五六七八九十\d]+节'
			
 
				+    - '^[一二三四五六七八九十]+、'
			
 
				+    - '^\(\d+\)'
			
 
				+    - '^（[一二三四五六七八九十\d]+）'
			
 
				+    - '^〖\d+(?:\.\d+)*〗'
			
 
				+  
			
 
				+  # 三级标题模式
			
 
				+  level3:
			
 
				+    - '^\([一二三四五六七八九十]+\)'
			
 
				+    - '^[①②③④⑤⑥⑦⑧⑨⑩]'
			
 
				+
			
 
				+# 编号格式配置
			
 
				+numbering:
			
 
				+  # 支持的编号格式
			
 
				+  formats:
			
 
				+    - '^【\d+】'
			
 
				+    - '^第[一二三四五六七八九十\d]+[章节条款]'
			
 
				+    - '^\d+[、．.]'
			
 
				+    - '^[一二三四五六七八九十]+[、．.]'
			
 
				+    - '^\d+\.\d+'
			
 
				+    - '^\(\d+\)'
			
 
				+    - '^（[一二三四五六七八九十\d]+）'
			
 
				+    - '^\([一二三四五六七八九十]+\)'
			
 
				+    - '^[①②③④⑤⑥⑦⑧⑨⑩]'
			
 
				+    - '^〖\d+(?:\.\d+)*〗'
			
 
				+
			
 
				+# 噪音过滤配置
			
 
				+noise_filters:
			
 
				+  # 噪音模式（用于过滤非目录内容）
			
 
				+  patterns:
			
 
				+    - '^\d{4}[-年]\d{1,2}[-月]\d{1,2}'
			
 
				+    - '^[A-Za-z0-9\-]{20,}$'
			
 
				+    - '^http[s]?://'
			
 
				+    - '^第\s*\d+\s*页'
			
 
				+    - '^共\s*\d+\s*页'
			
 
				+    - '^[\d\s\-_.]+$'
			
 
				+
			
 
				+# 目录识别配置
			
 
				+toc_detection:
			
 
				+  # 目录行的正则模式
			
 
				+  patterns:
			
 
				+    - '^(第[一二三四五六七八九十\d]+[章节条款].+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^(〖\d+(?:\.\d+)*〗.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^(\d+[、．.]\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^([一二三四五六七八九十]+[、．.]\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^(\d+(?:\.\d+)+\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^(.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+  
			
 
				+  # 标题长度限制
			
 
				+  min_length: 3
			
 
				+  max_length: 200
			
 
				+
			
 
				+# 日志配置
			
 
				+logging:
			
 
				+  # 日志级别（DEBUG, INFO, WARNING, ERROR）
			
 
				+  level: INFO
			
 
				+  # 日志格式
			
 
				+  format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
			
 
				+  # 日志文件名
			
 
				+  filename: 'doc_classifier.log'
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/example.py
+++ b/core/construction_review/doc_worker/extra/example.py
@@ -0,0 +1,175 @@
 
				+"""
			
 
				+使用示例
			
 
				+演示如何使用doc_classifier库
			
 
				+"""
			
 
				+
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+
			
 
				+def example_basic():
			
 
				+    """基础使用示例"""
			
 
				+    print("=" * 100)
			
 
				+    print("示例1: 基础使用")
			
 
				+    print("=" * 100)
			
 
				+    
			
 
				+    # 创建分类器实例
			
 
				+    classifier = DocumentClassifier(
			
 
				+        model_url="http://172.16.35.50:8000/v1/chat/completions"
			
 
				+    )
			
 
				+    
			
 
				+    # 处理PDF文档
			
 
				+    result = classifier.process_document(
			
 
				+        file_path="example.pdf",
			
 
				+        target_level=2,
			
 
				+        output_dir="./output"
			
 
				+    )
			
 
				+    
			
 
				+    print(f"\n处理完成！")
			
 
				+    print(f"生成了 {len(result['chunks'])} 个文本块")
			
 
				+    print(f"结果保存在: {result['output_dir']}")
			
 
				+
			
 
				+
			
 
				+def example_word():
			
 
				+    """处理Word文档示例"""
			
 
				+    print("\n" + "=" * 100)
			
 
				+    print("示例2: 处理Word文档")
			
 
				+    print("=" * 100)
			
 
				+    
			
 
				+    classifier = DocumentClassifier()
			
 
				+    
			
 
				+    # 处理Word文档
			
 
				+    result = classifier.process_document(
			
 
				+        file_path="example.docx",
			
 
				+        target_level=2,
			
 
				+        output_dir="./output_word"
			
 
				+    )
			
 
				+    
			
 
				+    print(f"\n处理完成！")
			
 
				+    print(f"生成了 {len(result['chunks'])} 个文本块")
			
 
				+
			
 
				+
			
 
				+def example_custom_chunk_size():
			
 
				+    """自定义分块大小示例"""
			
 
				+    print("\n" + "=" * 100)
			
 
				+    print("示例3: 自定义分块大小")
			
 
				+    print("=" * 100)
			
 
				+    
			
 
				+    classifier = DocumentClassifier()
			
 
				+    
			
 
				+    # 自定义分块大小
			
 
				+    result = classifier.process_document(
			
 
				+        file_path="example.pdf",
			
 
				+        target_level=2,
			
 
				+        max_chunk_size=1500,  # 最大1500字符
			
 
				+        min_chunk_size=800,   # 最小800字符
			
 
				+        output_dir="./output_custom"
			
 
				+    )
			
 
				+    
			
 
				+    print(f"\n处理完成！")
			
 
				+    print(f"生成了 {len(result['chunks'])} 个文本块")
			
 
				+
			
 
				+
			
 
				+def example_no_save():
			
 
				+    """不保存文件，只获取数据示例"""
			
 
				+    print("\n" + "=" * 100)
			
 
				+    print("示例4: 不保存文件，只获取数据")
			
 
				+    print("=" * 100)
			
 
				+    
			
 
				+    classifier = DocumentClassifier()
			
 
				+    
			
 
				+    # 不保存文件，只处理数据
			
 
				+    result = classifier.process_document(
			
 
				+        file_path="example.pdf",
			
 
				+        target_level=2,
			
 
				+        save_results=False  # 不保存文件
			
 
				+    )
			
 
				+    
			
 
				+    # 直接使用返回的数据
			
 
				+    print(f"\n获取到 {len(result['chunks'])} 个文本块")
			
 
				+    print("可以直接在内存中处理这些数据，无需保存文件")
			
 
				+    
			
 
				+    # 示例：筛选特定类别的内容
			
 
				+    safety_chunks = [
			
 
				+        chunk for chunk in result['chunks']
			
 
				+        if chunk['project_plan_type'] == 'safety'
			
 
				+    ]
			
 
				+    print(f"其中安全保证措施类别有 {len(safety_chunks)} 个块")
			
 
				+
			
 
				+
			
 
				+def example_access_chunks():
			
 
				+    """访问文本块数据示例"""
			
 
				+    print("\n" + "=" * 100)
			
 
				+    print("示例5: 访问文本块数据")
			
 
				+    print("=" * 100)
			
 
				+    
			
 
				+    classifier = DocumentClassifier()
			
 
				+    
			
 
				+    result = classifier.process_document(
			
 
				+        file_path="example.pdf",
			
 
				+        target_level=2,
			
 
				+        save_results=False  # 不保存文件
			
 
				+    )
			
 
				+    
			
 
				+    # 访问文本块数据
			
 
				+    print(f"\n文本块数据结构示例：")
			
 
				+    if result['chunks']:
			
 
				+        chunk = result['chunks'][0]
			
 
				+        print(f"文件名: {chunk['file_name']}")
			
 
				+        print(f"块ID: {chunk['chunk_id']}")
			
 
				+        print(f"章节标签: {chunk['section_label']}")
			
 
				+        print(f"上下文摘要: {chunk['context_summary']}")
			
 
				+        print(f"方案类型: {chunk['project_plan_type']}")
			
 
				+        print(f"页码: {chunk['element_tag']['page']}")
			
 
				+        print(f"内容长度: {len(chunk['review_chunk_content'])} 字符")
			
 
				+        print(f"内容预览: {chunk['review_chunk_content'][:100]}...")
			
 
				+
			
 
				+
			
 
				+def example_batch_processing():
			
 
				+    """批量处理示例"""
			
 
				+    print("\n" + "=" * 100)
			
 
				+    print("示例6: 批量处理多个文档")
			
 
				+    print("=" * 100)
			
 
				+    
			
 
				+    from pathlib import Path
			
 
				+    
			
 
				+    classifier = DocumentClassifier()
			
 
				+    
			
 
				+    # 要处理的文件列表
			
 
				+    files = [
			
 
				+        "doc1.pdf",
			
 
				+        "doc2.docx",
			
 
				+        "doc3.pdf"
			
 
				+    ]
			
 
				+    
			
 
				+    results = []
			
 
				+    for file_path in files:
			
 
				+        if not Path(file_path).exists():
			
 
				+            print(f"跳过不存在的文件: {file_path}")
			
 
				+            continue
			
 
				+        
			
 
				+        try:
			
 
				+            print(f"\n处理文件: {file_path}")
			
 
				+            result = classifier.process_document(
			
 
				+                file_path=file_path,
			
 
				+                target_level=2
			
 
				+            )
			
 
				+            results.append(result)
			
 
				+            print(f"完成: {file_path}")
			
 
				+        except Exception as e:
			
 
				+            print(f"错误: {file_path} - {str(e)}")
			
 
				+    
			
 
				+    print(f"\n批量处理完成！共处理 {len(results)} 个文件")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 运行示例（根据需要取消注释）
			
 
				+    
			
 
				+    # example_basic()
			
 
				+    # example_word()
			
 
				+    # example_custom_chunk_size()
			
 
				+    # example_no_save()
			
 
				+    # example_access_chunks()
			
 
				+    # example_batch_processing()
			
 
				+    
			
 
				+    print("\n请取消注释要运行的示例函数")
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/main.py
+++ b/core/construction_review/doc_worker/extra/main.py
@@ -0,0 +1,112 @@
 
				+"""
			
 
				+命令行入口程序
			
 
				+提供命令行接口来使用doc_classifier库
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import argparse
			
 
				+from pathlib import Path
			
 
				+from core import DocumentClassifier
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='文档分类切分工具 - 支持PDF和Word文档',
			
 
				+        formatter_class=argparse.RawDescriptionHelpFormatter,
			
 
				+        epilog="""
			
 
				+使用示例:
			
 
				+  python main.py document.pdf
			
 
				+  python main.py document.docx -l 2 -o ./output
			
 
				+  python main.py document.pdf --max-size 1500 --min-size 800
			
 
				+        """
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        'file_path',
			
 
				+        help='文档路径（PDF或Word）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-l', '--level',
			
 
				+        type=int,
			
 
				+        default=2,
			
 
				+        help='要分类的目标层级（默认: 2）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '-o', '--output',
			
 
				+        help='输出目录（默认: 源文件同目录下的"分类切分结果"）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--max-size',
			
 
				+        type=int,
			
 
				+        default=1000,
			
 
				+        help='最大分块字符数（默认: 1000）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--min-size',
			
 
				+        type=int,
			
 
				+        default=500,
			
 
				+        help='最小分块字符数（默认: 500）'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--model-url',
			
 
				+        default='http://172.16.35.50:8000/v1/chat/completions',
			
 
				+        help='大语言模型API地址'
			
 
				+    )
			
 
				+    
			
 
				+    parser.add_argument(
			
 
				+        '--no-save',
			
 
				+        action='store_true',
			
 
				+        help='不保存结果到文件（仅返回数据）'
			
 
				+    )
			
 
				+    
			
 
				+    args = parser.parse_args()
			
 
				+    
			
 
				+    # 检查文件是否存在
			
 
				+    file_path = Path(args.file_path)
			
 
				+    if not file_path.exists():
			
 
				+        print(f"错误: 文件不存在: {args.file_path}")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    # 检查文件格式
			
 
				+    if file_path.suffix.lower() not in ['.pdf', '.docx', '.doc']:
			
 
				+        print(f"错误: 不支持的文件格式: {file_path.suffix}")
			
 
				+        print("支持的格式: .pdf, .docx, .doc")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    try:
			
 
				+        # 创建分类器
			
 
				+        classifier = DocumentClassifier(model_url=args.model_url)
			
 
				+        
			
 
				+        # 处理文档
			
 
				+        result = classifier.process_document(
			
 
				+            file_path=str(file_path),
			
 
				+            target_level=args.level,
			
 
				+            output_dir=args.output,
			
 
				+            max_chunk_size=args.max_size,
			
 
				+            min_chunk_size=args.min_size,
			
 
				+            save_results=not args.no_save
			
 
				+        )
			
 
				+        
			
 
				+        print("\n" + "=" * 100)
			
 
				+        print("处理成功！")
			
 
				+        print("=" * 100)
			
 
				+        print(f"\n文本块总数: {len(result['chunks'])}")
			
 
				+        if not args.no_save:
			
 
				+            print(f"输出目录: {result['output_dir']}")
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"\n错误: {str(e)}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/requirements.txt
+++ b/core/construction_review/doc_worker/extra/requirements.txt
@@ -0,0 +1,5 @@
 
				+pymupdf>=1.23.0
			
 
				+python-docx>=0.8.11
			
 
				+requests>=2.31.0
			
 
				+pyyaml>=6.0
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/test_basic.py
+++ b/core/construction_review/doc_worker/extra/test_basic.py
@@ -0,0 +1,270 @@
 
				+"""
			
 
				+基础测试脚本
			
 
				+用于验证各模块的基本功能
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 添加当前目录到Python路径，以便导入模块
			
 
				+sys.path.insert(0, str(Path(__file__).parent))
			
 
				+
			
 
				+
			
 
				+def test_imports():
			
 
				+    """测试模块导入"""
			
 
				+    print("=" * 80)
			
 
				+    print("测试1: 模块导入")
			
 
				+    print("=" * 80)
			
 
				+    
			
 
				+    try:
			
 
				+        from core import DocumentClassifier
			
 
				+        print("✓ DocumentClassifier 导入成功")
			
 
				+        
			
 
				+        from toc_extractor import TOCExtractor
			
 
				+        print("✓ TOCExtractor 导入成功")
			
 
				+        
			
 
				+        from llm_classifier import LLMClassifier
			
 
				+        print("✓ LLMClassifier 导入成功")
			
 
				+        
			
 
				+        from text_splitter import TextSplitter
			
 
				+        print("✓ TextSplitter 导入成功")
			
 
				+        
			
 
				+        from result_saver import ResultSaver
			
 
				+        print("✓ ResultSaver 导入成功")
			
 
				+        
			
 
				+        print("\n所有模块导入成功！")
			
 
				+        return True
			
 
				+    except Exception as e:
			
 
				+        print(f"\n✗ 导入失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def test_dependencies():
			
 
				+    """测试依赖包"""
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("测试2: 依赖包检查")
			
 
				+    print("=" * 80)
			
 
				+    
			
 
				+    dependencies = {
			
 
				+        'fitz': 'PyMuPDF',
			
 
				+        'docx': 'python-docx',
			
 
				+        'requests': 'requests'
			
 
				+    }
			
 
				+    
			
 
				+    all_ok = True
			
 
				+    for module, package in dependencies.items():
			
 
				+        try:
			
 
				+            __import__(module)
			
 
				+            print(f"✓ {package} 已安装")
			
 
				+        except ImportError:
			
 
				+            print(f"✗ {package} 未安装")
			
 
				+            all_ok = False
			
 
				+    
			
 
				+    if all_ok:
			
 
				+        print("\n所有依赖包已安装！")
			
 
				+    else:
			
 
				+        print("\n请运行: pip install -r requirements.txt")
			
 
				+    
			
 
				+    return all_ok
			
 
				+
			
 
				+
			
 
				+def test_classifier_init():
			
 
				+    """测试分类器初始化"""
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("测试3: 分类器初始化")
			
 
				+    print("=" * 80)
			
 
				+    
			
 
				+    try:
			
 
				+        from core import DocumentClassifier
			
 
				+        
			
 
				+        classifier = DocumentClassifier()
			
 
				+        print("✓ 使用默认参数初始化成功")
			
 
				+        
			
 
				+        classifier = DocumentClassifier(
			
 
				+            model_url="http://test.com/api"
			
 
				+        )
			
 
				+        print("✓ 使用自定义参数初始化成功")
			
 
				+        
			
 
				+        print("\n分类器初始化测试通过！")
			
 
				+        return True
			
 
				+    except Exception as e:
			
 
				+        print(f"\n✗ 初始化失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def test_toc_extractor():
			
 
				+    """测试目录提取器"""
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("测试4: 目录提取器")
			
 
				+    print("=" * 80)
			
 
				+    
			
 
				+    try:
			
 
				+        from toc_extractor import TOCExtractor
			
 
				+        
			
 
				+        extractor = TOCExtractor()
			
 
				+        print("✓ TOCExtractor 创建成功")
			
 
				+        
			
 
				+        # 测试层级检测
			
 
				+        test_titles = [
			
 
				+            "第一章 工程概况",
			
 
				+            "1.1 项目背景",
			
 
				+            "1.1.1 位置交通",
			
 
				+            "〖1.2〗 自然条件"
			
 
				+        ]
			
 
				+        
			
 
				+        for title in test_titles:
			
 
				+            level = extractor._detect_level(title)
			
 
				+            print(f"  '{title}' -> 层级: {level}")
			
 
				+        
			
 
				+        print("\n目录提取器测试通过！")
			
 
				+        return True
			
 
				+    except Exception as e:
			
 
				+        print(f"\n✗ 测试失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def test_text_splitter():
			
 
				+    """测试文本切分器"""
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("测试5: 文本切分器")
			
 
				+    print("=" * 80)
			
 
				+    
			
 
				+    try:
			
 
				+        from text_splitter import TextSplitter
			
 
				+        
			
 
				+        splitter = TextSplitter()
			
 
				+        print("✓ TextSplitter 创建成功")
			
 
				+        
			
 
				+        # 测试标题标准化
			
 
				+        test_title = "  第一章   工程概况  "
			
 
				+        normalized = splitter._normalize_title(test_title)
			
 
				+        print(f"  标题标准化: '{test_title}' -> '{normalized}'")
			
 
				+        
			
 
				+        # 测试大块分割
			
 
				+        long_text = "这是一个测试句子。" * 200  # 约2000字符
			
 
				+        chunks = splitter._split_large_chunk(long_text, 1000, "测试标题")
			
 
				+        print(f"  大块分割: {len(long_text)}字符 -> {len(chunks)}个块")
			
 
				+        
			
 
				+        print("\n文本切分器测试通过！")
			
 
				+        return True
			
 
				+    except Exception as e:
			
 
				+        print(f"\n✗ 测试失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def test_llm_classifier():
			
 
				+    """测试LLM分类器"""
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("测试6: LLM分类器")
			
 
				+    print("=" * 80)
			
 
				+    
			
 
				+    try:
			
 
				+        from llm_classifier import LLMClassifier
			
 
				+        
			
 
				+        classifier = LLMClassifier()
			
 
				+        print("✓ LLMClassifier 创建成功")
			
 
				+        
			
 
				+        # 显示分类映射
			
 
				+        print("\n  分类类别映射:")
			
 
				+        for cn, en in classifier.category_mapping.items():
			
 
				+            print(f"    {cn} -> {en}")
			
 
				+        
			
 
				+        print("\nLLM分类器测试通过！")
			
 
				+        return True
			
 
				+    except Exception as e:
			
 
				+        print(f"\n✗ 测试失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def test_result_saver():
			
 
				+    """测试结果保存器"""
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("测试7: 结果保存器")
			
 
				+    print("=" * 80)
			
 
				+    
			
 
				+    try:
			
 
				+        from result_saver import ResultSaver
			
 
				+        
			
 
				+        saver = ResultSaver()
			
 
				+        print("✓ ResultSaver 创建成功")
			
 
				+        
			
 
				+        # 测试文件名清理
			
 
				+        test_names = [
			
 
				+            "正常文件名",
			
 
				+            "包含<>:特殊字符",
			
 
				+            "很长的文件名" * 50
			
 
				+        ]
			
 
				+        
			
 
				+        for name in test_names:
			
 
				+            sanitized = saver._sanitize_filename(name)
			
 
				+            print(f"  '{name[:30]}...' -> '{sanitized[:30]}...'")
			
 
				+        
			
 
				+        print("\n结果保存器测试通过！")
			
 
				+        return True
			
 
				+    except Exception as e:
			
 
				+        print(f"\n✗ 测试失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """运行所有测试"""
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("文档分类切分库 - 基础测试")
			
 
				+    print("=" * 80)
			
 
				+    
			
 
				+    tests = [
			
 
				+        ("模块导入", test_imports),
			
 
				+        ("依赖包检查", test_dependencies),
			
 
				+        ("分类器初始化", test_classifier_init),
			
 
				+        ("目录提取器", test_toc_extractor),
			
 
				+        ("文本切分器", test_text_splitter),
			
 
				+        ("LLM分类器", test_llm_classifier),
			
 
				+        ("结果保存器", test_result_saver)
			
 
				+    ]
			
 
				+    
			
 
				+    results = []
			
 
				+    for name, test_func in tests:
			
 
				+        try:
			
 
				+            result = test_func()
			
 
				+            results.append((name, result))
			
 
				+        except Exception as e:
			
 
				+            print(f"\n测试 '{name}' 发生异常: {e}")
			
 
				+            results.append((name, False))
			
 
				+    
			
 
				+    # 总结
			
 
				+    print("\n" + "=" * 80)
			
 
				+    print("测试总结")
			
 
				+    print("=" * 80)
			
 
				+    
			
 
				+    passed = sum(1 for _, result in results if result)
			
 
				+    total = len(results)
			
 
				+    
			
 
				+    for name, result in results:
			
 
				+        status = "✓ 通过" if result else "✗ 失败"
			
 
				+        print(f"{status} - {name}")
			
 
				+    
			
 
				+    print(f"\n总计: {passed}/{total} 个测试通过")
			
 
				+    
			
 
				+    if passed == total:
			
 
				+        print("\n所有测试通过！库已准备就绪。")
			
 
				+        return 0
			
 
				+    else:
			
 
				+        print("\n部分测试失败，请检查错误信息。")
			
 
				+        return 1
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    sys.exit(main())
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/优化完成说明.md
+++ b/core/construction_review/doc_worker/extra/优化完成说明.md
@@ -0,0 +1,305 @@
 
				+# 文档分类切分库 - 优化完成说明
			
 
				+
			
 
				+## 项目概述
			
 
				+
			
 
				+已成功将`pdf_classifier`和`word_classifier`两个独立项目整合为统一的`doc_classifier`库模块，并实现了全新的智能分块逻辑。
			
 
				+
			
 
				+## 完成的优化内容
			
 
				+
			
 
				+### 1. 配置文件管理 ✓
			
 
				+
			
 
				+**新增文件**:
			
 
				+- `config.yaml` - 统一配置文件
			
 
				+- `config_loader.py` - 配置加载模块
			
 
				+- `配置说明.md` - 配置文档
			
 
				+
			
 
				+**配置项包括**:
			
 
				+- LLM参数（model_url, model_name, temperature, timeout）
			
 
				+- 文本切分参数（target_level, max_chunk_size, min_chunk_size, fuzzy_threshold）
			
 
				+- 目录提取参数（max_pages, paragraphs_per_page）
			
 
				+- 分类类别（mapping, descriptions）
			
 
				+- 提示词模板（classification prompt）
			
 
				+- 输出配置（default_dir_name, save_results, max_filename_length）
			
 
				+- 标题模式（level1_patterns, level2_patterns, level3_patterns）
			
 
				+- 编号格式（numbering formats）
			
 
				+- 噪音过滤（noise patterns）
			
 
				+- 目录检测（toc patterns, min/max length）
			
 
				+- 日志配置（level, format, filename）
			
 
				+
			
 
				+**特点**:
			
 
				+- 所有配置集中管理，易于维护
			
 
				+- 支持配置文件和函数参数两种方式
			
 
				+- 配置优先级：函数参数 > 配置文件 > 默认值
			
 
				+- 单例模式，全局共享配置
			
 
				+
			
 
				+### 2. 统一的库模块结构 ✓
			
 
				+
			
 
				+**位置**: `script/doc_classifier/`
			
 
				+
			
 
				+**模块组成**:
			
 
				+- `__init__.py` - 库入口，导出主要类
			
 
				+- `core.py` - 核心处理类 DocumentClassifier
			
 
				+- `toc_extractor.py` - 目录提取模块（支持PDF和Word）
			
 
				+- `llm_classifier.py` - LLM分类模块
			
 
				+- `text_splitter.py` - 文本切分模块（新的智能分块逻辑）
			
 
				+- `result_saver.py` - 结果保存模块
			
 
				+- `config.yaml` - 配置文件
			
 
				+- `config_loader.py` - 配置加载模块
			
 
				+- `main.py` - 命令行入口
			
 
				+- `example.py` - 使用示例
			
 
				+- `test_basic.py` - 基础测试
			
 
				+
			
 
				+**特点**:
			
 
				+- 统一接口，自动识别PDF和Word格式
			
 
				+- 可作为Python库被其他项目导入
			
 
				+- 支持相对导入和绝对导入
			
 
				+
			
 
				+### 3. 新的分块逻辑 ✓
			
 
				+
			
 
				+**核心改进**:
			
 
				+
			
 
				+#### 分块流程
			
 
				+1. **定位标题**: 在正文中定位指定层级的标题（跳过目录页）
			
 
				+2. **子标题切分**: 在每个标题块中查找更低层级的子标题进行切分
			
 
				+3. **大块分割**: 超过`max_chunk_size`的块按句子级分割（保持语义完整）
			
 
				+4. **小块合并**: 不足`min_chunk_size`的块尝试合并（仅在同一父标题内合并）
			
 
				+
			
 
				+#### 关键特性
			
 
				+- **局部索引**: `chunk_id`是在每个指定层级标题内部的局部分块索引（整数：1, 2, 3, ...）
			
 
				+- **父标题追踪**: 通过`serial_number`记录当前标题在指定层级中的位置
			
 
				+- **智能合并**: 只在同一个父标题内合并小块，合并后重新编号
			
 
				+- **分割块保护**: 分割产生的块不参与合并，确保语义完整
			
 
				+
			
 
				+#### 参数控制
			
 
				+- `max_chunk_size`: 最大分块字符数（默认1000）
			
 
				+- `min_chunk_size`: 最小分块字符数（默认500）
			
 
				+- `target_level`: 目标层级（默认2级）
			
 
				+
			
 
				+### 4. 新的数据格式 ✓
			
 
				+
			
 
				+**文本块数据结构**:
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "file_name": "文档名称.pdf",
			
 
				+  "chunk_id": 1,
			
 
				+  "section_label": "第一章.工程概况->1.2 自然条件->1.2.1 位置交通",
			
 
				+  "context_summary": "自然条件",
			
 
				+  "project_plan_type": "overview",
			
 
				+  "element_tag": {
			
 
				+    "chunk_id": 1,
			
 
				+    "page": 5,
			
 
				+    "serial_number": 1
			
 
				+  },
			
 
				+  "review_chunk_content": "正文内容..."
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+**字段说明**:
			
 
				+- `chunk_id`: 在当前指定层级标题内部的局部分块索引（整数）
			
 
				+- `serial_number`: 当前标题在指定层级中的索引（第几个指定层级的标题）
			
 
				+- `section_label`: 完整的层级路径
			
 
				+- `context_summary`: 上下文摘要（父标题）
			
 
				+- `project_plan_type`: 分类代码（英文）
			
 
				+
			
 
				+### 5. 统一的API接口 ✓
			
 
				+
			
 
				+**主要接口**:
			
 
				+
			
 
				+```python
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+classifier = DocumentClassifier(
			
 
				+    model_url="http://172.16.35.50:8000/v1/chat/completions"
			
 
				+)
			
 
				+
			
 
				+result = classifier.process_document(
			
 
				+    file_path="document.pdf",      # 支持PDF和Word
			
 
				+    target_level=2,                 # 目标层级
			
 
				+    max_chunk_size=1000,            # 最大分块大小
			
 
				+    min_chunk_size=500,             # 最小分块大小
			
 
				+    output_dir="./output",          # 输出目录
			
 
				+    save_results=True               # 是否保存文件
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+**新增功能**:
			
 
				+- `save_results=False`: 不保存文件，只返回数据
			
 
				+- 命令行参数 `--no-save`: 不保存文件
			
 
				+
			
 
				+### 6. 完整的文档和示例 ✓
			
 
				+
			
 
				+**文档文件**:
			
 
				+- `README.md` - 完整的API文档和使用说明
			
 
				+- `快速开始.md` - 快速上手指南
			
 
				+- `项目总览.md` - 项目架构和技术细节
			
 
				+- `配置说明.md` - 配置文件详细说明
			
 
				+- `example.py` - 6个使用示例
			
 
				+- `优化完成说明.md` - 本文件
			
 
				+
			
 
				+**示例内容**:
			
 
				+1. 基础使用
			
 
				+2. 处理Word文档
			
 
				+3. 自定义分块大小
			
 
				+4. 不保存文件，只获取数据
			
 
				+5. 访问文本块数据
			
 
				+6. 批量处理
			
 
				+
			
 
				+## 测试结果
			
 
				+
			
 
				+运行 `test_basic.py` 测试结果：
			
 
				+
			
 
				+```
			
 
				+✓ 通过 - 模块导入
			
 
				+✓ 通过 - 依赖包检查
			
 
				+✓ 通过 - 分类器初始化
			
 
				+✓ 通过 - 目录提取器
			
 
				+✓ 通过 - 文本切分器
			
 
				+✓ 通过 - LLM分类器
			
 
				+✓ 通过 - 结果保存器
			
 
				+
			
 
				+总计: 7/7 个测试通过
			
 
				+```
			
 
				+
			
 
				+所有测试通过，库已准备就绪！
			
 
				+
			
 
				+## 使用方法
			
 
				+
			
 
				+### 方式1: 使用默认配置
			
 
				+
			
 
				+```python
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+# 自动从config.yaml加载配置
			
 
				+classifier = DocumentClassifier()
			
 
				+
			
 
				+# 使用配置文件中的默认参数
			
 
				+result = classifier.process_document("document.pdf")
			
 
				+```
			
 
				+
			
 
				+### 方式2: 覆盖部分配置
			
 
				+
			
 
				+```python
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+# 覆盖model_url，其他使用配置文件
			
 
				+classifier = DocumentClassifier(model_url="http://custom:8000/v1/chat/completions")
			
 
				+
			
 
				+# 覆盖分块参数，其他使用配置文件
			
 
				+result = classifier.process_document(
			
 
				+    file_path="document.pdf",
			
 
				+    max_chunk_size=1500,
			
 
				+    min_chunk_size=800,
			
 
				+    save_results=False
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 方式3: 修改配置文件
			
 
				+
			
 
				+直接编辑`config.yaml`：
			
 
				+
			
 
				+```yaml
			
 
				+text_splitting:
			
 
				+  max_chunk_size: 1500  # 修改默认值
			
 
				+  min_chunk_size: 800
			
 
				+```
			
 
				+
			
 
				+然后正常使用：
			
 
				+
			
 
				+```python
			
 
				+classifier = DocumentClassifier()
			
 
				+result = classifier.process_document("document.pdf")  # 使用新的默认值
			
 
				+```
			
 
				+
			
 
				+### 方式2: 命令行
			
 
				+
			
 
				+```bash
			
 
				+# 处理文档并保存
			
 
				+python main.py document.pdf
			
 
				+
			
 
				+# 不保存文件
			
 
				+python main.py document.pdf --no-save
			
 
				+
			
 
				+# 自定义参数
			
 
				+python main.py document.pdf -l 2 --max-size 1500 --min-size 800 -o ./output
			
 
				+```
			
 
				+
			
 
				+## 与旧版本对比
			
 
				+
			
 
				+| 特性 | 旧版本 | 新版本 |
			
 
				+|-----|--------|--------|
			
 
				+| 接口统一性 | PDF和Word分离 | 统一接口 |
			
 
				+| 分块逻辑 | 仅按标题切分 | 智能分割+合并 |
			
 
				+| chunk_id | 全局编号 | 局部索引（整数形式） |
			
 
				+| 数据格式 | 简单格式 | 规范化格式+serial_number |
			
 
				+| 使用方式 | 独立脚本 | 可作为库调用 |
			
 
				+| 保存选项 | 必须保存 | 可选保存 |
			
 
				+| 文档完整性 | 基础文档 | 完整文档+示例 |
			
 
				+
			
 
				+## 核心优势
			
 
				+
			
 
				+1. **局部索引**: chunk_id在每个父标题内独立编号（整数形式），便于理解和管理
			
 
				+2. **父标题追踪**: serial_number明确标识当前块属于哪个指定层级的标题
			
 
				+3. **智能合并**: 只在同一父标题内合并，避免跨标题混淆
			
 
				+4. **灵活使用**: 可选保存，支持纯数据处理模式
			
 
				+5. **统一接口**: 一个接口处理所有格式
			
 
				+
			
 
				+## 文件结构
			
 
				+
			
 
				+```
			
 
				+doc_classifier/
			
 
				+├── __init__.py              # 库入口
			
 
				+├── core.py                  # 核心类
			
 
				+├── toc_extractor.py         # 目录提取
			
 
				+├── llm_classifier.py        # LLM分类
			
 
				+├── text_splitter.py         # 文本切分（新逻辑）
			
 
				+├── result_saver.py          # 结果保存
			
 
				+├── config.yaml              # 配置文件
			
 
				+├── config_loader.py         # 配置加载
			
 
				+├── main.py                  # 命令行入口
			
 
				+├── example.py               # 使用示例
			
 
				+├── test_basic.py            # 基础测试
			
 
				+├── requirements.txt         # 依赖包
			
 
				+├── README.md                # 完整文档
			
 
				+├── 快速开始.md              # 快速指南
			
 
				+├── 项目总览.md              # 项目架构
			
 
				+├── 配置说明.md              # 配置文档
			
 
				+└── 优化完成说明.md          # 本文件
			
 
				+```
			
 
				+
			
 
				+## 依赖包
			
 
				+
			
 
				+```
			
 
				+pymupdf>=1.23.0
			
 
				+python-docx>=0.8.11
			
 
				+requests>=2.31.0
			
 
				+pyyaml>=6.0
			
 
				+```
			
 
				+
			
 
				+安装：
			
 
				+```bash
			
 
				+pip install -r requirements.txt
			
 
				+```
			
 
				+
			
 
				+## 下一步建议
			
 
				+
			
 
				+1. **实际测试**: 使用真实的PDF/Word文档进行测试
			
 
				+2. **性能优化**: 如需处理大量文档，可考虑并行处理
			
 
				+3. **功能扩展**: 根据实际需求添加新的分类类别
			
 
				+4. **错误处理**: 完善异常情况的处理逻辑
			
 
				+
			
 
				+## 技术支持
			
 
				+
			
 
				+如有问题，请查看：
			
 
				+1. `README.md` - 完整文档
			
 
				+2. `快速开始.md` - 快速上手
			
 
				+3. `配置说明.md` - 配置详解
			
 
				+4. `example.py` - 使用示例
			
 
				+5. `test_basic.py` - 测试代码
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**优化完成时间**: 2025-11-13  
			
 
				+**版本**: 2.0.0  
			
 
				+**状态**: ✓ 已完成并通过测试
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/快速开始.md
+++ b/core/construction_review/doc_worker/extra/快速开始.md
@@ -0,0 +1,231 @@
 
				+# 快速开始指南
			
 
				+
			
 
				+## 安装依赖
			
 
				+
			
 
				+首先安装所需的Python包：
			
 
				+
			
 
				+```bash
			
 
				+pip install -r requirements.txt
			
 
				+```
			
 
				+
			
 
				+需要的包：
			
 
				+- pymupdf (处理PDF)
			
 
				+- python-docx (处理Word)
			
 
				+- requests (调用API)
			
 
				+
			
 
				+## 方式1: 作为Python库使用
			
 
				+
			
 
				+### 最简单的例子
			
 
				+
			
 
				+```python
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+# 创建分类器
			
 
				+classifier = DocumentClassifier()
			
 
				+
			
 
				+# 处理文档
			
 
				+result = classifier.process_document("document.pdf")
			
 
				+
			
 
				+print(f"完成！生成了 {len(result['chunks'])} 个文本块")
			
 
				+```
			
 
				+
			
 
				+### 处理Word文档
			
 
				+
			
 
				+```python
			
 
				+# 同样的接口，自动识别格式
			
 
				+result = classifier.process_document("document.docx")
			
 
				+```
			
 
				+
			
 
				+### 自定义参数
			
 
				+
			
 
				+```python
			
 
				+result = classifier.process_document(
			
 
				+    file_path="document.pdf",
			
 
				+    target_level=2,           # 按2级目录分类
			
 
				+    max_chunk_size=1500,      # 最大1500字符
			
 
				+    min_chunk_size=800,       # 最小800字符
			
 
				+    output_dir="./my_output", # 自定义输出目录
			
 
				+    save_results=True         # 是否保存结果到文件
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 不保存文件，只获取数据
			
 
				+
			
 
				+```python
			
 
				+# 只处理数据，不保存到文件
			
 
				+result = classifier.process_document(
			
 
				+    file_path="document.pdf",
			
 
				+    save_results=False  # 不保存文件
			
 
				+)
			
 
				+
			
 
				+# 直接使用返回的数据
			
 
				+for chunk in result['chunks']:
			
 
				+    print(chunk['section_label'])
			
 
				+    print(chunk['review_chunk_content'])
			
 
				+```
			
 
				+
			
 
				+### 访问结果数据
			
 
				+
			
 
				+```python
			
 
				+result = classifier.process_document("document.pdf")
			
 
				+
			
 
				+# 访问文本块
			
 
				+for chunk in result['chunks']:
			
 
				+    print(f"章节: {chunk['section_label']}")
			
 
				+    print(f"类别: {chunk['project_plan_type']}")
			
 
				+    print(f"页码: {chunk['element_tag']['page']}")
			
 
				+    print(f"内容: {chunk['review_chunk_content'][:100]}...")
			
 
				+    print("-" * 50)
			
 
				+```
			
 
				+
			
 
				+## 方式2: 使用命令行
			
 
				+
			
 
				+### 基础用法
			
 
				+
			
 
				+```bash
			
 
				+python main.py document.pdf
			
 
				+```
			
 
				+
			
 
				+### 指定参数
			
 
				+
			
 
				+```bash
			
 
				+# 指定目标层级和输出目录
			
 
				+python main.py document.pdf -l 2 -o ./output
			
 
				+
			
 
				+# 自定义分块大小
			
 
				+python main.py document.pdf --max-size 1100 --min-size 900
			
 
				+
			
 
				+# 不保存文件，只处理数据
			
 
				+python main.py document.pdf --no-save
			
 
				+
			
 
				+# 完整参数
			
 
				+python main.py document.pdf -l 2 -o ./output --max-size 1500 --min-size 800
			
 
				+```
			
 
				+
			
 
				+### 查看帮助
			
 
				+
			
 
				+```bash
			
 
				+python main.py -h
			
 
				+```
			
 
				+
			
 
				+## 输出结果说明
			
 
				+
			
 
				+处理完成后，会在输出目录生成以下文件：
			
 
				+
			
 
				+```
			
 
				+分类切分结果/
			
 
				+├── README.md                    # 索引文件
			
 
				+├── 文档名_完整结果_时间戳.json   # 完整JSON数据
			
 
				+├── 文档名_统计报告_时间戳.txt    # 统计报告
			
 
				+├── overview/                    # 工程概况类别
			
 
				+│   ├── 001_第一章_工程概况.md
			
 
				+│   └── 002_1.1_项目背景.md
			
 
				+├── technology/                  # 施工工艺计算类别
			
 
				+│   ├── 001_第二章_施工工艺.md
			
 
				+│   └── ...
			
 
				+└── safety/                      # 安全保证措施类别
			
 
				+    └── ...
			
 
				+```
			
 
				+
			
 
				+## 数据格式说明
			
 
				+
			
 
				+每个文本块的JSON格式：
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "file_name": "文档名称.pdf",
			
 
				+  "chunk_id": 1,
			
 
				+  "section_label": "第一章.工程概况->1.2 自然条件->1.2.1 位置交通",
			
 
				+  "context_summary": "自然条件",
			
 
				+  "project_plan_type": "overview",
			
 
				+  "element_tag": {
			
 
				+    "chunk_id": 1,
			
 
				+    "page": 5,
			
 
				+    "serial_number": 1
			
 
				+  },
			
 
				+  "review_chunk_content": "正文内容..."
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+说明：
			
 
				+- `chunk_id`: 在当前指定层级标题内部的局部分块索引（整数：1, 2, 3, ...）
			
 
				+- `serial_number`: 当前标题在指定层级中的索引（第几个指定层级的标题）
			
 
				+
			
 
				+## 分类类别对照表
			
 
				+
			
 
				+| 中文名称 | 英文代码 |
			
 
				+|---------|---------|
			
 
				+| 编制依据 | basis |
			
 
				+| 工程概况 | overview |
			
 
				+| 施工计划 | plan |
			
 
				+| 施工工艺计算 | technology |
			
 
				+| 安全保证措施 | safety |
			
 
				+| 质量保证措施 | quality |
			
 
				+| 环境保证措施 | environment |
			
 
				+| 施工管理及作业人员配备与分工 | management |
			
 
				+| 验收要求 | acceptance |
			
 
				+| 其它资料 | other |
			
 
				+
			
 
				+## 批量处理示例
			
 
				+
			
 
				+```python
			
 
				+from pathlib import Path
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+classifier = DocumentClassifier()
			
 
				+
			
 
				+# 处理目录下所有PDF文件
			
 
				+for pdf_file in Path("./documents").glob("*.pdf"):
			
 
				+    print(f"处理: {pdf_file}")
			
 
				+    try:
			
 
				+        result = classifier.process_document(pdf_file)
			
 
				+        print(f"完成: {len(result['chunks'])} 个块")
			
 
				+    except Exception as e:
			
 
				+        print(f"错误: {e}")
			
 
				+```
			
 
				+
			
 
				+## 常见问题
			
 
				+
			
 
				+### Q: 提示"未检测到目录"怎么办？
			
 
				+
			
 
				+A: 确保文档包含目录结构，目录项需要有明确的编号格式（如1.1、第一章等）。
			
 
				+
			
 
				+### Q: 如何修改分类类别？
			
 
				+
			
 
				+A: 在 `llm_classifier.py` 中修改 `CATEGORY_MAPPING` 字典。
			
 
				+
			
 
				+### Q: 处理很慢怎么办？
			
 
				+
			
 
				+A: 大文档需要较长时间，可以：
			
 
				+1. 减小目标层级（如改为1级）
			
 
				+2. 增大最小分块大小
			
 
				+3. 检查模型API响应速度
			
 
				+
			
 
				+### Q: 如何调整分块大小？
			
 
				+
			
 
				+A: 使用参数控制：
			
 
				+```python
			
 
				+result = classifier.process_document(
			
 
				+    file_path="doc.pdf",
			
 
				+    max_chunk_size=2000,  # 增大最大值
			
 
				+    min_chunk_size=1000   # 增大最小值
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### Q: 支持其他文档格式吗？
			
 
				+
			
 
				+A: 目前支持：
			
 
				+- PDF (.pdf)
			
 
				+- Word (.docx, .doc)
			
 
				+
			
 
				+## 更多示例
			
 
				+
			
 
				+查看 `example.py` 文件获取更多使用示例。
			
 
				+
			
 
				+## 技术支持
			
 
				+
			
 
				+如有问题，请查看：
			
 
				+1. README.md - 完整文档
			
 
				+2. example.py - 示例代码
			
 
				+3. 项目Issues - 已知问题和解决方案
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/配置说明.md
+++ b/core/construction_review/doc_worker/extra/配置说明.md
@@ -0,0 +1,394 @@
 
				+# 配置说明文档
			
 
				+
			
 
				+## 概述
			
 
				+
			
 
				+`doc_classifier`库使用`config.yaml`文件进行统一的配置管理，包括LLM参数、分块逻辑、分类类别、提示词等所有可配置项。
			
 
				+
			
 
				+## 配置文件位置
			
 
				+
			
 
				+```
			
 
				+doc_classifier/
			
 
				+├── config.yaml          # 配置文件
			
 
				+├── config_loader.py     # 配置加载模块
			
 
				+└── ...
			
 
				+```
			
 
				+
			
 
				+## 配置项说明
			
 
				+
			
 
				+### 1. 大语言模型配置 (llm)
			
 
				+
			
 
				+```yaml
			
 
				+llm:
			
 
				+  model_url: "http://172.16.35.50:8000/v1/chat/completions"  # 模型API地址
			
 
				+  model_name: "Qwen2.5-7B-Instruct"                          # 模型名称
			
 
				+  temperature: 0.1                                            # 温度参数
			
 
				+  timeout: 60                                                 # 请求超时时间（秒）
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- `model_url`: LLM服务的API端点
			
 
				+- `model_name`: 使用的模型名称
			
 
				+- `temperature`: 控制输出的随机性，越低越确定
			
 
				+- `timeout`: API请求超时时间
			
 
				+
			
 
				+### 2. 文本切分配置 (text_splitting)
			
 
				+
			
 
				+```yaml
			
 
				+text_splitting:
			
 
				+  target_level: 2           # 目标层级
			
 
				+  max_chunk_size: 1000      # 最大分块字符数
			
 
				+  min_chunk_size: 500       # 最小分块字符数
			
 
				+  fuzzy_threshold: 0.80     # 模糊匹配阈值
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- `target_level`: 按几级目录进行分类（1=一级，2=二级）
			
 
				+- `max_chunk_size`: 超过此字符数的块会被分割
			
 
				+- `min_chunk_size`: 小于此字符数的块会尝试合并
			
 
				+- `fuzzy_threshold`: 标题匹配的相似度阈值（0-1）
			
 
				+
			
 
				+### 3. 目录提取配置 (toc_extraction)
			
 
				+
			
 
				+```yaml
			
 
				+toc_extraction:
			
 
				+  max_pages: 15                  # 最多读取的页数
			
 
				+  paragraphs_per_page: 30        # Word文档每页段落数
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- `max_pages`: 目录通常在前几页，限制读取页数提高效率
			
 
				+- `paragraphs_per_page`: Word文档模拟分页的段落数
			
 
				+
			
 
				+### 4. 分类类别配置 (categories)
			
 
				+
			
 
				+```yaml
			
 
				+categories:
			
 
				+  # 中文名称到英文代码的映射
			
 
				+  mapping:
			
 
				+    编制依据: basis
			
 
				+    工程概况: overview
			
 
				+    施工计划: plan
			
 
				+    # ... 更多类别
			
 
				+  
			
 
				+  # 类别描述（用于LLM分类提示词）
			
 
				+  descriptions:
			
 
				+    编制依据: "包括编制依据、编制说明、规范标准..."
			
 
				+    工程概况: "包括项目概况、工程概况、项目背景..."
			
 
				+    # ... 更多描述
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- `mapping`: 中文类别名称与英文代码的对应关系
			
 
				+- `descriptions`: 每个类别的详细描述，用于生成LLM提示词
			
 
				+
			
 
				+**如何添加新类别**：
			
 
				+1. 在`mapping`中添加新的键值对
			
 
				+2. 在`descriptions`中添加对应的描述
			
 
				+
			
 
				+### 5. LLM分类提示词模板 (prompts)
			
 
				+
			
 
				+```yaml
			
 
				+prompts:
			
 
				+  classification: |
			
 
				+    你是一个专业的工程文档分析助手...
			
 
				+    
			
 
				+    【分类类别说明】
			
 
				+    {category_descriptions}
			
 
				+    
			
 
				+    【待分类的目录项】
			
 
				+    {toc_items}
			
 
				+    
			
 
				+    ...
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- 使用`|`表示多行文本
			
 
				+- `{category_descriptions}`和`{toc_items}`是占位符，会被实际内容替换
			
 
				+- 可以自定义提示词内容和格式
			
 
				+
			
 
				+### 6. 输出配置 (output)
			
 
				+
			
 
				+```yaml
			
 
				+output:
			
 
				+  default_dir_name: "分类切分结果"   # 默认输出目录名称
			
 
				+  save_results: true                 # 是否默认保存结果
			
 
				+  max_filename_length: 200           # 文件名最大长度
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- `default_dir_name`: 未指定输出目录时使用的默认名称
			
 
				+- `save_results`: 是否默认保存结果到文件
			
 
				+- `max_filename_length`: 文件名超过此长度会被截断
			
 
				+
			
 
				+### 7. 标题层级识别配置 (title_patterns)
			
 
				+
			
 
				+```yaml
			
 
				+title_patterns:
			
 
				+  level1:
			
 
				+    - '^【\d+】'
			
 
				+    - '^第[一二三四五六七八九十\d]+章'
			
 
				+    # ... 更多模式
			
 
				+  
			
 
				+  level2:
			
 
				+    - '^第[一二三四五六七八九十\d]+节'
			
 
				+    - '^〖\d+(?:\.\d+)*〗'
			
 
				+    # ... 更多模式
			
 
				+  
			
 
				+  level3:
			
 
				+    - '^\([一二三四五六七八九十]+\)'
			
 
				+    # ... 更多模式
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- 使用正则表达式匹配不同层级的标题
			
 
				+- 可以添加新的模式以支持更多标题格式
			
 
				+
			
 
				+### 8. 编号格式配置 (numbering)
			
 
				+
			
 
				+```yaml
			
 
				+numbering:
			
 
				+  formats:
			
 
				+    - '^【\d+】'
			
 
				+    - '^第[一二三四五六七八九十\d]+[章节条款]'
			
 
				+    - '^\d+[、．.]'
			
 
				+    # ... 更多格式
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- 定义哪些格式被认为是有效的编号
			
 
				+- 用于判断文本是否为标题
			
 
				+
			
 
				+### 9. 噪音过滤配置 (noise_filters)
			
 
				+
			
 
				+```yaml
			
 
				+noise_filters:
			
 
				+  patterns:
			
 
				+    - '^\d{4}[-年]\d{1,2}[-月]\d{1,2}'  # 日期
			
 
				+    - '^http[s]?://'                    # URL
			
 
				+    - '^第\s*\d+\s*页'                  # 页码
			
 
				+    # ... 更多模式
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- 用于过滤非目录内容
			
 
				+- 匹配这些模式的文本会被忽略
			
 
				+
			
 
				+### 10. 目录识别配置 (toc_detection)
			
 
				+
			
 
				+```yaml
			
 
				+toc_detection:
			
 
				+  patterns:
			
 
				+    - '^(第[一二三四五六七八九十\d]+[章节条款].+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    - '^(〖\d+(?:\.\d+)*〗.+?)[.·]{2,}\s*(\d{1,4})\s*$'
			
 
				+    # ... 更多模式
			
 
				+  
			
 
				+  min_length: 3      # 标题最小长度
			
 
				+  max_length: 200    # 标题最大长度
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- `patterns`: 目录行的正则表达式模式
			
 
				+- `min_length`/`max_length`: 标题长度限制
			
 
				+
			
 
				+### 11. 日志配置 (logging)
			
 
				+
			
 
				+```yaml
			
 
				+logging:
			
 
				+  level: INFO                                                  # 日志级别
			
 
				+  format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'  # 日志格式
			
 
				+  filename: 'doc_classifier.log'                              # 日志文件名
			
 
				+```
			
 
				+
			
 
				+**说明**：
			
 
				+- `level`: DEBUG, INFO, WARNING, ERROR
			
 
				+- `format`: Python logging格式字符串
			
 
				+- `filename`: 日志文件名
			
 
				+
			
 
				+## 使用方法
			
 
				+
			
 
				+### 方式1: 使用默认配置
			
 
				+
			
 
				+```python
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+# 自动加载config.yaml中的配置
			
 
				+classifier = DocumentClassifier()
			
 
				+result = classifier.process_document("document.pdf")
			
 
				+```
			
 
				+
			
 
				+### 方式2: 覆盖部分配置
			
 
				+
			
 
				+```python
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+
			
 
				+# 覆盖model_url，其他使用配置文件
			
 
				+classifier = DocumentClassifier(model_url="http://custom-url:8000/v1/chat/completions")
			
 
				+
			
 
				+# 覆盖分块大小，其他使用配置文件
			
 
				+result = classifier.process_document(
			
 
				+    "document.pdf",
			
 
				+    max_chunk_size=1500,
			
 
				+    min_chunk_size=800
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+### 方式3: 修改配置文件
			
 
				+
			
 
				+直接编辑`config.yaml`文件，修改后重新运行程序即可生效。
			
 
				+
			
 
				+### 方式4: 使用自定义配置文件
			
 
				+
			
 
				+```python
			
 
				+from doc_classifier.config_loader import Config
			
 
				+
			
 
				+# 加载自定义配置文件
			
 
				+config = Config()
			
 
				+config.load_config("my_custom_config.yaml")
			
 
				+
			
 
				+# 然后正常使用
			
 
				+from doc_classifier import DocumentClassifier
			
 
				+classifier = DocumentClassifier()
			
 
				+```
			
 
				+
			
 
				+## 配置优先级
			
 
				+
			
 
				+参数来源的优先级（从高到低）：
			
 
				+
			
 
				+1. **函数参数**: 直接传递给函数的参数
			
 
				+2. **配置文件**: config.yaml中的配置
			
 
				+3. **默认值**: 代码中的硬编码默认值
			
 
				+
			
 
				+示例：
			
 
				+```python
			
 
				+# config.yaml中: max_chunk_size: 1000
			
 
				+classifier = DocumentClassifier()
			
 
				+
			
 
				+# 使用配置文件的值（1000）
			
 
				+result = classifier.process_document("doc.pdf")
			
 
				+
			
 
				+# 使用函数参数的值（1500），覆盖配置文件
			
 
				+result = classifier.process_document("doc.pdf", max_chunk_size=1500)
			
 
				+```
			
 
				+
			
 
				+## 常见配置场景
			
 
				+
			
 
				+### 场景1: 调整分块大小
			
 
				+
			
 
				+```yaml
			
 
				+text_splitting:
			
 
				+  max_chunk_size: 1500  # 增大到1500
			
 
				+  min_chunk_size: 800   # 增大到800
			
 
				+```
			
 
				+
			
 
				+### 场景2: 更换LLM服务
			
 
				+
			
 
				+```yaml
			
 
				+llm:
			
 
				+  model_url: "http://new-server:8000/v1/chat/completions"
			
 
				+  model_name: "new-model-name"
			
 
				+```
			
 
				+
			
 
				+### 场景3: 添加新的分类类别
			
 
				+
			
 
				+```yaml
			
 
				+categories:
			
 
				+  mapping:
			
 
				+    # ... 现有类别
			
 
				+    新类别: new_category
			
 
				+  
			
 
				+  descriptions:
			
 
				+    # ... 现有描述
			
 
				+    新类别: "新类别的描述说明"
			
 
				+```
			
 
				+
			
 
				+### 场景4: 自定义提示词
			
 
				+
			
 
				+```yaml
			
 
				+prompts:
			
 
				+  classification: |
			
 
				+    你是一个专业助手。
			
 
				+    
			
 
				+    【自定义的提示词内容】
			
 
				+    {category_descriptions}
			
 
				+    
			
 
				+    【自定义的格式】
			
 
				+    {toc_items}
			
 
				+```
			
 
				+
			
 
				+### 场景5: 支持新的标题格式
			
 
				+
			
 
				+```yaml
			
 
				+title_patterns:
			
 
				+  level2:
			
 
				+    - '^〖\d+(?:\.\d+)*〗'  # 现有格式
			
 
				+    - '^§\d+\.\d+'         # 新增格式
			
 
				+```
			
 
				+
			
 
				+## 配置验证
			
 
				+
			
 
				+运行测试以验证配置是否正确：
			
 
				+
			
 
				+```bash
			
 
				+python test_basic.py
			
 
				+```
			
 
				+
			
 
				+如果所有测试通过，说明配置正确。
			
 
				+
			
 
				+## 配置备份
			
 
				+
			
 
				+建议在修改配置前备份原文件：
			
 
				+
			
 
				+```bash
			
 
				+cp config.yaml config.yaml.backup
			
 
				+```
			
 
				+
			
 
				+## 故障排除
			
 
				+
			
 
				+### 问题1: 配置文件未找到
			
 
				+
			
 
				+**错误**: `FileNotFoundError: 配置文件不存在`
			
 
				+
			
 
				+**解决**: 确保`config.yaml`文件在`doc_classifier`目录下
			
 
				+
			
 
				+### 问题2: YAML格式错误
			
 
				+
			
 
				+**错误**: `yaml.scanner.ScannerError`
			
 
				+
			
 
				+**解决**: 检查YAML语法，注意缩进（使用空格，不要用Tab）
			
 
				+
			
 
				+### 问题3: 配置未生效
			
 
				+
			
 
				+**解决**: 
			
 
				+1. 检查配置文件是否正确保存
			
 
				+2. 重新运行程序
			
 
				+3. 确认没有函数参数覆盖配置
			
 
				+
			
 
				+### 问题4: 缺少pyyaml依赖
			
 
				+
			
 
				+**错误**: `ModuleNotFoundError: No module named 'yaml'`
			
 
				+
			
 
				+**解决**: 
			
 
				+```bash
			
 
				+pip install pyyaml
			
 
				+```
			
 
				+
			
 
				+## 最佳实践
			
 
				+
			
 
				+1. **版本控制**: 将`config.yaml`加入版本控制
			
 
				+2. **环境区分**: 为不同环境创建不同的配置文件
			
 
				+3. **文档更新**: 修改配置后更新相关文档
			
 
				+4. **测试验证**: 修改配置后运行测试确保正常
			
 
				+5. **备份重要配置**: 定期备份配置文件
			
 
				+
			
 
				+## 相关文件
			
 
				+
			
 
				+- `config.yaml` - 配置文件
			
 
				+- `config_loader.py` - 配置加载模块
			
 
				+- `test_basic.py` - 测试脚本
			
 
				+- `README.md` - 完整文档
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**更新时间**: 2025-11-13  
			
 
				+**版本**: 2.0.0
			
 
				+
			
--- a/core/construction_review/doc_worker/extra/项目总览.md
+++ b/core/construction_review/doc_worker/extra/项目总览.md
@@ -0,0 +1,341 @@
 
				+# 文档分类切分库 - 项目总览
			
 
				+
			
 
				+## 项目简介
			
 
				+
			
 
				+这是一个统一的Python库，用于处理PDF和Word文档的目录提取、智能分类和文本切分。它整合了原有的`pdf_classifier`和`word_classifier`两个独立项目，提供了更加统一、灵活和强大的文档处理能力。
			
 
				+
			
 
				+## 核心功能
			
 
				+
			
 
				+1. **目录提取**：自动从PDF和Word文档中提取目录结构
			
 
				+2. **智能分类**：使用大语言模型对目录项进行语义分类
			
 
				+3. **智能切分**：按目录层级和字符数进行智能文本切分
			
 
				+4. **多格式输出**：支持JSON、Markdown、统计报告等多种输出格式
			
 
				+
			
 
				+## 项目架构
			
 
				+
			
 
				+```
			
 
				+doc_classifier/
			
 
				+├── __init__.py           # 库入口，导出主要类
			
 
				+├── core.py              # 核心处理类 DocumentClassifier
			
 
				+├── toc_extractor.py     # 目录提取模块 TOCExtractor
			
 
				+├── llm_classifier.py    # LLM分类模块 LLMClassifier
			
 
				+├── text_splitter.py     # 文本切分模块 TextSplitter
			
 
				+├── result_saver.py      # 结果保存模块 ResultSaver
			
 
				+├── main.py              # 命令行入口
			
 
				+├── example.py           # 使用示例
			
 
				+├── requirements.txt     # 依赖包
			
 
				+├── README.md            # 完整文档
			
 
				+├── 快速开始.md          # 快速开始指南
			
 
				+└── 项目总览.md          # 本文件
			
 
				+```
			
 
				+
			
 
				+## 模块说明
			
 
				+
			
 
				+### 1. core.py - 核心处理模块
			
 
				+
			
 
				+**主要类**: `DocumentClassifier`
			
 
				+
			
 
				+**功能**:
			
 
				+- 统一的文档处理接口
			
 
				+- 协调各个子模块的工作流程
			
 
				+- 提供简单易用的API
			
 
				+
			
 
				+**主要方法**:
			
 
				+- `process_document()`: 处理文档的主要方法
			
 
				+
			
 
				+### 2. toc_extractor.py - 目录提取模块
			
 
				+
			
 
				+**主要类**: `TOCExtractor`
			
 
				+
			
 
				+**功能**:
			
 
				+- 从PDF中提取目录（基于文本模式匹配）
			
 
				+- 从Word中提取目录（内置目录结构 + 文本模式）
			
 
				+- 识别目录项的层级结构
			
 
				+
			
 
				+**支持的目录格式**:
			
 
				+- 数字编号：1.1、1.1.1、1.1.1.1
			
 
				+- 中文编号：第一章、第二节
			
 
				+- 特殊格式：【1】、〖1.2〗
			
 
				+- 其他格式：一、二、三、(1)、(2)
			
 
				+
			
 
				+### 3. llm_classifier.py - LLM分类模块
			
 
				+
			
 
				+**主要类**: `LLMClassifier`
			
 
				+
			
 
				+**功能**:
			
 
				+- 调用大语言模型API
			
 
				+- 构建分类提示词
			
 
				+- 解析分类结果
			
 
				+- 映射中英文类别名称
			
 
				+
			
 
				+**预定义分类**:
			
 
				+- 编制依据 (basis)
			
 
				+- 工程概况 (overview)
			
 
				+- 施工计划 (plan)
			
 
				+- 施工工艺计算 (technology)
			
 
				+- 安全保证措施 (safety)
			
 
				+- 质量保证措施 (quality)
			
 
				+- 环境保证措施 (environment)
			
 
				+- 施工管理及作业人员配备与分工 (management)
			
 
				+- 验收要求 (acceptance)
			
 
				+- 其它资料 (other)
			
 
				+
			
 
				+### 4. text_splitter.py - 文本切分模块
			
 
				+
			
 
				+**主要类**: `TextSplitter`
			
 
				+
			
 
				+**功能**:
			
 
				+- 提取PDF/Word全文
			
 
				+- 在正文中定位标题
			
 
				+- 按目录层级切分文本
			
 
				+- 智能分割大块
			
 
				+- 智能合并小块
			
 
				+
			
 
				+**切分逻辑**:
			
 
				+
			
 
				+1. **定位标题**: 在正文中定位指定层级的标题（跳过目录页）
			
 
				+2. **子标题切分**: 在每个标题块中查找更低层级的子标题进行切分
			
 
				+3. **大块分割**: 超过`max_chunk_size`的块按句子级分割（保持语义完整）
			
 
				+4. **小块合并**: 不足`min_chunk_size`的块尝试合并（合并后不超过`max_chunk_size`）
			
 
				+
			
 
				+**特点**:
			
 
				+- 分割产生的块不参与合并，确保语义完整
			
 
				+- 支持表格内容提取（Word）
			
 
				+- 模糊匹配标题，应对OCR错误
			
 
				+
			
 
				+### 5. result_saver.py - 结果保存模块
			
 
				+
			
 
				+**主要类**: `ResultSaver`
			
 
				+
			
 
				+**功能**:
			
 
				+- 保存完整JSON结果
			
 
				+- 按类别保存Markdown文件
			
 
				+- 生成索引文件
			
 
				+- 生成统计报告
			
 
				+
			
 
				+**输出文件**:
			
 
				+- `{文档名}_完整结果_{时间戳}.json`: 包含所有数据
			
 
				+- `{文档名}_统计报告_{时间戳}.txt`: 详细统计信息
			
 
				+- `README.md`: 索引文件
			
 
				+- `{类别}/`: 按类别分组的Markdown文件
			
 
				+
			
 
				+## 数据流程
			
 
				+
			
 
				+```
			
 
				+输入文档 (PDF/Word)
			
 
				+    ↓
			
 
				+[TOCExtractor] 提取目录
			
 
				+    ↓
			
 
				+目录项列表
			
 
				+    ↓
			
 
				+[LLMClassifier] 智能分类
			
 
				+    ↓
			
 
				+已分类的目录项
			
 
				+    ↓
			
 
				+[TextSplitter] 提取全文
			
 
				+    ↓
			
 
				+页面内容列表
			
 
				+    ↓
			
 
				+[TextSplitter] 定位标题
			
 
				+    ↓
			
 
				+标题位置列表
			
 
				+    ↓
			
 
				+[TextSplitter] 智能切分
			
 
				+    ↓
			
 
				+文本块列表
			
 
				+    ↓
			
 
				+[ResultSaver] 保存结果
			
 
				+    ↓
			
 
				+输出文件
			
 
				+```
			
 
				+
			
 
				+## 数据格式
			
 
				+
			
 
				+### 目录项格式
			
 
				+
			
 
				+```python
			
 
				+{
			
 
				+    'title': '1.1 工程概况',
			
 
				+    'page': '5',
			
 
				+    'level': 2,
			
 
				+    'category': '工程概况',
			
 
				+    'category_code': 'overview',
			
 
				+    'original': '1.1 工程概况 ......... 5'
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 文本块格式
			
 
				+
			
 
				+```python
			
 
				+{
			
 
				+    'file_name': '文档名.pdf',
			
 
				+    'chunk_id': 'doc_chunk_1.2.1',
			
 
				+    'section_label': '第一章->1.2 自然条件->1.2.1 位置交通',
			
 
				+    'context_summary': '自然条件',
			
 
				+    'project_plan_type': 'overview',
			
 
				+    'element_tag': {
			
 
				+        'chunk_id': 'doc_chunk_1.2.1',
			
 
				+        'page': 5,
			
 
				+        'serial_number': '1.2.1'
			
 
				+    },
			
 
				+    'review_chunk_content': '正文内容...'
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+## 与旧版本对比
			
 
				+
			
 
				+### 主要改进
			
 
				+
			
 
				+| 特性 | 旧版本 | 新版本 |
			
 
				+|-----|--------|--------|
			
 
				+| 接口统一性 | PDF和Word分离 | 统一接口 |
			
 
				+| 分块逻辑 | 仅按标题切分 | 智能分割+合并 |
			
 
				+| 数据格式 | 简单格式 | 规范化格式 |
			
 
				+| 使用方式 | 独立脚本 | 可作为库调用 |
			
 
				+| 文档完整性 | 基础文档 | 完整文档+示例 |
			
 
				+| 可扩展性 | 较低 | 高度模块化 |
			
 
				+
			
 
				+### 新增功能
			
 
				+
			
 
				+1. **智能分块**: 根据字符数自动分割和合并
			
 
				+2. **统一接口**: 一个接口处理所有格式
			
 
				+3. **库模块化**: 可被其他项目导入使用
			
 
				+4. **命令行工具**: 提供命令行接口
			
 
				+5. **更好的文档**: 完整的API文档和使用示例
			
 
				+
			
 
				+## 使用场景
			
 
				+
			
 
				+### 场景1: 文档预处理
			
 
				+
			
 
				+将长文档切分成适合RAG系统的小块：
			
 
				+
			
 
				+```python
			
 
				+classifier = DocumentClassifier()
			
 
				+result = classifier.process_document(
			
 
				+    "long_document.pdf",
			
 
				+    max_chunk_size=1000,
			
 
				+    min_chunk_size=500
			
 
				+)
			
 
				+# 使用result['chunks']进行后续处理
			
 
				+```
			
 
				+
			
 
				+### 场景2: 文档分类整理
			
 
				+
			
 
				+按类别整理大量文档：
			
 
				+
			
 
				+```python
			
 
				+for doc in documents:
			
 
				+    result = classifier.process_document(doc)
			
 
				+    # 结果自动按类别保存到文件夹
			
 
				+```
			
 
				+
			
 
				+### 场景3: 批量数据提取
			
 
				+
			
 
				+从多个文档中提取特定类别的内容：
			
 
				+
			
 
				+```python
			
 
				+all_safety_content = []
			
 
				+for doc in documents:
			
 
				+    result = classifier.process_document(doc)
			
 
				+    safety_chunks = [
			
 
				+        chunk for chunk in result['chunks']
			
 
				+        if chunk['project_plan_type'] == 'safety'
			
 
				+    ]
			
 
				+    all_safety_content.extend(safety_chunks)
			
 
				+```
			
 
				+
			
 
				+## 技术栈
			
 
				+
			
 
				+- **Python 3.7+**
			
 
				+- **PyMuPDF (fitz)**: PDF处理
			
 
				+- **python-docx**: Word处理
			
 
				+- **requests**: HTTP请求
			
 
				+- **大语言模型**: 文本分类
			
 
				+
			
 
				+## 性能考虑
			
 
				+
			
 
				+### 处理速度
			
 
				+
			
 
				+- 小文档（<50页）: 1-2分钟
			
 
				+- 中等文档（50-200页）: 2-5分钟
			
 
				+- 大文档（>200页）: 5-15分钟
			
 
				+
			
 
				+主要耗时在：
			
 
				+1. LLM分类调用
			
 
				+2. 全文提取
			
 
				+3. 标题定位
			
 
				+
			
 
				+### 优化建议
			
 
				+
			
 
				+1. 减小目标层级（处理更少的目录项）
			
 
				+2. 增大分块大小（生成更少的块）
			
 
				+3. 使用更快的LLM服务
			
 
				+4. 批量处理时使用多进程
			
 
				+
			
 
				+## 扩展性
			
 
				+
			
 
				+### 添加新的分类类别
			
 
				+
			
 
				+在`llm_classifier.py`中修改：
			
 
				+
			
 
				+```python
			
 
				+CATEGORY_MAPPING = {
			
 
				+    "编制依据": "basis",
			
 
				+    "新类别": "new_category",  # 添加新类别
			
 
				+    # ...
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 支持新的文档格式
			
 
				+
			
 
				+在`toc_extractor.py`和`text_splitter.py`中添加新的提取方法。
			
 
				+
			
 
				+### 自定义分块逻辑
			
 
				+
			
 
				+继承`TextSplitter`类并重写`split_by_hierarchy`方法。
			
 
				+
			
 
				+## 测试
			
 
				+
			
 
				+建议测试的场景：
			
 
				+
			
 
				+1. 不同格式的文档（PDF、Word）
			
 
				+2. 不同的目录结构（数字编号、中文编号）
			
 
				+3. 不同大小的文档（小、中、大）
			
 
				+4. 边界情况（无目录、目录格式异常）
			
 
				+
			
 
				+## 未来计划
			
 
				+
			
 
				+1. 支持更多文档格式（如HTML、Markdown）
			
 
				+2. 增加更多分类类别
			
 
				+3. 优化处理速度
			
 
				+4. 添加可视化界面
			
 
				+5. 支持分布式处理
			
 
				+
			
 
				+## 贡献指南
			
 
				+
			
 
				+欢迎贡献代码！请遵循以下步骤：
			
 
				+
			
 
				+1. Fork项目
			
 
				+2. 创建特性分支
			
 
				+3. 提交更改
			
 
				+4. 推送到分支
			
 
				+5. 创建Pull Request
			
 
				+
			
 
				+## 许可证
			
 
				+
			
 
				+MIT License
			
 
				+
			
 
				+## 联系方式
			
 
				+
			
 
				+如有问题或建议，请通过以下方式联系：
			
 
				+
			
 
				+- 提交Issue
			
 
				+- 发送邮件
			
 
				+- 项目讨论区
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**版本**: 2.0.0  
			
 
				+**更新日期**: 2025-11-13  
			
 
				+**作者**: Your Name
			
 
				+
			
--- a/core/construction_review/doc_worker/llm_classifier.py
+++ b/core/construction_review/doc_worker/llm_classifier.py
@@ -0,0 +1,212 @@
 
				+"""
			
 
				+大语言模型分类模块
			
 
				+使用LLM对目录项进行智能分类
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import re
			
 
				+import requests
			
 
				+
			
 
				+try:
			
 
				+    from .config_loader import get_config
			
 
				+except ImportError:
			
 
				+    from config_loader import get_config
			
 
				+
			
 
				+
			
 
				+class LLMClassifier:
			
 
				+    """大语言模型分类器"""
			
 
				+    
			
 
				+    def __init__(self, model_url=None, model_name=None):
			
 
				+        """
			
 
				+        初始化分类器
			
 
				+        
			
 
				+        参数:
			
 
				+            model_url: 模型API地址（可选，默认从配置文件读取）
			
 
				+            model_name: 模型名称（可选，默认从配置文件读取）
			
 
				+        """
			
 
				+        self.config = get_config()
			
 
				+        self.model_url = model_url or self.config.llm_model_url
			
 
				+        self.model_name = model_name or self.config.llm_model_name
			
 
				+        self.category_mapping = self.config.category_mapping
			
 
				+    
			
 
				+    def classify(self, toc_items, target_level=2):
			
 
				+        """
			
 
				+        对目录项进行智能分类
			
 
				+        
			
 
				+        参数:
			
 
				+            toc_items: 目录项列表
			
 
				+            target_level: 要分类的目标层级
			
 
				+            
			
 
				+        返回:
			
 
				+            dict: 分类结果
			
 
				+        """
			
 
				+        print(f"\n正在对{target_level}级目录进行智能分类...")
			
 
				+        
			
 
				+        # 构建提示词
			
 
				+        prompt_result = self._build_prompt(toc_items, target_level)
			
 
				+        if prompt_result is None:
			
 
				+            print(f"  警告: 未找到{target_level}级目录项")
			
 
				+            return None
			
 
				+        
			
 
				+        prompt, filtered_items = prompt_result
			
 
				+        
			
 
				+        print(f"  找到 {len(filtered_items)} 个{target_level}级目录项")
			
 
				+        print("  正在调用模型进行分类...")
			
 
				+        
			
 
				+        # 调用模型
			
 
				+        llm_response = self._call_api(prompt)
			
 
				+        
			
 
				+        if llm_response is None:
			
 
				+            print("  错误: 模型调用失败")
			
 
				+            return None
			
 
				+        
			
 
				+        print("  模型调用成功，正在解析结果...")
			
 
				+        
			
 
				+        # 解析结果
			
 
				+        classification = self._parse_result(llm_response)
			
 
				+        
			
 
				+        if classification is None:
			
 
				+            print("  错误: 结果解析失败")
			
 
				+            print(f"  模型原始返回:\n{llm_response[:500]}...")
			
 
				+            return None
			
 
				+        
			
 
				+        if "分类结果" not in classification:
			
 
				+            print(f"  警告: 解析结果中没有'分类结果'字段")
			
 
				+            print(f"  模型原始返回:\n{llm_response[:500]}...")
			
 
				+            return None
			
 
				+        
			
 
				+        # 整合分类结果到原始目录项
			
 
				+        classified_items = []
			
 
				+        classification_map = {}
			
 
				+        
			
 
				+        if "分类结果" in classification:
			
 
				+            for item in classification["分类结果"]:
			
 
				+                title = item.get("标题", "")
			
 
				+                category = item.get("类别", "其他")
			
 
				+                classification_map[title] = category
			
 
				+        
			
 
				+        for item in filtered_items:
			
 
				+            title = item['title']
			
 
				+            
			
 
				+            # 尝试直接匹配
			
 
				+            category_cn = classification_map.get(title, None)
			
 
				+            
			
 
				+            # 如果直接匹配失败，尝试去掉编号后匹配
			
 
				+            if category_cn is None:
			
 
				+                # 去掉开头的编号（如 "1 ", "1. ", "第一章 " 等）
			
 
				+                title_without_number = re.sub(r'^[\d一二三四五六七八九十]+[、\.\s]+', '', title)
			
 
				+                title_without_number = re.sub(r'^第[一二三四五六七八九十\d]+[章节条款]\s*', '', title_without_number)
			
 
				+                category_cn = classification_map.get(title_without_number, None)
			
 
				+            
			
 
				+            # 如果还是没找到，尝试模糊匹配
			
 
				+            if category_cn is None:
			
 
				+                for map_title, map_category in classification_map.items():
			
 
				+                    if map_title in title or title in map_title:
			
 
				+                        category_cn = map_category
			
 
				+                        break
			
 
				+            
			
 
				+            # 最后的默认值
			
 
				+            if category_cn is None:
			
 
				+                category_cn = "未分类"
			
 
				+            
			
 
				+            category_en = self.category_mapping.get(category_cn, "other")
			
 
				+            
			
 
				+            classified_items.append({
			
 
				+                'title': title,
			
 
				+                'page': item['page'],
			
 
				+                'level': item['level'],
			
 
				+                'category': category_cn,
			
 
				+                'category_code': category_en,
			
 
				+                'original': item.get('original', '')
			
 
				+            })
			
 
				+        
			
 
				+        print(f"  分类完成！共分类 {len(classified_items)} 个目录项")
			
 
				+        
			
 
				+        return {
			
 
				+            'items': classified_items,
			
 
				+            'total_count': len(classified_items),
			
 
				+            'target_level': target_level
			
 
				+        }
			
 
				+    
			
 
				+    def _build_prompt(self, toc_items, target_level=2):
			
 
				+        """构建目录分类的提示词"""
			
 
				+        # 从配置文件读取分类类别描述
			
 
				+        categories = self.config.category_descriptions
			
 
				+        
			
 
				+        # 筛选出指定层级的目录项
			
 
				+        filtered_items = [item for item in toc_items if item['level'] == target_level]
			
 
				+        
			
 
				+        if not filtered_items:
			
 
				+            return None
			
 
				+        
			
 
				+        # 构建目录项列表字符串
			
 
				+        toc_list_str = "\n".join([f"{i+1}. {item['title']}" for i, item in enumerate(filtered_items)])
			
 
				+        
			
 
				+        # 构建分类说明字符串
			
 
				+        category_desc = "\n".join([f"- {cat}: {desc}" for cat, desc in categories.items()])
			
 
				+        
			
 
				+        # 从配置文件读取提示词模板
			
 
				+        prompt_template = self.config.classification_prompt_template
			
 
				+        
			
 
				+        # 替换模板中的占位符
			
 
				+        prompt = prompt_template.format(
			
 
				+            category_descriptions=category_desc,
			
 
				+            toc_items=toc_list_str
			
 
				+        )
			
 
				+        
			
 
				+        return prompt, filtered_items
			
 
				+    
			
 
				+    def _call_api(self, prompt, temperature=None):
			
 
				+        """调用大语言模型API进行目录分类"""
			
 
				+        if temperature is None:
			
 
				+            temperature = self.config.llm_temperature
			
 
				+        
			
 
				+        try:
			
 
				+            headers = {
			
 
				+                "Content-Type": "application/json"
			
 
				+            }
			
 
				+            
			
 
				+            data = {
			
 
				+                "model": self.model_name,
			
 
				+                "messages": [
			
 
				+                    {
			
 
				+                        "role": "user",
			
 
				+                        "content": prompt
			
 
				+                    }
			
 
				+                ],
			
 
				+                "stream": False,
			
 
				+                "temperature": temperature
			
 
				+            }
			
 
				+            
			
 
				+            timeout = self.config.llm_timeout
			
 
				+            response = requests.post(self.model_url, headers=headers, json=data, timeout=timeout)
			
 
				+            response.raise_for_status()
			
 
				+            
			
 
				+            result = response.json()
			
 
				+            content = result.get("choices", [{}])[0].get("message", {}).get("content", "")
			
 
				+            
			
 
				+            return content
			
 
				+        
			
 
				+        except requests.exceptions.RequestException as e:
			
 
				+            print(f"  错误: 调用模型API失败 - {str(e)}")
			
 
				+            return None
			
 
				+        except Exception as e:
			
 
				+            print(f"  错误: 解析模型返回结果失败 - {str(e)}")
			
 
				+            return None
			
 
				+    
			
 
				+    def _parse_result(self, llm_response):
			
 
				+        """解析模型返回的分类结果"""
			
 
				+        try:
			
 
				+            # 尝试提取JSON部分
			
 
				+            json_match = re.search(r'\{[\s\S]*\}', llm_response)
			
 
				+            if json_match:
			
 
				+                json_str = json_match.group(0)
			
 
				+                result = json.loads(json_str)
			
 
				+                return result
			
 
				+            else:
			
 
				+                print("  警告: 无法从模型返回中提取JSON格式")
			
 
				+                return None
			
 
				+        except json.JSONDecodeError as e:
			
 
				+            print(f"  错误: 解析JSON失败 - {str(e)}")
			
 
				+            return None
			
 
				+
			
--- a/core/construction_review/doc_worker/result_saver.py
+++ b/core/construction_review/doc_worker/result_saver.py
@@ -0,0 +1,294 @@
 
				+"""
			
 
				+结果保存模块
			
 
				+保存分类和切分结果到多种格式
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from datetime import datetime
			
 
				+from collections import defaultdict, Counter
			
 
				+
			
 
				+try:
			
 
				+    from .config_loader import get_config
			
 
				+except ImportError:
			
 
				+    from config_loader import get_config
			
 
				+
			
 
				+
			
 
				+class ResultSaver:
			
 
				+    """结果保存器"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.config = get_config()
			
 
				+    
			
 
				+    def save_all(self, file_path, toc_info, classification_result, chunks, output_dir):
			
 
				+        """
			
 
				+        保存所有结果
			
 
				+        
			
 
				+        参数:
			
 
				+            file_path: 源文件路径
			
 
				+            toc_info: 目录信息
			
 
				+            classification_result: 分类结果
			
 
				+            chunks: 文本块列表
			
 
				+            output_dir: 输出目录
			
 
				+            
			
 
				+        返回:
			
 
				+            dict: 保存的文件路径
			
 
				+        """
			
 
				+        output_path = Path(output_dir)
			
 
				+        output_path.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        saved_files = {}
			
 
				+        
			
 
				+        # 保存完整JSON
			
 
				+        json_file = self._save_json(file_path, toc_info, classification_result, chunks, output_dir)
			
 
				+        saved_files['json'] = json_file
			
 
				+        
			
 
				+        # 按类别保存文本块
			
 
				+        print("\n按类别保存文本块:")
			
 
				+        category_files = self._save_by_category(chunks, file_path, output_dir)
			
 
				+        saved_files['category_files'] = category_files
			
 
				+        
			
 
				+        # 创建索引
			
 
				+        index_file = self._create_index(chunks, file_path, output_dir)
			
 
				+        saved_files['index'] = index_file
			
 
				+        
			
 
				+        # 保存统计报告
			
 
				+        report_file = self._save_report(file_path, toc_info, classification_result, chunks, output_dir)
			
 
				+        saved_files['report'] = report_file
			
 
				+        
			
 
				+        return saved_files
			
 
				+    
			
 
				+    def _save_json(self, file_path, toc_info, classification_result, chunks, output_dir):
			
 
				+        """保存完整的分类和切分结果到JSON"""
			
 
				+        output_path = Path(output_dir)
			
 
				+        output_path.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				+        file_name = Path(file_path).stem
			
 
				+        
			
 
				+        json_file = output_path / f"{file_name}_完整结果_{timestamp}.json"
			
 
				+        
			
 
				+        output_data = {
			
 
				+            'source_file': str(file_path),
			
 
				+            'process_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+            'toc_summary': {
			
 
				+                'total_items': toc_info['toc_count'],
			
 
				+                'toc_pages': toc_info['toc_pages']
			
 
				+            },
			
 
				+            'classification': classification_result,
			
 
				+            'chunks': chunks
			
 
				+        }
			
 
				+        
			
 
				+        with open(json_file, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(output_data, f, ensure_ascii=False, indent=2)
			
 
				+        
			
 
				+        print(f"已保存完整结果JSON: {json_file}")
			
 
				+        return str(json_file)
			
 
				+    
			
 
				+    def _save_by_category(self, chunks, file_path, output_dir):
			
 
				+        """按类别保存文本块到独立的Markdown文件"""
			
 
				+        output_path = Path(output_dir)
			
 
				+        file_name = Path(file_path).stem
			
 
				+        
			
 
				+        # 按类别分组
			
 
				+        category_groups = defaultdict(list)
			
 
				+        for chunk in chunks:
			
 
				+            category = chunk['project_plan_type']
			
 
				+            category_groups[category].append(chunk)
			
 
				+        
			
 
				+        saved_files = {}
			
 
				+        
			
 
				+        # 为每个类别创建子文件夹并保存文件
			
 
				+        for category, category_chunks in category_groups.items():
			
 
				+            category_dir = output_path / self._sanitize_filename(category)
			
 
				+            category_dir.mkdir(parents=True, exist_ok=True)
			
 
				+            
			
 
				+            category_files = []
			
 
				+            
			
 
				+            # 为每个文本块创建一个MD文件
			
 
				+            for i, chunk in enumerate(category_chunks, 1):
			
 
				+                section_label = chunk['section_label']
			
 
				+                safe_label = self._sanitize_filename(section_label)
			
 
				+                
			
 
				+                md_filename = f"{i:03d}_{safe_label}.md"
			
 
				+                md_file = category_dir / md_filename
			
 
				+                
			
 
				+                with open(md_file, 'w', encoding='utf-8') as f:
			
 
				+                    f.write(f"# {section_label}\n\n")
			
 
				+                    f.write(f"**类别**: {category}\n\n")
			
 
				+                    f.write(f"**来源文件**: {chunk['file_name']}\n\n")
			
 
				+                    f.write(f"**页码**: {chunk['element_tag']['page']}\n\n")
			
 
				+                    f.write(f"**块ID**: {chunk['chunk_id']}\n\n")
			
 
				+                    f.write(f"**字符数**: {len(chunk['review_chunk_content'])}\n\n")
			
 
				+                    f.write("---\n\n")
			
 
				+                    f.write(chunk['review_chunk_content'])
			
 
				+                    
			
 
				+                    if not chunk['review_chunk_content'].endswith('\n'):
			
 
				+                        f.write('\n')
			
 
				+                
			
 
				+                category_files.append(str(md_file))
			
 
				+            
			
 
				+            saved_files[category] = category_files
			
 
				+            print(f"  [{category}] 保存了 {len(category_files)} 个文件到: {category_dir}")
			
 
				+        
			
 
				+        return saved_files
			
 
				+    
			
 
				+    def _create_index(self, chunks, file_path, output_dir):
			
 
				+        """创建按类别分组的索引文件"""
			
 
				+        output_path = Path(output_dir)
			
 
				+        output_path.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        file_name = Path(file_path).stem
			
 
				+        index_file = output_path / "README.md"
			
 
				+        
			
 
				+        # 按类别分组
			
 
				+        category_groups = defaultdict(list)
			
 
				+        for chunk in chunks:
			
 
				+            category_groups[chunk['project_plan_type']].append(chunk)
			
 
				+        
			
 
				+        with open(index_file, 'w', encoding='utf-8') as f:
			
 
				+            f.write(f"# {file_name} - 分类切分结果索引\n\n")
			
 
				+            f.write(f"**来源文件**: {Path(file_path).name}\n\n")
			
 
				+            f.write(f"**处理时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
			
 
				+            f.write(f"**文本块总数**: {len(chunks)}\n\n")
			
 
				+            f.write(f"**类别数量**: {len(category_groups)}\n\n")
			
 
				+            
			
 
				+            # 统计信息
			
 
				+            total_chars = sum(len(chunk['review_chunk_content']) for chunk in chunks)
			
 
				+            f.write(f"**总字符数**: {total_chars}\n\n")
			
 
				+            
			
 
				+            f.write("---\n\n")
			
 
				+            f.write("## 分类统计\n\n")
			
 
				+            
			
 
				+            # 按类别统计
			
 
				+            for category, category_chunks in sorted(category_groups.items()):
			
 
				+                category_chars = sum(len(chunk['review_chunk_content']) for chunk in category_chunks)
			
 
				+                f.write(f"- **{category}**: {len(category_chunks)} 个文本块, {category_chars} 字符\n")
			
 
				+            
			
 
				+            f.write("\n---\n\n")
			
 
				+            f.write("## 详细目录\n\n")
			
 
				+            
			
 
				+            # 按类别输出详细目录
			
 
				+            for category, category_chunks in sorted(category_groups.items()):
			
 
				+                f.write(f"### {category}\n\n")
			
 
				+                
			
 
				+                for i, chunk in enumerate(category_chunks, 1):
			
 
				+                    section_label = chunk['section_label']
			
 
				+                    safe_label = self._sanitize_filename(section_label)
			
 
				+                    category_safe = self._sanitize_filename(category)
			
 
				+                    md_filename = f"{i:03d}_{safe_label}.md"
			
 
				+                    
			
 
				+                    char_count = len(chunk['review_chunk_content'])
			
 
				+                    page = chunk['element_tag']['page']
			
 
				+                    
			
 
				+                    f.write(f"{i}. [{section_label}]({category_safe}/{md_filename}) - 页码: {page}, 字符数: {char_count}\n")
			
 
				+                
			
 
				+                f.write("\n")
			
 
				+        
			
 
				+        print(f"已保存索引文件: {index_file}")
			
 
				+        return str(index_file)
			
 
				+    
			
 
				+    def _save_report(self, file_path, toc_info, classification_result, chunks, output_dir):
			
 
				+        """保存详细的统计报告"""
			
 
				+        output_path = Path(output_dir)
			
 
				+        output_path.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				+        file_name = Path(file_path).stem
			
 
				+        
			
 
				+        report_file = output_path / f"{file_name}_统计报告_{timestamp}.txt"
			
 
				+        
			
 
				+        with open(report_file, 'w', encoding='utf-8') as f:
			
 
				+            f.write("=" * 100 + "\n")
			
 
				+            f.write("文档分类切分统计报告\n")
			
 
				+            f.write("=" * 100 + "\n\n")
			
 
				+            
			
 
				+            f.write(f"源文件: {Path(file_path).name}\n")
			
 
				+            f.write(f"处理时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
			
 
				+            
			
 
				+            # 目录统计
			
 
				+            f.write("=" * 100 + "\n")
			
 
				+            f.write("目录提取统计\n")
			
 
				+            f.write("=" * 100 + "\n\n")
			
 
				+            f.write(f"目录项总数: {toc_info['toc_count']}\n")
			
 
				+            f.write(f"目录所在页: {', '.join(map(str, toc_info['toc_pages']))}\n\n")
			
 
				+            
			
 
				+            # 层级统计
			
 
				+            level_counts = Counter([item['level'] for item in toc_info['toc_items']])
			
 
				+            f.write("目录层级分布:\n")
			
 
				+            for level in sorted(level_counts.keys()):
			
 
				+                f.write(f"  {level}级: {level_counts[level]} 项\n")
			
 
				+            f.write("\n")
			
 
				+            
			
 
				+            # 分类统计
			
 
				+            if classification_result:
			
 
				+                f.write("=" * 100 + "\n")
			
 
				+                f.write("分类统计\n")
			
 
				+                f.write("=" * 100 + "\n\n")
			
 
				+                
			
 
				+                category_counts = Counter([item['category'] for item in classification_result['items']])
			
 
				+                f.write(f"已分类项数: {classification_result['total_count']}\n")
			
 
				+                f.write(f"分类数量: {len(category_counts)}\n\n")
			
 
				+                
			
 
				+                f.write("各类别统计:\n")
			
 
				+                for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
			
 
				+                    f.write(f"  {category}: {count} 项\n")
			
 
				+                f.write("\n")
			
 
				+            
			
 
				+            # 文本块统计
			
 
				+            f.write("=" * 100 + "\n")
			
 
				+            f.write("文本块切分统计\n")
			
 
				+            f.write("=" * 100 + "\n\n")
			
 
				+            
			
 
				+            f.write(f"文本块总数: {len(chunks)}\n\n")
			
 
				+            
			
 
				+            total_chars = sum(len(chunk['review_chunk_content']) for chunk in chunks)
			
 
				+            avg_chars = total_chars / len(chunks) if chunks else 0
			
 
				+            
			
 
				+            f.write(f"总字符数: {total_chars}\n")
			
 
				+            f.write(f"平均每块字符数: {avg_chars:.1f}\n\n")
			
 
				+            
			
 
				+            # 按类别统计文本块
			
 
				+            category_groups = defaultdict(list)
			
 
				+            for chunk in chunks:
			
 
				+                category_groups[chunk['project_plan_type']].append(chunk)
			
 
				+            
			
 
				+            f.write("按类别统计:\n")
			
 
				+            for category, category_chunks in sorted(category_groups.items()):
			
 
				+                category_chars = sum(len(chunk['review_chunk_content']) for chunk in category_chunks)
			
 
				+                f.write(f"  {category}: {len(category_chunks)} 块, {category_chars} 字符\n")
			
 
				+            f.write("\n")
			
 
				+            
			
 
				+            # 详细列表
			
 
				+            f.write("=" * 100 + "\n")
			
 
				+            f.write("文本块详细列表\n")
			
 
				+            f.write("=" * 100 + "\n\n")
			
 
				+            
			
 
				+            for category, category_chunks in sorted(category_groups.items()):
			
 
				+                f.write(f"\n【{category}】\n")
			
 
				+                f.write("-" * 100 + "\n")
			
 
				+                
			
 
				+                for i, chunk in enumerate(category_chunks, 1):
			
 
				+                    char_count = len(chunk['review_chunk_content'])
			
 
				+                    page = chunk['element_tag']['page']
			
 
				+                    f.write(f"  [{i}] {chunk['section_label']}\n")
			
 
				+                    f.write(f"      页码: {page}, 字符数: {char_count}, 块ID: {chunk['chunk_id']}\n")
			
 
				+        
			
 
				+        print(f"已保存统计报告: {report_file}")
			
 
				+        return str(report_file)
			
 
				+    
			
 
				+    def _sanitize_filename(self, filename):
			
 
				+        """清理文件名，移除或替换不合法字符"""
			
 
				+        invalid_chars = r'<>:"/\|?*'
			
 
				+        for char in invalid_chars:
			
 
				+            filename = filename.replace(char, '_')
			
 
				+        
			
 
				+        filename = filename.strip()
			
 
				+        
			
 
				+        # 从配置读取最大文件名长度
			
 
				+        max_length = self.config.max_filename_length
			
 
				+        if len(filename) > max_length:
			
 
				+            filename = filename[:max_length]
			
 
				+        
			
 
				+        return filename
			
 
				+
			
--- a/core/construction_review/doc_worker/text_splitter.py
+++ b/core/construction_review/doc_worker/text_splitter.py
@@ -0,0 +1,814 @@
 
				+"""
			
 
				+文本切分模块
			
 
				+实现按目录层级和字符数的智能切分逻辑
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+from pathlib import Path
			
 
				+from difflib import SequenceMatcher
			
 
				+import fitz  # PyMuPDF
			
 
				+from docx import Document
			
 
				+
			
 
				+try:
			
 
				+    from .config_loader import get_config
			
 
				+except ImportError:
			
 
				+    from config_loader import get_config
			
 
				+
			
 
				+
			
 
				+class TextSplitter:
			
 
				+    """文本切分器，支持PDF和Word格式"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.config = get_config()
			
 
				+    
			
 
				+    def extract_full_text(self, file_path):
			
 
				+        """
			
 
				+        提取文档的全文内容
			
 
				+        
			
 
				+        参数:
			
 
				+            file_path: 文档路径(PDF或Word)
			
 
				+            
			
 
				+        返回:
			
 
				+            list: 每页的文本内容
			
 
				+        """
			
 
				+        file_path = Path(file_path)
			
 
				+        file_ext = file_path.suffix.lower()
			
 
				+        
			
 
				+        if file_ext == '.pdf':
			
 
				+            return self._extract_from_pdf(file_path)
			
 
				+        elif file_ext in ['.docx', '.doc']:
			
 
				+            return self._extract_from_word(file_path)
			
 
				+        else:
			
 
				+            raise ValueError(f"不支持的文件格式: {file_ext}")
			
 
				+    
			
 
				+    def _extract_from_pdf(self, pdf_path):
			
 
				+        """提取PDF的全文内容"""
			
 
				+        try:
			
 
				+            doc = fitz.open(pdf_path)
			
 
				+            pages_content = []
			
 
				+            current_pos = 0
			
 
				+            
			
 
				+            for page_num in range(len(doc)):
			
 
				+                page = doc[page_num]
			
 
				+                text = page.get_text()
			
 
				+                
			
 
				+                pages_content.append({
			
 
				+                    'page_num': page_num + 1,
			
 
				+                    'text': text,
			
 
				+                    'start_pos': current_pos,
			
 
				+                    'end_pos': current_pos + len(text),
			
 
				+                    'source_file': str(pdf_path)
			
 
				+                })
			
 
				+                
			
 
				+                current_pos += len(text)
			
 
				+            
			
 
				+            doc.close()
			
 
				+            return pages_content
			
 
				+        except Exception as e:
			
 
				+            print(f"  错误: 无法读取PDF全文 - {str(e)}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _extract_from_word(self, word_path):
			
 
				+        """提取Word的全文内容（包括段落和表格）"""
			
 
				+        try:
			
 
				+            doc = Document(word_path)
			
 
				+            pages_content = []
			
 
				+            current_pos = 0
			
 
				+            
			
 
				+            # 提取所有内容（段落和表格按文档顺序）
			
 
				+            all_content = []
			
 
				+            
			
 
				+            # 遍历文档的所有元素（段落和表格）
			
 
				+            for element in doc.element.body:
			
 
				+                # 检查是段落还是表格
			
 
				+                if element.tag.endswith('p'):  # 段落
			
 
				+                    for para in doc.paragraphs:
			
 
				+                        if para._element == element:
			
 
				+                            text = para.text
			
 
				+                            if text.strip():
			
 
				+                                all_content.append(text)
			
 
				+                            break
			
 
				+                elif element.tag.endswith('tbl'):  # 表格
			
 
				+                    for table in doc.tables:
			
 
				+                        if table._element == element:
			
 
				+                            table_text = self._extract_table_text(table)
			
 
				+                            all_content.append(table_text)
			
 
				+                            break
			
 
				+            
			
 
				+            # 模拟分页：每30个元素作为一"页"
			
 
				+            elements_per_page = 30
			
 
				+            for page_num in range(0, len(all_content), elements_per_page):
			
 
				+                page_elements = all_content[page_num:page_num + elements_per_page]
			
 
				+                page_text = '\n'.join(page_elements)
			
 
				+                
			
 
				+                pages_content.append({
			
 
				+                    'page_num': page_num // elements_per_page + 1,
			
 
				+                    'text': page_text,
			
 
				+                    'start_pos': current_pos,
			
 
				+                    'end_pos': current_pos + len(page_text),
			
 
				+                    'source_file': str(word_path)
			
 
				+                })
			
 
				+                
			
 
				+                current_pos += len(page_text)
			
 
				+            
			
 
				+            return pages_content
			
 
				+        except Exception as e:
			
 
				+            print(f"  错误: 无法读取Word全文 - {str(e)}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _extract_table_text(self, table):
			
 
				+        """提取表格内容为文本格式"""
			
 
				+        table_text = []
			
 
				+        for row in table.rows:
			
 
				+            row_text = []
			
 
				+            for cell in row.cells:
			
 
				+                cell_text = cell.text.strip().replace('\n', ' ')
			
 
				+                row_text.append(cell_text)
			
 
				+            table_text.append('\t'.join(row_text))
			
 
				+        
			
 
				+        return '\n[表格开始]\n' + '\n'.join(table_text) + '\n[表格结束]\n'
			
 
				+    
			
 
				+    def split_by_hierarchy(self, classified_items, pages_content, toc_info, 
			
 
				+                          target_level=2, max_chunk_size=1000, min_chunk_size=500):
			
 
				+        """
			
 
				+        按目录层级和字符数智能切分文本
			
 
				+        
			
 
				+        新的分块逻辑：
			
 
				+        1. 按目录项定位到指定层级的正文标题
			
 
				+        2. 在指定层级正文标题所属的正文块中，先按目录项的最低层级子标题进行分块
			
 
				+        3. 然后逐个判断字符数：
			
 
				+           - 超过max_chunk_size的进行分割(句子级，保持语义完整，分割的块不参与合并)
			
 
				+           - 不足min_chunk_size的块进行合并(合并后不能超过max_chunk_size，否则不合并)
			
 
				+        
			
 
				+        参数:
			
 
				+            classified_items: 已分类的目录项列表
			
 
				+            pages_content: 文档全文内容（按页）
			
 
				+            toc_info: 目录信息
			
 
				+            target_level: 目标层级
			
 
				+            max_chunk_size: 最大分块字符数
			
 
				+            min_chunk_size: 最小分块字符数
			
 
				+            
			
 
				+        返回:
			
 
				+            list: 带分类信息的文本块列表
			
 
				+        """
			
 
				+        full_text = ''.join([page['text'] for page in pages_content])
			
 
				+        
			
 
				+        print(f"  正在定位{len(classified_items)}个已分类的标题...")
			
 
				+        print(f"  目录所在页: {toc_info['toc_pages']}")
			
 
				+        
			
 
				+        # 步骤1: 在正文中定位已分类的标题（跳过目录页）
			
 
				+        located_titles = self._find_title_positions(
			
 
				+            classified_items, 
			
 
				+            full_text, 
			
 
				+            pages_content, 
			
 
				+            toc_info['toc_pages']
			
 
				+        )
			
 
				+        
			
 
				+        # 只保留成功定位的标题
			
 
				+        found_titles = [t for t in located_titles if t['found']]
			
 
				+        
			
 
				+        if not found_titles:
			
 
				+            print(f"  错误: 未能在正文中定位任何标题")
			
 
				+            return []
			
 
				+        
			
 
				+        print(f"  成功定位 {len(found_titles)}/{len(classified_items)} 个标题")
			
 
				+        
			
 
				+        # 按位置排序
			
 
				+        found_titles.sort(key=lambda x: x['position'])
			
 
				+        
			
 
				+        # 步骤2: 提取所有层级的目录项，用于在正文块中查找子标题
			
 
				+        all_toc_items = toc_info['toc_items']
			
 
				+        
			
 
				+        # 步骤3: 对每个目标层级的标题，提取其正文块并进行智能切分
			
 
				+        all_chunks = []
			
 
				+        
			
 
				+        for i, title_info in enumerate(found_titles):
			
 
				+            start_pos = title_info['position']
			
 
				+            
			
 
				+            # 确定正文块的结束位置（下一个同级标题的位置）
			
 
				+            if i + 1 < len(found_titles):
			
 
				+                end_pos = found_titles[i + 1]['position']
			
 
				+            else:
			
 
				+                end_pos = len(full_text)
			
 
				+            
			
 
				+            # 提取正文块
			
 
				+            content_block = full_text[start_pos:end_pos]
			
 
				+            
			
 
				+            # 在正文块中查找子标题（比目标层级更低的层级）
			
 
				+            sub_chunks = self._split_by_sub_titles(
			
 
				+                content_block,
			
 
				+                all_toc_items,
			
 
				+                title_info,
			
 
				+                target_level,
			
 
				+                max_chunk_size,
			
 
				+                min_chunk_size
			
 
				+            )
			
 
				+            
			
 
				+            # 为每个子块添加元数据
			
 
				+            for j, sub_chunk in enumerate(sub_chunks, 1):
			
 
				+                # 计算实际页码
			
 
				+                chunk_start_pos = start_pos + sub_chunk['relative_start']
			
 
				+                page_num = self._get_page_number(chunk_start_pos, pages_content)
			
 
				+                
			
 
				+                # 构建section_label（层级路径）
			
 
				+                section_label = self._build_section_label(
			
 
				+                    title_info['title'],
			
 
				+                    sub_chunk.get('sub_title', '')
			
 
				+                )
			
 
				+                
			
 
				+                # 提取最低层级标题的编号
			
 
				+                sub_title = sub_chunk.get('sub_title', '')
			
 
				+                if sub_title:
			
 
				+                    title_number = self._extract_title_number(sub_title)
			
 
				+                else:
			
 
				+                    # 如果没有子标题，从父标题提取
			
 
				+                    title_number = self._extract_title_number(title_info['title'])
			
 
				+                
			
 
				+                # 构建chunk_id格式：doc_chunk_<serial_number>_<序号>
			
 
				+                # 序号从1开始（如果合并了会从0开始）
			
 
				+                chunk_id_str = f"doc_chunk_{title_number}_{j}" if title_number else f"doc_chunk_{j}"
			
 
				+                
			
 
				+                all_chunks.append({
			
 
				+                    'file_name': Path(pages_content[0].get('source_file', 'unknown')).name if pages_content else 'unknown',
			
 
				+                    'chunk_id': chunk_id_str,
			
 
				+                    'section_label': section_label,
			
 
				+                    'project_plan_type': 'bridge_up_part',
			
 
				+                    'element_tag': {
			
 
				+                        'chunk_id': chunk_id_str,
			
 
				+                        'page': page_num,
			
 
				+                        'serial_number': title_number if title_number else str(i + 1)
			
 
				+                    },
			
 
				+                    'review_chunk_content': sub_chunk['content'],
			
 
				+                    '_title_number': title_number,  # 临时存储，用于合并时判断
			
 
				+                    '_local_index': j  # 临时存储局部索引
			
 
				+                })
			
 
				+        
			
 
				+        # 步骤4: 对小块进行合并
			
 
				+        merged_chunks = self._merge_small_chunks(all_chunks, max_chunk_size, min_chunk_size)
			
 
				+        
			
 
				+        # 步骤5: 生成最终的chunk_id和serial_number
			
 
				+        final_chunks = self._finalize_chunk_ids(merged_chunks)
			
 
				+        
			
 
				+        print(f"  初始切分: {len(all_chunks)} 个块")
			
 
				+        print(f"  合并后: {len(merged_chunks)} 个块")
			
 
				+        
			
 
				+        return final_chunks
			
 
				+    
			
 
				+    def _find_title_positions(self, classified_items, full_text, pages_content, toc_pages):
			
 
				+        """在正文中定位已分类的标题位置（跳过目录页）"""
			
 
				+        # 计算目录页的文本范围
			
 
				+        toc_start_pos = float('inf')
			
 
				+        toc_end_pos = 0
			
 
				+        
			
 
				+        for page in pages_content:
			
 
				+            if page['page_num'] in toc_pages:
			
 
				+                toc_start_pos = min(toc_start_pos, page['start_pos'])
			
 
				+                toc_end_pos = max(toc_end_pos, page['end_pos'])
			
 
				+        
			
 
				+        print(f"    目录页范围: {toc_start_pos} - {toc_end_pos}")
			
 
				+        
			
 
				+        located_titles = []
			
 
				+        
			
 
				+        for item in classified_items:
			
 
				+            title = item['title']
			
 
				+            category = item['category']
			
 
				+            category_code = item.get('category_code', 'other')
			
 
				+            
			
 
				+            # 在全文中查找标题（使用配置的模糊匹配阈值）
			
 
				+            fuzzy_threshold = self.config.fuzzy_threshold
			
 
				+            pos = self._find_title_in_text(title, full_text, fuzzy_threshold=fuzzy_threshold)
			
 
				+            
			
 
				+            # 如果找到的位置在目录页范围内，继续查找下一个出现
			
 
				+            if pos >= 0 and toc_start_pos <= pos < toc_end_pos:
			
 
				+                print(f"    [跳过目录] {title} -> 位置: {pos} (在目录页)")
			
 
				+                
			
 
				+                # 尝试在目录页之后继续查找
			
 
				+                search_start = toc_end_pos
			
 
				+                remaining_text = full_text[search_start:]
			
 
				+                pos_in_remaining = self._find_title_in_text(title, remaining_text, fuzzy_threshold=fuzzy_threshold)
			
 
				+                
			
 
				+                if pos_in_remaining >= 0:
			
 
				+                    pos = search_start + pos_in_remaining
			
 
				+                    print(f"    [找到正文] {title} -> 位置: {pos}")
			
 
				+                else:
			
 
				+                    pos = -1
			
 
				+                    print(f"    [未找到] {title} (目录页之后)")
			
 
				+            
			
 
				+            if pos >= 0:
			
 
				+                # 确认位置不在目录页
			
 
				+                if not (toc_start_pos <= pos < toc_end_pos):
			
 
				+                    # 找到对应的页码
			
 
				+                    page_num = self._get_page_number(pos, pages_content)
			
 
				+                    
			
 
				+                    located_titles.append({
			
 
				+                        'title': title,
			
 
				+                        'category': category,
			
 
				+                        'category_code': category_code,
			
 
				+                        'position': pos,
			
 
				+                        'toc_page': item.get('page', ''),
			
 
				+                        'actual_page': page_num,
			
 
				+                        'found': True
			
 
				+                    })
			
 
				+                    print(f"    [确认] {title} -> 页码: {page_num}, 位置: {pos}")
			
 
				+                else:
			
 
				+                    print(f"    [未找到] {title} (只在目录页)")
			
 
				+                    located_titles.append({
			
 
				+                        'title': title,
			
 
				+                        'category': category,
			
 
				+                        'category_code': category_code,
			
 
				+                        'position': -1,
			
 
				+                        'toc_page': item.get('page', ''),
			
 
				+                        'found': False
			
 
				+                    })
			
 
				+            else:
			
 
				+                print(f"    [未找到] {title}")
			
 
				+                located_titles.append({
			
 
				+                    'title': title,
			
 
				+                    'category': category,
			
 
				+                    'category_code': category_code,
			
 
				+                    'position': -1,
			
 
				+                    'toc_page': item.get('page', ''),
			
 
				+                    'found': False
			
 
				+                })
			
 
				+        
			
 
				+        return located_titles
			
 
				+    
			
 
				+    def _find_title_in_text(self, title, text, fuzzy_threshold=0.85):
			
 
				+        """在文本中查找标题的位置"""
			
 
				+        normalized_title = self._normalize_title(title)
			
 
				+        
			
 
				+        # 方法1: 精确匹配
			
 
				+        if normalized_title in text:
			
 
				+            return text.index(normalized_title)
			
 
				+        
			
 
				+        # 方法2: 移除所有空格后匹配
			
 
				+        title_no_space = normalized_title.replace(' ', '')
			
 
				+        text_no_space = text.replace(' ', '')
			
 
				+        if title_no_space in text_no_space:
			
 
				+            pos_no_space = text_no_space.index(title_no_space)
			
 
				+            return pos_no_space
			
 
				+        
			
 
				+        # 方法3: 按行查找，匹配度最高的行
			
 
				+        lines = text.split('\n')
			
 
				+        current_pos = 0
			
 
				+        best_ratio = 0
			
 
				+        best_pos = -1
			
 
				+        
			
 
				+        for line in lines:
			
 
				+            line_stripped = line.strip()
			
 
				+            
			
 
				+            if len(line_stripped) < 3:
			
 
				+                current_pos += len(line) + 1
			
 
				+                continue
			
 
				+            
			
 
				+            # 计算相似度
			
 
				+            ratio = SequenceMatcher(None, normalized_title, line_stripped).ratio()
			
 
				+            
			
 
				+            if ratio > best_ratio:
			
 
				+                best_ratio = ratio
			
 
				+                best_pos = current_pos
			
 
				+            
			
 
				+            current_pos += len(line) + 1
			
 
				+        
			
 
				+        # 如果找到相似度足够高的行
			
 
				+        if best_ratio >= fuzzy_threshold:
			
 
				+            return best_pos
			
 
				+        
			
 
				+        return -1
			
 
				+    
			
 
				+    def _normalize_title(self, title):
			
 
				+        """标准化标题用于匹配"""
			
 
				+        normalized = re.sub(r'\s+', ' ', title)
			
 
				+        normalized = normalized.strip()
			
 
				+        return normalized
			
 
				+    
			
 
				+    def _extract_title_number(self, title):
			
 
				+        """
			
 
				+        从标题中提取编号部分
			
 
				+        
			
 
				+        例如：
			
 
				+        "1.5 施工条件" -> "1.5"
			
 
				+        "1.6 风险辨识与分级" -> "1.6"
			
 
				+        "1 工程概况" -> "1"
			
 
				+        
			
 
				+        参数:
			
 
				+            title: 标题字符串
			
 
				+            
			
 
				+        返回:
			
 
				+            str: 编号部分，如果未找到则返回空字符串
			
 
				+        """
			
 
				+        # 匹配数字编号格式（如 1.5, 1.6, 1.2.3等）
			
 
				+        number_match = re.match(r'^(\d+(?:\.\d+)*)', title)
			
 
				+        if number_match:
			
 
				+            return number_match.group(1)
			
 
				+        
			
 
				+        # 匹配中文编号格式（如 一、二、三等）
			
 
				+        chinese_match = re.match(r'^([一二三四五六七八九十]+)[、．.]', title)
			
 
				+        if chinese_match:
			
 
				+            return chinese_match.group(1)
			
 
				+        
			
 
				+        return ""
			
 
				+    
			
 
				+    def _get_page_number(self, position, pages_content):
			
 
				+        """根据位置获取页码"""
			
 
				+        for page in pages_content:
			
 
				+            if page['start_pos'] <= position < page['end_pos']:
			
 
				+                return page['page_num']
			
 
				+        return 1
			
 
				+    
			
 
				+    def _split_by_sub_titles(self, content_block, all_toc_items, parent_title_info, 
			
 
				+                            target_level, max_chunk_size, min_chunk_size):
			
 
				+        """
			
 
				+        在正文块中按子标题进行切分
			
 
				+        
			
 
				+        参数:
			
 
				+            content_block: 正文块内容
			
 
				+            all_toc_items: 所有目录项
			
 
				+            parent_title_info: 父标题信息
			
 
				+            target_level: 目标层级
			
 
				+            max_chunk_size: 最大分块字符数
			
 
				+            min_chunk_size: 最小分块字符数
			
 
				+            
			
 
				+        返回:
			
 
				+            list: 子块列表
			
 
				+        """
			
 
				+        # 查找比目标层级更低的子标题
			
 
				+        sub_titles = []
			
 
				+        fuzzy_threshold = self.config.fuzzy_threshold
			
 
				+        for toc_item in all_toc_items:
			
 
				+            if toc_item['level'] > target_level:
			
 
				+                # 在正文块中查找这个子标题
			
 
				+                pos = self._find_title_in_text(toc_item['title'], content_block, fuzzy_threshold=fuzzy_threshold)
			
 
				+                if pos >= 0:
			
 
				+                    sub_titles.append({
			
 
				+                        'title': toc_item['title'],
			
 
				+                        'level': toc_item['level'],
			
 
				+                        'position': pos
			
 
				+                    })
			
 
				+        
			
 
				+        # 按位置排序
			
 
				+        sub_titles.sort(key=lambda x: x['position'])
			
 
				+        
			
 
				+        # 如果没有找到子标题，将整个正文块作为一个块
			
 
				+        if not sub_titles:
			
 
				+            # 检查是否需要分割
			
 
				+            if len(content_block) > max_chunk_size:
			
 
				+                return self._split_large_chunk(content_block, max_chunk_size, parent_title_info['title'])
			
 
				+            else:
			
 
				+                return [{
			
 
				+                    'content': content_block,
			
 
				+                    'relative_start': 0,
			
 
				+                    'sub_title': '',
			
 
				+                    'serial_number': ''
			
 
				+                }]
			
 
				+        
			
 
				+        # 按子标题切分
			
 
				+        chunks = []
			
 
				+        for i, sub_title in enumerate(sub_titles):
			
 
				+            start_pos = sub_title['position']
			
 
				+            
			
 
				+            # 确定结束位置
			
 
				+            if i + 1 < len(sub_titles):
			
 
				+                end_pos = sub_titles[i + 1]['position']
			
 
				+            else:
			
 
				+                end_pos = len(content_block)
			
 
				+            
			
 
				+            chunk_content = content_block[start_pos:end_pos]
			
 
				+            
			
 
				+            # 检查是否需要分割
			
 
				+            if len(chunk_content) > max_chunk_size:
			
 
				+                split_chunks = self._split_large_chunk(chunk_content, max_chunk_size, sub_title['title'])
			
 
				+                for j, split_chunk in enumerate(split_chunks):
			
 
				+                    split_chunk['relative_start'] = start_pos + split_chunk['relative_start']
			
 
				+                    split_chunk['sub_title'] = sub_title['title']
			
 
				+                    chunks.append(split_chunk)
			
 
				+            else:
			
 
				+                chunks.append({
			
 
				+                    'content': chunk_content,
			
 
				+                    'relative_start': start_pos,
			
 
				+                    'sub_title': sub_title['title']
			
 
				+                })
			
 
				+        
			
 
				+        return chunks
			
 
				+    
			
 
				+    def _split_large_chunk(self, content, max_chunk_size, title):
			
 
				+        """
			
 
				+        将超大块按句子级分割（保持语义完整）
			
 
				+        
			
 
				+        参数:
			
 
				+            content: 内容
			
 
				+            max_chunk_size: 最大分块字符数
			
 
				+            title: 标题
			
 
				+            
			
 
				+        返回:
			
 
				+            list: 分割后的块列表
			
 
				+        """
			
 
				+        # 按句子分割（中文句号、问号、感叹号）
			
 
				+        sentences = re.split(r'([。！？\n])', content)
			
 
				+        
			
 
				+        # 重新组合句子和标点
			
 
				+        combined_sentences = []
			
 
				+        for i in range(0, len(sentences) - 1, 2):
			
 
				+            if i + 1 < len(sentences):
			
 
				+                combined_sentences.append(sentences[i] + sentences[i + 1])
			
 
				+            else:
			
 
				+                combined_sentences.append(sentences[i])
			
 
				+        
			
 
				+        if not combined_sentences:
			
 
				+            combined_sentences = [content]
			
 
				+        
			
 
				+        # 按max_chunk_size组合句子
			
 
				+        chunks = []
			
 
				+        current_chunk = ""
			
 
				+        current_start = 0
			
 
				+        
			
 
				+        for sentence in combined_sentences:
			
 
				+            if len(current_chunk) + len(sentence) <= max_chunk_size:
			
 
				+                current_chunk += sentence
			
 
				+            else:
			
 
				+                if current_chunk:
			
 
				+                    chunks.append({
			
 
				+                        'content': current_chunk,
			
 
				+                        'relative_start': current_start,
			
 
				+                        'is_split': True  # 标记为分割块，不参与合并
			
 
				+                    })
			
 
				+                    current_start += len(current_chunk)
			
 
				+                current_chunk = sentence
			
 
				+        
			
 
				+        # 添加最后一个块
			
 
				+        if current_chunk:
			
 
				+            chunks.append({
			
 
				+                'content': current_chunk,
			
 
				+                'relative_start': current_start,
			
 
				+                'is_split': True
			
 
				+            })
			
 
				+        
			
 
				+        return chunks
			
 
				+    
			
 
				+    def _merge_small_chunks(self, chunks, max_chunk_size, min_chunk_size):
			
 
				+        """
			
 
				+        合并小于min_chunk_size的块
			
 
				+        
			
 
				+        参数:
			
 
				+            chunks: 块列表
			
 
				+            max_chunk_size: 最大分块字符数
			
 
				+            min_chunk_size: 最小分块字符数
			
 
				+            
			
 
				+        返回:
			
 
				+            list: 合并后的块列表
			
 
				+        """
			
 
				+        if not chunks:
			
 
				+            return []
			
 
				+        
			
 
				+        # 先按最低层级标题编号分组处理（在同一标题内合并）
			
 
				+        current_title_number = None
			
 
				+        title_groups = []
			
 
				+        current_group = []
			
 
				+        
			
 
				+        for chunk in chunks:
			
 
				+            title_number = chunk.get('_title_number', '')
			
 
				+            
			
 
				+            if title_number != current_title_number:
			
 
				+                # 保存上一组
			
 
				+                if current_group:
			
 
				+                    title_groups.append({
			
 
				+                        'title_number': current_title_number,
			
 
				+                        'chunks': current_group
			
 
				+                    })
			
 
				+                # 开始新组
			
 
				+                current_title_number = title_number
			
 
				+                current_group = [chunk]
			
 
				+            else:
			
 
				+                current_group.append(chunk)
			
 
				+        
			
 
				+        # 保存最后一组
			
 
				+        if current_group:
			
 
				+            title_groups.append({
			
 
				+                'title_number': current_title_number,
			
 
				+                'chunks': current_group
			
 
				+            })
			
 
				+        
			
 
				+        # 在每个组内合并小块
			
 
				+        merged_groups = []
			
 
				+        for group in title_groups:
			
 
				+            merged_chunks = self._merge_within_title(group['chunks'], max_chunk_size, min_chunk_size)
			
 
				+            merged_groups.append({
			
 
				+                'title_number': group['title_number'],
			
 
				+                'chunks': merged_chunks
			
 
				+            })
			
 
				+        
			
 
				+        # 处理跨标题合并：如果上一组的最后一个块与当前组的第一个块都是小块，可以合并
			
 
				+        final_merged = []
			
 
				+        for i, group in enumerate(merged_groups):
			
 
				+            if i == 0:
			
 
				+                final_merged.extend(group['chunks'])
			
 
				+            else:
			
 
				+                # 检查是否可以与上一组的最后一个块合并
			
 
				+                prev_group = merged_groups[i - 1]
			
 
				+                if prev_group['chunks'] and group['chunks']:
			
 
				+                    prev_last = prev_group['chunks'][-1]
			
 
				+                    curr_first = group['chunks'][0]
			
 
				+                    
			
 
				+                    prev_content = prev_last['review_chunk_content']
			
 
				+                    curr_content = curr_first['review_chunk_content']
			
 
				+                    
			
 
				+                    # 如果两个块都是小块且不是分割块，可以合并
			
 
				+                    if (not prev_last.get('is_split', False) and 
			
 
				+                        not curr_first.get('is_split', False) and
			
 
				+                        len(prev_content) < min_chunk_size and
			
 
				+                        len(curr_content) < min_chunk_size and
			
 
				+                        len(prev_content) + len(curr_content) <= max_chunk_size):
			
 
				+                        
			
 
				+                        # 合并
			
 
				+                        merged_content = prev_content + '\n\n' + curr_content
			
 
				+                        merged_chunk = prev_last.copy()
			
 
				+                        merged_chunk['review_chunk_content'] = merged_content
			
 
				+                        merged_chunk['section_label'] = self._merge_section_labels(
			
 
				+                            prev_last['section_label'],
			
 
				+                            curr_first['section_label']
			
 
				+                        )
			
 
				+                        # 合并标题编号
			
 
				+                        prev_title_num = prev_last.get('_title_number', '')
			
 
				+                        curr_title_num = curr_first.get('_title_number', '')
			
 
				+                        if prev_title_num and curr_title_num and prev_title_num != curr_title_num:
			
 
				+                            # chunk_id中使用+号（无空格）
			
 
				+                            merged_chunk['_title_number'] = f"{prev_title_num}+{curr_title_num}"
			
 
				+                            # serial_number中使用空格（用于显示）
			
 
				+                            merged_chunk['_title_number_display'] = f"{prev_title_num} + {curr_title_num}"
			
 
				+                        merged_chunk['_is_merged'] = True
			
 
				+                        
			
 
				+                        # 替换上一组的最后一个块
			
 
				+                        final_merged[-1] = merged_chunk
			
 
				+                        # 跳过当前组的第一个块
			
 
				+                        final_merged.extend(group['chunks'][1:])
			
 
				+                    else:
			
 
				+                        final_merged.extend(group['chunks'])
			
 
				+                else:
			
 
				+                    final_merged.extend(group['chunks'])
			
 
				+        
			
 
				+        return final_merged
			
 
				+    
			
 
				+    def _merge_within_title(self, title_chunks, max_chunk_size, min_chunk_size):
			
 
				+        """在同一个最低层级标题内合并小块"""
			
 
				+        if not title_chunks:
			
 
				+            return []
			
 
				+        
			
 
				+        merged = []
			
 
				+        i = 0
			
 
				+        
			
 
				+        while i < len(title_chunks):
			
 
				+            current_chunk = title_chunks[i]
			
 
				+            current_content = current_chunk['review_chunk_content']
			
 
				+            
			
 
				+            # 如果当前块是分割块，不参与合并
			
 
				+            if current_chunk.get('is_split', False):
			
 
				+                merged.append(current_chunk)
			
 
				+                i += 1
			
 
				+                continue
			
 
				+            
			
 
				+            # 如果当前块小于最小值，尝试与下一个块合并
			
 
				+            if len(current_content) < min_chunk_size and i + 1 < len(title_chunks):
			
 
				+                next_chunk = title_chunks[i + 1]
			
 
				+                next_content = next_chunk['review_chunk_content']
			
 
				+                
			
 
				+                # 检查下一个块是否也是小块且不是分割块
			
 
				+                if (not next_chunk.get('is_split', False) and 
			
 
				+                    len(current_content) + len(next_content) <= max_chunk_size):
			
 
				+                    # 合并
			
 
				+                    merged_content = current_content + '\n\n' + next_content
			
 
				+                    merged_chunk = current_chunk.copy()
			
 
				+                    merged_chunk['review_chunk_content'] = merged_content
			
 
				+                    # 使用优化的标签合并函数
			
 
				+                    merged_chunk['section_label'] = self._merge_section_labels(
			
 
				+                        current_chunk['section_label'], 
			
 
				+                        next_chunk['section_label']
			
 
				+                    )
			
 
				+                    merged.append(merged_chunk)
			
 
				+                    i += 2  # 跳过下一个块
			
 
				+                    continue
			
 
				+            
			
 
				+            # 否则直接添加
			
 
				+            merged.append(current_chunk)
			
 
				+            i += 1
			
 
				+        
			
 
				+        return merged
			
 
				+    
			
 
				+    def _finalize_chunk_ids(self, chunks):
			
 
				+        """
			
 
				+        生成最终的chunk_id和serial_number
			
 
				+        
			
 
				+        参数:
			
 
				+            chunks: 合并后的块列表
			
 
				+            
			
 
				+        返回:
			
 
				+            list: 最终处理后的块列表
			
 
				+        """
			
 
				+        final_chunks = []
			
 
				+        current_title_number = None
			
 
				+        local_index = 1
			
 
				+        
			
 
				+        for i, chunk in enumerate(chunks):
			
 
				+            title_number = chunk.get('_title_number', '')
			
 
				+            is_merged = chunk.get('_is_merged', False)
			
 
				+            
			
 
				+            # 提取标题编号的主要部分（用于判断是否在同一标题内）
			
 
				+            # 如果包含+号，说明是跨标题合并的块
			
 
				+            if '+' in str(title_number):
			
 
				+                # 跨标题合并的块，序号从0开始
			
 
				+                local_index = 0
			
 
				+                # chunk_id中使用+号（无空格），如"1.5+1.6"
			
 
				+                merged_title_number = title_number
			
 
				+                # serial_number中使用空格，如"1.5 + 1.6"
			
 
				+                serial_number_display = chunk.get('_title_number_display', title_number.replace('+', ' + '))
			
 
				+                # 更新current_title_number为合并后的编号，这样下一个块会重新开始
			
 
				+                current_title_number = title_number
			
 
				+            else:
			
 
				+                # 如果标题编号变化，重置索引
			
 
				+                if title_number != current_title_number:
			
 
				+                    current_title_number = title_number
			
 
				+                    # 如果上一个块是跨标题合并的，说明当前标题的第一个块已经被合并了，序号从1开始
			
 
				+                    # 否则序号从1开始
			
 
				+                    local_index = 1
			
 
				+                else:
			
 
				+                    local_index += 1
			
 
				+                merged_title_number = title_number
			
 
				+                serial_number_display = title_number
			
 
				+            
			
 
				+            # 生成chunk_id（使用无空格的编号）
			
 
				+            if merged_title_number:
			
 
				+                chunk_id_str = f"doc_chunk_{merged_title_number}_{local_index}"
			
 
				+            else:
			
 
				+                chunk_id_str = f"doc_chunk_{local_index}"
			
 
				+            
			
 
				+            # 更新chunk数据
			
 
				+            final_chunk = {
			
 
				+                'file_name': chunk['file_name'],
			
 
				+                'chunk_id': chunk_id_str,
			
 
				+                'section_label': chunk['section_label'],
			
 
				+                'project_plan_type': 'bridge_up_part',
			
 
				+                'element_tag': {
			
 
				+                    'chunk_id': chunk_id_str,
			
 
				+                    'page': chunk['element_tag']['page'],
			
 
				+                    'serial_number': serial_number_display if merged_title_number else ''
			
 
				+                },
			
 
				+                'review_chunk_content': chunk['review_chunk_content']
			
 
				+            }
			
 
				+            
			
 
				+            final_chunks.append(final_chunk)
			
 
				+        
			
 
				+        return final_chunks
			
 
				+    
			
 
				+    def _build_section_label(self, parent_title, sub_title):
			
 
				+        """构建section_label（层级路径）"""
			
 
				+        if sub_title:
			
 
				+            return f"{parent_title}->{sub_title}"
			
 
				+        else:
			
 
				+            return parent_title
			
 
				+    
			
 
				+    def _merge_section_labels(self, label1, label2):
			
 
				+        """
			
 
				+        合并两个section_label，提取公共前缀
			
 
				+        
			
 
				+        例如：
			
 
				+        "1 工程概况->1.3 工程地质" + "1 工程概况->1.4 气象水文"
			
 
				+        => "1 工程概况->1.3 工程地质 + 1.4 气象水文"
			
 
				+        
			
 
				+        参数:
			
 
				+            label1: 第一个标签
			
 
				+            label2: 第二个标签
			
 
				+            
			
 
				+        返回:
			
 
				+            str: 合并后的标签
			
 
				+        """
			
 
				+        # 按"->"分割标签
			
 
				+        parts1 = label1.split('->')
			
 
				+        parts2 = label2.split('->')
			
 
				+        
			
 
				+        # 找到公共前缀
			
 
				+        common_prefix = []
			
 
				+        for i in range(min(len(parts1), len(parts2))):
			
 
				+            if parts1[i] == parts2[i]:
			
 
				+                common_prefix.append(parts1[i])
			
 
				+            else:
			
 
				+                break
			
 
				+        
			
 
				+        # 如果有公共前缀
			
 
				+        if common_prefix:
			
 
				+            # 获取不同的部分
			
 
				+            diff1 = '->'.join(parts1[len(common_prefix):])
			
 
				+            diff2 = '->'.join(parts2[len(common_prefix):])
			
 
				+            
			
 
				+            # 构建合并后的标签
			
 
				+            prefix = '->'.join(common_prefix)
			
 
				+            if diff1 and diff2:
			
 
				+                return f"{prefix}->{diff1} + {diff2}"
			
 
				+            elif diff1:
			
 
				+                return f"{prefix}->{diff1}"
			
 
				+            elif diff2:
			
 
				+                return f"{prefix}->{diff2}"
			
 
				+            else:
			
 
				+                return prefix
			
 
				+        else:
			
 
				+            # 没有公共前缀，直接用+连接
			
 
				+            return f"{label1} + {label2}"
			
 
				+
			
--- a/core/construction_review/doc_worker/toc_extractor.py
+++ b/core/construction_review/doc_worker/toc_extractor.py
@@ -0,0 +1,348 @@
 
				+"""
			
 
				+目录提取模块
			
 
				+支持从PDF和Word文档中提取目录结构
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+from pathlib import Path
			
 
				+import fitz  # PyMuPDF
			
 
				+from docx import Document
			
 
				+
			
 
				+try:
			
 
				+    from .config_loader import get_config
			
 
				+except ImportError:
			
 
				+    from config_loader import get_config
			
 
				+
			
 
				+
			
 
				+class TOCExtractor:
			
 
				+    """目录提取器，支持PDF和Word格式"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.config = get_config()
			
 
				+    
			
 
				+    def extract_toc(self, file_path):
			
 
				+        """
			
 
				+        提取文档目录
			
 
				+        
			
 
				+        参数:
			
 
				+            file_path: 文档路径(PDF或Word)
			
 
				+            
			
 
				+        返回:
			
 
				+            dict: 包含目录项和统计信息的字典
			
 
				+        """
			
 
				+        file_path = Path(file_path)
			
 
				+        file_ext = file_path.suffix.lower()
			
 
				+        
			
 
				+        if file_ext == '.pdf':
			
 
				+            return self._extract_from_pdf(file_path)
			
 
				+        elif file_ext in ['.docx', '.doc']:
			
 
				+            return self._extract_from_word(file_path)
			
 
				+        else:
			
 
				+            raise ValueError(f"不支持的文件格式: {file_ext}")
			
 
				+    
			
 
				+    def _extract_from_pdf(self, pdf_path, max_pages=None):
			
 
				+        """从PDF中提取目录"""
			
 
				+        if max_pages is None:
			
 
				+            max_pages = self.config.toc_max_pages
			
 
				+        pages_text = self._extract_pdf_pages(pdf_path, max_pages)
			
 
				+        
			
 
				+        all_toc_items = []
			
 
				+        toc_page_nums = []
			
 
				+        
			
 
				+        for page_info in pages_text:
			
 
				+            toc_items = self._detect_toc_patterns(page_info['text'])
			
 
				+            
			
 
				+            if toc_items:
			
 
				+                all_toc_items.extend(toc_items)
			
 
				+                toc_page_nums.append(page_info['page_num'])
			
 
				+        
			
 
				+        # 去重
			
 
				+        unique_toc = []
			
 
				+        seen = set()
			
 
				+        for item in all_toc_items:
			
 
				+            key = (item['title'], item['page'])
			
 
				+            if key not in seen:
			
 
				+                seen.add(key)
			
 
				+                unique_toc.append(item)
			
 
				+        
			
 
				+        return {
			
 
				+            'toc_items': unique_toc,
			
 
				+            'toc_count': len(unique_toc),
			
 
				+            'toc_pages': toc_page_nums
			
 
				+        }
			
 
				+    
			
 
				+    def _extract_from_word(self, word_path, max_pages=None):
			
 
				+        """从Word中提取目录"""
			
 
				+        if max_pages is None:
			
 
				+            max_pages = self.config.toc_max_pages
			
 
				+        
			
 
				+        # 方法1: 尝试提取内置目录结构
			
 
				+        builtin_toc = self._extract_builtin_toc(word_path)
			
 
				+        
			
 
				+        # 方法2: 文本模式匹配（作为补充）
			
 
				+        pages_text = self._extract_word_pages(word_path, max_pages)
			
 
				+        
			
 
				+        pattern_toc_items = []
			
 
				+        toc_page_nums = []
			
 
				+        
			
 
				+        for page_info in pages_text:
			
 
				+            toc_items = self._detect_toc_patterns(page_info['text'])
			
 
				+            
			
 
				+            if toc_items:
			
 
				+                pattern_toc_items.extend(toc_items)
			
 
				+                toc_page_nums.append(page_info['page_num'])
			
 
				+        
			
 
				+        # 合并两种方法的结果
			
 
				+        all_toc_items = []
			
 
				+        
			
 
				+        # 优先使用内置目录
			
 
				+        if builtin_toc:
			
 
				+            all_toc_items.extend(builtin_toc)
			
 
				+        
			
 
				+        # 如果内置目录为空或数量较少，使用模式匹配的结果
			
 
				+        if len(builtin_toc) < 3:
			
 
				+            all_toc_items.extend(pattern_toc_items)
			
 
				+        
			
 
				+        # 去重
			
 
				+        unique_toc = []
			
 
				+        seen = set()
			
 
				+        for item in all_toc_items:
			
 
				+            key = (item['title'], item.get('page', '?'))
			
 
				+            if key not in seen:
			
 
				+                seen.add(key)
			
 
				+                unique_toc.append(item)
			
 
				+        
			
 
				+        return {
			
 
				+            'toc_items': unique_toc,
			
 
				+            'toc_count': len(unique_toc),
			
 
				+            'toc_pages': toc_page_nums if toc_page_nums else [1]
			
 
				+        }
			
 
				+    
			
 
				+    def _extract_pdf_pages(self, pdf_path, max_pages=None):
			
 
				+        """从PDF文件的前几页提取文本"""
			
 
				+        if max_pages is None:
			
 
				+            max_pages = self.config.toc_max_pages
			
 
				+        try:
			
 
				+            doc = fitz.open(pdf_path)
			
 
				+            pages_text = []
			
 
				+            
			
 
				+            for page_num in range(min(len(doc), max_pages)):
			
 
				+                page = doc[page_num]
			
 
				+                text = page.get_text()
			
 
				+                pages_text.append({
			
 
				+                    'page_num': page_num + 1,
			
 
				+                    'text': text
			
 
				+                })
			
 
				+            
			
 
				+            doc.close()
			
 
				+            return pages_text
			
 
				+        except Exception as e:
			
 
				+            print(f"  错误: 无法读取PDF - {str(e)}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _extract_word_pages(self, word_path, max_pages=None):
			
 
				+        """从Word文件的前几页提取文本"""
			
 
				+        if max_pages is None:
			
 
				+            max_pages = self.config.toc_max_pages
			
 
				+        
			
 
				+        try:
			
 
				+            doc = Document(word_path)
			
 
				+            pages_text = []
			
 
				+            
			
 
				+            all_text = []
			
 
				+            for para in doc.paragraphs:
			
 
				+                text = para.text.strip()
			
 
				+                if text:
			
 
				+                    all_text.append(text)
			
 
				+            
			
 
				+            # 模拟分页：从配置读取每页段落数
			
 
				+            paragraphs_per_page = self.config.paragraphs_per_page
			
 
				+            for i in range(0, min(len(all_text), max_pages * paragraphs_per_page), paragraphs_per_page):
			
 
				+                page_text = '\n'.join(all_text[i:i+paragraphs_per_page])
			
 
				+                pages_text.append({
			
 
				+                    'page_num': i // paragraphs_per_page + 1,
			
 
				+                    'text': page_text
			
 
				+                })
			
 
				+            
			
 
				+            return pages_text
			
 
				+        except Exception as e:
			
 
				+            print(f"  错误: 无法读取Word - {str(e)}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _extract_builtin_toc(self, word_path):
			
 
				+        """提取Word文档的内置目录结构"""
			
 
				+        try:
			
 
				+            doc = Document(word_path)
			
 
				+            toc_items = []
			
 
				+            
			
 
				+            for para in doc.paragraphs:
			
 
				+                style_name = para.style.name if para.style else ""
			
 
				+                text = para.text.strip()
			
 
				+                
			
 
				+                if not text:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 检查是否是标题样式
			
 
				+                if style_name.startswith('Heading'):
			
 
				+                    if not self._has_numbering(text):
			
 
				+                        continue
			
 
				+                    
			
 
				+                    try:
			
 
				+                        level = int(style_name.split()[-1]) if len(style_name.split()) > 1 else 1
			
 
				+                    except:
			
 
				+                        level = 1
			
 
				+                    
			
 
				+                    toc_items.append({
			
 
				+                        'title': text,
			
 
				+                        'level': level,
			
 
				+                        'page': '?',
			
 
				+                        'original': text,
			
 
				+                        'source': 'heading_style'
			
 
				+                    })
			
 
				+                # 检查是否是TOC样式
			
 
				+                elif 'TOC' in style_name or 'toc' in style_name.lower():
			
 
				+                    match = re.search(r'(\d+)\s*$', text)
			
 
				+                    page = match.group(1) if match else '?'
			
 
				+                    
			
 
				+                    title = re.sub(r'\s*\d+\s*$', '', text).strip()
			
 
				+                    
			
 
				+                    if not self._has_numbering(title):
			
 
				+                        continue
			
 
				+                    
			
 
				+                    level_match = re.search(r'TOC\s*(\d+)', style_name, re.IGNORECASE)
			
 
				+                    level = int(level_match.group(1)) if level_match else 1
			
 
				+                    
			
 
				+                    if title:
			
 
				+                        toc_items.append({
			
 
				+                            'title': title,
			
 
				+                            'level': level,
			
 
				+                            'page': page,
			
 
				+                            'original': text,
			
 
				+                            'source': 'toc_style'
			
 
				+                        })
			
 
				+            
			
 
				+            return toc_items
			
 
				+        except Exception as e:
			
 
				+            print(f"  错误: 无法读取Word内置目录 - {str(e)}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _has_numbering(self, text):
			
 
				+        """检查文本是否包含编号格式"""
			
 
				+        # 从配置读取编号格式
			
 
				+        numbering_patterns = self.config.numbering_formats
			
 
				+        
			
 
				+        for pattern in numbering_patterns:
			
 
				+            if re.match(pattern, text):
			
 
				+                return True
			
 
				+        
			
 
				+        return False
			
 
				+    
			
 
				+    def _detect_toc_patterns(self, text):
			
 
				+        """检测文本中的目录模式"""
			
 
				+        toc_items = []
			
 
				+        lines = text.split('\n')
			
 
				+        
			
 
				+        # 预处理：合并可能分行的目录项
			
 
				+        merged_lines = []
			
 
				+        i = 0
			
 
				+        while i < len(lines):
			
 
				+            line = lines[i].strip()
			
 
				+            
			
 
				+            if re.match(r'^第[一二三四五六七八九十\d]+[章节条款]\s*$', line):
			
 
				+                if i + 1 < len(lines):
			
 
				+                    next_line = lines[i + 1].strip()
			
 
				+                    if re.search(r'[.·]{2,}.*\d{1,4}\s*$', next_line):
			
 
				+                        merged_line = line + next_line
			
 
				+                        merged_lines.append(merged_line)
			
 
				+                        i += 2
			
 
				+                        continue
			
 
				+            
			
 
				+            merged_lines.append(line)
			
 
				+            i += 1
			
 
				+        
			
 
				+        # 从配置读取目录格式的正则表达式
			
 
				+        patterns = self.config.toc_patterns
			
 
				+        
			
 
				+        # 从配置读取长度限制
			
 
				+        min_length = self.config.toc_min_length
			
 
				+        max_length = self.config.toc_max_length
			
 
				+        
			
 
				+        for line in merged_lines:
			
 
				+            line = line.strip()
			
 
				+            
			
 
				+            if len(line) < min_length or len(line) > max_length:
			
 
				+                continue
			
 
				+            
			
 
				+            if line.isdigit():
			
 
				+                continue
			
 
				+            
			
 
				+            for pattern in patterns:
			
 
				+                match = re.match(pattern, line)
			
 
				+                if match:
			
 
				+                    title = match.group(1).strip()
			
 
				+                    page_num = match.group(2).strip()
			
 
				+                    
			
 
				+                    title_clean = re.sub(r'[.·]{2,}', '', title)
			
 
				+                    title_clean = re.sub(r'\s{2,}', ' ', title_clean)
			
 
				+                    title_clean = title_clean.strip()
			
 
				+                    
			
 
				+                    if title_clean and not self._is_likely_noise(title_clean):
			
 
				+                        toc_items.append({
			
 
				+                            'original': line,
			
 
				+                            'title': title_clean,
			
 
				+                            'page': page_num,
			
 
				+                            'level': self._detect_level(title_clean)
			
 
				+                        })
			
 
				+                        break
			
 
				+        
			
 
				+        return toc_items
			
 
				+    
			
 
				+    def _is_likely_noise(self, text):
			
 
				+        """判断文本是否可能是噪音（非目录内容）"""
			
 
				+        # 从配置读取噪音模式
			
 
				+        noise_patterns = self.config.noise_patterns
			
 
				+        
			
 
				+        for pattern in noise_patterns:
			
 
				+            if re.search(pattern, text):
			
 
				+                return True
			
 
				+        
			
 
				+        return False
			
 
				+    
			
 
				+    def _detect_level(self, title):
			
 
				+        """检测目录项的层级"""
			
 
				+        if re.match(r'^【\d+】', title):
			
 
				+            return 1
			
 
				+        
			
 
				+        # 检查数字编号层级（如 1.1, 1.1.1, 1.1.1.1）
			
 
				+        number_match = re.match(r'^(\d+(?:\.\d+)*)\s', title)
			
 
				+        if number_match:
			
 
				+            number_part = number_match.group(1)
			
 
				+            dot_count = number_part.count('.')
			
 
				+            return dot_count + 1
			
 
				+        
			
 
				+        # 检查〖〗格式的编号
			
 
				+        bracket_match = re.match(r'^〖(\d+(?:\.\d+)*)〗', title)
			
 
				+        if bracket_match:
			
 
				+            number_part = bracket_match.group(1)
			
 
				+            dot_count = number_part.count('.')
			
 
				+            return dot_count + 1
			
 
				+        
			
 
				+        # 从配置读取标题模式
			
 
				+        level1_patterns = self.config.level1_patterns
			
 
				+        level2_patterns = self.config.level2_patterns
			
 
				+        level3_patterns = self.config.level3_patterns
			
 
				+        
			
 
				+        for pattern in level1_patterns:
			
 
				+            if re.match(pattern, title):
			
 
				+                return 1
			
 
				+        
			
 
				+        for pattern in level2_patterns:
			
 
				+            if re.match(pattern, title):
			
 
				+                return 2
			
 
				+        
			
 
				+        for pattern in level3_patterns:
			
 
				+            if re.match(pattern, title):
			
 
				+                return 3
			
 
				+        
			
 
				+        return 1
			
 
				+