3 месяцев назад · 62f5eddf2c
--- a/README.md
+++ b/README.md
@@ -12,6 +12,12 @@
 
				     - gunicorn -c gunicorn_config.py server.app:app       多进程启动
			
 
				 
			
 
				 
			
 
				+
			
 
				+
			
 
				+### 文档处理
			
 
				+  pip install PyPDF2 python-docx langchain-text-splitter -i https://mirrors.aliyun.com/pypi/simple/
			
 
				+
			
 
				+
			
 
				   ### PostgreSQL 数据库操作测试
			
 
				     sentence-transformers
			
 
				 
			
--- a/config/config.ini
+++ b/config/config.ini
@@ -59,8 +59,12 @@ PGVECTOR_USER=vector_user
 
				 PGVECTOR_PASSWORD=pg16@123
			
 
				 
			
 
				 [milvus]
			
 
				-MILVUS_HOST=192.168.0.3
			
 
				+MILVUS_HOST=192.168.0.5
			
 
				 MILVUS_PORT=19530
			
 
				 MILVUS_DB=lq_db
			
 
				 MILVUS_USER=
			
 
				-MILVUS_PASSWORD=
			
 
				+MILVUS_PASSWORD=
			
 
				+
			
 
				+
			
 
				+[minerU]
			
 
				+MINERU_SERVER_URL=http://192.168.0.166:8000
			
--- a/docker/redis.conf
+++ b/docker/redis.conf
@@ -0,0 +1,11 @@
 
				+# 监听所有网卡，允许远程
			
 
				+bind 0.0.0.0
			
 
				+port 6379
			
 
				+protected-mode no
			
 
				+
			
 
				+# 设置密码（自行替换）
			
 
				+requirepass Wxcz666@
			
 
				+
			
 
				+# 开启 AOF 持久化
			
 
				+appendonly yes
			
 
				+dir /data
			
--- a/file_processors/bfp_document_catalog_processor.py
+++ b/file_processors/bfp_document_catalog_processor.py
@@ -0,0 +1,21 @@
 
				+"""
			
 
				+文档处理器
			
 
				+负责文档解析、内容提取和结构化处理
			
 
				+集成doc_worker模块的智能处理能力
			
 
				+"""
			
 
				+
			
 
				+import io
			
 
				+import os
			
 
				+import tempfile
			
 
				+from pathlib import Path
			
 
				+from typing import Dict, Any, Optional, Callable
			
 
				+from datetime import datetime
			
 
				+from foundation.logger.loggering import server_logger as logger
			
 
				+
			
 
				+
			
 
				+
			
 
				+class DocumentCatalogProcessor:
			
 
				+    """
			
 
				+    
			
 
				+     文档处理器
			
 
				+    """
			
--- a/file_processors/bfp_md_processor.py
+++ b/file_processors/bfp_md_processor.py
@@ -0,0 +1,487 @@
 
				+import re
			
 
				+import os
			
 
				+import json
			
 
				+from typing import List, Dict, Any
			
 
				+from tqdm import tqdm
			
 
				+import time
			
 
				+from foundation.logger.loggering import server_logger as logger
			
 
				+from foundation.utils.common import handler_err
			
 
				+from foundation.base.config import config_handler
			
 
				+from foundation.rag.vector.base_vector import BaseVectorDB
			
 
				+
			
 
				+class BfpMarkdownProcessor():
			
 
				+
			
 
				+    def __init__(self, directory, base_vector: BaseVectorDB, **kwargs):
			
 
				+        """
			
 
				+        初始化 PDF 处理器
			
 
				+        :param directory: PDF 文件所在目录
			
 
				+        :param db_type: 数据库类型 ('vector' 或 'es')
			
 
				+        :param kwargs: 其他参数
			
 
				+        """
			
 
				+        self.base_vector = base_vector
			
 
				+        self.directory = directory  # PDF 文件所在目录
			
 
				+        self.file_group_num = kwargs.get('file_group_num', 20)  # 每组处理的文件数
			
 
				+        self.batch_num = kwargs.get('batch_num', 6)  # 每次插入的批次数量
			
 
				+        self.chunksize = kwargs.get('chunksize', 500)  # 切分文本的大小
			
 
				+        self.overlap = kwargs.get('overlap', 100)  # 切分文本的重叠大小
			
 
				+        self.file_suffix_list = kwargs.get('file_suffix_list', ['.md'])
			
 
				+        logger.info(f"""
			
 
				+                       初始化PDF文件导入器:
			
 
				+                       配置参数：
			
 
				+                       - 文件后缀列表：{self.file_suffix_list}
			
 
				+                       - 导入的文件路径：{self.directory}
			
 
				+                       - 每次处理文件数：{self.file_group_num}
			
 
				+                       - 每批次处理样本数：{self.batch_num}
			
 
				+                       - 切分文本的大小：{self.chunksize}
			
 
				+                       - 切分文本重叠大小：{self.overlap}
			
 
				+                       """)
			
 
				+
			
 
				+    def load_files(self):
			
 
				+        """
			
 
				+        加载目录下的所有PDF文件
			
 
				+        """
			
 
				+        file_path = os.path.join(self.directory)
			
 
				+        pdf_path_files = []
			
 
				+        pdf_file_names = []
			
 
				+        # logger.info(f"file_path: {file_path}")
			
 
				+        for file_name in os.listdir(file_path):
			
 
				+            # 获取后缀（带点） # file_name.lower().endswith('.docx'):
			
 
				+            file_suffix = os.path.splitext(file_name)[1]
			
 
				+            if file_suffix in self.file_suffix_list:
			
 
				+                pdf_file_names.append(file_name)
			
 
				+                pdf_path_files.append(os.path.join(file_path, file_name))
			
 
				+            else:
			
 
				+                logger.info(f"Skipping {file_name} because it is not a PDF file.")
			
 
				+
			
 
				+        logger.info(f"Found {len(pdf_file_names)} PDF files.")
			
 
				+        logger.info(f"pdf_path_files: {pdf_path_files}，pdf_file_names：{pdf_file_names}")
			
 
				+        return pdf_path_files, pdf_file_names
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def process_tqdm_pdfs_group(self, key_name: str = "collection_name"):
			
 
				+        """
			
 
				+        处理PDF文件组 并且直接入库处理
			
 
				+        """
			
 
				+        total_chunks = 0
			
 
				+        # 读取PDF文件内容
			
 
				+        path_files, file_names = self.load_files()
			
 
				+
			
 
				+        logger.info(f"process {len(path_files)} documents.")
			
 
				+        start_time = time.time()
			
 
				+        total_docs_inserted = 0
			
 
				+
			
 
				+        total_batches = len(path_files)
			
 
				+
			
 
				+        with tqdm(total=total_batches, desc="process batches", unit="batch") as pbar:
			
 
				+            for path_file, file_name in zip(path_files, file_names):
			
 
				+                # 初始化拆分器
			
 
				+                splitter = MarkdownDocumentSplitter(max_chunk_size=self.chunksize)
			
 
				+                splitter.load_markdown(path_file)
			
 
				+                # 解析结构
			
 
				+                structure = splitter.parse_structure()
			
 
				+                logger.info("文档结构解析完成")
			
 
				+                # 拆分文档
			
 
				+                chunks = splitter.split_document()
			
 
				+                logger.info(f"生成 {len(chunks)} 个文档块")
			
 
				+
			
 
				+                # 保存结果
			
 
				+                output_dir = 'test/bfp_chunks_files/'+file_name
			
 
				+                splitter.save_chunks(chunks, output_dir)
			
 
				+                # 生成结构报告
			
 
				+                splitter.generate_structure_report(os.path.join(output_dir, 'structure_report.md'))
			
 
				+                logger.info(f"所有文件已保存到 {output_dir} 目录")
			
 
				+
			
 
				+                logger.info(f"Documents file_name:{file_name}，docs:{len(chunks)}")
			
 
				+                # 调用传入的插入函数
			
 
				+                # 数据标准化处理
			
 
				+                documents = self.base_vector.document_standard(chunks)
			
 
				+                self.base_vector.add_tqdm_batch_documents(param={key_name: "tv_basis_of_preparation_md"},
			
 
				+                                                          documents=documents)
			
 
				+
			
 
				+                total_docs_inserted += 1
			
 
				+                # 计算并显示当前的TPM
			
 
				+                elapsed_time = time.time() - start_time
			
 
				+                if elapsed_time > 0:
			
 
				+                    tpm = (total_docs_inserted / elapsed_time) * 60
			
 
				+                    pbar.set_postfix({"TPM": f"{tpm:.2f}"})
			
 
				+
			
 
				+                pbar.update(1)
			
 
				+
			
 
				+        # TODO 切分的问题 可以增加metadata元数据信息
			
 
				+        logger.info(
			
 
				+            f"Processed Documents:{self.directory}，docs:{len(path_files)},total_chunks:{total_chunks}")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+class MarkdownDocumentSplitter:
			
 
				+    def __init__(self, max_chunk_size: int = 4000):
			
 
				+        self.max_chunk_size = max_chunk_size
			
 
				+        self.content = ""
			
 
				+        self.structure = []
			
 
				+
			
 
				+    def load_markdown(self, file_path: str) -> str:
			
 
				+        with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+            self.content = f.read()
			
 
				+        return self.content
			
 
				+
			
 
				+
			
 
				+    def parse_structure(self) -> List[Dict[str, Any]]:
			
 
				+        lines = self.content.split('\n')
			
 
				+        structure = []
			
 
				+        hierarchy_stack = []  # 用于维护标题层级关系
			
 
				+
			
 
				+        for i, line in enumerate(lines):
			
 
				+            line = line.strip()
			
 
				+            if line.startswith('#') and ' ' in line:
			
 
				+                # 提取标题级别和内容
			
 
				+                level_match = re.match(r'^(#+)\s+(.+)$', line)
			
 
				+                if level_match:
			
 
				+                    level = len(level_match.group(1))
			
 
				+                    title = level_match.group(2).strip()
			
 
				+
			
 
				+                    # 创建章节对象
			
 
				+                    section = {
			
 
				+                        'level': level,
			
 
				+                        'title': title,
			
 
				+                        'start_line': i,
			
 
				+                        'end_line': None,
			
 
				+                        'content': '',
			
 
				+                        'children': [],
			
 
				+                        'parent': None
			
 
				+                    }
			
 
				+
			
 
				+                    # 处理层级关系
			
 
				+                    while hierarchy_stack and hierarchy_stack[-1]['level'] >= level:
			
 
				+                        hierarchy_stack.pop()
			
 
				+
			
 
				+                    # 设置父级关系
			
 
				+                    if hierarchy_stack:
			
 
				+                        parent = hierarchy_stack[-1]
			
 
				+                        section['parent'] = parent
			
 
				+                        parent['children'].append(section)
			
 
				+                    else:
			
 
				+                        structure.append(section)
			
 
				+
			
 
				+                    hierarchy_stack.append(section)
			
 
				+
			
 
				+        # 填充内容并构建完整标题路径
			
 
				+        for section in self._flatten_structure(structure):
			
 
				+            start = section['start_line'] + 1
			
 
				+            end = self._find_section_end(lines, start, section['level'])
			
 
				+            section['end_line'] = end
			
 
				+
			
 
				+            raw_content = '\n'.join(lines[start:end + 1])
			
 
				+            cleaned_content = self._clean_content(raw_content)
			
 
				+            section['content'] = cleaned_content
			
 
				+
			
 
				+            # 构建完整标题路径和层级
			
 
				+            section['full_title'] = self._build_full_title_path(section)
			
 
				+            section['hierarchy'] = self._get_section_hierarchy(section)
			
 
				+
			
 
				+        self.structure = structure
			
 
				+        return structure
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def _build_full_title_path(self, section: Dict) -> str:
			
 
				+        """构建完整标题路径"""
			
 
				+        titles = []
			
 
				+        current = section
			
 
				+        while current:
			
 
				+            titles.insert(0, current['title'])
			
 
				+            current = current.get('parent')
			
 
				+        return '->'.join(titles)
			
 
				+
			
 
				+    def _get_section_hierarchy(self, section: Dict) -> List[str]:
			
 
				+        """获取章节层级"""
			
 
				+        hierarchy = []
			
 
				+        current = section
			
 
				+        while current:
			
 
				+            hierarchy.insert(0, current['title'])
			
 
				+            current = current.get('parent')
			
 
				+        return hierarchy
			
 
				+
			
 
				+    def _clean_content(self, content: str) -> str:
			
 
				+        """清理内容，去除多余空行"""
			
 
				+        lines = content.split('\n')
			
 
				+        cleaned_lines = []
			
 
				+
			
 
				+        for line in lines:
			
 
				+            stripped_line = line.strip()
			
 
				+            # 跳过空行，但保留有内容的行
			
 
				+            if stripped_line:
			
 
				+                cleaned_lines.append(line)
			
 
				+            elif cleaned_lines and cleaned_lines[-1].strip():  # 只在有内容行后保留一个空行
			
 
				+                cleaned_lines.append('')
			
 
				+
			
 
				+        # 去除结尾的空行
			
 
				+        while cleaned_lines and not cleaned_lines[-1].strip():
			
 
				+            cleaned_lines.pop()
			
 
				+
			
 
				+        return '\n'.join(cleaned_lines)
			
 
				+
			
 
				+    def _has_meaningful_content(self, content: str) -> bool:
			
 
				+        """检查是否有实际内容"""
			
 
				+        lines = content.split('\n')
			
 
				+        for line in lines:
			
 
				+            line = line.strip()
			
 
				+            if line and not line.startswith('#') and len(line) > 5:
			
 
				+                return True
			
 
				+        return False
			
 
				+
			
 
				+    def _flatten_structure(self, structure: List[Dict]) -> List[Dict]:
			
 
				+        """展平结构"""
			
 
				+        flattened = []
			
 
				+        for section in structure:
			
 
				+            flattened.append(section)
			
 
				+            flattened.extend(self._flatten_structure(section['children']))
			
 
				+        return flattened
			
 
				+
			
 
				+    def _find_section_end(self, lines: List[str], start_idx: int, current_level: int) -> int:
			
 
				+        """查找章节结束位置"""
			
 
				+        for i in range(start_idx, len(lines)):
			
 
				+            line = lines[i].strip()
			
 
				+            if line.startswith('#') and ' ' in line:
			
 
				+                level_match = re.match(r'^(#+)\s+', line)
			
 
				+                if level_match:
			
 
				+                    level = len(level_match.group(1))
			
 
				+                    if level <= current_level:
			
 
				+                        return i - 1
			
 
				+        return len(lines) - 1
			
 
				+
			
 
				+    def split_document(self) -> List[Dict[str, Any]]:
			
 
				+        if not self.structure:
			
 
				+            self.parse_structure()
			
 
				+
			
 
				+        chunks = []
			
 
				+        flattened = self._flatten_structure(self.structure)
			
 
				+
			
 
				+        for section in flattened:
			
 
				+            if not section['content'] or not self._has_meaningful_content(section['content']):
			
 
				+                continue
			
 
				+
			
 
				+            if len(section['content']) <= self.max_chunk_size:
			
 
				+                chunk = self._create_chunk(section)
			
 
				+                if chunk:
			
 
				+                    chunks.append(chunk)
			
 
				+            else:
			
 
				+                sub_chunks = self._split_section_content(section)
			
 
				+                chunks.extend(sub_chunks)
			
 
				+
			
 
				+        return chunks
			
 
				+
			
 
				+    def _split_section_content(self, section: Dict) -> List[Dict]:
			
 
				+        """拆分章节内容为更小的块"""
			
 
				+        content = section['content']
			
 
				+        lines = content.split('\n')
			
 
				+        chunks = []
			
 
				+        current_chunk = []
			
 
				+        current_size = 0
			
 
				+
			
 
				+        for line in lines:
			
 
				+            line_size = len(line)
			
 
				+
			
 
				+            if (current_size + line_size > self.max_chunk_size and current_chunk) or \
			
 
				+                    (line.strip() and line.startswith('#') and current_chunk and current_size > 100):
			
 
				+
			
 
				+                chunk_content = '\n'.join(current_chunk)
			
 
				+                chunk_content = self._clean_content(chunk_content)
			
 
				+                if self._has_meaningful_content(chunk_content):
			
 
				+                    chunk = self._create_sub_chunk(section, chunk_content, len(chunks))
			
 
				+                    chunks.append(chunk)
			
 
				+
			
 
				+                current_chunk = [line]
			
 
				+                current_size = line_size
			
 
				+            else:
			
 
				+                current_chunk.append(line)
			
 
				+                current_size += line_size
			
 
				+
			
 
				+        if current_chunk:
			
 
				+            chunk_content = '\n'.join(current_chunk)
			
 
				+            chunk_content = self._clean_content(chunk_content)
			
 
				+            if self._has_meaningful_content(chunk_content):
			
 
				+                chunk = self._create_sub_chunk(section, chunk_content, len(chunks))
			
 
				+                chunks.append(chunk)
			
 
				+
			
 
				+        return chunks
			
 
				+
			
 
				+    def _create_chunk(self, section: Dict) -> Dict:
			
 
				+        """创建文档块"""
			
 
				+        content_with_parents = self._build_content_with_parents(section)
			
 
				+
			
 
				+        return {
			
 
				+            'metadata': {
			
 
				+                'title': section['title'],
			
 
				+                'level': section['level'],
			
 
				+                'hierarchy': section['hierarchy'],
			
 
				+                'full_title_path': section['full_title'],
			
 
				+                'start_line': section['start_line'],
			
 
				+                'end_line': section['end_line'],
			
 
				+                'chunk_type': 'section'
			
 
				+            },
			
 
				+            'content': content_with_parents,
			
 
				+            'content_size': len(content_with_parents)
			
 
				+        }
			
 
				+
			
 
				+    def _create_sub_chunk(self, section: Dict, content: str, sub_index: int) -> Dict:
			
 
				+        """创建子文档块"""
			
 
				+        content_with_parents = self._build_content_with_parents(section, content, sub_index)
			
 
				+
			
 
				+        return {
			
 
				+            'metadata': {
			
 
				+                'title': f"{section['title']} - 部分{sub_index + 1}",
			
 
				+                'level': section['level'],
			
 
				+                'hierarchy': section['hierarchy'],
			
 
				+                'full_title_path': section['full_title'],
			
 
				+                'start_line': section['start_line'],
			
 
				+                'end_line': section['end_line'],
			
 
				+                'chunk_type': 'subsection',
			
 
				+                'parent_title': section['title'],
			
 
				+                'sub_index': sub_index
			
 
				+            },
			
 
				+            'content': content_with_parents,
			
 
				+            #'page_content': content_with_parents,
			
 
				+            'content_size': len(content_with_parents)
			
 
				+        }
			
 
				+
			
 
				+    def _build_content_with_parents(self, section: Dict, sub_content: str = None, sub_index: int = None) -> str:
			
 
				+        """构建包含所有父标题的完整内容"""
			
 
				+        # 获取所有父级标题
			
 
				+        parents = []
			
 
				+        current = section
			
 
				+        while current:
			
 
				+            parents.insert(0, current)
			
 
				+            current = current.get('parent')
			
 
				+
			
 
				+        content_lines = []
			
 
				+
			
 
				+        # 添加所有层级的标题（从一级标题开始）
			
 
				+        for i, parent_section in enumerate(parents):
			
 
				+            level = i + 1
			
 
				+            content_lines.append(f"{'#' * level} {parent_section['title']}")
			
 
				+
			
 
				+        # 如果有子分块，添加子分块标题
			
 
				+        if sub_index is not None:
			
 
				+            content_lines.append(f"{'#' * (len(parents) + 1)} 部分 {sub_index + 1}")
			
 
				+
			
 
				+        content_lines.append("")  # 标题和内容之间的空行
			
 
				+
			
 
				+        # 添加内容
			
 
				+        if sub_content:
			
 
				+            content_lines.append(sub_content)
			
 
				+        else:
			
 
				+            content_lines.append(section['content'])
			
 
				+
			
 
				+        full_content = '\n'.join(content_lines)
			
 
				+        return self._clean_content(full_content)
			
 
				+
			
 
				+
			
 
				+    def save_chunks(self, chunks: List[Dict], output_dir: str):
			
 
				+        if not os.path.exists(output_dir):
			
 
				+            os.makedirs(output_dir)
			
 
				+
			
 
				+        metadata = {
			
 
				+            'total_chunks': len(chunks),
			
 
				+            'max_chunk_size': self.max_chunk_size,
			
 
				+            'document_info': {
			
 
				+                'title': '公路工程施工安全技术规范',
			
 
				+                'code': 'JTG F90-2015'
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f:
			
 
				+            json.dump(metadata, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+        for i, chunk in enumerate(chunks):
			
 
				+            filename = f"chunk_{i:03d}.md"
			
 
				+            filepath = os.path.join(output_dir, filename)
			
 
				+
			
 
				+            with open(filepath, 'w', encoding='utf-8') as f:
			
 
				+                f.write(chunk['content'])
			
 
				+
			
 
				+            chunk_metadata_path = os.path.join(output_dir, f"chunk_{i:03d}_metadata.json")
			
 
				+            with open(chunk_metadata_path, 'w', encoding='utf-8') as f:
			
 
				+                json.dump(chunk['metadata'], f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    def generate_structure_report(self, output_file: str):
			
 
				+        """生成结构报告"""
			
 
				+        if not self.structure:
			
 
				+            self.parse_structure()
			
 
				+
			
 
				+        report = ["# 文档结构分析报告\n"]
			
 
				+        report.append(f"## 总章节数: {len(self._flatten_structure(self.structure))}")
			
 
				+        report.append(f"## 最大块大小: {self.max_chunk_size} 字符\n")
			
 
				+
			
 
				+        report.append("## 文档层级结构:\n")
			
 
				+        self._add_structure_to_report(self.structure, report, 0)
			
 
				+
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            f.write('\n'.join(report))
			
 
				+
			
 
				+    def _add_structure_to_report(self, structure: List[Dict], report: List[str], indent: int):
			
 
				+        """递归添加结构到报告"""
			
 
				+        for section in structure:
			
 
				+            indent_str = "  " * indent
			
 
				+            title = section['title']
			
 
				+            level = section['level']
			
 
				+            content_size = len(section['content'])
			
 
				+            children_count = len(section['children'])
			
 
				+
			
 
				+            report.append(f"{indent_str}- {'#' * level} {title} "
			
 
				+                          f"(内容大小: {content_size}字符, 子章节: {children_count})")
			
 
				+
			
 
				+            if section['children']:
			
 
				+                self._add_structure_to_report(section['children'], report, indent + 1)
			
 
				+
			
 
				+def main():
			
 
				+    # 创建测试文档
			
 
				+
			
 
				+    # 初始化拆分器
			
 
				+    splitter = MarkdownDocumentSplitter(max_chunk_size=2000)
			
 
				+
			
 
				+    # 示例文件路径
			
 
				+    file_path = "I:/wangxun_dev_workspace/lq_workspace/output/"
			
 
				+    pdf_file = file_path + "公路工程施工安全技术规范_2.md"
			
 
				+
			
 
				+    # 加载文档（这里假设文档已经保存为markdown文件）
			
 
				+    # 在实际使用中，您需要先将提供的文档内容保存为markdown文件
			
 
				+    try:
			
 
				+        splitter.load_markdown(pdf_file)
			
 
				+    except FileNotFoundError:
			
 
				+        logger.info("请先将文档内容保存为 '公路工程施工安全技术规范.md' 文件")
			
 
				+        return
			
 
				+
			
 
				+    # 解析结构
			
 
				+    structure = splitter.parse_structure()
			
 
				+    logger.info("文档结构解析完成")
			
 
				+
			
 
				+    # 拆分文档
			
 
				+    chunks = splitter.split_document()
			
 
				+    logger.info(f"生成 {len(chunks)} 个文档块")
			
 
				+
			
 
				+    # 保存结果
			
 
				+    output_dir = 'knowledge_base_chunks'
			
 
				+    splitter.save_chunks(chunks, output_dir)
			
 
				+
			
 
				+    # 生成结构报告
			
 
				+    splitter.generate_structure_report(os.path.join(output_dir, 'structure_report.md'))
			
 
				+
			
 
				+    logger.info(f"所有文件已保存到 {output_dir} 目录")
			
 
				+
			
 
				+    # 显示示例
			
 
				+    if chunks:
			
 
				+        logger.info(f"\n第一个分块内容:")
			
 
				+        logger.info("=" * 50)
			
 
				+        logger.info(chunks[0]['content'])
			
 
				+        logger.info("=" * 50)
			
 
				+
			
 
				+        logger.info(f"\n第一个分块元数据:")
			
 
				+        logger.info(json.dumps(chunks[0]['metadata'], ensure_ascii=False, indent=2))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/file_processors/bfp_md_processor_bak.py
+++ b/file_processors/bfp_md_processor_bak.py
@@ -0,0 +1,317 @@
 
				+
			
 
				+
			
 
				+import re
			
 
				+import os
			
 
				+import json
			
 
				+from typing import List, Dict, Any, Tuple
			
 
				+
			
 
				+class MarkdownDocumentSplitter:
			
 
				+    def __init__(self, max_chunk_size: int = 4000):
			
 
				+        """
			
 
				+        初始化文档拆分器
			
 
				+        
			
 
				+        Args:
			
 
				+            max_chunk_size: 最大块大小（字符数）
			
 
				+        """
			
 
				+        self.max_chunk_size = max_chunk_size
			
 
				+        self.content = ""
			
 
				+        self.structure = []
			
 
				+        
			
 
				+    def load_markdown(self, file_path: str) -> str:
			
 
				+        """
			
 
				+        加载Markdown文件
			
 
				+        
			
 
				+        Args:
			
 
				+            file_path: 文件路径
			
 
				+            
			
 
				+        Returns:
			
 
				+            文件内容
			
 
				+        """
			
 
				+        with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+            self.content = f.read()
			
 
				+        return self.content
			
 
				+    
			
 
				+    def parse_structure(self) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        解析文档结构
			
 
				+        
			
 
				+        Returns:
			
 
				+            文档结构列表
			
 
				+        """
			
 
				+        lines = self.content.split('\n')
			
 
				+        structure = []
			
 
				+        current_section = None
			
 
				+        
			
 
				+        for i, line in enumerate(lines):
			
 
				+            # 检测标题行
			
 
				+            if line.strip().startswith('#') and ' ' in line:
			
 
				+                level = len(line.split(' ')[0].strip())
			
 
				+                title = line.split(' ', 1)[1].strip()
			
 
				+                
			
 
				+                section = {
			
 
				+                    'level': level,
			
 
				+                    'title': title,
			
 
				+                    'start_line': i,
			
 
				+                    'end_line': None,
			
 
				+                    'content': '',
			
 
				+                    'children': []
			
 
				+                }
			
 
				+                
			
 
				+                # 如果是顶级标题
			
 
				+                if level == 1:
			
 
				+                    structure.append(section)
			
 
				+                    current_section = section
			
 
				+                else:
			
 
				+                    # 找到父级标题
			
 
				+                    parent = self._find_parent_section(structure, level-1)
			
 
				+                    if parent:
			
 
				+                        parent['children'].append(section)
			
 
				+                        current_section = section
			
 
				+        
			
 
				+        # 填充内容
			
 
				+        for section in self._flatten_structure(structure):
			
 
				+            start = section['start_line'] + 1
			
 
				+            end = self._find_section_end(lines, start, section['level'])
			
 
				+            section['end_line'] = end
			
 
				+            section['content'] = '\n'.join(lines[start:end+1]).strip()
			
 
				+            
			
 
				+        self.structure = structure
			
 
				+        return structure
			
 
				+    
			
 
				+    def _find_parent_section(self, structure: List[Dict], target_level: int) -> Dict:
			
 
				+        """递归查找父级标题"""
			
 
				+        if not structure:
			
 
				+            return None
			
 
				+            
			
 
				+        for section in reversed(structure):
			
 
				+            if section['level'] == target_level:
			
 
				+                return section
			
 
				+            if section['children']:
			
 
				+                result = self._find_parent_section(section['children'], target_level)
			
 
				+                if result:
			
 
				+                    return result
			
 
				+        return None
			
 
				+    
			
 
				+    def _flatten_structure(self, structure: List[Dict]) -> List[Dict]:
			
 
				+        """展平结构"""
			
 
				+        flattened = []
			
 
				+        for section in structure:
			
 
				+            flattened.append(section)
			
 
				+            flattened.extend(self._flatten_structure(section['children']))
			
 
				+        return flattened
			
 
				+    
			
 
				+    def _find_section_end(self, lines: List[str], start_idx: int, current_level: int) -> int:
			
 
				+        """查找章节结束位置"""
			
 
				+        for i in range(start_idx, len(lines)):
			
 
				+            line = lines[i].strip()
			
 
				+            if line.startswith('#') and ' ' in line:
			
 
				+                level = len(line.split(' ')[0].strip())
			
 
				+                if level <= current_level:
			
 
				+                    return i - 1
			
 
				+        return len(lines) - 1
			
 
				+    
			
 
				+    def split_document(self) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        拆分文档为知识库片段
			
 
				+        
			
 
				+        Returns:
			
 
				+            拆分后的片段列表
			
 
				+        """
			
 
				+        if not self.structure:
			
 
				+            self.parse_structure()
			
 
				+            
			
 
				+        chunks = []
			
 
				+        flattened = self._flatten_structure(self.structure)
			
 
				+        
			
 
				+        for section in flattened:
			
 
				+            # 检查是否需要进一步拆分
			
 
				+            if len(section['content']) <= self.max_chunk_size:
			
 
				+                chunk = self._create_chunk(section)
			
 
				+                chunks.append(chunk)
			
 
				+            else:
			
 
				+                # 需要进一步拆分为子块
			
 
				+                sub_chunks = self._split_section_content(section)
			
 
				+                chunks.extend(sub_chunks)
			
 
				+                
			
 
				+        return chunks
			
 
				+    
			
 
				+    def _split_section_content(self, section: Dict) -> List[Dict]:
			
 
				+        """拆分章节内容为更小的块"""
			
 
				+        content = section['content']
			
 
				+        lines = content.split('\n')
			
 
				+        chunks = []
			
 
				+        current_chunk = []
			
 
				+        current_size = 0
			
 
				+        
			
 
				+        for line in lines:
			
 
				+            line_size = len(line)
			
 
				+            
			
 
				+            # 如果是新段落或当前块太大，创建新块
			
 
				+            if (current_size + line_size > self.max_chunk_size and current_chunk) or \
			
 
				+               (line.strip() and not line.startswith(' ') and current_chunk):
			
 
				+                
			
 
				+                chunk_content = '\n'.join(current_chunk)
			
 
				+                chunk = self._create_sub_chunk(section, chunk_content, len(chunks))
			
 
				+                chunks.append(chunk)
			
 
				+                
			
 
				+                current_chunk = [line]
			
 
				+                current_size = line_size
			
 
				+            else:
			
 
				+                current_chunk.append(line)
			
 
				+                current_size += line_size
			
 
				+        
			
 
				+        # 添加最后一个块
			
 
				+        if current_chunk:
			
 
				+            chunk_content = '\n'.join(current_chunk)
			
 
				+            chunk = self._create_sub_chunk(section, chunk_content, len(chunks))
			
 
				+            chunks.append(chunk)
			
 
				+            
			
 
				+        return chunks
			
 
				+    
			
 
				+    def _create_chunk(self, section: Dict) -> Dict:
			
 
				+        """创建文档块"""
			
 
				+        hierarchy = self._get_section_hierarchy(section)
			
 
				+        
			
 
				+        return {
			
 
				+            'metadata': {
			
 
				+                'title': section['title'],
			
 
				+                'level': section['level'],
			
 
				+                'hierarchy': hierarchy,
			
 
				+                'start_line': section['start_line'],
			
 
				+                'end_line': section['end_line'],
			
 
				+                'chunk_type': 'section'
			
 
				+            },
			
 
				+            'content': f"# {section['title']}\n\n{section['content']}",
			
 
				+            'content_size': len(section['content'])
			
 
				+        }
			
 
				+    
			
 
				+    def _create_sub_chunk(self, section: Dict, content: str, sub_index: int) -> Dict:
			
 
				+        """创建子文档块"""
			
 
				+        hierarchy = self._get_section_hierarchy(section)
			
 
				+        
			
 
				+        return {
			
 
				+            'metadata': {
			
 
				+                'title': f"{section['title']} - 部分{sub_index + 1}",
			
 
				+                'level': section['level'],
			
 
				+                'hierarchy': hierarchy,
			
 
				+                'start_line': section['start_line'],
			
 
				+                'end_line': section['end_line'],
			
 
				+                'chunk_type': 'subsection',
			
 
				+                'parent_title': section['title'],
			
 
				+                'sub_index': sub_index
			
 
				+            },
			
 
				+            'content': f"# {section['title']}\n\n## 部分 {sub_index + 1}\n\n{content}",
			
 
				+            'content_size': len(content)
			
 
				+        }
			
 
				+    
			
 
				+    def _get_section_hierarchy(self, section: Dict) -> List[str]:
			
 
				+        """获取章节层级路径"""
			
 
				+        # 简化实现，实际应用中需要递归查找父级标题
			
 
				+        return [section['title']]
			
 
				+    
			
 
				+    def save_chunks(self, chunks: List[Dict], output_dir: str):
			
 
				+        """
			
 
				+        保存拆分后的块到文件
			
 
				+        
			
 
				+        Args:
			
 
				+            chunks: 文档块列表
			
 
				+            output_dir: 输出目录
			
 
				+        """
			
 
				+        if not os.path.exists(output_dir):
			
 
				+            os.makedirs(output_dir)
			
 
				+        
			
 
				+        # 保存元数据
			
 
				+        metadata = {
			
 
				+            'total_chunks': len(chunks),
			
 
				+            'max_chunk_size': self.max_chunk_size,
			
 
				+            'document_info': {
			
 
				+                'title': '公路工程施工安全技术规范',
			
 
				+                'code': 'JTG F90-2015'
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f:
			
 
				+            json.dump(metadata, f, ensure_ascii=False, indent=2)
			
 
				+        
			
 
				+        # 保存每个块
			
 
				+        for i, chunk in enumerate(chunks):
			
 
				+            filename = f"chunk_{i:03d}.md"
			
 
				+            filepath = os.path.join(output_dir, filename)
			
 
				+            
			
 
				+            # 保存内容
			
 
				+            with open(filepath, 'w', encoding='utf-8') as f:
			
 
				+                f.write(chunk['content'])
			
 
				+            
			
 
				+            # 保存块元数据
			
 
				+            chunk_metadata_path = os.path.join(output_dir, f"chunk_{i:03d}_metadata.json")
			
 
				+            with open(chunk_metadata_path, 'w', encoding='utf-8') as f:
			
 
				+                json.dump(chunk['metadata'], f, ensure_ascii=False, indent=2)
			
 
				+    
			
 
				+    def generate_structure_report(self, output_file: str):
			
 
				+        """生成结构报告"""
			
 
				+        if not self.structure:
			
 
				+            self.parse_structure()
			
 
				+            
			
 
				+        report = ["# 文档结构分析报告\n"]
			
 
				+        report.append(f"## 总章节数: {len(self._flatten_structure(self.structure))}")
			
 
				+        report.append(f"## 最大块大小: {self.max_chunk_size} 字符\n")
			
 
				+        
			
 
				+        report.append("## 文档层级结构:\n")
			
 
				+        self._add_structure_to_report(self.structure, report, 0)
			
 
				+        
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            f.write('\n'.join(report))
			
 
				+    
			
 
				+    def _add_structure_to_report(self, structure: List[Dict], report: List[str], indent: int):
			
 
				+        """递归添加结构到报告"""
			
 
				+        for section in structure:
			
 
				+            indent_str = "  " * indent
			
 
				+            title = section['title']
			
 
				+            level = section['level']
			
 
				+            content_size = len(section['content'])
			
 
				+            children_count = len(section['children'])
			
 
				+            
			
 
				+            report.append(f"{indent_str}- {'#' * level} {title} "
			
 
				+                         f"(内容大小: {content_size}字符, 子章节: {children_count})")
			
 
				+            
			
 
				+            if section['children']:
			
 
				+                self._add_structure_to_report(section['children'], report, indent + 1)
			
 
				+
			
 
				+# 使用示例
			
 
				+def main():
			
 
				+    # 初始化拆分器
			
 
				+    splitter = MarkdownDocumentSplitter(max_chunk_size=4000)
			
 
				+    # 示例文件路径
			
 
				+    file_path = "I:/wangxun_dev_workspace/lq_workspace/output/"
			
 
				+    pdf_file = file_path + "公路工程施工安全技术规范.md"
			
 
				+    
			
 
				+    # 加载文档（这里假设文档已经保存为markdown文件）
			
 
				+    # 在实际使用中，您需要先将提供的文档内容保存为markdown文件
			
 
				+    try:
			
 
				+        splitter.load_markdown(pdf_file)
			
 
				+    except FileNotFoundError:
			
 
				+        print("请先将文档内容保存为 '公路工程施工安全技术规范.md' 文件")
			
 
				+        return
			
 
				+    
			
 
				+    # 解析文档结构
			
 
				+    structure = splitter.parse_structure()
			
 
				+    print(f"解析完成，共找到 {len(structure)} 个顶级章节")
			
 
				+    
			
 
				+    # 拆分文档
			
 
				+    chunks = splitter.split_document()
			
 
				+    print(f"拆分完成，共生成 {len(chunks)} 个文档块")
			
 
				+    
			
 
				+    # 保存拆分结果
			
 
				+    output_dir = 'knowledge_base_chunks'
			
 
				+    splitter.save_chunks(chunks, output_dir)
			
 
				+    
			
 
				+    # 生成结构报告
			
 
				+    splitter.generate_structure_report(os.path.join(output_dir, 'structure_report.md'))
			
 
				+    
			
 
				+    print(f"所有文件已保存到 {output_dir} 目录")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 首先创建示例文件（在实际使用中请替换为您的实际文档）
			
 
				+    # 运行主程序
			
 
				+    main()
			
--- a/file_processors/doc_worker/advanced_document_splitter.py
+++ b/file_processors/doc_worker/advanced_document_splitter.py
@@ -0,0 +1,115 @@
 
				+# advanced_document_splitter.py
			
 
				+import os
			
 
				+import json
			
 
				+from datetime import datetime
			
 
				+from typing import List, Dict, Any
			
 
				+from .document_parser import DocumentSplitter, DocumentChunk
			
 
				+
			
 
				+class AdvancedDocumentSplitter:
			
 
				+    """高级文档拆分器，支持批量处理和多种输出格式"""
			
 
				+    
			
 
				+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
			
 
				+        self.splitter = DocumentSplitter(chunk_size, chunk_overlap)
			
 
				+        self.processed_files = []
			
 
				+    
			
 
				+    def batch_process(self, input_folder: str, output_folder: str):
			
 
				+        """批量处理文件夹中的所有文档"""
			
 
				+        if not os.path.exists(output_folder):
			
 
				+            os.makedirs(output_folder)
			
 
				+        
			
 
				+        supported_extensions = ['.pdf', '.docx']
			
 
				+        
			
 
				+        for filename in os.listdir(input_folder):
			
 
				+            file_path = os.path.join(input_folder, filename)
			
 
				+            if os.path.isfile(file_path):
			
 
				+                file_ext = os.path.splitext(filename)[1].lower()
			
 
				+                if file_ext in supported_extensions:
			
 
				+                    try:
			
 
				+                        print(f"处理文件: {filename}")
			
 
				+                        chunks = self.splitter.split_document(file_path)
			
 
				+                        
			
 
				+                        # 保存结果
			
 
				+                        base_name = os.path.splitext(filename)[0]
			
 
				+                        self._save_multiple_formats(chunks, output_folder, base_name)
			
 
				+                        
			
 
				+                        self.processed_files.append({
			
 
				+                            'filename': filename,
			
 
				+                            'chunk_count': len(chunks),
			
 
				+                            'processed_at': datetime.now().isoformat()
			
 
				+                        })
			
 
				+                        
			
 
				+                    except Exception as e:
			
 
				+                        print(f"处理文件 {filename} 时出错: {e}")
			
 
				+    
			
 
				+    def _save_multiple_formats(self, chunks: List[DocumentChunk], output_folder: str, base_name: str):
			
 
				+        """保存为多种格式"""
			
 
				+        # JSON格式
			
 
				+        json_file = os.path.join(output_folder, f"{base_name}_chunks.json")
			
 
				+        self.splitter.save_chunks_to_json(chunks, json_file)
			
 
				+        
			
 
				+        # 文本格式
			
 
				+        txt_file = os.path.join(output_folder, f"{base_name}_chunks.txt")
			
 
				+        self._save_chunks_to_txt(chunks, txt_file)
			
 
				+        
			
 
				+        # 元数据索引
			
 
				+        meta_file = os.path.join(output_folder, f"{base_name}_metadata.json")
			
 
				+        self._save_metadata_index(chunks, meta_file)
			
 
				+    
			
 
				+    def _save_chunks_to_txt(self, chunks: List[DocumentChunk], output_file: str):
			
 
				+        """保存片段到文本文件"""
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            for i, chunk in enumerate(chunks):
			
 
				+                f.write(f"=== 片段 {i+1} ===\n")
			
 
				+                f.write(f"标题: {chunk.metadata.title}\n")
			
 
				+                f.write(f"层级: {chunk.metadata.level}\n")
			
 
				+                f.write(f"页码: {chunk.metadata.page_start}-{chunk.metadata.page_end}\n")
			
 
				+                f.write(f"父级: {' -> '.join(chunk.metadata.parent_titles)}\n")
			
 
				+                f.write(f"片段: {chunk.chunk_index + 1}/{chunk.total_chunks}\n")
			
 
				+                f.write(f"内容哈希: {chunk.metadata.content_hash}\n")
			
 
				+                f.write("内容:\n")
			
 
				+                f.write(chunk.content)
			
 
				+                f.write("\n\n" + "="*50 + "\n\n")
			
 
				+    
			
 
				+    def _save_metadata_index(self, chunks: List[DocumentChunk], output_file: str):
			
 
				+        """保存元数据索引"""
			
 
				+        metadata = []
			
 
				+        for chunk in chunks:
			
 
				+            metadata.append({
			
 
				+                'title': chunk.metadata.title,
			
 
				+                'level': chunk.metadata.level,
			
 
				+                'page_range': f"{chunk.metadata.page_start}-{chunk.metadata.page_end}",
			
 
				+                'parent_titles': chunk.metadata.parent_titles,
			
 
				+                'chunk_index': chunk.chunk_index,
			
 
				+                'total_chunks': chunk.total_chunks,
			
 
				+                'content_hash': chunk.metadata.content_hash,
			
 
				+                'content_length': len(chunk.content)
			
 
				+            })
			
 
				+        
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(metadata, f, ensure_ascii=False, indent=2)
			
 
				+    
			
 
				+    def generate_report(self, output_file: str):
			
 
				+        """生成处理报告"""
			
 
				+        report = {
			
 
				+            'processed_at': datetime.now().isoformat(),
			
 
				+            'total_files': len(self.processed_files),
			
 
				+            'total_chunks': sum(f['chunk_count'] for f in self.processed_files),
			
 
				+            'files': self.processed_files
			
 
				+        }
			
 
				+        
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(report, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+# 使用示例
			
 
				+if __name__ == "__main__":
			
 
				+    # 批量处理示例
			
 
				+    advanced_splitter = AdvancedDocumentSplitter(chunk_size=800, chunk_overlap=50)
			
 
				+    
			
 
				+    input_folder = "documents"
			
 
				+    output_folder = "output_chunks"
			
 
				+    
			
 
				+    if os.path.exists(input_folder):
			
 
				+        advanced_splitter.batch_process(input_folder, output_folder)
			
 
				+        advanced_splitter.generate_report(os.path.join(output_folder, "processing_report.json"))
			
 
				+    else:
			
 
				+        print(f"输入文件夹不存在: {input_folder}")
			
--- a/file_processors/doc_worker/base_document.py
+++ b/file_processors/doc_worker/base_document.py
@@ -0,0 +1,121 @@
 
				+import re
			
 
				+import json
			
 
				+from typing import List, Dict, Any, Tuple
			
 
				+from dataclasses import dataclass
			
 
				+import hashlib
			
 
				+import docx
			
 
				+from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class DocumentMetadata:
			
 
				+    """文档元数据"""
			
 
				+    title: str
			
 
				+    level: int
			
 
				+    page_start: int
			
 
				+    page_end: int
			
 
				+    parent_titles: List[str]
			
 
				+    content_hash: str
			
 
				+
			
 
				+@dataclass
			
 
				+class DocumentChunk:
			
 
				+    """文档片段"""
			
 
				+    content: str
			
 
				+    metadata: DocumentMetadata
			
 
				+    chunk_index: int
			
 
				+    total_chunks: int
			
 
				+
			
 
				+class DocumentParser:
			
 
				+    """文档解析器基类"""
			
 
				+    
			
 
				+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
			
 
				+        self.chunk_size = chunk_size
			
 
				+        self.chunk_overlap = chunk_overlap
			
 
				+        self.text_splitter = RecursiveCharacterTextSplitter(
			
 
				+            chunk_size=chunk_size,
			
 
				+            chunk_overlap=chunk_overlap,
			
 
				+            length_function=len,
			
 
				+        )
			
 
				+    
			
 
				+    def extract_toc_structure(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """提取目录结构 - 子类需要实现"""
			
 
				+        raise NotImplementedError
			
 
				+    
			
 
				+    def extract_content_by_section(self, file_path: str, section_info: Dict[str, Any]) -> str:
			
 
				+        """提取指定章节内容 - 子类需要实现"""
			
 
				+        raise NotImplementedError
			
 
				+    
			
 
				+    def calculate_content_hash(self, content: str) -> str:
			
 
				+        """计算内容哈希值"""
			
 
				+        return hashlib.md5(content.encode('utf-8')).hexdigest()
			
 
				+    
			
 
				+    def split_document(self, file_path: str) -> List[DocumentChunk]:
			
 
				+        """拆分文档"""
			
 
				+        print(f"开始解析文档: {file_path}")
			
 
				+        
			
 
				+        # 1. 提取目录结构
			
 
				+        toc_structure = self.extract_toc_structure(file_path)
			
 
				+        print(f"提取到 {len(toc_structure)} 个目录项")
			
 
				+        #print(f"提取到目录： {toc_structure}")
			
 
				+        
			
 
				+        # 2. 按目录结构拆分
			
 
				+        all_chunks = []
			
 
				+        for section in toc_structure:
			
 
				+            section_chunks = self._process_section(file_path, section)
			
 
				+            all_chunks.extend(section_chunks)
			
 
				+        
			
 
				+        print(f"文档拆分完成，共生成 {len(all_chunks)} 个片段")
			
 
				+        return all_chunks
			
 
				+    
			
 
				+    def _process_section(self, file_path: str, section: Dict[str, Any]) -> List[DocumentChunk]:
			
 
				+        """处理单个章节"""
			
 
				+        # 提取章节内容
			
 
				+        content = self.extract_content_by_section(file_path, section)
			
 
				+        if not content.strip():
			
 
				+            return []
			
 
				+        
			
 
				+        # 检查内容大小
			
 
				+        if len(content) <= self.chunk_size:
			
 
				+            # 内容在固定大小内，直接作为一个片段
			
 
				+            metadata = DocumentMetadata(
			
 
				+                title=section['title'],
			
 
				+                level=section['level'],
			
 
				+                page_start=section.get('page_start', 1),
			
 
				+                page_end=section.get('page_end', section.get('page_start', 1)),
			
 
				+                parent_titles=section.get('parent_titles', []),
			
 
				+                content_hash=self.calculate_content_hash(content)
			
 
				+            )
			
 
				+            
			
 
				+            return [DocumentChunk(
			
 
				+                content=content,
			
 
				+                metadata=metadata,
			
 
				+                chunk_index=0,
			
 
				+                total_chunks=1
			
 
				+            )]
			
 
				+        else:
			
 
				+            # 内容过大，进一步拆分
			
 
				+            return self._split_large_section(content, section)
			
 
				+    
			
 
				+    def _split_large_section(self, content: str, section: Dict[str, Any]) -> List[DocumentChunk]:
			
 
				+        """拆分大章节"""
			
 
				+        chunks = []
			
 
				+        split_texts = self.text_splitter.split_text(content)
			
 
				+        
			
 
				+        for i, chunk_content in enumerate(split_texts):
			
 
				+            metadata = DocumentMetadata(
			
 
				+                title=section['title'],
			
 
				+                level=section['level'],
			
 
				+                page_start=section.get('page_start', 1),
			
 
				+                page_end=section.get('page_end', section.get('page_start', 1)),
			
 
				+                parent_titles=section.get('parent_titles', []),
			
 
				+                content_hash=self.calculate_content_hash(chunk_content)
			
 
				+            )
			
 
				+            
			
 
				+            chunks.append(DocumentChunk(
			
 
				+                content=chunk_content,
			
 
				+                metadata=metadata,
			
 
				+                chunk_index=i,
			
 
				+                total_chunks=len(split_texts)
			
 
				+            ))
			
 
				+        
			
 
				+        return chunks
			
--- a/file_processors/doc_worker/document_parser.py
+++ b/file_processors/doc_worker/document_parser.py
@@ -0,0 +1,963 @@
 
				+import re
			
 
				+import json
			
 
				+import os
			
 
				+from base_document import DocumentParser,DocumentChunk
			
 
				+from typing import List, Dict, Any, Tuple
			
 
				+from PyPDF2 import PdfReader, PdfWriter
			
 
				+import fitz  # PyMuPDF - 更好的PDF解析
			
 
				+import docx
			
 
				+
			
 
				+
			
 
				+
			
 
				+class EnhancedPDFTocExtractor:
			
 
				+    """增强的PDF目录提取器"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.level_patterns = {
			
 
				+            1: [
			
 
				+                r'^\d+\s+',  # "1 "
			
 
				+                r'^第[一二三四五六七八九十]+\S*\s+',  # "第一章"
			
 
				+                r'^[A-Z]\s+',  # "A "
			
 
				+            ],
			
 
				+            2: [
			
 
				+                r'^\d+\.\d+\s+',  # "1.1 "
			
 
				+                r'^[一二三四五六七八九十]+、\s*',  # "一、"
			
 
				+                r'^\(\d+\)\s+',  # "(1)"
			
 
				+            ],
			
 
				+            3: [
			
 
				+                r'^\d+\.\d+\.\d+\s+',  # "1.1.1"
			
 
				+                r'^\d+\)\s+',  # "1)"
			
 
				+            ]
			
 
				+        }
			
 
				+    
			
 
				+    def extract_complete_toc(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """提取完整的目录结构"""
			
 
				+        print(f"正在提取PDF目录: {os.path.basename(file_path)}")
			
 
				+        
			
 
				+        # 方法1: 使用PyMuPDF提取书签
			
 
				+        pymupdf_toc = self._extract_with_pymupdf(file_path)
			
 
				+        if pymupdf_toc and self._validate_toc(pymupdf_toc):
			
 
				+            print(f"PyMuPDF提取到 {len(pymupdf_toc)} 个有效目录项")
			
 
				+            return pymupdf_toc
			
 
				+        
			
 
				+        # 方法2: 从文本内容提取目录结构
			
 
				+        text_toc = self._extract_from_text_content(file_path)
			
 
				+        if text_toc and self._validate_toc(text_toc):
			
 
				+            print(f"文本分析提取到 {len(text_toc)} 个有效目录项")
			
 
				+            return text_toc
			
 
				+        
			
 
				+        print("未提取到有效的目录结构")
			
 
				+        return []
			
 
				+    
			
 
				+    def _extract_with_pymupdf(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """使用PyMuPDF提取目录"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            
			
 
				+            # 获取书签
			
 
				+            toc = doc.get_toc()
			
 
				+            if not toc:
			
 
				+                print("PDF没有书签目录")
			
 
				+                doc.close()
			
 
				+                return []
			
 
				+            
			
 
				+            processed_toc = []
			
 
				+            parent_stack = []  # 用于跟踪父级标题
			
 
				+            
			
 
				+            for item in toc:
			
 
				+                level, title, page_num = item
			
 
				+                
			
 
				+                # 清理标题
			
 
				+                clean_title = self._clean_title(title)
			
 
				+                if not clean_title or len(clean_title) < 2:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 确定实际页码（PyMuPDF页码从1开始）
			
 
				+                actual_page = max(1, page_num)
			
 
				+                
			
 
				+                # 处理层级关系
			
 
				+                while parent_stack and parent_stack[-1]['level'] >= level:
			
 
				+                    parent_stack.pop()
			
 
				+                
			
 
				+                parent_titles = [p['title'] for p in parent_stack]
			
 
				+                
			
 
				+                # 创建目录项
			
 
				+                toc_item = {
			
 
				+                    'title': clean_title,
			
 
				+                    'level': level,
			
 
				+                    'page_start': actual_page,
			
 
				+                    'page_end': actual_page,
			
 
				+                    'parent_titles': parent_titles,
			
 
				+                    'source': 'pymupdf_toc'
			
 
				+                }
			
 
				+                
			
 
				+                processed_toc.append(toc_item)
			
 
				+                
			
 
				+                # 更新父级栈
			
 
				+                parent_stack.append({
			
 
				+                    'title': clean_title,
			
 
				+                    'level': level,
			
 
				+                    'page': actual_page
			
 
				+                })
			
 
				+            
			
 
				+            doc.close()
			
 
				+            return processed_toc
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"PyMuPDF提取目录失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _extract_from_text_content(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """从文本内容提取目录结构"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            full_text = ""
			
 
				+            
			
 
				+            # 提取前20页的文本（目录通常在文档前面）
			
 
				+            max_pages_for_toc = min(20, len(doc))
			
 
				+            page_texts = []
			
 
				+            
			
 
				+            for page_num in range(max_pages_for_toc):
			
 
				+                page = doc[page_num]
			
 
				+                text = page.get_text().strip()
			
 
				+                if text:
			
 
				+                    page_texts.append((page_num + 1, text))
			
 
				+                    full_text += f"--- 第{page_num + 1}页 ---\n{text}\n\n"
			
 
				+            
			
 
				+            doc.close()
			
 
				+            
			
 
				+            # 从文本中识别目录
			
 
				+            toc_items = self._identify_toc_from_text(full_text, page_texts)
			
 
				+            
			
 
				+            # 如果找到目录，进一步分析层级
			
 
				+            if toc_items:
			
 
				+                toc_items = self._analyze_toc_levels(toc_items)
			
 
				+            
			
 
				+            return toc_items
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"文本内容提取目录失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _identify_toc_from_text(self, full_text: str, page_texts: List[Tuple[int, str]]) -> List[Dict[str, Any]]:
			
 
				+        """从文本中识别目录"""
			
 
				+        toc_items = []
			
 
				+        
			
 
				+        # 目录特征模式
			
 
				+        toc_patterns = [
			
 
				+            # 带页码的目录项： "1 总则 ........... 1"
			
 
				+            r'^(\d+(?:\.\d+)*)\s+([^\.]{5,50}?)\s*\.{3,}\s*(\d+)\s*$',
			
 
				+            # 中文编号： "第一章 总则 ........... 1"
			
 
				+            r'^(第[一二三四五六七八九十百千]+[章节条])\s+([^\.]{5,50}?)\s*\.{3,}\s*(\d+)\s*$',
			
 
				+            # 简单格式： "1 总则 1"
			
 
				+            r'^(\d+(?:\.\d+)*)\s+([^\d\.]{5,50}?)\s+(\d+)\s*$',
			
 
				+        ]
			
 
				+        
			
 
				+        lines = full_text.split('\n')
			
 
				+        current_page = 1
			
 
				+        
			
 
				+        for line in lines:
			
 
				+            line = line.strip()
			
 
				+            if not line or len(line) > 200:
			
 
				+                continue
			
 
				+            
			
 
				+            # 更新当前页码
			
 
				+            page_match = re.match(r'^---\s*第(\d+)页\s*---$', line)
			
 
				+            if page_match:
			
 
				+                current_page = int(page_match.group(1))
			
 
				+                continue
			
 
				+            
			
 
				+            # 检查是否是目录项
			
 
				+            for pattern in toc_patterns:
			
 
				+                match = re.match(pattern, line)
			
 
				+                if match:
			
 
				+                    numbering = match.group(1)
			
 
				+                    title = match.group(2).strip()
			
 
				+                    page_num = int(match.group(3)) if match.groups() >= 3 else current_page
			
 
				+                    
			
 
				+                    # 清理标题
			
 
				+                    title = self._clean_title(title)
			
 
				+                    if not title or len(title) < 2:
			
 
				+                        continue
			
 
				+                    
			
 
				+                    # 确定层级
			
 
				+                    level = self._determine_level_from_numbering(numbering)
			
 
				+                    
			
 
				+                    toc_items.append({
			
 
				+                        'title': title,
			
 
				+                        'level': level,
			
 
				+                        'page_start': page_num,
			
 
				+                        'page_end': page_num,
			
 
				+                        'parent_titles': [],
			
 
				+                        'source': 'text_analysis'
			
 
				+                    })
			
 
				+                    break
			
 
				+        
			
 
				+        return toc_items
			
 
				+    
			
 
				+    def _analyze_toc_levels(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				+        """分析目录项的层级关系"""
			
 
				+        if not toc_items:
			
 
				+            return []
			
 
				+        
			
 
				+        # 根据编号模式重新确定层级
			
 
				+        for item in toc_items:
			
 
				+            title = item['title']
			
 
				+            # 基于标题前的编号确定层级
			
 
				+            level = self._analyze_title_level(title)
			
 
				+            if level > 0:
			
 
				+                item['level'] = level
			
 
				+        
			
 
				+        # 构建层级关系
			
 
				+        return self._build_hierarchy(toc_items)
			
 
				+    
			
 
				+    def _analyze_title_level(self, title: str) -> int:
			
 
				+        """分析标题的层级"""
			
 
				+        # 检查常见的层级模式
			
 
				+        if re.match(r'^\d+\.\d+\.\d+', title):
			
 
				+            return 3
			
 
				+        elif re.match(r'^\d+\.\d+', title):
			
 
				+            return 2
			
 
				+        elif re.match(r'^\d+', title):
			
 
				+            return 1
			
 
				+        elif re.match(r'^第[一二三四五六七八九十]+[章节]', title):
			
 
				+            return 1
			
 
				+        elif re.match(r'^[一二三四五六七八九十]+、', title):
			
 
				+            return 2
			
 
				+        elif re.match(r'^\(\d+\)', title):
			
 
				+            return 3
			
 
				+        
			
 
				+        return 1  # 默认层级
			
 
				+    
			
 
				+    def _build_hierarchy(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
			
 
				+        """构建层级关系"""
			
 
				+        if not toc_items:
			
 
				+            return []
			
 
				+        
			
 
				+        hierarchical_items = []
			
 
				+        parent_stack = []
			
 
				+        
			
 
				+        for item in toc_items:
			
 
				+            current_level = item['level']
			
 
				+            
			
 
				+            # 弹出栈中层级大于等于当前层级的项目
			
 
				+            while parent_stack and parent_stack[-1]['level'] >= current_level:
			
 
				+                parent_stack.pop()
			
 
				+            
			
 
				+            # 获取父级标题
			
 
				+            parent_titles = [p['title'] for p in parent_stack]
			
 
				+            item['parent_titles'] = parent_titles
			
 
				+            
			
 
				+            hierarchical_items.append(item)
			
 
				+            
			
 
				+            # 将当前项目压入栈
			
 
				+            parent_stack.append({
			
 
				+                'title': item['title'],
			
 
				+                'level': item['level'],
			
 
				+                'page': item['page_start']
			
 
				+            })
			
 
				+        
			
 
				+        return hierarchical_items
			
 
				+    
			
 
				+    def _determine_level_from_numbering(self, numbering: str) -> int:
			
 
				+        """根据编号确定层级"""
			
 
				+        if '.' in numbering:
			
 
				+            dot_count = numbering.count('.')
			
 
				+            return min(dot_count + 1, 3)
			
 
				+        elif re.match(r'^第[一二三四五六七八九十]+[章节]', numbering):
			
 
				+            return 1
			
 
				+        elif re.match(r'^[一二三四五六七八九十]+、', numbering):
			
 
				+            return 2
			
 
				+        else:
			
 
				+            return 1
			
 
				+    
			
 
				+    def _clean_title(self, title: str) -> str:
			
 
				+        """清理标题"""
			
 
				+        if not title:
			
 
				+            return ""
			
 
				+        
			
 
				+        # 移除.pdf后缀
			
 
				+        title = re.sub(r'\.pdf$', '', title, flags=re.IGNORECASE)
			
 
				+        
			
 
				+        # 移除常见的噪音字符
			
 
				+        #title = re.sub(r'^[\s\d\.\-•>*]*', '', title)  # 开头的编号和符号
			
 
				+        #title = re.sub(r'[\s\d\.\-•>*]*$', '', title)  # 结尾的编号和符号
			
 
				+        
			
 
				+        # 移除多余的空白字符
			
 
				+        title = re.sub(r'\s+', ' ', title).strip()
			
 
				+        
			
 
				+        # 移除目录特有的噪音
			
 
				+        title = re.sub(r'\.{3,}.*$', '', title)  # 移除省略号和页码
			
 
				+        
			
 
				+        return title
			
 
				+    
			
 
				+    def _validate_toc(self, toc_items: List[Dict[str, Any]]) -> bool:
			
 
				+        """验证目录结构的有效性"""
			
 
				+        if not toc_items:
			
 
				+            return False
			
 
				+        
			
 
				+        # 检查是否有合理的页码分布
			
 
				+        pages = [item['page_start'] for item in toc_items]
			
 
				+        unique_pages = set(pages)
			
 
				+        
			
 
				+        # 如果所有页码都是1，可能有问题
			
 
				+        if len(unique_pages) == 1 and 1 in unique_pages:
			
 
				+            print("警告：所有目录项的页码都是1，可能提取不准确")
			
 
				+            # 不立即返回False，可能短文档确实都在第1页
			
 
				+        
			
 
				+        # 检查标题质量
			
 
				+        valid_titles = 0
			
 
				+        for item in toc_items:
			
 
				+            title = item['title']
			
 
				+            if len(title) >= 2 and len(title) <= 100:
			
 
				+                valid_titles += 1
			
 
				+        
			
 
				+        # 至少要有一定比例的有效标题
			
 
				+        return valid_titles >= max(2, len(toc_items) * 0.5)
			
 
				+
			
 
				+
			
 
				+
			
 
				+class PDFTocExtractor:
			
 
				+    """PDF目录提取器 - 专门处理PDF目录解析"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.level_markers = {
			
 
				+            1: ['chapter', 'part', '篇', '章', 'section'],
			
 
				+            2: ['section', '节', 'subsection'],
			
 
				+            3: ['subsubsection', '小节', 'topic']
			
 
				+        }
			
 
				+    
			
 
				+    def extract_toc_with_pymupdf(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """使用PyMuPDF提取目录（更准确）"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            toc = doc.get_toc()
			
 
				+            doc.close()
			
 
				+            
			
 
				+            return self._process_pymupdf_toc(toc)
			
 
				+        except Exception as e:
			
 
				+            print(f"PyMuPDF提取目录失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _process_pymupdf_toc(self, toc: List) -> List[Dict[str, Any]]:
			
 
				+        """处理PyMuPDF返回的目录数据"""
			
 
				+        processed_toc = []
			
 
				+        
			
 
				+        for item in toc:
			
 
				+            print(item)
			
 
				+            level, title, page_num = item
			
 
				+            # PyMuPDF的页码是从1开始的，但需要验证
			
 
				+            actual_page = max(1, page_num)
			
 
				+            
			
 
				+            processed_toc.append({
			
 
				+                'title': self._clean_title(title),
			
 
				+                'level': level,
			
 
				+                'page_start': actual_page,
			
 
				+                'page_end': actual_page,
			
 
				+                'parent_titles': [],
			
 
				+                'source': 'pymupdf_toc'
			
 
				+            })
			
 
				+        
			
 
				+        return processed_toc
			
 
				+    
			
 
				+    def extract_toc_with_pypdf2(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """使用PyPDF2提取目录"""
			
 
				+        try:
			
 
				+            with open(file_path, 'rb') as file:
			
 
				+                pdf_reader = PdfReader(file)
			
 
				+                
			
 
				+                if not hasattr(pdf_reader, 'outline') or not pdf_reader.outline:
			
 
				+                    return []
			
 
				+                
			
 
				+                return self._extract_toc_from_outline(pdf_reader.outline, pdf_reader)
			
 
				+        except Exception as e:
			
 
				+            print(f"PyPDF2提取目录失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _extract_toc_from_outline(self, outline, pdf_reader, level=1, parent_titles=None, parent_pages=None):
			
 
				+        """从书签中提取目录结构"""
			
 
				+        if parent_titles is None:
			
 
				+            parent_titles = []
			
 
				+        if parent_pages is None:
			
 
				+            parent_pages = []
			
 
				+        
			
 
				+        toc_items = []
			
 
				+        
			
 
				+        for item in outline:
			
 
				+            if isinstance(item, dict):
			
 
				+                # 提取标题
			
 
				+                title = self._extract_title(item)
			
 
				+                if not title:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 提取页码 - 这是关键修复
			
 
				+                page_num = self._extract_page_number(item, pdf_reader)
			
 
				+                
			
 
				+                # 确定层级
			
 
				+                actual_level = self._determine_level(title, level)
			
 
				+                
			
 
				+                toc_item = {
			
 
				+                    'title': title,
			
 
				+                    'level': actual_level,
			
 
				+                    'page_start': page_num,
			
 
				+                    'page_end': page_num,
			
 
				+                    'parent_titles': parent_titles.copy(),
			
 
				+                    'parent_pages': parent_pages.copy(),
			
 
				+                    'source': 'pypdf2_outline'
			
 
				+                }
			
 
				+                
			
 
				+                toc_items.append(toc_item)
			
 
				+                
			
 
				+                # 处理子项目
			
 
				+                if '/First' in item:
			
 
				+                    child_parent_titles = parent_titles + [title]
			
 
				+                    child_parent_pages = parent_pages + [page_num]
			
 
				+                    children = self._extract_toc_from_outline(
			
 
				+                        item['/First'], pdf_reader, actual_level + 1, 
			
 
				+                        child_parent_titles, child_parent_pages
			
 
				+                    )
			
 
				+                    toc_items.extend(children)
			
 
				+            
			
 
				+            elif isinstance(item, list):
			
 
				+                # 处理嵌套结构
			
 
				+                nested_items = self._extract_toc_from_outline(
			
 
				+                    item, pdf_reader, level, parent_titles, parent_pages
			
 
				+                )
			
 
				+                toc_items.extend(nested_items)
			
 
				+        
			
 
				+        return toc_items
			
 
				+    
			
 
				+    def _extract_title(self, item) -> str:
			
 
				+        """提取并清理标题"""
			
 
				+        title = item.get('/Title', '')
			
 
				+        if isinstance(title, str):
			
 
				+            # 清理标题中的编号和特殊字符
			
 
				+            title = re.sub(r'^\s*[\d\.\s]+\s*', '', title)
			
 
				+            title = title.strip()
			
 
				+        return title if title else "未命名标题"
			
 
				+    
			
 
				+    def _extract_page_number(self, item, pdf_reader) -> int:
			
 
				+        """提取页码 - 关键修复方法"""
			
 
				+        try:
			
 
				+            # 方法1: 从/A/Destinations提取
			
 
				+            if '/A' in item:
			
 
				+                action = item['/A']
			
 
				+                if '/D' in action:
			
 
				+                    dest = action['/D']
			
 
				+                    if isinstance(dest, list) and len(dest) > 0:
			
 
				+                        page_ref = dest[0]
			
 
				+                        return self._get_page_number_from_ref(page_ref, pdf_reader)
			
 
				+            
			
 
				+            # 方法2: 从/Dest直接提取
			
 
				+            if '/Dest' in item:
			
 
				+                dest = item['/Dest']
			
 
				+                if isinstance(dest, list) and len(dest) > 0:
			
 
				+                    page_ref = dest[0]
			
 
				+                    return self._get_page_number_from_ref(page_ref, pdf_reader)
			
 
				+            
			
 
				+            # 方法3: 尝试从其他属性提取
			
 
				+            for key in ['/Page', '/P']:
			
 
				+                if key in item:
			
 
				+                    page_ref = item[key]
			
 
				+                    return self._get_page_number_from_ref(page_ref, pdf_reader)
			
 
				+                    
			
 
				+        except Exception as e:
			
 
				+            print(f"提取页码失败: {e}")
			
 
				+        
			
 
				+        # 默认返回第1页
			
 
				+        return 1
			
 
				+    
			
 
				+    def _get_page_number_from_ref(self, page_ref, pdf_reader) -> int:
			
 
				+        """从页面引用获取实际页码"""
			
 
				+        try:
			
 
				+            if hasattr(page_ref, 'get_object'):
			
 
				+                page_obj = page_ref.get_object()
			
 
				+            else:
			
 
				+                page_obj = page_ref
			
 
				+            
			
 
				+            # 在PDF阅读器中查找页面索引
			
 
				+            for i, page in enumerate(pdf_reader.pages):
			
 
				+                if hasattr(page, 'get_object'):
			
 
				+                    page_obj2 = page.get_object()
			
 
				+                    if page_obj2 == page_obj:
			
 
				+                        return i + 1  # 转换为从1开始的页码
			
 
				+            
			
 
				+            # 如果找不到，尝试其他方法
			
 
				+            if hasattr(page_obj, 'indirect_ref'):
			
 
				+                # 使用间接引用ID来估算
			
 
				+                ref_id = getattr(page_obj.indirect_ref, 'idnum', 0)
			
 
				+                if ref_id > 0:
			
 
				+                    return min(max(1, ref_id % 100), len(pdf_reader.pages))
			
 
				+                    
			
 
				+        except Exception as e:
			
 
				+            print(f"解析页面引用失败: {e}")
			
 
				+        
			
 
				+        return 1
			
 
				+    
			
 
				+    def _determine_level(self, title: str, base_level: int) -> int:
			
 
				+        """根据标题内容确定层级"""
			
 
				+        title_lower = title.lower()
			
 
				+        
			
 
				+        # 基于关键词判断层级
			
 
				+        for level, markers in self.level_markers.items():
			
 
				+            for marker in markers:
			
 
				+                if marker in title_lower:
			
 
				+                    return level
			
 
				+        
			
 
				+        # 基于标题格式判断
			
 
				+        if re.match(r'^(第[一二三四五六七八九十]+[章节篇])', title):
			
 
				+            return 1
			
 
				+        elif re.match(r'^\d+\.\d+', title):
			
 
				+            dots_count = title.count('.')
			
 
				+            return min(dots_count, 3)
			
 
				+        elif re.match(r'^[一二三四五六七八九十]、', title):
			
 
				+            return 2
			
 
				+        
			
 
				+        # 默认使用基础层级
			
 
				+        return base_level
			
 
				+    
			
 
				+    def _clean_title(self, title: str) -> str:
			
 
				+        """清理标题"""
			
 
				+        # 移除多余的空白字符
			
 
				+        title = re.sub(r'\s+', ' ', title).strip()
			
 
				+        
			
 
				+        # 移除开头的编号（如果存在）
			
 
				+        title = re.sub(r'^\s*[\d\.\s]+\s*', '', title)
			
 
				+        
			
 
				+        return title
			
 
				+    
			
 
				+    def extract_toc_from_text(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """从文本内容中提取目录结构"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            toc_items = []
			
 
				+            
			
 
				+            for page_num in range(len(doc)):
			
 
				+                page = doc[page_num]
			
 
				+                text = page.get_text()
			
 
				+                
			
 
				+                # 在文本中查找标题模式
			
 
				+                headings = self._find_headings_in_text(text, page_num + 1)
			
 
				+                toc_items.extend(headings)
			
 
				+            
			
 
				+            doc.close()
			
 
				+            return toc_items
			
 
				+        except Exception as e:
			
 
				+            print(f"从文本提取目录失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _find_headings_in_text(self, text: str, page_num: int) -> List[Dict[str, Any]]:
			
 
				+        """在文本中查找标题"""
			
 
				+        headings = []
			
 
				+        lines = text.split('\n')
			
 
				+        
			
 
				+        heading_patterns = [
			
 
				+            # 中文标题模式
			
 
				+            (r'^(第[一二三四五六七八九十零百千]+[章节条款篇])\s+(.+)$', 1),
			
 
				+            (r'^([一二三四五六七八九十]、)\s*(.+)$', 2),
			
 
				+            # 数字标题模式
			
 
				+            (r'^(\d+)\s+(.+)$', 2),
			
 
				+            (r'^(\d+\.\d+)\s+(.+)$', 3),
			
 
				+            (r'^(\d+\.\d+\.\d+)\s+(.+)$', 4),
			
 
				+            # 英文标题模式
			
 
				+            (r'^(Chapter|Section)\s+(\d+)\s+(.+)$', 1),
			
 
				+            (r'^(\d+\.\d+)\s+(.+)$', 2),
			
 
				+        ]
			
 
				+        
			
 
				+        for line in lines:
			
 
				+            line = line.strip()
			
 
				+            if len(line) > 100:  # 太长的行不是标题
			
 
				+                continue
			
 
				+                
			
 
				+            for pattern, level in heading_patterns:
			
 
				+                match = re.match(pattern, line)
			
 
				+                if match:
			
 
				+                    if len(match.groups()) == 2:
			
 
				+                        title = match.group(2)
			
 
				+                    else:
			
 
				+                        title = match.group(3) if len(match.groups()) > 2 else line
			
 
				+                    
			
 
				+                    headings.append({
			
 
				+                        'title': title.strip(),
			
 
				+                        'level': level,
			
 
				+                        'page_start': page_num,
			
 
				+                        'page_end': page_num,
			
 
				+                        'parent_titles': [],
			
 
				+                        'source': 'text_analysis'
			
 
				+                    })
			
 
				+                    break
			
 
				+        
			
 
				+        return headings
			
 
				+
			
 
				+class PDFParser(DocumentParser):
			
 
				+    """PDF文档解析器 - 修复版本"""
			
 
				+    
			
 
				+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
			
 
				+        super().__init__(chunk_size, chunk_overlap)
			
 
				+        self.toc_extractor = EnhancedPDFTocExtractor()
			
 
				+    
			
 
				+    def extract_toc_structure(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """提取PDF目录结构 - 增强版本"""
			
 
				+        print(f"正在解析PDF目录: {os.path.basename(file_path)}")
			
 
				+        
			
 
				+        # 使用增强的目录提取器
			
 
				+        toc_structure = self.toc_extractor.extract_complete_toc(file_path)
			
 
				+        
			
 
				+        if not toc_structure:
			
 
				+            print("未提取到目录结构，将使用传统分割方式")
			
 
				+            return []
			
 
				+        
			
 
				+        # 验证并修复目录结构
			
 
				+        toc_structure = self._validate_and_fix_toc(toc_structure, file_path)
			
 
				+        
			
 
				+        print(f"最终提取到 {len(toc_structure)} 个目录项")
			
 
				+        return toc_structure
			
 
				+    
			
 
				+    def _validate_and_fix_toc(self, toc_structure: List[Dict[str, Any]], file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """验证并修复目录结构 - 完整版本"""
			
 
				+        if not toc_structure:
			
 
				+            return []
			
 
				+        
			
 
				+        doc_length = self._get_pdf_length(file_path)
			
 
				+        fixed_toc = []
			
 
				+        
			
 
				+        print(f"文档总页数: {doc_length}")
			
 
				+        print(f"开始验证 {len(toc_structure)} 个目录项...")
			
 
				+        
			
 
				+        invalid_count = 0
			
 
				+        
			
 
				+        for i, item in enumerate(toc_structure):
			
 
				+            original_page = item['page_start']
			
 
				+            
			
 
				+            # 检查页码有效性
			
 
				+            if (original_page <= 0 or 
			
 
				+                original_page > doc_length + 5 or  # 允许稍微超出
			
 
				+                (i > 0 and original_page < toc_structure[i-1]['page_start'])):  # 页码倒序
			
 
				+                
			
 
				+                # 标记为估算的页码
			
 
				+                estimated_page = self._estimate_page_number(item, doc_length, toc_structure)
			
 
				+                item['page_start'] = estimated_page
			
 
				+                item['page_end'] = estimated_page
			
 
				+                item['source'] = 'estimated'
			
 
				+                invalid_count += 1
			
 
				+                
			
 
				+                print(f"  修复第{i+1}项: '{item['title']}' 页码 {original_page} -> {estimated_page}")
			
 
				+            
			
 
				+            # 清理标题
			
 
				+            original_title = item['title']
			
 
				+            item['title'] = self._clean_title_completely(original_title)
			
 
				+            if original_title != item['title']:
			
 
				+                print(f"  清理标题: '{original_title}' -> '{item['title']}'")
			
 
				+            
			
 
				+            fixed_toc.append(item)
			
 
				+        
			
 
				+        if invalid_count > 0:
			
 
				+            print(f"共修复 {invalid_count} 个无效页码")
			
 
				+    
			
 
				+        return fixed_toc
			
 
				+    
			
 
				+    def _get_pdf_length(self, file_path: str) -> int:
			
 
				+        """获取PDF页数"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            length = len(doc)
			
 
				+            doc.close()
			
 
				+            return length
			
 
				+        except:
			
 
				+            return 100  # 默认值
			
 
				+    
			
 
				+    def _estimate_page_number(self, item: Dict[str, Any], doc_length: int, toc_structure: List[Dict[str, Any]] = None) -> int:
			
 
				+        """估算页码 - 改进版本"""
			
 
				+        if not toc_structure:
			
 
				+            return 1
			
 
				+        
			
 
				+        try:
			
 
				+            index = toc_structure.index(item)
			
 
				+            
			
 
				+            # 方法1: 如果有前面的有效页码，基于前面的页码估算
			
 
				+            for i in range(index - 1, -1, -1):
			
 
				+                prev_item = toc_structure[i]
			
 
				+                if (prev_item['page_start'] > 0 and 
			
 
				+                    prev_item['page_start'] <= doc_length and
			
 
				+                    prev_item.get('source') != 'estimated'):
			
 
				+                    # 基于前一项的页码估算，假设每项占2-3页
			
 
				+                    estimated = prev_item['page_start'] + 2
			
 
				+                    return min(estimated, doc_length)
			
 
				+            
			
 
				+            # 方法2: 基于索引位置估算（假设目录在文档前部）
			
 
				+            if index < 10:  # 前10项可能在文档前20页
			
 
				+                return min(index + 1, 20)
			
 
				+            else:
			
 
				+                # 后续项按比例分布
			
 
				+                progress = index / len(toc_structure)
			
 
				+                estimated = int(progress * doc_length * 0.8) + 1  # 留20%给附录等
			
 
				+                return min(max(1, estimated), doc_length)
			
 
				+                
			
 
				+        except (ValueError, IndexError):
			
 
				+            return 1
			
 
				+    
			
 
				+    def _clean_title_completely(self, title: str) -> str:
			
 
				+        """完全清理标题"""
			
 
				+        if not title:
			
 
				+            return "未命名标题"
			
 
				+        
			
 
				+        # 移除文件后缀
			
 
				+        title = re.sub(r'\.(pdf|docx?|txt)$', '', title, flags=re.IGNORECASE)
			
 
				+        
			
 
				+        # 移除页码引用
			
 
				+        title = re.sub(r'[\(\{\[\<]?页码?\s*\d+[\)\}\ \]\>]?', '', title)
			
 
				+        
			
 
				+        # 清理空白字符
			
 
				+        title = re.sub(r'\s+', ' ', title).strip()
			
 
				+        
			
 
				+        return title if title else "未命名标题"
			
 
				+    
			
 
				+    def extract_full_content(self, file_path: str) -> str:
			
 
				+        """提取完整PDF内容"""
			
 
				+        full_content = ""
			
 
				+        
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            
			
 
				+            for page_num in range(len(doc)):
			
 
				+                page = doc[page_num]
			
 
				+                text = page.get_text()
			
 
				+                
			
 
				+                if text.strip():
			
 
				+                    full_content += f"--- 第{page_num + 1}页 ---\n{text}\n\n"
			
 
				+            
			
 
				+            doc.close()
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"PyMuPDF提取内容失败: {e}")
			
 
				+            # 回退到PyPDF2
			
 
				+            try:
			
 
				+                with open(file_path, 'rb') as file:
			
 
				+                    pdf_reader = PdfReader(file)
			
 
				+                    
			
 
				+                    for page_num, page in enumerate(pdf_reader.pages, 1):
			
 
				+                        page_content = page.extract_text()
			
 
				+                        if page_content.strip():
			
 
				+                            full_content += f"--- 第{page_num}页 ---\n{page_content}\n\n"
			
 
				+            except Exception as e2:
			
 
				+                print(f"PyPDF2提取内容也失败: {e2}")
			
 
				+        
			
 
				+        return full_content
			
 
				+    
			
 
				+    def extract_content_by_section(self, file_path: str, section_info: Dict[str, Any]) -> str:
			
 
				+        """提取指定章节内容"""
			
 
				+        content = ""
			
 
				+        start_page = section_info.get('page_start', 1)
			
 
				+        end_page = section_info.get('page_end', start_page)
			
 
				+        
			
 
				+        print(f"提取章节: {section_info['title']}, 页码: {start_page}-{end_page}")
			
 
				+        
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            
			
 
				+            # 调整页码范围
			
 
				+            start_idx = max(0, start_page - 1)
			
 
				+            end_idx = min(len(doc) - 1, end_page - 1)
			
 
				+            
			
 
				+            for page_num in range(start_idx, end_idx + 1):
			
 
				+                page = doc[page_num]
			
 
				+                text = page.get_text()
			
 
				+                if text.strip():
			
 
				+                    content += text + "\n\n"
			
 
				+            
			
 
				+            doc.close()
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"提取章节内容失败: {e}")
			
 
				+        
			
 
				+        return content
			
 
				+
			
 
				+
			
 
				+
			
 
				+class WordParser(DocumentParser):
			
 
				+    """Word文档解析器"""
			
 
				+    
			
 
				+    def extract_toc_structure(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """提取Word目录结构"""
			
 
				+        doc = docx.Document(file_path)
			
 
				+        toc_structure = []
			
 
				+        current_parents = []
			
 
				+        
			
 
				+        for paragraph in doc.paragraphs:
			
 
				+            if paragraph.style.name.startswith('Heading'):
			
 
				+                level = int(paragraph.style.name.replace('Heading', '').strip())
			
 
				+                title = paragraph.text.strip()
			
 
				+                
			
 
				+                # 更新父级标题栈
			
 
				+                while current_parents and current_parents[-1]['level'] >= level:
			
 
				+                    current_parents.pop()
			
 
				+                
			
 
				+                parent_titles = [p['title'] for p in current_parents]
			
 
				+                
			
 
				+                # 估算页码（Word没有直接的页码信息）
			
 
				+                # 这里使用段落索引作为近似值
			
 
				+                para_index = doc.paragraphs.index(paragraph)
			
 
				+                estimated_page = para_index // 50 + 1  # 假设每页50个段落
			
 
				+                
			
 
				+                toc_structure.append({
			
 
				+                    'title': title,
			
 
				+                    'level': level,
			
 
				+                    'page_start': estimated_page,
			
 
				+                    'page_end': estimated_page,
			
 
				+                    'parent_titles': parent_titles.copy(),
			
 
				+                    'paragraph_index': para_index
			
 
				+                })
			
 
				+                
			
 
				+                current_parents.append({
			
 
				+                    'title': title,
			
 
				+                    'level': level,
			
 
				+                    'index': len(toc_structure) - 1
			
 
				+                })
			
 
				+        
			
 
				+        return toc_structure
			
 
				+    
			
 
				+    def extract_content_by_section(self, file_path: str, section_info: Dict[str, Any]) -> str:
			
 
				+        """提取指定章节内容"""
			
 
				+        doc = docx.Document(file_path)
			
 
				+        content = []
			
 
				+        in_section = False
			
 
				+        current_level = section_info['level']
			
 
				+        
			
 
				+        start_index = section_info.get('paragraph_index', 0)
			
 
				+        next_section_index = self._find_next_section_index(doc, start_index, current_level)
			
 
				+        
			
 
				+        for i, paragraph in enumerate(doc.paragraphs):
			
 
				+            if i < start_index:
			
 
				+                continue
			
 
				+            
			
 
				+            if i == start_index:
			
 
				+                in_section = True
			
 
				+                content.append(paragraph.text)
			
 
				+                continue
			
 
				+            
			
 
				+            if in_section:
			
 
				+                # 检查是否到达下一同级或更高级标题
			
 
				+                if (paragraph.style.name.startswith('Heading') and 
			
 
				+                    i >= next_section_index):
			
 
				+                    break
			
 
				+                
			
 
				+                content.append(paragraph.text)
			
 
				+        
			
 
				+        return '\n'.join(content)
			
 
				+    
			
 
				+    def _find_next_section_index(self, doc, start_index: int, current_level: int) -> int:
			
 
				+        """查找下一个同级或更高级标题的索引"""
			
 
				+        for i in range(start_index + 1, len(doc.paragraphs)):
			
 
				+            paragraph = doc.paragraphs[i]
			
 
				+            if paragraph.style.name.startswith('Heading'):
			
 
				+                level = int(paragraph.style.name.replace('Heading', '').strip())
			
 
				+                if level <= current_level:
			
 
				+                    return i
			
 
				+        return len(doc.paragraphs)
			
 
				+
			
 
				+class DocumentSplitter:
			
 
				+    """文档拆分管理器"""
			
 
				+    
			
 
				+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
			
 
				+        self.chunk_size = chunk_size
			
 
				+        self.chunk_overlap = chunk_overlap
			
 
				+        self.parsers = {
			
 
				+            '.pdf': PDFParser(chunk_size, chunk_overlap),
			
 
				+            '.docx': WordParser(chunk_size, chunk_overlap)
			
 
				+        }
			
 
				+    
			
 
				+    def split_document(self, file_path: str) -> List[DocumentChunk]:
			
 
				+        """拆分文档"""
			
 
				+        file_ext = os.path.splitext(file_path)[1].lower()
			
 
				+        
			
 
				+        if file_ext not in self.parsers:
			
 
				+            raise ValueError(f"不支持的文件格式: {file_ext}")
			
 
				+        
			
 
				+        parser = self.parsers[file_ext]
			
 
				+        return parser.split_document(file_path)
			
 
				+    
			
 
				+    def save_chunks_to_json(self, chunks: List[DocumentChunk], output_file: str):
			
 
				+        """保存片段到JSON文件"""
			
 
				+        chunk_data = []
			
 
				+        
			
 
				+        for chunk in chunks:
			
 
				+            chunk_data.append({
			
 
				+                'content': chunk.content,
			
 
				+                'metadata': {
			
 
				+                    'title': chunk.metadata.title,
			
 
				+                    'level': chunk.metadata.level,
			
 
				+                    'page_start': chunk.metadata.page_start,
			
 
				+                    'page_end': chunk.metadata.page_end,
			
 
				+                    'parent_titles': chunk.metadata.parent_titles,
			
 
				+                    'content_hash': chunk.metadata.content_hash
			
 
				+                },
			
 
				+                'chunk_index': chunk.chunk_index,
			
 
				+                'total_chunks': chunk.total_chunks
			
 
				+            })
			
 
				+        
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            json.dump(chunk_data, f, ensure_ascii=False, indent=2)
			
 
				+    
			
 
				+    def print_chunk_summary(self, chunks: List[DocumentChunk]):
			
 
				+        """打印片段摘要"""
			
 
				+        print("\n=== 文档拆分摘要 ===")
			
 
				+        print(f"总片段数: {len(chunks)}")
			
 
				+        
			
 
				+        for i, chunk in enumerate(chunks):
			
 
				+            print(f"\n片段 {i+1}:")
			
 
				+            print(f"  标题: {chunk.metadata.title}")
			
 
				+            print(f"  层级: {chunk.metadata.level}")
			
 
				+            print(f"  页码: {chunk.metadata.page_start}-{chunk.metadata.page_end}")
			
 
				+            print(f"  父级: {' -> '.join(chunk.metadata.parent_titles)}")
			
 
				+            print(f"  片段: {chunk.chunk_index + 1}/{chunk.total_chunks}")
			
 
				+            print(f"  内容长度: {len(chunk.content)}")
			
 
				+            print(f"  内容预览: {chunk.content[:100]}...")
			
 
				+
			
 
				+# 使用示例
			
 
				+def main():
			
 
				+    # 初始化拆分器
			
 
				+    splitter = DocumentSplitter(chunk_size=800, chunk_overlap=50)
			
 
				+    
			
 
				+    # 示例文件路径
			
 
				+    file_path = "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/bfp_files/"
			
 
				+    pdf_file = file_path + "公路工程施工安全技术规范.pdf"
			
 
				+    #pdf_file = file_path + "公路桥涵施工技术规范.pdf"
			
 
				+    word_file = "example.docx"
			
 
				+    
			
 
				+    try:
			
 
				+
			
 
				+      if os.path.exists(pdf_file):
			
 
				+        print(f"测试PDF目录提取: {pdf_file}")
			
 
				+        
			
 
				+        # 创建PDF解析器实例进行测试
			
 
				+        pdf_parser = PDFParser()
			
 
				+        
			
 
				+        # 测试目录提取
			
 
				+        #toc_structure = pdf_parser.extract_toc_structure(pdf_file)
			
 
				+        # print(f"\n提取到的目录结构 ({len(toc_structure)} 项):")
			
 
				+        # for i, item in enumerate(toc_structure):
			
 
				+        #     print(f"{i+1:2d}. 层级:{item['level']:2d} 页码:{item['page_start']:3d}-{item['page_end']:3d} 标题: {item['title']}")
			
 
				+        
			
 
				+        # 测试完整文档拆分
			
 
				+        chunks = splitter.split_document(pdf_file)
			
 
				+        #splitter.print_chunk_summary(chunks)
			
 
				+
			
 
				+        # 拆分PDF文档
			
 
				+        # if os.path.exists(pdf_file):
			
 
				+        #     print(f"处理PDF文档: {pdf_file}")
			
 
				+        #     pdf_chunks = splitter.split_document(pdf_file)
			
 
				+        #     splitter.save_chunks_to_json(pdf_chunks, "pdf_chunks.json")
			
 
				+        #     splitter.print_chunk_summary(pdf_chunks[:5])  # 只显示前5个片段
			
 
				+        
			
 
				+        # 拆分Word文档
			
 
				+        # if os.path.exists(word_file):
			
 
				+        #     print(f"\n处理Word文档: {word_file}")
			
 
				+        #     word_chunks = splitter.split_document(word_file)
			
 
				+        #     splitter.save_chunks_to_json(word_chunks, "word_chunks.json")
			
 
				+        #     splitter.print_chunk_summary(word_chunks[:5])
			
 
				+    
			
 
				+    except Exception as e:
			
 
				+        print(f"处理文档时出错: {e}")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/file_processors/doc_worker/document_parser2.py
+++ b/file_processors/doc_worker/document_parser2.py
@@ -0,0 +1,625 @@
 
				+import os
			
 
				+import re
			
 
				+import json
			
 
				+from typing import List, Dict, Any, Tuple, Optional
			
 
				+from dataclasses import dataclass
			
 
				+import hashlib
			
 
				+
			
 
				+try:
			
 
				+    from PyPDF2 import PdfReader, PdfWriter
			
 
				+    import fitz  # PyMuPDF
			
 
				+    import docx
			
 
				+    from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				+except ImportError:
			
 
				+    print("请安装所需依赖：pip install PyPDF2 pymupdf python-docx langchain-text-splitter")
			
 
				+    exit(1)
			
 
				+
			
 
				+@dataclass
			
 
				+class DocumentMetadata:
			
 
				+    """文档元数据"""
			
 
				+    title: str
			
 
				+    level: int
			
 
				+    page_start: int
			
 
				+    page_end: int
			
 
				+    parent_titles: List[str]
			
 
				+    content_hash: str
			
 
				+    section_type: str
			
 
				+
			
 
				+@dataclass
			
 
				+class DocumentChunk:
			
 
				+    """文档片段"""
			
 
				+    content: str
			
 
				+    metadata: DocumentMetadata
			
 
				+    chunk_index: int
			
 
				+    total_chunks: int
			
 
				+
			
 
				+class RobustPDFTocExtractor:
			
 
				+    """健壮的PDF目录提取器 - 专门处理页码提取失败的情况"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.level_patterns = {
			
 
				+            1: [r'^\d+\s+', r'^第[一二三四五六七八九十]+\S*\s+'],
			
 
				+            2: [r'^\d+\.\d+\s+', r'^[一二三四五六七八九十]+、\s*'],
			
 
				+            3: [r'^\d+\.\d+\.\d+\s+', r'^\(\d+\)\s+']
			
 
				+        }
			
 
				+    
			
 
				+    def extract_toc_with_fallback(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """提取目录，包含多种回退策略"""
			
 
				+        print(f"正在提取PDF目录: {os.path.basename(file_path)}")
			
 
				+        
			
 
				+        # 方法1: 使用PyMuPDF提取书签
			
 
				+        pymupdf_toc = self._extract_with_pymupdf(file_path)
			
 
				+        if pymupdf_toc and self._has_valid_pages(pymupdf_toc):
			
 
				+            print(f"PyMuPDF提取到 {len(pymupdf_toc)} 个有效目录项")
			
 
				+            return pymupdf_toc
			
 
				+        
			
 
				+        # 方法2: 从目录页文本提取
			
 
				+        text_toc = self._extract_from_toc_page(file_path)
			
 
				+        if text_toc and self._has_valid_pages(text_toc):
			
 
				+            print(f"目录页分析提取到 {len(text_toc)} 个有效目录项")
			
 
				+            return text_toc
			
 
				+        
			
 
				+        # 方法3: 智能估算页码
			
 
				+        estimated_toc = self._extract_and_estimate_pages(file_path)
			
 
				+        if estimated_toc:
			
 
				+            print(f"智能估算提取到 {len(estimated_toc)} 个目录项")
			
 
				+            return estimated_toc
			
 
				+        
			
 
				+        print("无法提取有效的目录结构，将使用传统分割方式")
			
 
				+        return []
			
 
				+    
			
 
				+    def _extract_with_pymupdf(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """使用PyMuPDF提取目录"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            toc = doc.get_toc()
			
 
				+            doc.close()
			
 
				+            
			
 
				+            if not toc:
			
 
				+                return []
			
 
				+            
			
 
				+            processed_toc = []
			
 
				+            parent_stack = []
			
 
				+            
			
 
				+            for item in toc:
			
 
				+                level, title, page_num = item
			
 
				+                
			
 
				+                clean_title = self._clean_title(title)
			
 
				+                if not clean_title or len(clean_title) < 2:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 确定实际页码
			
 
				+                actual_page = max(1, page_num)
			
 
				+                
			
 
				+                # 处理层级关系
			
 
				+                while parent_stack and parent_stack[-1]['level'] >= level:
			
 
				+                    parent_stack.pop()
			
 
				+                
			
 
				+                parent_titles = [p['title'] for p in parent_stack]
			
 
				+                
			
 
				+                toc_item = {
			
 
				+                    'title': clean_title,
			
 
				+                    'level': level,
			
 
				+                    'page_start': actual_page,
			
 
				+                    'page_end': actual_page,
			
 
				+                    'parent_titles': parent_titles,
			
 
				+                    'source': 'pymupdf_toc'
			
 
				+                }
			
 
				+                
			
 
				+                processed_toc.append(toc_item)
			
 
				+                parent_stack.append({
			
 
				+                    'title': clean_title,
			
 
				+                    'level': level,
			
 
				+                    'page': actual_page
			
 
				+                })
			
 
				+            
			
 
				+            return processed_toc
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"PyMuPDF提取目录失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _extract_from_toc_page(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """从目录页提取目录结构"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            print(f"文档内容doc: {doc}")
			
 
				+            toc_items = []
			
 
				+            
			
 
				+            # 检查前10页，寻找目录页
			
 
				+            for page_num in range(min(10, len(doc))):
			
 
				+                page = doc[page_num]
			
 
				+                text = page.get_text()
			
 
				+                
			
 
				+                # 检查是否是目录页（包含"目录"、"目次"等关键词）
			
 
				+                flag = self._is_toc_page(text)
			
 
				+                print(f"检查目录页 {page_num + 1}-{text}: {flag}")
			
 
				+                if flag:
			
 
				+                    items = self._parse_toc_page_text(text, page_num + 1)
			
 
				+                    toc_items.extend(items)
			
 
				+                    break  # 找到目录页后停止
			
 
				+            
			
 
				+            doc.close()
			
 
				+            return toc_items
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"目录页分析失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _is_toc_page(self, text: str) -> bool:
			
 
				+        """判断是否是目录页 - 修复空格问题"""
			
 
				+        # 1. 预处理文本：合并连续空格为单个空格
			
 
				+        normalized_text = re.sub(r'\s+', ' ', text)
			
 
				+        text_lower = normalized_text.lower()
			
 
				+        
			
 
				+        # 2. 关键词检测（处理带空格的关键词）
			
 
				+        toc_keywords = [
			
 
				+            '目录', '目次', 'contents', 'content', 'table of contents',
			
 
				+            '目 录', '目 次'  # 明确包含空格的关键词
			
 
				+        ]
			
 
				+        
			
 
				+        for keyword in toc_keywords:
			
 
				+            # 移除关键词中的空格进行匹配
			
 
				+            clean_keyword = re.sub(r'\s+', '', keyword)
			
 
				+            clean_text = re.sub(r'\s+', '', text_lower)
			
 
				+            
			
 
				+            if clean_keyword in clean_text:
			
 
				+                return True
			
 
				+            
			
 
				+            # 同时检查原始关键词（处理正常情况）
			
 
				+            if keyword in text_lower:
			
 
				+                return True
			
 
				+        
			
 
				+        # 3. 使用正则表达式匹配带空格的关键词
			
 
				+        toc_patterns = [
			
 
				+            r'目\s*录',    # 匹配"目  录"、"目 录"等
			
 
				+            r'目\s*次',    # 匹配"目  次"、"目 次"等  
			
 
				+            r'content\s*s', # 匹配"content s"等
			
 
				+        ]
			
 
				+        
			
 
				+        for pattern in toc_patterns:
			
 
				+            if re.search(pattern, text_lower, re.IGNORECASE):
			
 
				+                return True
			
 
				+        
			
 
				+        # 4. 检测典型的目录结构模式（原有逻辑）
			
 
				+        lines = text.split('\n')
			
 
				+        toc_line_count = 0
			
 
				+        total_lines = len(lines)
			
 
				+        
			
 
				+        for line in lines:
			
 
				+            line = line.strip()
			
 
				+            if not line:
			
 
				+                continue
			
 
				+                
			
 
				+            # 检测目录项特征
			
 
				+            if self._is_toc_line(line):
			
 
				+                toc_line_count += 1
			
 
				+        
			
 
				+        # 如果超过30%的行看起来像目录项，认为是目录页
			
 
				+        if total_lines > 0 and toc_line_count / total_lines > 0.3:
			
 
				+            return True
			
 
				+        
			
 
				+        # 5. 检测目录特有的格式模式
			
 
				+        toc_patterns = [
			
 
				+            r'\.{3,}\s*\d+',  # 省略号后跟页码
			
 
				+            r'\d+\s*\.{3,}',  # 数字后跟省略号
			
 
				+            r'^\s*\d+(\.\d+)*\s+',  # 数字编号开头
			
 
				+            r'^\s*第[一二三四五六七八九十]+\S*\s+',  # 中文编号开头
			
 
				+        ]
			
 
				+        
			
 
				+        for pattern in toc_patterns:
			
 
				+            if re.search(pattern, text):
			
 
				+                return True
			
 
				+    
			
 
				+        return False
			
 
				+    
			
 
				+    def _parse_toc_page_text(self, text: str, page_num: int) -> List[Dict[str, Any]]:
			
 
				+        """解析目录页文本"""
			
 
				+        toc_items = []
			
 
				+        lines = text.split('\n')
			
 
				+        
			
 
				+        # 目录项模式
			
 
				+        toc_patterns = [
			
 
				+            # "1 总则 ........... 1"
			
 
				+            r'^(\d+(?:\.\d+)*)\s+([^\.]{2,50}?)\s*\.{3,}\s*(\d+)\s*$',
			
 
				+            # "第一章 总则 ........... 1"  
			
 
				+            r'^(第[一二三四五六七八九十百千]+[章节条])\s+([^\.]{2,50}?)\s*\.{3,}\s*(\d+)\s*$',
			
 
				+            # "附录A 危险性较大的工程 ........... 100"
			
 
				+            r'^(附录[ABCDE])\s+([^\.]{2,50}?)\s*\.{3,}\s*(\d+)\s*$',
			
 
				+        ]
			
 
				+        
			
 
				+        for line in lines:
			
 
				+            line = line.strip()
			
 
				+            if not line or len(line) > 200:
			
 
				+                continue
			
 
				+                
			
 
				+            for pattern in toc_patterns:
			
 
				+                match = re.match(pattern, line)
			
 
				+                if match:
			
 
				+                    numbering = match.group(1)
			
 
				+                    title = match.group(2).strip()
			
 
				+                    page_num = int(match.group(3))
			
 
				+                    
			
 
				+                    title = self._clean_title(title)
			
 
				+                    if not title:
			
 
				+                        continue
			
 
				+                    
			
 
				+                    level = self._determine_level(numbering)
			
 
				+                    
			
 
				+                    toc_items.append({
			
 
				+                        'title': title,
			
 
				+                        'level': level,
			
 
				+                        'page_start': page_num,
			
 
				+                        'page_end': page_num,
			
 
				+                        'parent_titles': [],
			
 
				+                        'source': 'toc_page'
			
 
				+                    })
			
 
				+                    break
			
 
				+        
			
 
				+        return toc_items
			
 
				+    
			
 
				+    def _extract_and_estimate_pages(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """提取目录结构并智能估算页码"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            doc_length = len(doc)
			
 
				+            
			
 
				+            # 首先提取目录结构（不包含准确页码）
			
 
				+            toc_structure = self._extract_toc_structure_only(file_path)
			
 
				+            if not toc_structure:
			
 
				+                doc.close()
			
 
				+                return []
			
 
				+            
			
 
				+            # 智能估算页码
			
 
				+            estimated_toc = self._smart_estimate_pages(toc_structure, doc_length)
			
 
				+            doc.close()
			
 
				+            
			
 
				+            return estimated_toc
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"智能估算页码失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _extract_toc_structure_only(self, file_path: str) -> List[Dict[str, Any]]:
			
 
				+        """仅提取目录结构（不包含页码）"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            toc = doc.get_toc()
			
 
				+            doc.close()
			
 
				+            
			
 
				+            if not toc:
			
 
				+                return []
			
 
				+            
			
 
				+            toc_structure = []
			
 
				+            for item in toc:
			
 
				+                level, title, _ = item  # 忽略页码
			
 
				+                clean_title = self._clean_title(title)
			
 
				+                if clean_title and len(clean_title) >= 2:
			
 
				+                    toc_structure.append({
			
 
				+                        'title': clean_title,
			
 
				+                        'level': level,
			
 
				+                        'source': 'structure_only'
			
 
				+                    })
			
 
				+            
			
 
				+            return toc_structure
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"提取目录结构失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _smart_estimate_pages(self, toc_structure: List[Dict[str, Any]], doc_length: int) -> List[Dict[str, Any]]:
			
 
				+        """智能估算页码"""
			
 
				+        if not toc_structure:
			
 
				+            return []
			
 
				+        
			
 
				+        estimated_toc = []
			
 
				+        current_page = 1
			
 
				+        
			
 
				+        # 标准文档的典型页数分布
			
 
				+        page_estimates = {
			
 
				+            '封面': 1, '扉页': 1, '前言': 2, '目录': 2, '总则': 3, '术语': 4,
			
 
				+            '附录': 5, '封底': 1
			
 
				+        }
			
 
				+        
			
 
				+        for i, item in enumerate(toc_structure):
			
 
				+            title = item['title']
			
 
				+            level = item['level']
			
 
				+            
			
 
				+            # 基于标题内容估算页数
			
 
				+            estimated_pages = 2  # 默认2页
			
 
				+            
			
 
				+            for key, pages in page_estimates.items():
			
 
				+                if key in title:
			
 
				+                    estimated_pages = pages
			
 
				+                    break
			
 
				+            
			
 
				+            # 调整基于层级的页数
			
 
				+            if level == 1:
			
 
				+                estimated_pages = max(estimated_pages, 3)
			
 
				+            elif level == 2:
			
 
				+                estimated_pages = max(estimated_pages, 2)
			
 
				+            else:
			
 
				+                estimated_pages = max(estimated_pages, 1)
			
 
				+            
			
 
				+            # 确保不超过文档总长度
			
 
				+            if current_page > doc_length:
			
 
				+                current_page = doc_length
			
 
				+            
			
 
				+            estimated_toc.append({
			
 
				+                'title': title,
			
 
				+                'level': level,
			
 
				+                'page_start': current_page,
			
 
				+                'page_end': min(current_page + estimated_pages - 1, doc_length),
			
 
				+                'parent_titles': [],
			
 
				+                'source': 'smart_estimated'
			
 
				+            })
			
 
				+            
			
 
				+            current_page += estimated_pages
			
 
				+        
			
 
				+        return estimated_toc
			
 
				+    
			
 
				+    def _has_valid_pages(self, toc_structure: List[Dict[str, Any]]) -> bool:
			
 
				+        """检查目录是否有有效的页码"""
			
 
				+        if not toc_structure:
			
 
				+            return False
			
 
				+        
			
 
				+        valid_pages = 0
			
 
				+        for item in toc_structure:
			
 
				+            if item['page_start'] > 1:  # 至少有一个不是第1页
			
 
				+                valid_pages += 1
			
 
				+        
			
 
				+        # 至少要有10%的目录项有有效页码
			
 
				+        return valid_pages >= max(2, len(toc_structure) * 0.1)
			
 
				+    
			
 
				+    def _determine_level(self, numbering: str) -> int:
			
 
				+        """根据编号确定层级"""
			
 
				+        if '.' in numbering:
			
 
				+            return numbering.count('.') + 1
			
 
				+        elif '第' in numbering and '章' in numbering:
			
 
				+            return 1
			
 
				+        elif '附录' in numbering:
			
 
				+            return 1
			
 
				+        else:
			
 
				+            return 1
			
 
				+    
			
 
				+    def _clean_title(self, title: str) -> str:
			
 
				+        """清理标题"""
			
 
				+        if not title:
			
 
				+            return ""
			
 
				+        
			
 
				+        # 移除文件后缀和噪音
			
 
				+        title = re.sub(r'\.(pdf|docx?|txt)$', '', title, flags=re.IGNORECASE)
			
 
				+        title = re.sub(r'\.{3,}.*$', '', title)  # 移除省略号和页码
			
 
				+        title = re.sub(r'\s+', ' ', title).strip()
			
 
				+        
			
 
				+        return title
			
 
				+
			
 
				+class RobustPDFParser:
			
 
				+    """健壮的PDF解析器 - 处理页码提取失败的情况"""
			
 
				+    
			
 
				+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
			
 
				+        self.chunk_size = chunk_size
			
 
				+        self.chunk_overlap = chunk_overlap
			
 
				+        self.text_splitter = RecursiveCharacterTextSplitter(
			
 
				+            chunk_size=chunk_size,
			
 
				+            chunk_overlap=chunk_overlap,
			
 
				+            length_function=len,
			
 
				+        )
			
 
				+        self.toc_extractor = RobustPDFTocExtractor()
			
 
				+    
			
 
				+    def split_document(self, file_path: str) -> List[DocumentChunk]:
			
 
				+        """拆分文档 - 健壮版本"""
			
 
				+        print(f"开始解析文档: {os.path.basename(file_path)}")
			
 
				+        
			
 
				+        # 提取目录结构
			
 
				+        toc_structure = self.toc_extractor.extract_toc_with_fallback(file_path)
			
 
				+        
			
 
				+        if toc_structure:
			
 
				+            print(f"按目录结构拆分，共 {len(toc_structure)} 个章节")
			
 
				+            return self._split_by_toc_structure(file_path, toc_structure)
			
 
				+        else:
			
 
				+            print("使用传统方式拆分文档")
			
 
				+            return self._split_traditional(file_path)
			
 
				+    
			
 
				+    def _split_by_toc_structure(self, file_path: str, toc_structure: List[Dict[str, Any]]) -> List[DocumentChunk]:
			
 
				+        """按目录结构拆分"""
			
 
				+        all_chunks = []
			
 
				+        doc_length = self._get_pdf_length(file_path)
			
 
				+        
			
 
				+        for i, section in enumerate(toc_structure):
			
 
				+            print(f"处理章节 {i+1}/{len(toc_structure)}: {section['title']}")
			
 
				+            
			
 
				+            # 提取章节内容
			
 
				+            content = self._extract_section_content(file_path, section, doc_length)
			
 
				+            if not content.strip():
			
 
				+                print(f"  警告: 章节 '{section['title']}' 内容为空")
			
 
				+                continue
			
 
				+            
			
 
				+            # 处理内容拆分
			
 
				+            section_chunks = self._process_section_content(content, section)
			
 
				+            all_chunks.extend(section_chunks)
			
 
				+        
			
 
				+        print(f"文档拆分完成，共生成 {len(all_chunks)} 个片段")
			
 
				+        return all_chunks
			
 
				+    
			
 
				+    def _extract_section_content(self, file_path: str, section: Dict[str, Any], doc_length: int) -> str:
			
 
				+        """提取章节内容 - 健壮版本"""
			
 
				+        start_page = section['page_start']
			
 
				+        end_page = section['page_end']
			
 
				+        
			
 
				+        # 验证页码范围
			
 
				+        if start_page < 1 or start_page > doc_length or end_page < start_page:
			
 
				+            print(f"  警告: 章节 '{section['title']}' 页码范围无效 ({start_page}-{end_page})")
			
 
				+            # 使用智能页码范围
			
 
				+            start_page, end_page = self._estimate_section_pages(section, doc_length, toc_structure)
			
 
				+            print(f"  使用估算页码: {start_page}-{end_page}")
			
 
				+        
			
 
				+        content = ""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            
			
 
				+            start_idx = max(0, start_page - 1)
			
 
				+            end_idx = min(len(doc) - 1, end_page - 1)
			
 
				+            
			
 
				+            for page_num in range(start_idx, end_idx + 1):
			
 
				+                page = doc[page_num]
			
 
				+                text = page.get_text()
			
 
				+                if text.strip():
			
 
				+                    content += text + "\n\n"
			
 
				+            
			
 
				+            doc.close()
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"  提取章节内容失败: {e}")
			
 
				+        
			
 
				+        return content
			
 
				+    
			
 
				+    def _estimate_section_pages(self, section: Dict[str, Any], doc_length: int, toc_structure: List[Dict[str, Any]] = None) -> Tuple[int, int]:
			
 
				+        """估算章节的页码范围"""
			
 
				+        # 简单实现：每个章节分配2-5页
			
 
				+        section_index = toc_structure.index(section) if toc_structure else 0
			
 
				+        start_page = max(1, section_index * 3 + 1)
			
 
				+        end_page = min(doc_length, start_page + 4)  # 每个章节5页
			
 
				+        
			
 
				+        return start_page, end_page
			
 
				+    
			
 
				+    def _process_section_content(self, content: str, section: Dict[str, Any]) -> List[DocumentChunk]:
			
 
				+        """处理章节内容"""
			
 
				+        if len(content) <= self.chunk_size:
			
 
				+            # 内容较小，直接作为一个片段
			
 
				+            metadata = DocumentMetadata(
			
 
				+                title=section['title'],
			
 
				+                level=section['level'],
			
 
				+                page_start=section['page_start'],
			
 
				+                page_end=section['page_end'],
			
 
				+                parent_titles=section.get('parent_titles', []),
			
 
				+                content_hash=self._calculate_content_hash(content),
			
 
				+                section_type='toc'
			
 
				+            )
			
 
				+            
			
 
				+            return [DocumentChunk(
			
 
				+                content=content,
			
 
				+                metadata=metadata,
			
 
				+                chunk_index=0,
			
 
				+                total_chunks=1
			
 
				+            )]
			
 
				+        else:
			
 
				+            # 内容较大，进一步拆分
			
 
				+            return self._split_large_content(content, section)
			
 
				+    
			
 
				+    def _split_large_content(self, content: str, section: Dict[str, Any]) -> List[DocumentChunk]:
			
 
				+        """拆分大段内容"""
			
 
				+        chunks = []
			
 
				+        split_texts = self.text_splitter.split_text(content)
			
 
				+        
			
 
				+        for i, chunk_content in enumerate(split_texts):
			
 
				+            metadata = DocumentMetadata(
			
 
				+                title=f"{section['title']} - 部分{i+1}",
			
 
				+                level=section['level'],
			
 
				+                page_start=section['page_start'],
			
 
				+                page_end=section['page_end'],
			
 
				+                parent_titles=section.get('parent_titles', []),
			
 
				+                content_hash=self._calculate_content_hash(chunk_content),
			
 
				+                section_type='toc_split'
			
 
				+            )
			
 
				+            
			
 
				+            chunks.append(DocumentChunk(
			
 
				+                content=chunk_content,
			
 
				+                metadata=metadata,
			
 
				+                chunk_index=i,
			
 
				+                total_chunks=len(split_texts)
			
 
				+            ))
			
 
				+        
			
 
				+        return chunks
			
 
				+    
			
 
				+    def _split_traditional(self, file_path: str) -> List[DocumentChunk]:
			
 
				+        """传统分割方式"""
			
 
				+        full_content = self._extract_full_content(file_path)
			
 
				+        
			
 
				+        if not full_content.strip():
			
 
				+            print("警告: 文档内容为空")
			
 
				+            return []
			
 
				+        
			
 
				+        # 按固定大小分割
			
 
				+        chunks = []
			
 
				+        split_texts = self.text_splitter.split_text(full_content)
			
 
				+        
			
 
				+        for i, content in enumerate(split_texts):
			
 
				+            metadata = DocumentMetadata(
			
 
				+                title=f"文档内容 - 部分{i+1}",
			
 
				+                level=1,
			
 
				+                page_start=1,
			
 
				+                page_end=self._get_pdf_length(file_path),
			
 
				+                parent_titles=[],
			
 
				+                content_hash=self._calculate_content_hash(content),
			
 
				+                section_type='traditional'
			
 
				+            )
			
 
				+            
			
 
				+            chunks.append(DocumentChunk(
			
 
				+                content=content,
			
 
				+                metadata=metadata,
			
 
				+                chunk_index=i,
			
 
				+                total_chunks=len(split_texts)
			
 
				+            ))
			
 
				+        
			
 
				+        return chunks
			
 
				+    
			
 
				+    def _extract_full_content(self, file_path: str) -> str:
			
 
				+        """提取完整文档内容"""
			
 
				+        full_content = ""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            for page_num in range(len(doc)):
			
 
				+                page = doc[page_num]
			
 
				+                text = page.get_text()
			
 
				+                if text.strip():
			
 
				+                    full_content += text + "\n\n"
			
 
				+            doc.close()
			
 
				+        except Exception as e:
			
 
				+            print(f"提取完整内容失败: {e}")
			
 
				+        return full_content
			
 
				+    
			
 
				+    def _get_pdf_length(self, file_path: str) -> int:
			
 
				+        """获取PDF页数"""
			
 
				+        try:
			
 
				+            doc = fitz.open(file_path)
			
 
				+            length = len(doc)
			
 
				+            doc.close()
			
 
				+            return length
			
 
				+        except:
			
 
				+            return 100
			
 
				+    
			
 
				+    def _calculate_content_hash(self, content: str) -> str:
			
 
				+        """计算内容哈希"""
			
 
				+        return hashlib.md5(content.encode('utf-8')).hexdigest()
			
 
				+
			
 
				+# 使用示例
			
 
				+def main():
			
 
				+    # 初始化解析器
			
 
				+    pdf_parser = RobustPDFParser(chunk_size=800, chunk_overlap=50)
			
 
				+    
			
 
				+    # 测试文件
			
 
				+    #test_file = "公路工程施工安全技术规范.pdf"
			
 
				+    # 示例文件路径
			
 
				+    file_path = "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/bfp_files/"
			
 
				+    test_file = file_path + "公路工程施工安全技术规范.pdf"
			
 
				+    #pdf_file = file_path + "公路桥涵施工技术规范.pdf"
			
 
				+    
			
 
				+    if os.path.exists(test_file):
			
 
				+        print(f"处理PDF文档: {test_file}")
			
 
				+        chunks = pdf_parser.split_document(test_file)
			
 
				+        
			
 
				+        print(f"\n最终生成 {len(chunks)} 个片段")
			
 
				+        if chunks:
			
 
				+            print("\n前5个片段:")
			
 
				+            for i, chunk in enumerate(chunks[:5]):
			
 
				+                print(f"{i+1}. [{chunk.metadata.section_type}] {chunk.metadata.title}")
			
 
				+                print(f"   页码: {chunk.metadata.page_start}-{chunk.metadata.page_end}")
			
 
				+                print(f"   内容长度: {len(chunk.content)}")
			
 
				+                print(f"   预览: {chunk.content[:100]}...")
			
 
				+                print()
			
 
				+    else:
			
 
				+        print(f"文件不存在: {test_file}")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/file_processors/doc_worker/test_document.py
+++ b/file_processors/doc_worker/test_document.py
@@ -0,0 +1,352 @@
 
				+import fitz  # PyMuPDF
			
 
				+from docx import Document as DocxDocument
			
 
				+from docx.document import Document
			
 
				+from docx.table import Table
			
 
				+from docx.text.paragraph import Paragraph
			
 
				+from docx.oxml.table import CT_Tbl
			
 
				+from docx.oxml.text.paragraph import CT_P
			
 
				+from docx.text.run import Run
			
 
				+import re
			
 
				+from typing import List, Dict, Any, Union, Tuple
			
 
				+
			
 
				+# 定义文档片段的数据结构
			
 
				+class DocFragment:
			
 
				+    def __init__(self, content: str, metadata: Dict[str, Any]):
			
 
				+        self.content = content
			
 
				+        self.metadata = metadata
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return f"DocFragment(title='{self.metadata.get('title', 'N/A')}', page='{self.metadata.get('page_start', 'N/A')}-{self.metadata.get('page_end', 'N/A')}')"
			
 
				+
			
 
				+class DocumentParser:
			
 
				+    def __init__(self, max_chars: int = 1000):
			
 
				+        """
			
 
				+        初始化解析器。
			
 
				+
			
 
				+        :param max_chars: 每个知识库片段的最大字符数。
			
 
				+        """
			
 
				+        self.max_chars = max_chars
			
 
				+        self.fragments: List[DocFragment] = []
			
 
				+        self.current_doc_type = None
			
 
				+
			
 
				+    def parse(self, file_path: str) -> List[DocFragment]:
			
 
				+        """
			
 
				+        解析PDF或Word文档。
			
 
				+
			
 
				+        :param file_path: 文档文件路径。
			
 
				+        :return: 解析后的文档片段列表。
			
 
				+        """
			
 
				+        if file_path.lower().endswith('.pdf'):
			
 
				+            self.current_doc_type = 'pdf'
			
 
				+            self._parse_pdf(file_path)
			
 
				+        elif file_path.lower().endswith(('.docx', '.doc')):
			
 
				+            self.current_doc_type = 'word'
			
 
				+            self._parse_word(file_path)
			
 
				+        else:
			
 
				+            raise ValueError(f"Unsupported file format: {file_path}")
			
 
				+        return self.fragments
			
 
				+
			
 
				+    def _parse_pdf(self, file_path: str):
			
 
				+        """解析PDF文档"""
			
 
				+        doc = fitz.open(file_path)
			
 
				+        toc = doc.get_toc()  # 获取目录 (Table of Contents)
			
 
				+        print("PDF TOC:", toc) # 调试用
			
 
				+
			
 
				+        # 构建一个按页码索引的标题列表，方便查找
			
 
				+        page_titles = self._build_page_title_map_pdf(toc, doc.page_count)
			
 
				+
			
 
				+        for i in range(doc.page_count):
			
 
				+            page = doc.load_page(i)
			
 
				+            text = page.get_text()
			
 
				+            if not text.strip():
			
 
				+                continue
			
 
				+
			
 
				+            # 尝试找到当前页对应的标题和路径
			
 
				+            title_info = self._find_title_for_page_pdf(page_titles, i)
			
 
				+            title = title_info['title']
			
 
				+            path = title_info['path']
			
 
				+            level = title_info['level']
			
 
				+
			
 
				+            # 一个页面的内容可能属于多个章节，需要更精细地处理。
			
 
				+            # 这里我们先获取页面所属的主要章节信息，然后尝试根据换行等自然段落拆分。
			
 
				+            # 但更理想的是，根据目录标题在文本中的位置进行切分。
			
 
				+            
			
 
				+            # 为了更精确，我们尝试在页面文本中查找目录标题
			
 
				+            relevant_toc_items = self._find_toc_items_in_page_text(toc, text, i)
			
 
				+            
			
 
				+            if not relevant_toc_items:
			
 
				+                # 如果页面文本中没有找到明确的目录标题，则使用最近的上级标题
			
 
				+                # 并将整个页面内容作为一个片段
			
 
				+                content = text.strip()
			
 
				+                if content:
			
 
				+                    metadata = {
			
 
				+                        "source_file": file_path,
			
 
				+                        "title": title or "Unknown Section",
			
 
				+                        "path": " -> ".join(path) if path else "Root",
			
 
				+                        "level": level,
			
 
				+                        "page_start": i + 1,
			
 
				+                        "page_end": i + 1,
			
 
				+                    }
			
 
				+                    self._add_fragments(content, metadata)
			
 
				+            else:
			
 
				+                # 如果找到了目录标题，则按标题切分页面内容
			
 
				+                sorted_items = sorted(relevant_toc_items, key=lambda x: x['start_pos'])
			
 
				+                for idx, item in enumerate(sorted_items):
			
 
				+                    start_pos = item['start_pos']
			
 
				+                    end_pos = len(text) if idx == len(sorted_items) - 1 else sorted_items[idx + 1]['start_pos']
			
 
				+                    section_text = text[start_pos:end_pos].strip()
			
 
				+                    
			
 
				+                    if section_text:
			
 
				+                        # 查找此section_text对应的完整路径
			
 
				+                        section_path = self._get_path_from_toc(toc, item['title'], item['page_num'])
			
 
				+                        section_title = item['title']
			
 
				+                        section_level = item['level']
			
 
				+                        
			
 
				+                        metadata = {
			
 
				+                            "source_file": file_path,
			
 
				+                            "title": section_title,
			
 
				+                            "path": " -> ".join(section_path) if section_path else "Root",
			
 
				+                            "level": section_level,
			
 
				+                            "page_start": item['page_num'] + 1, # PyMuPDF页码从0开始
			
 
				+                            "page_end": item['page_num'] + 1,
			
 
				+                        }
			
 
				+                        self._add_fragments(section_text, metadata)
			
 
				+
			
 
				+
			
 
				+    def _build_page_title_map_pdf(self, toc: List, num_pages: int):
			
 
				+        """为PDF构建一个按页码映射标题的辅助结构"""
			
 
				+        page_map = [{"title": "Unknown Section", "path": [], "level": 0} for _ in range(num_pages)]
			
 
				+        stack = []
			
 
				+
			
 
				+        for item in toc:
			
 
				+            level, title, page_num_zero_based = item[0], item[1], item[2] - 1 # TOC页码是1-based, 转为0-based
			
 
				+            if page_num_zero_based < 0 or page_num_zero_based >= num_pages:
			
 
				+                 continue # 无效页码
			
 
				+
			
 
				+            # 调整栈以匹配当前层级
			
 
				+            while stack and stack[-1]['level'] >= level:
			
 
				+                stack.pop()
			
 
				+            
			
 
				+            stack.append({"title": title, "level": level})
			
 
				+            current_path = [item["title"] for item in stack]
			
 
				+            
			
 
				+            # 更新从此页到下一标题页之前的所有页的映射
			
 
				+            start_page = page_num_zero_based
			
 
				+            # 找到下一个同级或更高级的标题页码，或文档末尾
			
 
				+            next_start_page = num_pages
			
 
				+            for next_item in toc:
			
 
				+                 if next_item[0] <= level and next_item[2] - 1 > page_num_zero_based:
			
 
				+                     next_start_page = next_item[2] - 1
			
 
				+                     break
			
 
				+            
			
 
				+            for p in range(start_page, min(next_start_page, num_pages)):
			
 
				+                 page_map[p] = {"title": title, "path": current_path.copy(), "level": level}
			
 
				+        
			
 
				+        return page_map
			
 
				+
			
 
				+    def _find_title_for_page_pdf(self, page_titles: List[Dict], page_num: int):
			
 
				+        """查找指定页码对应的标题信息"""
			
 
				+        if 0 <= page_num < len(page_titles):
			
 
				+            return page_titles[page_num]
			
 
				+        return {"title": "Unknown Section", "path": [], "level": 0}
			
 
				+
			
 
				+    def _find_toc_items_in_page_text(self, toc: List, page_text: str, page_num: int) -> List[Dict]:
			
 
				+        """在页面文本中查找目录项的起始位置"""
			
 
				+        found_items = []
			
 
				+        # 将TOC中属于当前页的标题按文本长度降序排列，优先匹配长标题，避免短标题误匹配
			
 
				+        page_toc_items = [item for item in toc if item[2] - 1 == page_num] # 转为0-based
			
 
				+        page_toc_items.sort(key=lambda x: len(x[1]), reverse=True)
			
 
				+        
			
 
				+        text_pos = 0
			
 
				+        for level, title, p_num in page_toc_items:
			
 
				+             # 使用更宽松的匹配，可能需要处理换行、空格等问题
			
 
				+             # 这里简单使用str.find，实际可能需要更复杂的NLP处理
			
 
				+             clean_title = re.escape(title.strip())
			
 
				+             match = re.search(clean_title, page_text[text_pos:], re.IGNORECASE)
			
 
				+             if match:
			
 
				+                 actual_pos = text_pos + match.start()
			
 
				+                 found_items.append({
			
 
				+                     "title": title,
			
 
				+                     "level": level,
			
 
				+                     "page_num": p_num - 1, # 0-based
			
 
				+                     "start_pos": actual_pos
			
 
				+                 })
			
 
				+                 # 更新搜索起始位置，避免重复匹配同一位置
			
 
				+                 text_pos = actual_pos + 1 
			
 
				+                 
			
 
				+        # 按起始位置排序
			
 
				+        found_items.sort(key=lambda x: x['start_pos'])
			
 
				+        return found_items
			
 
				+
			
 
				+    def _get_path_from_toc(self, toc: List, target_title: str, target_page: int) -> List[str]:
			
 
				+        """根据标题和页码在TOC中查找其完整路径"""
			
 
				+        stack = []
			
 
				+        for item in toc:
			
 
				+            level, title, page_num = item[0], item[1], item[2] - 1 # 1-based to 0-based
			
 
				+            # 调整栈深度
			
 
				+            while stack and stack[-1]['level'] >= level:
			
 
				+                stack.pop()
			
 
				+            stack.append({"title": title, "level": level})
			
 
				+            
			
 
				+            if title == target_title and page_num == target_page:
			
 
				+                return [s["title"] for s in stack]
			
 
				+        return [] # 如果未找到，则返回空列表或根路径
			
 
				+
			
 
				+
			
 
				+    def _parse_word(self, file_path: str):
			
 
				+        """解析Word文档"""
			
 
				+        docx_doc = DocxDocument(file_path)
			
 
				+        # Word 没有像PDF那样直接的“目录”API，但我们可以解析段落样式来识别标题
			
 
				+        # 常见标题样式: "Heading 1", "Heading 2", ..., "Heading 9"
			
 
				+        # 需要遍历所有段落，识别标题，构建层级结构，然后按此结构切分内容
			
 
				+        # 这比PDF更复杂，因为Word内容更结构化（段落、表格、图片等）
			
 
				+
			
 
				+        # 简化处理：遍历段落，识别标题，累积内容直到遇到下一个同级或更高级标题
			
 
				+        current_structure = [] # [(level, title), ...]
			
 
				+        current_content = ""
			
 
				+        current_page_hint = 1 # Word没有直接的页码API，这里用一个近似值或忽略
			
 
				+
			
 
				+        for element in docx_doc.element.body:
			
 
				+            if isinstance(element, CT_P):
			
 
				+                paragraph = Paragraph(element, docx_doc)
			
 
				+                style_name = paragraph.style.name if paragraph.style else "Normal"
			
 
				+                
			
 
				+                # 检查是否为标题样式
			
 
				+                if style_name.startswith('Heading'):
			
 
				+                    try:
			
 
				+                        level = int(style_name.split()[-1])
			
 
				+                    except ValueError:
			
 
				+                        level = 99 # 非标准标题，视为普通内容
			
 
				+
			
 
				+                    if level < 99:
			
 
				+                        title_text = paragraph.text.strip()
			
 
				+                        if not title_text: continue # 跳过空标题
			
 
				+
			
 
				+                        # 保存当前累积的内容作为一个片段
			
 
				+                        if current_content.strip():
			
 
				+                            metadata = self._create_metadata_word(current_structure, file_path, current_page_hint)
			
 
				+                            self._add_fragments(current_content.strip(), metadata)
			
 
				+                        
			
 
				+                        # 更新结构
			
 
				+                        # 移除更深层次的标题
			
 
				+                        current_structure = [(l, t) for l, t in current_structure if l < level]
			
 
				+                        # 添加当前标题
			
 
				+                        current_structure.append((level, title_text))
			
 
				+                        # 重置内容
			
 
				+                        current_content = ""
			
 
				+                        continue # 标题本身不加入内容，下一个段落开始新内容
			
 
				+                
			
 
				+                # 累积非标题段落内容
			
 
				+                # 处理段落中的runs
			
 
				+                para_text = ""
			
 
				+                for run in paragraph.runs:
			
 
				+                    para_text += run.text
			
 
				+                current_content += para_text + "\n" # 保持段落分隔
			
 
				+            
			
 
				+            elif isinstance(element, CT_Tbl):
			
 
				+                table = Table(element, docx_doc)
			
 
				+                # 简单处理表格：将其转换为文本添加到内容中
			
 
				+                table_text = "\n"
			
 
				+                for row in table.rows:
			
 
				+                    row_text = " | ".join([cell.text for cell in row.cells])
			
 
				+                    table_text += row_text + "\n"
			
 
				+                current_content += table_text + "\n"
			
 
				+
			
 
				+        # 处理文档末尾剩余的内容
			
 
				+        if current_content.strip():
			
 
				+            metadata = self._create_metadata_word(current_structure, file_path, current_page_hint)
			
 
				+            self._add_fragments(current_content.strip(), metadata)
			
 
				+
			
 
				+    def _create_metadata_word(self, structure: List[Tuple[int, str]], file_path: str, page_hint: int) -> Dict[str, Any]:
			
 
				+        """为Word文档片段创建元数据"""
			
 
				+        if structure:
			
 
				+            level, title = structure[-1]
			
 
				+            path = " -> ".join([s[1] for s in structure])
			
 
				+        else:
			
 
				+            level, title = 0, "Document Body"
			
 
				+            path = "Root"
			
 
				+        return {
			
 
				+            "source_file": file_path,
			
 
				+            "title": title,
			
 
				+            "path": path,
			
 
				+            "level": level,
			
 
				+            "page_start": page_hint, # Word页码近似或可忽略
			
 
				+            "page_end": page_hint,
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+    def _add_fragments(self, content: str, metadata: Dict[str, Any]):
			
 
				+        """将内容按大小拆分并添加到片段列表"""
			
 
				+        if len(content) <= self.max_chars:
			
 
				+            self.fragments.append(DocFragment(content, metadata.copy()))
			
 
				+        else:
			
 
				+            # 简单按字符数拆分，尽量在句子或段落边界处拆分
			
 
				+            parts = self._split_content_by_size(content)
			
 
				+            for i, part in enumerate(parts):
			
 
				+                frag_meta = metadata.copy()
			
 
				+                frag_meta["part_index"] = i
			
 
				+                frag_meta["total_parts"] = len(parts)
			
 
				+                self.fragments.append(DocFragment(part, frag_meta))
			
 
				+
			
 
				+    def _split_content_by_size(self, text: str) -> List[str]:
			
 
				+        """按最大字符数拆分文本，尽量在自然边界处拆分"""
			
 
				+        if len(text) <= self.max_chars:
			
 
				+            return [text]
			
 
				+        
			
 
				+        parts = []
			
 
				+        while len(text) > self.max_chars:
			
 
				+            # 从 max_chars 位置向前查找切分点
			
 
				+            split_point = self.max_chars
			
 
				+            # 优先查找句号、问号、感叹号
			
 
				+            for sep in ['.', '!', '?', '\n', '\r', ';', ':', ',', ' ', '']:
			
 
				+                last_sep_index = text.rfind(sep, 0, self.max_chars)
			
 
				+                if last_sep_index != -1 and last_sep_index > self.max_chars * 0.8: # 确保不是切得太短
			
 
				+                    split_point = last_sep_index + len(sep)
			
 
				+                    break
			
 
				+            
			
 
				+            parts.append(text[:split_point])
			
 
				+            text = text[split_point:]
			
 
				+        
			
 
				+        if text: # 添加最后一部分
			
 
				+            parts.append(text)
			
 
				+        
			
 
				+        return parts
			
 
				+
			
 
				+# --- 使用示例 ---
			
 
				+if __name__ == "__main__":
			
 
				+    # 请将 'your_document.pdf' 或 'your_document.docx' 替换为你的实际文件路径
			
 
				+    file_path_pdf = "sample_document.pdf"  # 示例PDF路径
			
 
				+    file_path_word = "sample_document.docx" # 示例Word路径
			
 
				+
			
 
				+    # 示例文件路径
			
 
				+    file_path = "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/bfp_files/"
			
 
				+    file_path_pdf = file_path + "公路工程施工安全技术规范.pdf"
			
 
				+    #file_path_pdf = file_path + "公路桥涵施工技术规范.pdf"
			
 
				+
			
 
				+    file_path_word = file_path + ""
			
 
				+
			
 
				+    parser = DocumentParser(max_chars=500) # 设置最大字符数为500
			
 
				+
			
 
				+    # 解析PDF
			
 
				+    try:
			
 
				+        fragments_pdf = parser.parse(file_path_pdf)
			
 
				+        print(f"--- Parsed {len(fragments_pdf)} fragments from PDF ---")
			
 
				+        for frag in fragments_pdf:
			
 
				+            print(f"Title: {frag.metadata['title']}, Path: {frag.metadata['path']}, Pages: {frag.metadata['page_start']}-{frag.metadata['page_end']}, Level: {frag.metadata['level']}")
			
 
				+            # print(f"Content Preview: {frag.content[:100]}...") # 打印内容预览
			
 
				+            print("-" * 20)
			
 
				+    except Exception as e:
			
 
				+        print(f"Error parsing PDF: {e}")
			
 
				+
			
 
				+    # 解析Word
			
 
				+    # try:
			
 
				+    #     # 重新实例化以避免与PDF解析混淆
			
 
				+    #     parser_word = DocumentParser(max_chars=500)
			
 
				+    #     fragments_word = parser_word.parse(file_path_word)
			
 
				+    #     print(f"--- Parsed {len(fragments_word)} fragments from Word ---")
			
 
				+    #     for frag in fragments_word:
			
 
				+    #         print(f"Title: {frag.metadata['title']}, Path: {frag.metadata['path']}, Level: {frag.metadata['level']}")
			
 
				+    #         # print(f"Content Preview: {frag.content[:100]}...") # 打印内容预览
			
 
				+    #         print("-" * 20)
			
 
				+    # except Exception as e:
			
 
				+    #     print(f"Error parsing Word: {e}")
			
--- a/file_processors/knowledge_base_chunks/metadata.json
+++ b/file_processors/knowledge_base_chunks/metadata.json
@@ -0,0 +1,8 @@
 
				+{
			
 
				+  "total_chunks": 17,
			
 
				+  "max_chunk_size": 2000,
			
 
				+  "document_info": {
			
 
				+    "title": "公路工程施工安全技术规范",
			
 
				+    "code": "JTG F90-2015"
			
 
				+  }
			
 
				+}
			
--- a/file_processors/knowledge_base_chunks/structure_report.md
+++ b/file_processors/knowledge_base_chunks/structure_report.md
@@ -0,0 +1,29 @@
 
				+# 文档结构分析报告
			
 
				+
			
 
				+## 总章节数: 22
			
 
				+## 最大块大小: 2000 字符
			
 
				+
			
 
				+## 文档层级结构:
			
 
				+
			
 
				+- # 中华人民共和国行业标准 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 公路工程施工安全技术规范 (内容大小: 147字符, 子章节: 0)
			
 
				+- # 人民交通出版社股份有限公司 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 中华人民共和国交通运输部 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 公 告 (内容大小: 4字符, 子章节: 0)
			
 
				+- # 交通运输部关于发布《公路工程施工安全技术规范》的公告 (内容大小: 254字符, 子章节: 0)
			
 
				+- # 前 言 (内容大小: 1312字符, 子章节: 0)
			
 
				+- # 1总则 (内容大小: 173字符, 子章节: 0)
			
 
				+- # 2术语 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 2.0.1危险源hazards (内容大小: 164字符, 子章节: 0)
			
 
				+- # 2.0.4应急预案emergency response plan (内容大小: 62字符, 子章节: 0)
			
 
				+- # 2.0.5风险评估risk assessment (内容大小: 48字符, 子章节: 0)
			
 
				+- # 2.0.6特种设备special equipment (内容大小: 52字符, 子章节: 0)
			
 
				+- # 2.0.7特殊作业人员special operator (内容大小: 200字符, 子章节: 0)
			
 
				+- # 3基本规定 (内容大小: 1049字符, 子章节: 0)
			
 
				+- # 4施工准备 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 4.1驻地和场站建设 (内容大小: 871字符, 子章节: 0)
			
 
				+- # 4.2施工便道 (内容大小: 492字符, 子章节: 0)
			
 
				+- # 4.3临时码头和栈桥 (内容大小: 768字符, 子章节: 0)
			
 
				+- # 4.4施工临时用电 (内容大小: 2203字符, 子章节: 0)
			
 
				+- # 4.5生产生活用水 (内容大小: 86字符, 子章节: 0)
			
 
				+- # 4.6施工机械设备 (内容大小: 306字符, 子章节: 0)
			
--- a/file_processors/mineru/config.py
+++ b/file_processors/mineru/config.py
@@ -0,0 +1,47 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+配置文件 - PDF转Markdown转换工具
			
 
				+"""
			
 
				+
			
 
				+# MinerU服务器配置
			
 
				+SERVER_URL = "http://192.168.0.166:8000"
			
 
				+
			
 
				+# 目录配置
			
 
				+INPUT_DIR = "./raw_file"  # 输入PDF文件目录
			
 
				+OUTPUT_DIR = "./output"   # 输出结果目录
			
 
				+
			
 
				+# 转换参数配置
			
 
				+CONVERT_CONFIG = {
			
 
				+    # 文档语言列表，支持: ch(中文), en(英文), etc.
			
 
				+    "lang_list": ["ch"],
			
 
				+    
			
 
				+    # 解析后端选择:
			
 
				+    # - "pipeline": 速度快，无幻觉，精度82+，推荐使用（需要GPU）
			
 
				+    # - "vlm": 兼容性好，速度较慢，精度90+
			
 
				+    "backend": "pipeline",
			
 
				+    
			
 
				+    # 解析方法，默认"auto"自动选择
			
 
				+    "parse_method": "auto",
			
 
				+    
			
 
				+    # 是否启用公式识别
			
 
				+    "formula_enable": True,
			
 
				+    
			
 
				+    # 是否启用表格识别
			
 
				+    "table_enable": True,
			
 
				+    
			
 
				+    # 返回选项
			
 
				+    "return_md": True,              # 返回Markdown文本
			
 
				+    "return_middle_json": False,    # 返回中间JSON结果
			
 
				+    "return_model_output": False,   # 返回模型原始输出
			
 
				+    "return_content_list": False,   # 返回内容列表
			
 
				+    "return_images": True,          # 返回提取的图片
			
 
				+    
			
 
				+    # 响应格式
			
 
				+    "response_format_zip": False,   # 是否以ZIP格式返回
			
 
				+    
			
 
				+    # 页码范围（从0开始）
			
 
				+    "start_page_id": 0,      # 起始页码
			
 
				+    "end_page_id": 9999,    # 结束页码（9999表示到最后一页）
			
 
				+}
			
 
				+
			
--- a/file_processors/mineru/convert_single.py
+++ b/file_processors/mineru/convert_single.py
@@ -0,0 +1,65 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+单文件转换脚本 - 用于测试单个PDF文件
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+from pdf_to_markdown import PDFToMarkdownConverter
			
 
				+from config import SERVER_URL, OUTPUT_DIR, CONVERT_CONFIG
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    
			
 
				+    if len(sys.argv) < 2:
			
 
				+        print("用法: python convert_single.py <PDF文件路径>")
			
 
				+        print("\n示例:")
			
 
				+        print("  python convert_single.py raw_file/example.pdf")
			
 
				+        return 1
			
 
				+    
			
 
				+    pdf_path = sys.argv[1]
			
 
				+    
			
 
				+    print("=" * 70)
			
 
				+    print(" 单文件PDF转Markdown")
			
 
				+    print("=" * 70)
			
 
				+    print(f"\n文件: {pdf_path}")
			
 
				+    print(f"服务器: {SERVER_URL}")
			
 
				+    print(f"输出目录: {OUTPUT_DIR}")
			
 
				+    print("\n" + "=" * 70 + "\n")
			
 
				+    
			
 
				+    # 创建转换器
			
 
				+    converter = PDFToMarkdownConverter(server_url=SERVER_URL)
			
 
				+    
			
 
				+    try:
			
 
				+        # 转换PDF
			
 
				+        result = converter.convert_pdf(
			
 
				+            pdf_path=pdf_path,
			
 
				+            output_dir=OUTPUT_DIR,
			
 
				+            **CONVERT_CONFIG
			
 
				+        )
			
 
				+        
			
 
				+        # 保存Markdown
			
 
				+        from pathlib import Path
			
 
				+        pdf_name = Path(pdf_path).stem
			
 
				+        md_path = Path(OUTPUT_DIR) / f"{pdf_name}.md"
			
 
				+        converter.save_markdown(result, str(md_path))
			
 
				+        
			
 
				+        print("\n" + "=" * 70)
			
 
				+        print("转换成功!")
			
 
				+        print("=" * 70)
			
 
				+        
			
 
				+        return 0
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print("\n" + "=" * 70)
			
 
				+        print(f"转换失败: {e}")
			
 
				+        print("=" * 70)
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return 1
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    exit(main())
			
 
				+
			
--- a/file_processors/mineru/pdf_to_markdown.py
+++ b/file_processors/mineru/pdf_to_markdown.py
@@ -0,0 +1,382 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+PDF转Markdown转换脚本
			
 
				+使用MinerU本地部署版本进行PDF文档解析
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import requests
			
 
				+from pathlib import Path
			
 
				+from typing import List, Optional
			
 
				+import zipfile
			
 
				+import io
			
 
				+
			
 
				+
			
 
				+class PDFToMarkdownConverter:
			
 
				+    """PDF转Markdown转换器"""
			
 
				+    
			
 
				+    def __init__(self, server_url: str = "http://localhost:8000"):
			
 
				+        """
			
 
				+        初始化转换器
			
 
				+        
			
 
				+        Args:
			
 
				+            server_url: MinerU服务器地址，默认为本地8000端口
			
 
				+        """
			
 
				+        self.server_url = server_url.rstrip('/')
			
 
				+        self.api_endpoint = f"{self.server_url}/file_parse"
			
 
				+    
			
 
				+    def convert_pdf(
			
 
				+        self,
			
 
				+        pdf_path: str,
			
 
				+        output_dir: str = "./output",
			
 
				+        lang_list: List[str] = None,
			
 
				+        backend: str = "pipeline",
			
 
				+        parse_method: str = "auto",
			
 
				+        formula_enable: bool = True,
			
 
				+        table_enable: bool = True,
			
 
				+        return_md: bool = True,
			
 
				+        return_middle_json: bool = False,
			
 
				+        return_model_output: bool = False,
			
 
				+        return_content_list: bool = False,
			
 
				+        return_images: bool = False,
			
 
				+        response_format_zip: bool = False,
			
 
				+        start_page_id: int = 0,
			
 
				+        end_page_id: int = 99999
			
 
				+    ) -> dict:
			
 
				+        """
			
 
				+        转换单个PDF文件为Markdown
			
 
				+        
			
 
				+        参数说明（基于MinerU官方文档）:
			
 
				+            pdf_path: PDF文件路径
			
 
				+            output_dir: 输出目录路径，默认为"./output"
			
 
				+            lang_list: 文档语言列表，默认["ch"]表示中文。支持: ch(中文), en(英文), etc.
			
 
				+            backend: 解析后端，可选值:
			
 
				+                - "pipeline": 速度快，无幻觉，精度82+，需要GPU（推荐）
			
 
				+                - "vlm": 兼容性好，速度较慢，精度90+
			
 
				+            parse_method: 解析方法，默认"auto"自动选择
			
 
				+            formula_enable: 是否启用公式识别，默认True
			
 
				+            table_enable: 是否启用表格识别，默认True
			
 
				+            return_md: 是否返回Markdown文本，默认True
			
 
				+            return_middle_json: 是否返回中间JSON结果，默认False
			
 
				+            return_model_output: 是否返回模型原始输出，默认False
			
 
				+            return_content_list: 是否返回内容列表，默认False
			
 
				+            return_images: 是否返回提取的图片，默认False
			
 
				+            response_format_zip: 是否以ZIP格式返回结果，默认False
			
 
				+            start_page_id: 起始页码（从0开始），默认0
			
 
				+            end_page_id: 结束页码，默认99999（表示到最后一页）
			
 
				+        
			
 
				+        Returns:
			
 
				+            dict: 包含转换结果的字典
			
 
				+        """
			
 
				+        if lang_list is None:
			
 
				+            lang_list = ["ch"]
			
 
				+        
			
 
				+        # 检查文件是否存在
			
 
				+        if not os.path.exists(pdf_path):
			
 
				+            raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
			
 
				+        
			
 
				+        # 准备文件
			
 
				+        files = {
			
 
				+            'files': (os.path.basename(pdf_path), open(pdf_path, 'rb'), 'application/pdf')
			
 
				+        }
			
 
				+        
			
 
				+        # 准备表单数据
			
 
				+        data = {
			
 
				+            'output_dir': output_dir,
			
 
				+            'lang_list': lang_list,
			
 
				+            'backend': backend,
			
 
				+            'parse_method': parse_method,
			
 
				+            'formula_enable': formula_enable,
			
 
				+            'table_enable': table_enable,
			
 
				+            'return_md': return_md,
			
 
				+            'return_middle_json': return_middle_json,
			
 
				+            'return_model_output': return_model_output,
			
 
				+            'return_content_list': return_content_list,
			
 
				+            'return_images': return_images,
			
 
				+            'response_format_zip': response_format_zip,
			
 
				+            'start_page_id': start_page_id,
			
 
				+            'end_page_id': end_page_id
			
 
				+        }
			
 
				+        
			
 
				+        try:
			
 
				+            print(f"正在转换: {pdf_path}")
			
 
				+            print(f"后端: {backend}, 语言: {lang_list}")
			
 
				+            
			
 
				+            # 发送请求
			
 
				+            response = requests.post(
			
 
				+                self.api_endpoint,
			
 
				+                files=files,
			
 
				+                data=data,
			
 
				+                timeout=600  # 5分钟超时
			
 
				+            )
			
 
				+            
			
 
				+            # 关闭文件
			
 
				+            files['files'][1].close()
			
 
				+            
			
 
				+            # 检查响应状态
			
 
				+            response.raise_for_status()
			
 
				+            
			
 
				+            # 处理响应
			
 
				+            if response_format_zip:
			
 
				+                # 如果返回ZIP格式
			
 
				+                return self._handle_zip_response(response, pdf_path, output_dir)
			
 
				+            else:
			
 
				+                # 如果返回JSON格式
			
 
				+                return response.json()
			
 
				+                
			
 
				+        except requests.exceptions.RequestException as e:
			
 
				+            print(f"请求失败: {e}")
			
 
				+            raise
			
 
				+        except Exception as e:
			
 
				+            print(f"转换出错: {e}")
			
 
				+            raise
			
 
				+    
			
 
				+    def _handle_zip_response(self, response, pdf_path: str, output_dir: str) -> dict:
			
 
				+        """处理ZIP格式的响应"""
			
 
				+        pdf_name = Path(pdf_path).stem
			
 
				+        output_path = Path(output_dir) / pdf_name
			
 
				+        output_path.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        # 解压ZIP内容
			
 
				+        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
			
 
				+            zip_file.extractall(output_path)
			
 
				+        
			
 
				+        print(f"结果已保存到: {output_path}")
			
 
				+        return {"status": "success", "output_path": str(output_path)}
			
 
				+    
			
 
				+    def convert_directory(
			
 
				+        self,
			
 
				+        input_dir: str,
			
 
				+        output_dir: str = "./output",
			
 
				+        **kwargs
			
 
				+    ) -> dict:
			
 
				+        """
			
 
				+        批量转换目录中的所有PDF文件
			
 
				+        
			
 
				+        Args:
			
 
				+            input_dir: 输入目录路径
			
 
				+            output_dir: 输出目录路径
			
 
				+            **kwargs: 其他转换参数，参考convert_pdf方法
			
 
				+        
			
 
				+        Returns:
			
 
				+            dict: 包含转换结果统计的字典
			
 
				+        """
			
 
				+        input_path = Path(input_dir)
			
 
				+        if not input_path.exists():
			
 
				+            raise FileNotFoundError(f"输入目录不存在: {input_dir}")
			
 
				+        
			
 
				+        # 获取所有PDF文件
			
 
				+        pdf_files = list(input_path.glob("*.pdf"))
			
 
				+        
			
 
				+        if not pdf_files:
			
 
				+            print(f"目录中没有找到PDF文件: {input_dir}")
			
 
				+            return {"total": 0, "success": 0, "failed": 0}
			
 
				+        
			
 
				+        print(f"找到 {len(pdf_files)} 个PDF文件")
			
 
				+        
			
 
				+        results = {
			
 
				+            "total": len(pdf_files),
			
 
				+            "success": 0,
			
 
				+            "failed": 0,
			
 
				+            "details": []
			
 
				+        }
			
 
				+        
			
 
				+        # 逐个转换
			
 
				+        for pdf_file in pdf_files:
			
 
				+            try:
			
 
				+                result = self.convert_pdf(
			
 
				+                    str(pdf_file),
			
 
				+                    output_dir=output_dir,
			
 
				+                    **kwargs
			
 
				+                )
			
 
				+                
			
 
				+                # 立即保存Markdown文件
			
 
				+                if result:
			
 
				+                    md_filename = pdf_file.stem + ".md"
			
 
				+                    md_path = Path(output_dir) / md_filename
			
 
				+                    self.save_markdown(result, str(md_path))
			
 
				+                
			
 
				+                results["success"] += 1
			
 
				+                results["details"].append({
			
 
				+                    "file": pdf_file.name,
			
 
				+                    "status": "success",
			
 
				+                    "result": result
			
 
				+                })
			
 
				+                print(f"成功: {pdf_file.name}")
			
 
				+            except Exception as e:
			
 
				+                results["failed"] += 1
			
 
				+                results["details"].append({
			
 
				+                    "file": pdf_file.name,
			
 
				+                    "status": "failed",
			
 
				+                    "error": str(e)
			
 
				+                })
			
 
				+                print(f"✗ 失败: {pdf_file.name} - {e}")
			
 
				+        
			
 
				+        return results
			
 
				+    
			
 
				+    def save_markdown(self, result: dict, output_path: str):
			
 
				+        """
			
 
				+        保存Markdown内容到文件
			
 
				+        
			
 
				+        Args:
			
 
				+            result: convert_pdf返回的结果字典
			
 
				+            output_path: 输出文件路径
			
 
				+        """
			
 
				+        import json
			
 
				+        
			
 
				+        # 确保输出目录存在
			
 
				+        output_file = Path(output_path)
			
 
				+        output_file.parent.mkdir(parents=True, exist_ok=True)
			
 
				+        
			
 
				+        # 尝试多种可能的键名来提取markdown内容
			
 
				+        markdown_content = None
			
 
				+        
			
 
				+        # MinerU API返回格式：{'backend': ..., 'version': ..., 'results': {filename: {...}}}
			
 
				+        # results是一个字典，键是文件名，值是解析结果
			
 
				+        if 'results' in result and isinstance(result['results'], dict):
			
 
				+            # 获取第一个文件的结果（results是字典）
			
 
				+            file_results = result['results']
			
 
				+            if file_results:
			
 
				+                # 获取第一个文件的数据
			
 
				+                file_result = next(iter(file_results.values()))
			
 
				+                
			
 
				+                # 常见的可能键名
			
 
				+                possible_keys = ['md_content', 'markdown', 'md', 'content_list', 'text', 'content', 'data']
			
 
				+                
			
 
				+                for key in possible_keys:
			
 
				+                    if key in file_result:
			
 
				+                        markdown_content = file_result[key]
			
 
				+                        # 如果是content_list，需要特殊处理
			
 
				+                        if key == 'content_list' and isinstance(markdown_content, list):
			
 
				+                            # 将content_list转换为markdown字符串
			
 
				+                            markdown_content = self._content_list_to_markdown(markdown_content)
			
 
				+                        if markdown_content:
			
 
				+                            break
			
 
				+        
			
 
				+        # 如果results是列表格式（兼容旧版本）
			
 
				+        elif 'results' in result and isinstance(result['results'], list) and len(result['results']) > 0:
			
 
				+            file_result = result['results'][0]
			
 
				+            
			
 
				+            possible_keys = ['md_content', 'markdown', 'md', 'content_list', 'text', 'content', 'data']
			
 
				+            
			
 
				+            for key in possible_keys:
			
 
				+                if key in file_result:
			
 
				+                    markdown_content = file_result[key]
			
 
				+                    if key == 'content_list' and isinstance(markdown_content, list):
			
 
				+                        markdown_content = self._content_list_to_markdown(markdown_content)
			
 
				+                    if markdown_content:
			
 
				+                        break
			
 
				+        
			
 
				+        # 尝试直接在顶层查找
			
 
				+        if not markdown_content:
			
 
				+            possible_keys = ['md_content', 'markdown', 'md', 'content', 'text', 'result', 'data']
			
 
				+            
			
 
				+            for key in possible_keys:
			
 
				+                if key in result:
			
 
				+                    markdown_content = result[key]
			
 
				+                    if markdown_content:
			
 
				+                        break
			
 
				+            
			
 
				+            # 如果还是没找到，尝试查找嵌套结构
			
 
				+            if not markdown_content and isinstance(result, dict):
			
 
				+                if 'data' in result and isinstance(result['data'], dict):
			
 
				+                    for key in possible_keys:
			
 
				+                        if key in result['data']:
			
 
				+                            markdown_content = result['data'][key]
			
 
				+                            if markdown_content:
			
 
				+                                break
			
 
				+        
			
 
				+        if not markdown_content:
			
 
				+            # 输出调试信息
			
 
				+            print("警告: 无法提取markdown内容")
			
 
				+            print(f"  返回的键: {list(result.keys())}")
			
 
				+            
			
 
				+            # 如果有results，显示其内部结构
			
 
				+            if 'results' in result and isinstance(result['results'], list) and len(result['results']) > 0:
			
 
				+                print(f"  results[0]的键: {list(result['results'][0].keys())}")
			
 
				+            
			
 
				+            # 保存原始JSON以便调试
			
 
				+            json_path = output_path.replace('.md', '_debug.json')
			
 
				+            with open(json_path, 'w', encoding='utf-8') as f:
			
 
				+                json.dump(result, f, ensure_ascii=False, indent=2)
			
 
				+            print(f"  调试JSON已保存到: {json_path}")
			
 
				+            return
			
 
				+        
			
 
				+        # 保存markdown文件
			
 
				+        with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+            f.write(markdown_content)
			
 
				+        
			
 
				+        file_size = len(markdown_content)
			
 
				+        print(f"  Markdown已保存: {output_path} ({file_size} 字符)")
			
 
				+    
			
 
				+    def _content_list_to_markdown(self, content_list: list) -> str:
			
 
				+        """
			
 
				+        将content_list转换为markdown字符串
			
 
				+        
			
 
				+        Args:
			
 
				+            content_list: 内容列表
			
 
				+            
			
 
				+        Returns:
			
 
				+            str: markdown格式的文本
			
 
				+        """
			
 
				+        markdown_lines = []
			
 
				+        
			
 
				+        for item in content_list:
			
 
				+            if isinstance(item, dict):
			
 
				+                # 提取文本内容
			
 
				+                text = item.get('text', item.get('content', ''))
			
 
				+                if text:
			
 
				+                    markdown_lines.append(text)
			
 
				+            elif isinstance(item, str):
			
 
				+                markdown_lines.append(item)
			
 
				+        
			
 
				+        return '\n\n'.join(markdown_lines)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数 - 示例用法"""
			
 
				+    
			
 
				+    # 配置参数
			
 
				+    SERVER_URL = "http://localhost:8000"  # MinerU服务器地址
			
 
				+    INPUT_DIR = "./raw_file"  # 输入目录
			
 
				+    OUTPUT_DIR = "./output"  # 输出目录
			
 
				+    
			
 
				+    # 创建转换器实例
			
 
				+    converter = PDFToMarkdownConverter(server_url=SERVER_URL)
			
 
				+    
			
 
				+    print("=" * 60)
			
 
				+    print("PDF转Markdown转换工具 (基于MinerU)")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    try:
			
 
				+        # 批量转换目录中的所有PDF文件
			
 
				+        results = converter.convert_directory(
			
 
				+            input_dir=INPUT_DIR,
			
 
				+            output_dir=OUTPUT_DIR,
			
 
				+            lang_list=["ch"],  # 中文文档
			
 
				+            backend="pipeline",  # 使用pipeline后端（速度快）
			
 
				+            formula_enable=True,  # 启用公式识别
			
 
				+            table_enable=True,  # 启用表格识别
			
 
				+            return_md=True,  # 返回Markdown
			
 
				+            return_images=True,  # 返回图片
			
 
				+            response_format_zip=False  # 不使用ZIP格式
			
 
				+        )
			
 
				+        
			
 
				+        # 打印统计结果
			
 
				+        print("\n" + "=" * 60)
			
 
				+        print("转换完成!")
			
 
				+        print(f"总计: {results['total']} 个文件")
			
 
				+        print(f"成功: {results['success']} 个文件")
			
 
				+        print(f"失败: {results['failed']} 个文件")
			
 
				+        print("=" * 60)
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"\n错误: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
--- a/file_processors/pdf_mineru_md.py
+++ b/file_processors/pdf_mineru_md.py
@@ -0,0 +1,116 @@
 
				+
			
 
				+
			
 
				+
			
 
				+import os
			
 
				+import time
			
 
				+from tqdm import tqdm
			
 
				+from langchain_community.document_loaders import PyMuPDFLoader
			
 
				+from langchain_text_splitters import RecursiveCharacterTextSplitter
			
 
				+from foundation.logger.loggering import server_logger
			
 
				+from foundation.utils.common import handler_err
			
 
				+from foundation.base.config import config_handler
			
 
				+from langchain_core.documents import Document
			
 
				+from foundation.rag.vector.base_vector import BaseVectorDB
			
 
				+
			
 
				+class BfpPDFMineruMdProcessor:
			
 
				+    """
			
 
				+        编制依据将 PDF文档转换为 markdown 处理器
			
 
				+    """
			
 
				+    def __init__(self, directory , output_directory ,  **kwargs):
			
 
				+        """
			
 
				+        初始化 PDF 处理器
			
 
				+        :param directory: PDF 文件所在目录
			
 
				+        :param kwargs: 其他参数
			
 
				+        """
			
 
				+        self.directory = directory  # PDF 文件所在目录
			
 
				+        self.output_directory = output_directory
			
 
				+        self.file_suffix_list = kwargs.get('file_suffix_list', ['.pdf' , '.docx' , '.doc'])
			
 
				+        server_logger.info(f"""
			
 
				+                    初始化PDF文件导入器:
			
 
				+                    配置参数：
			
 
				+                    - 文件后缀列表：{self.file_suffix_list}
			
 
				+                    - 导入的文件路径：{self.directory}
			
 
				+                    - 导出的文件路径：{self.output_directory}
			
 
				+                    """)
			
 
				+
			
 
				+    def load_pdf_files(self):
			
 
				+        """
			
 
				+        加载目录下的所有PDF文件
			
 
				+        """
			
 
				+        file_path = os.path.join(self.directory)
			
 
				+        pdf_path_files = []
			
 
				+        pdf_file_names = []
			
 
				+        #server_logger.info(f"file_path: {file_path}")
			
 
				+        for file_name in os.listdir(file_path):
			
 
				+            # 获取后缀（带点） # file_name.lower().endswith('.docx'):
			
 
				+            file_suffix = os.path.splitext(file_name)[1] 
			
 
				+            if file_suffix in self.file_suffix_list:
			
 
				+                pdf_file_names.append(file_name)
			
 
				+                pdf_path_files.append(os.path.join(file_path, file_name))
			
 
				+            else:
			
 
				+                server_logger.info(f"Skipping {file_name} because it is not a PDF file.")
			
 
				+
			
 
				+        server_logger.info(f"Found {len(pdf_file_names)} PDF files.")
			
 
				+        server_logger.info(f"pdf_path_files: {pdf_path_files}，pdf_file_names：{pdf_file_names}")
			
 
				+        return pdf_path_files , pdf_file_names
			
 
				+
			
 
				+
			
 
				+    def process_tqdm_pdfs_group(self):
			
 
				+        """
			
 
				+        处理PDF文件组 并且直接入库处理
			
 
				+        """
			
 
				+        # 读取PDF文件内容
			
 
				+        pdf_path_files , pdf_file_names = self.load_pdf_files()
			
 
				+
			
 
				+        server_logger.info(f"process {len(pdf_path_files)} documents.")
			
 
				+        start_time = time.time()
			
 
				+        total_docs = 0
			
 
				+
			
 
				+        total_batches = len(pdf_path_files)
			
 
				+        
			
 
				+        with tqdm(total=total_batches, desc="process batches", unit="batch") as pbar:
			
 
				+            for pdf_path_file , pdf_file_name in zip(pdf_path_files , pdf_file_names):
			
 
				+                # pdf 转换为 markdown
			
 
				+                self.pdf_md_process(pdf_path_file=pdf_path_file)
			
 
				+
			
 
				+                total_docs += 1
			
 
				+                # 计算并显示当前的TPM
			
 
				+                elapsed_time = time.time() - start_time
			
 
				+                if elapsed_time > 0:
			
 
				+                    tpm = (total_docs / elapsed_time) * 60
			
 
				+                    pbar.set_postfix({"TPM": f"{tpm:.2f}"})
			
 
				+
			
 
				+                pbar.update(1)
			
 
				+
			
 
				+        # TODO 切分的问题 可以增加metadata元数据信息 
			
 
				+        server_logger.info(f"Processed Documents:{self.directory}，docs:{len(pdf_path_files)},total_docs:{total_docs}")
			
 
				+        
			
 
				+
			
 
				+
			
 
				+    def pdf_md_process(self , pdf_path_file):
			
 
				+        """
			
 
				+            pdf 转换为 markdown
			
 
				+        """
			
 
				+        import sys
			
 
				+        from .mineru.pdf_to_markdown import PDFToMarkdownConverter
			
 
				+        from .mineru.config import CONVERT_CONFIG
			
 
				+
			
 
				+        server_url = config_handler.get('minerU', 'MINERU_SERVER_URL')
			
 
				+
			
 
				+        pdf_path = pdf_path_file        
			
 
				+        # 创建转换器
			
 
				+        converter = PDFToMarkdownConverter(server_url=server_url)
			
 
				+        try:
			
 
				+            # 转换PDF
			
 
				+            result = converter.convert_pdf(
			
 
				+                pdf_path=pdf_path,
			
 
				+                output_dir=self.output_directory,
			
 
				+                **CONVERT_CONFIG
			
 
				+            )
			
 
				+            # 保存Markdown
			
 
				+            from pathlib import Path
			
 
				+            pdf_name = Path(pdf_path).stem
			
 
				+            md_path = Path(self.output_directory) / f"{pdf_name}.md"
			
 
				+            converter.save_markdown(result, str(md_path))
			
 
				+        except Exception as err:
			
 
				+            handler_err(logger=server_logger , err=err)
			
--- a/foundation/rag/vector/base_vector.py
+++ b/foundation/rag/vector/base_vector.py
@@ -31,8 +31,15 @@ class BaseVectorDB:
 
				         result = []
			
 
				         for doc in documents:
			
 
				             tmp = {}
			
 
				-            tmp['content'] = doc.page_content
			
 
				-            tmp['metadata'] = doc.metadata if doc.metadata else {}
			
 
				+            tmp['content'] = doc["content"] if doc["content"] else doc.page_content
			
 
				+            tmp['metadata'] = doc["metadata"] if doc["metadata"] else doc.metadata if doc.metadata else {}
			
 
				+            tmp['title'] = doc.get("title")
			
 
				+            tmp['level'] = doc.get("level")
			
 
				+            tmp['full_title_path'] = doc.get("full_title_path")
			
 
				+            tmp['start_line'] = doc.get("start_line")
			
 
				+            tmp['end_line'] = doc.get("end_line")
			
 
				+            tmp['hierarchy'] = doc.get("hierarchy")
			
 
				+            tmp['chunk_type'] = doc.get("section")
			
 
				             result.append(tmp)
			
 
				         return result
			
 
				 
			
--- a/foundation/rag/vector/milvus_vector.py
+++ b/foundation/rag/vector/milvus_vector.py
@@ -176,6 +176,7 @@ class MilvusVectorManager(BaseVectorDB):
 
				             return insert_result.primary_keys
			
 
				             
			
 
				         except Exception as e:
			
 
				+            logger.info(f"documents:{documents}")
			
 
				             logger.error(f"Error batch inserting: {e}")
			
 
				             return None
			
 
				     
			
--- a/test/bfp_chunks_files/公路工程施工安全技术规范.md/metadata.json
+++ b/test/bfp_chunks_files/公路工程施工安全技术规范.md/metadata.json
@@ -0,0 +1,8 @@
 
				+{
			
 
				+  "total_chunks": 253,
			
 
				+  "max_chunk_size": 500,
			
 
				+  "document_info": {
			
 
				+    "title": "公路工程施工安全技术规范",
			
 
				+    "code": "JTG F90-2015"
			
 
				+  }
			
 
				+}
			
--- a/test/bfp_chunks_files/公路工程施工安全技术规范.md/structure_report.md
+++ b/test/bfp_chunks_files/公路工程施工安全技术规范.md/structure_report.md
@@ -0,0 +1,185 @@
 
				+# 文档结构分析报告
			
 
				+
			
 
				+## 总章节数: 178
			
 
				+## 最大块大小: 500 字符
			
 
				+
			
 
				+## 文档层级结构:
			
 
				+
			
 
				+- # 中华人民共和国行业标准 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 公路工程施工安全技术规范 (内容大小: 147字符, 子章节: 0)
			
 
				+- # 人民交通出版社股份有限公司 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 中华人民共和国交通运输部 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 公 告 (内容大小: 4字符, 子章节: 0)
			
 
				+- # 交通运输部关于发布《公路工程施工安全技术规范》的公告 (内容大小: 254字符, 子章节: 0)
			
 
				+- # 前 言 (内容大小: 1312字符, 子章节: 0)
			
 
				+- # 目 次 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 1总则 (内容大小: 12字符, 子章节: 0)
			
 
				+- # 4施工准备 6 (内容大小: 85字符, 子章节: 0)
			
 
				+- # 5通用作业 ...........·... 12 (内容大小: 186字符, 子章节: 0)
			
 
				+- # 6路基工程… 29 (内容大小: 100字符, 子章节: 0)
			
 
				+- # 6.8特殊路基·… … 34 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 7路面工程 36 (内容大小: 59字符, 子章节: 0)
			
 
				+- # 8桥涵工程· 38 (内容大小: 259字符, 子章节: 0)
			
 
				+- # 9隧道工程…· 62 (内容大小: 201字符, 子章节: 0)
			
 
				+- # 9.13水下隧道 78 (内容大小: 84字符, 子章节: 0)
			
 
				+- # 10交通安全设施 83 (内容大小: 86字符, 子章节: 0)
			
 
				+- # 11改扩建工程 85 (内容大小: 36字符, 子章节: 0)
			
 
				+- # 12特殊季节与环境施·．………… 88 (内容大小: 210字符, 子章节: 0)
			
 
				+- # 本规范用词用语说明 98 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 附件《公路工程施工安全技术规范》（JTG F90—2015）条文说明· 99 (内容大小: 143字符, 子章节: 0)
			
 
				+- # 1总则 (内容大小: 173字符, 子章节: 0)
			
 
				+- # 2术语 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 2.0.1危险源hazards (内容大小: 164字符, 子章节: 0)
			
 
				+- # 2.0.4应急预案emergency response plan (内容大小: 62字符, 子章节: 0)
			
 
				+- # 2.0.5风险评估risk assessment (内容大小: 48字符, 子章节: 0)
			
 
				+- # 2.0.6特种设备special equipment (内容大小: 52字符, 子章节: 0)
			
 
				+- # 2.0.7特殊作业人员special operator (内容大小: 200字符, 子章节: 0)
			
 
				+- # 3基本规定 (内容大小: 1049字符, 子章节: 0)
			
 
				+- # 4施工准备 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 4.1驻地和场站建设 (内容大小: 871字符, 子章节: 0)
			
 
				+- # 4.2施工便道 (内容大小: 492字符, 子章节: 0)
			
 
				+- # 4.3临时码头和栈桥 (内容大小: 768字符, 子章节: 0)
			
 
				+- # 4.4施工临时用电 (内容大小: 2203字符, 子章节: 0)
			
 
				+- # 4.5生产生活用水 (内容大小: 86字符, 子章节: 0)
			
 
				+- # 4.6施工机械设备 (内容大小: 321字符, 子章节: 0)
			
 
				+- # 5通用作业 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 5.1测量作业 (内容大小: 482字符, 子章节: 0)
			
 
				+- # 5.2支架及模板工程 (内容大小: 1671字符, 子章节: 0)
			
 
				+- # 5.2.15模板存放应符合下列规定： (内容大小: 131字符, 子章节: 0)
			
 
				+- # 5.3钢筋工程 (内容大小: 385字符, 子章节: 0)
			
 
				+- # 5.4混凝土工程 (内容大小: 773字符, 子章节: 0)
			
 
				+- # 5.5电焊与气焊 (内容大小: 1104字符, 子章节: 0)
			
 
				+- # 5.6起重吊装 (内容大小: 1122字符, 子章节: 0)
			
 
				+- # 5.6.17起重机严禁吊人。 (内容大小: 193字符, 子章节: 0)
			
 
				+- # 5.7高处作业 (内容大小: 2804字符, 子章节: 0)
			
 
				+- # 5.7.32雨雪季节应采取防滑措施。 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 5.8水上作业 (内容大小: 1447字符, 子章节: 0)
			
 
				+- # 5.9潜水作业 (内容大小: 969字符, 子章节: 0)
			
 
				+- # 5.10爆破作业 (内容大小: 912字符, 子章节: 0)
			
 
				+- # 5.11小型机具 (内容大小: 532字符, 子章节: 0)
			
 
				+- # 5.12涂装作业 (内容大小: 351字符, 子章节: 0)
			
 
				+- # 6 路基工程 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 6.1一般规定 (内容大小: 419字符, 子章节: 0)
			
 
				+- # 6.2场地清理 (内容大小: 61字符, 子章节: 0)
			
 
				+- # 6.3土方工程 (内容大小: 503字符, 子章节: 0)
			
 
				+- # 6.4石方工程 (内容大小: 146字符, 子章节: 0)
			
 
				+- # 6.5防护工程 (内容大小: 1871字符, 子章节: 0)
			
 
				+- # 6.6排水工程 (内容大小: 80字符, 子章节: 0)
			
 
				+- # 6.7软基处理 (内容大小: 461字符, 子章节: 0)
			
 
				+- # 6.8特殊路基 (内容大小: 962字符, 子章节: 0)
			
 
				+- # 7路面工程 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 7.1一般规定 (内容大小: 385字符, 子章节: 0)
			
 
				+- # 7.2基层与底基层 (内容大小: 314字符, 子章节: 0)
			
 
				+- # 7.3沥青面层 (内容大小: 390字符, 子章节: 0)
			
 
				+- # 7.4水泥混凝土面层 (内容大小: 73字符, 子章节: 0)
			
 
				+- # 8桥涵工程 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 8.1一般规定 (内容大小: 135字符, 子章节: 0)
			
 
				+- # 8.2预应力混凝土工程 (内容大小: 381字符, 子章节: 0)
			
 
				+- # 8.3钻（挖）孔灌注桩 (内容大小: 984字符, 子章节: 0)
			
 
				+- # 8.4沉入桩 (内容大小: 782字符, 子章节: 0)
			
 
				+- # 8.5沉井 (内容大小: 1000字符, 子章节: 0)
			
 
				+- # 8.6地下连续墙 (内容大小: 216字符, 子章节: 0)
			
 
				+- # 8.7围堰 (内容大小: 369字符, 子章节: 0)
			
 
				+- # 8.7.5钢吊（套）箱围堰施工应符合下列规定： (内容大小: 199字符, 子章节: 0)
			
 
				+- # 8.8明挖地基 (内容大小: 1085字符, 子章节: 0)
			
 
				+- # 8.9承台与墩台 (内容大小: 718字符, 子章节: 0)
			
 
				+- # 8.10砌体 (内容大小: 623字符, 子章节: 0)
			
 
				+- # 8.11钢筋混凝土和预应力梁式桥 (内容大小: 1751字符, 子章节: 0)
			
 
				+- # 8.12拱桥 (内容大小: 2914字符, 子章节: 0)
			
 
				+- # 8.13斜拉桥 (内容大小: 1836字符, 子章节: 0)
			
 
				+- # 8.13.5斜拉索施工应符合下列规定： (内容大小: 551字符, 子章节: 0)
			
 
				+- # 8.14悬索桥 (内容大小: 2045字符, 子章节: 0)
			
 
				+- # 8.14.7先导索施工应符合下列规定： (内容大小: 1748字符, 子章节: 0)
			
 
				+- # 8.15钢桥 (内容大小: 1602字符, 子章节: 0)
			
 
				+- # 8.16桥面及附属工程 (内容大小: 192字符, 子章节: 0)
			
 
				+- # 8.17涵洞与通道 (内容大小: 644字符, 子章节: 0)
			
 
				+- # 9 隧道工程 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 9.1一般规定 (内容大小: 1216字符, 子章节: 0)
			
 
				+- # 9.2洞口与明洞 (内容大小: 579字符, 子章节: 0)
			
 
				+- # 9.3开挖 (内容大小: 1453字符, 子章节: 0)
			
 
				+- # 9.4装渣与运输 (内容大小: 199字符, 子章节: 0)
			
 
				+- # 9.5支护 (内容大小: 421字符, 子章节: 0)
			
 
				+- # 9.6衬砌 (内容大小: 290字符, 子章节: 0)
			
 
				+- # 9.7辅助坑道 (内容大小: 1429字符, 子章节: 0)
			
 
				+- # 9.8防水和排水 (内容大小: 88字符, 子章节: 0)
			
 
				+- # 9.8.2隧道排水作业应符合下列规定： (内容大小: 285字符, 子章节: 0)
			
 
				+- # 9.9通风、防尘及防有害气体 (内容大小: 2294字符, 子章节: 0)
			
 
				+- # 3空气中粉尘浓度应符合表9.9.2-2的规定。 (内容大小: 5608字符, 子章节: 0)
			
 
				+- # 9.10风、水、电供应 (内容大小: 846字符, 子章节: 0)
			
 
				+- # 9.11不良地质和特殊岩土地段 (内容大小: 4036字符, 子章节: 0)
			
 
				+- # 9.12盾构施工 (内容大小: 2158字符, 子章节: 0)
			
 
				+- # 9.13水下隧道 (内容大小: 560字符, 子章节: 0)
			
 
				+- # 9.14特殊地段 (内容大小: 367字符, 子章节: 0)
			
 
				+- # 9.15小净距及连拱隧道 (内容大小: 419字符, 子章节: 0)
			
 
				+- # 9.16附属设施工程 (内容大小: 363字符, 子章节: 0)
			
 
				+- # 9.17超前地质预报和监控量测 (内容大小: 993字符, 子章节: 0)
			
 
				+- # 9.18逃生与救援 (内容大小: 500字符, 子章节: 0)
			
 
				+- # 10 交通安全设施 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 10.1一般规定 (内容大小: 182字符, 子章节: 0)
			
 
				+- # 10.2护栏 (内容大小: 285字符, 子章节: 0)
			
 
				+- # 10.3交通标志 (内容大小: 181字符, 子章节: 0)
			
 
				+- # 10.4交通标线 (内容大小: 151字符, 子章节: 0)
			
 
				+- # 10.5隔离栅和桥梁护网 (内容大小: 135字符, 子章节: 0)
			
 
				+- # 10.6防眩设施 (内容大小: 59字符, 子章节: 0)
			
 
				+- # 11 改扩建工程 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 11.1改扩建 (内容大小: 382字符, 子章节: 0)
			
 
				+- # 11.2拆除 (内容大小: 1105字符, 子章节: 0)
			
 
				+- # 12特殊季节与特殊环境施工 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 12.1一般规定 (内容大小: 109字符, 子章节: 0)
			
 
				+- # 12.2冬季施工 (内容大小: 380字符, 子章节: 0)
			
 
				+- # 12.3雨季施工 (内容大小: 176字符, 子章节: 0)
			
 
				+- # 12.4夜间施工 (内容大小: 113字符, 子章节: 0)
			
 
				+- # 12.5高温施工 (内容大小: 76字符, 子章节: 0)
			
 
				+- # 12.6台风季节施工 (内容大小: 74字符, 子章节: 0)
			
 
				+- # 12.7汛期施工 (内容大小: 94字符, 子章节: 0)
			
 
				+- # 12.8能见度不良施工 (内容大小: 175字符, 子章节: 0)
			
 
				+- # 12.9沙漠地区施工 (内容大小: 187字符, 子章节: 0)
			
 
				+- # 12.10高海拔地区施工 (内容大小: 333字符, 子章节: 0)
			
 
				+- # 附录A危险性较大的工程 (内容大小: 2310字符, 子章节: 0)
			
 
				+- # 附录B 专项施工方案主要内容 (内容大小: 231字符, 子章节: 0)
			
 
				+- # 附录C风险评估报告的内容 (内容大小: 292字符, 子章节: 0)
			
 
				+- # 附录D 特殊作业人员范围 (内容大小: 164字符, 子章节: 0)
			
 
				+- # 附录E 特种设备名录 (内容大小: 927字符, 子章节: 0)
			
 
				+- # 本规范用词用语说明 (内容大小: 485字符, 子章节: 0)
			
 
				+- # 《公路工程施工安全技术规范 (内容大小: 35字符, 子章节: 0)
			
 
				+- # 3基本规定 (内容大小: 293字符, 子章节: 0)
			
 
				+- # 4施工准备 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 4.3临时码头和栈桥 (内容大小: 85字符, 子章节: 0)
			
 
				+- # 4.4施工临时用电 (内容大小: 428字符, 子章节: 0)
			
 
				+- # 5通用作业 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 5.2支架及模板工程 (内容大小: 79字符, 子章节: 0)
			
 
				+- # 5.5电焊与气焊 (内容大小: 47字符, 子章节: 0)
			
 
				+- # 5.6起重吊装 (内容大小: 126字符, 子章节: 0)
			
 
				+- # 5.7高处作业 (内容大小: 133字符, 子章节: 0)
			
 
				+- # 5.8水上作业 (内容大小: 647字符, 子章节: 0)
			
 
				+- # 5.9潜水作业 (内容大小: 313字符, 子章节: 0)
			
 
				+- # 5.10爆破作业 (内容大小: 21字符, 子章节: 0)
			
 
				+- # 5.11小型机具 (内容大小: 39字符, 子章节: 0)
			
 
				+- # 5.12涂装作业 (内容大小: 78字符, 子章节: 0)
			
 
				+- # 6 路基工程 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 6.4石方工程 (内容大小: 24字符, 子章节: 0)
			
 
				+- # 6.5防护工程 (内容大小: 1265字符, 子章节: 0)
			
 
				+- # 8桥涵工程 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 8.4沉入桩 (内容大小: 62字符, 子章节: 0)
			
 
				+- # 8.12拱桥 (内容大小: 626字符, 子章节: 0)
			
 
				+- # 8.13斜拉桥 (内容大小: 509字符, 子章节: 0)
			
 
				+- # 8.14悬索桥 (内容大小: 57字符, 子章节: 0)
			
 
				+- # 8.17涵洞与通道 (内容大小: 75字符, 子章节: 0)
			
 
				+- # 9 隧道工程 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 9.1一般规定 (内容大小: 45字符, 子章节: 0)
			
 
				+- # 9.3开挖 (内容大小: 28字符, 子章节: 0)
			
 
				+- # 9.7辅助坑道 (内容大小: 72字符, 子章节: 0)
			
 
				+- # 9.9通风、防尘及防有害气体 (内容大小: 162字符, 子章节: 0)
			
 
				+- # 9.10风、水、电供应 (内容大小: 70字符, 子章节: 0)
			
 
				+- # 10本款参考《公路隧道施工技术细则》（JTG/T F60—2009）第12.3.1条制定。 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 9.11不良地质和特殊岩土地段 (内容大小: 173字符, 子章节: 0)
			
 
				+- # 9.12盾构施工 (内容大小: 534字符, 子章节: 0)
			
 
				+- # 10 交通安全设施 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 10.4交通标线 (内容大小: 22字符, 子章节: 0)
			
 
				+- # 11改扩建工程 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 11.1改扩建 (内容大小: 154字符, 子章节: 0)
			
 
				+- # 12特殊季节与特殊环境施工 (内容大小: 0字符, 子章节: 0)
			
 
				+- # 12.2冬季施工 (内容大小: 91字符, 子章节: 0)
			
 
				+- # 12.8能见度不良施工 (内容大小: 25字符, 子章节: 0)
			
 
				+- # 12.10高海拔地区施工 (内容大小: 64字符, 子章节: 0)
			
 
				+- # 附录A危险性较大的工程 (内容大小: 671字符, 子章节: 0)
			
--- a/test/bfp_md_files/公路工程施工安全技术规范.md
+++ b/test/bfp_md_files/公路工程施工安全技术规范.md
--- a/test/test/test.pdf
+++ b/test/test/test.pdf
--- a/views/test_views.py
+++ b/views/test_views.py
@@ -23,6 +23,8 @@ from views import test_router, get_operation_id
 
				 from foundation.agent.workflow.test_workflow_graph import test_workflow_graph
			
 
				 from file_processors.pdf_processor import PDFProcessor
			
 
				 from file_processors.bfp_pdf_processor import BfpPDFProcessor
			
 
				+from file_processors.pdf_mineru_md import BfpPDFMineruMdProcessor
			
 
				+from file_processors.bfp_md_processor import BfpMarkdownProcessor
			
 
				 
			
 
				 from foundation.models.silicon_flow import SiliconFlowAPI
			
 
				 from foundation.rag.vector.pg_vector_mananger import PGVectorManager
			
@@ -598,6 +600,52 @@ async def pgvector_test_endpoint(
 
				 
			
 
				 
			
 
				 
			
 
				+@test_router.post("/data/bfp/pdf_md", response_model=TestForm)
			
 
				+async def bfp_md_indb_endpoint(
			
 
				+        param: TestForm,
			
 
				+        trace_id: str = Depends(get_operation_id)):
			
 
				+    """
			
 
				+        编制依据文档切分处理 和 入库处理
			
 
				+    """
			
 
				+    try:
			
 
				+        server_logger.info(trace_id=trace_id, msg=f"{param}")
			
 
				+        print(trace_id)
			
 
				+        # 从字典中获取input
			
 
				+        input_query = param.input
			
 
				+        session_id = param.config.session_id
			
 
				+        context = param.context
			
 
				+        header_info = {
			
 
				+        }
			
 
				+        task_prompt_info = {"task_prompt": ""}
			
 
				+        #file_directory= "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/pdf_files"
			
 
				+        #file_directory= "test/bfp_files"
			
 
				+        file_directory= "test/test"
			
 
				+        output_directory = "test/bfp_md_files"
			
 
				+         # 初始化知识问答处理
			
 
				+        pdf_processor = BfpPDFMineruMdProcessor(directory=file_directory , output_directory=output_directory)
			
 
				+        pdf_processor.process_tqdm_pdfs_group()
			
 
				+
			
 
				+        server_logger.info(trace_id=trace_id, msg=f"【result】: ", log_type="bfp/pdf_md")
			
 
				+        output = None
			
 
				+
			
 
				+        
			
 
				+
			
 
				+        #server_logger.debug(trace_id=trace_id, msg=f"【result】: {output}", log_type="agent/chat")
			
 
				+        # 返回字典格式的响应
			
 
				+        return JSONResponse(
			
 
				+            return_json(data={"output": output}, data_type="text", trace_id=trace_id))
			
 
				+
			
 
				+    except ValueError as err:
			
 
				+        handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/pdf_md")
			
 
				+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
			
 
				+
			
 
				+    except Exception as err:
			
 
				+        handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/pdf_md")
			
 
				+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
			
 
				+    
			
 
				+
			
 
				+
			
 
				+
			
 
				 
			
 
				 @test_router.post("/data/bfp/indb", response_model=TestForm)
			
 
				 async def bfp_indb_endpoint(
			
@@ -817,4 +865,46 @@ async def bfp_search_endpoint(
 
				 
			
 
				     except Exception as err:
			
 
				         handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/milvus/search")
			
 
				-        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
			
 
				+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
			
 
				+
			
 
				+
			
 
				+@test_router.post("/data/bfp/md/milvus/batch/indb", response_model=TestForm)
			
 
				+async def bfp_md_batch_indb_endpoint(
			
 
				+        param: TestForm,
			
 
				+        trace_id: str = Depends(get_operation_id)):
			
 
				+    """
			
 
				+        编制依据文档 批量切分和入库处理
			
 
				+    """
			
 
				+    try:
			
 
				+        server_logger.info(trace_id=trace_id, msg=f"{param}")
			
 
				+        # 从字典中获取input
			
 
				+        input_query = param.input
			
 
				+        session_id = param.config.session_id
			
 
				+        context = param.context
			
 
				+        header_info = {
			
 
				+        }
			
 
				+        # 初始化客户端（需提前设置环境变量 SILICONFLOW_API_KEY）
			
 
				+        client = SiliconFlowAPI()
			
 
				+        # 抽象测试
			
 
				+        vector_db = MilvusVectorManager(base_api_platform=client)
			
 
				+        # file_directory= "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/pdf_files"
			
 
				+        file_directory = "test/bfp_md_files"
			
 
				+        # 初始化知识问答处理
			
 
				+        processor = BfpMarkdownProcessor(directory=file_directory, base_vector=vector_db)
			
 
				+        processor.process_tqdm_pdfs_group(key_name="collection_name")
			
 
				+        server_logger.info(trace_id=trace_id, msg=f"【result】: ", log_type="bfp/md/milvus/batch/indb")
			
 
				+        output = "success"
			
 
				+
			
 
				+        # server_logger.debug(trace_id=trace_id, ms g=f"【result】: {output}", log_type="agent/chat")
			
 
				+        # 返回字典格式的响应
			
 
				+        return JSONResponse(
			
 
				+            return_json(data={"output": output}, data_type="text", trace_id=trace_id))
			
 
				+
			
 
				+    except ValueError as err:
			
 
				+        handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/md/milvus/batch/indb")
			
 
				+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
			
 
				+
			
 
				+    except Exception as err:
			
 
				+        handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/md/milvus/batch/indb")
			
 
				+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
			
 
				+