Browse Source

markdown数据预处理

lingmin_package@163.com 1 month ago
parent
commit
62f5eddf2c

+ 6 - 0
README.md

@@ -12,6 +12,12 @@
     - gunicorn -c gunicorn_config.py server.app:app       多进程启动
 
 
+
+
+### 文档处理
+  pip install PyPDF2 python-docx langchain-text-splitter -i https://mirrors.aliyun.com/pypi/simple/
+
+
   ### PostgreSQL 数据库操作测试
     sentence-transformers
 

+ 6 - 2
config/config.ini

@@ -59,8 +59,12 @@ PGVECTOR_USER=vector_user
 PGVECTOR_PASSWORD=pg16@123
 
 [milvus]
-MILVUS_HOST=192.168.0.3
+MILVUS_HOST=192.168.0.5
 MILVUS_PORT=19530
 MILVUS_DB=lq_db
 MILVUS_USER=
-MILVUS_PASSWORD=
+MILVUS_PASSWORD=
+
+
+[minerU]
+MINERU_SERVER_URL=http://192.168.0.166:8000

+ 11 - 0
docker/redis.conf

@@ -0,0 +1,11 @@
+# 监听所有网卡,允许远程
+bind 0.0.0.0
+port 6379
+protected-mode no
+
+# 设置密码(自行替换)
+requirepass Wxcz666@
+
+# 开启 AOF 持久化
+appendonly yes
+dir /data

+ 21 - 0
file_processors/bfp_document_catalog_processor.py

@@ -0,0 +1,21 @@
+"""
+文档处理器
+负责文档解析、内容提取和结构化处理
+集成doc_worker模块的智能处理能力
+"""
+
+import io
+import os
+import tempfile
+from pathlib import Path
+from typing import Dict, Any, Optional, Callable
+from datetime import datetime
+from foundation.logger.loggering import server_logger as logger
+
+
+
+class DocumentCatalogProcessor:
+    """
+    
+     文档处理器
+    """

+ 487 - 0
file_processors/bfp_md_processor.py

@@ -0,0 +1,487 @@
+import re
+import os
+import json
+from typing import List, Dict, Any
+from tqdm import tqdm
+import time
+from foundation.logger.loggering import server_logger as logger
+from foundation.utils.common import handler_err
+from foundation.base.config import config_handler
+from foundation.rag.vector.base_vector import BaseVectorDB
+
+class BfpMarkdownProcessor():
+
+    def __init__(self, directory, base_vector: BaseVectorDB, **kwargs):
+        """
+        初始化 PDF 处理器
+        :param directory: PDF 文件所在目录
+        :param db_type: 数据库类型 ('vector' 或 'es')
+        :param kwargs: 其他参数
+        """
+        self.base_vector = base_vector
+        self.directory = directory  # PDF 文件所在目录
+        self.file_group_num = kwargs.get('file_group_num', 20)  # 每组处理的文件数
+        self.batch_num = kwargs.get('batch_num', 6)  # 每次插入的批次数量
+        self.chunksize = kwargs.get('chunksize', 500)  # 切分文本的大小
+        self.overlap = kwargs.get('overlap', 100)  # 切分文本的重叠大小
+        self.file_suffix_list = kwargs.get('file_suffix_list', ['.md'])
+        logger.info(f"""
+                       初始化PDF文件导入器:
+                       配置参数:
+                       - 文件后缀列表:{self.file_suffix_list}
+                       - 导入的文件路径:{self.directory}
+                       - 每次处理文件数:{self.file_group_num}
+                       - 每批次处理样本数:{self.batch_num}
+                       - 切分文本的大小:{self.chunksize}
+                       - 切分文本重叠大小:{self.overlap}
+                       """)
+
+    def load_files(self):
+        """
+        加载目录下的所有PDF文件
+        """
+        file_path = os.path.join(self.directory)
+        pdf_path_files = []
+        pdf_file_names = []
+        # logger.info(f"file_path: {file_path}")
+        for file_name in os.listdir(file_path):
+            # 获取后缀(带点) # file_name.lower().endswith('.docx'):
+            file_suffix = os.path.splitext(file_name)[1]
+            if file_suffix in self.file_suffix_list:
+                pdf_file_names.append(file_name)
+                pdf_path_files.append(os.path.join(file_path, file_name))
+            else:
+                logger.info(f"Skipping {file_name} because it is not a PDF file.")
+
+        logger.info(f"Found {len(pdf_file_names)} PDF files.")
+        logger.info(f"pdf_path_files: {pdf_path_files},pdf_file_names:{pdf_file_names}")
+        return pdf_path_files, pdf_file_names
+
+
+
+    def process_tqdm_pdfs_group(self, key_name: str = "collection_name"):
+        """
+        处理PDF文件组 并且直接入库处理
+        """
+        total_chunks = 0
+        # 读取PDF文件内容
+        path_files, file_names = self.load_files()
+
+        logger.info(f"process {len(path_files)} documents.")
+        start_time = time.time()
+        total_docs_inserted = 0
+
+        total_batches = len(path_files)
+
+        with tqdm(total=total_batches, desc="process batches", unit="batch") as pbar:
+            for path_file, file_name in zip(path_files, file_names):
+                # 初始化拆分器
+                splitter = MarkdownDocumentSplitter(max_chunk_size=self.chunksize)
+                splitter.load_markdown(path_file)
+                # 解析结构
+                structure = splitter.parse_structure()
+                logger.info("文档结构解析完成")
+                # 拆分文档
+                chunks = splitter.split_document()
+                logger.info(f"生成 {len(chunks)} 个文档块")
+
+                # 保存结果
+                output_dir = 'test/bfp_chunks_files/'+file_name
+                splitter.save_chunks(chunks, output_dir)
+                # 生成结构报告
+                splitter.generate_structure_report(os.path.join(output_dir, 'structure_report.md'))
+                logger.info(f"所有文件已保存到 {output_dir} 目录")
+
+                logger.info(f"Documents file_name:{file_name},docs:{len(chunks)}")
+                # 调用传入的插入函数
+                # 数据标准化处理
+                documents = self.base_vector.document_standard(chunks)
+                self.base_vector.add_tqdm_batch_documents(param={key_name: "tv_basis_of_preparation_md"},
+                                                          documents=documents)
+
+                total_docs_inserted += 1
+                # 计算并显示当前的TPM
+                elapsed_time = time.time() - start_time
+                if elapsed_time > 0:
+                    tpm = (total_docs_inserted / elapsed_time) * 60
+                    pbar.set_postfix({"TPM": f"{tpm:.2f}"})
+
+                pbar.update(1)
+
+        # TODO 切分的问题 可以增加metadata元数据信息
+        logger.info(
+            f"Processed Documents:{self.directory},docs:{len(path_files)},total_chunks:{total_chunks}")
+
+
+
+
+class MarkdownDocumentSplitter:
+    def __init__(self, max_chunk_size: int = 4000):
+        self.max_chunk_size = max_chunk_size
+        self.content = ""
+        self.structure = []
+
+    def load_markdown(self, file_path: str) -> str:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            self.content = f.read()
+        return self.content
+
+
+    def parse_structure(self) -> List[Dict[str, Any]]:
+        lines = self.content.split('\n')
+        structure = []
+        hierarchy_stack = []  # 用于维护标题层级关系
+
+        for i, line in enumerate(lines):
+            line = line.strip()
+            if line.startswith('#') and ' ' in line:
+                # 提取标题级别和内容
+                level_match = re.match(r'^(#+)\s+(.+)$', line)
+                if level_match:
+                    level = len(level_match.group(1))
+                    title = level_match.group(2).strip()
+
+                    # 创建章节对象
+                    section = {
+                        'level': level,
+                        'title': title,
+                        'start_line': i,
+                        'end_line': None,
+                        'content': '',
+                        'children': [],
+                        'parent': None
+                    }
+
+                    # 处理层级关系
+                    while hierarchy_stack and hierarchy_stack[-1]['level'] >= level:
+                        hierarchy_stack.pop()
+
+                    # 设置父级关系
+                    if hierarchy_stack:
+                        parent = hierarchy_stack[-1]
+                        section['parent'] = parent
+                        parent['children'].append(section)
+                    else:
+                        structure.append(section)
+
+                    hierarchy_stack.append(section)
+
+        # 填充内容并构建完整标题路径
+        for section in self._flatten_structure(structure):
+            start = section['start_line'] + 1
+            end = self._find_section_end(lines, start, section['level'])
+            section['end_line'] = end
+
+            raw_content = '\n'.join(lines[start:end + 1])
+            cleaned_content = self._clean_content(raw_content)
+            section['content'] = cleaned_content
+
+            # 构建完整标题路径和层级
+            section['full_title'] = self._build_full_title_path(section)
+            section['hierarchy'] = self._get_section_hierarchy(section)
+
+        self.structure = structure
+        return structure
+
+
+
+
+
+    def _build_full_title_path(self, section: Dict) -> str:
+        """构建完整标题路径"""
+        titles = []
+        current = section
+        while current:
+            titles.insert(0, current['title'])
+            current = current.get('parent')
+        return '->'.join(titles)
+
+    def _get_section_hierarchy(self, section: Dict) -> List[str]:
+        """获取章节层级"""
+        hierarchy = []
+        current = section
+        while current:
+            hierarchy.insert(0, current['title'])
+            current = current.get('parent')
+        return hierarchy
+
+    def _clean_content(self, content: str) -> str:
+        """清理内容,去除多余空行"""
+        lines = content.split('\n')
+        cleaned_lines = []
+
+        for line in lines:
+            stripped_line = line.strip()
+            # 跳过空行,但保留有内容的行
+            if stripped_line:
+                cleaned_lines.append(line)
+            elif cleaned_lines and cleaned_lines[-1].strip():  # 只在有内容行后保留一个空行
+                cleaned_lines.append('')
+
+        # 去除结尾的空行
+        while cleaned_lines and not cleaned_lines[-1].strip():
+            cleaned_lines.pop()
+
+        return '\n'.join(cleaned_lines)
+
+    def _has_meaningful_content(self, content: str) -> bool:
+        """检查是否有实际内容"""
+        lines = content.split('\n')
+        for line in lines:
+            line = line.strip()
+            if line and not line.startswith('#') and len(line) > 5:
+                return True
+        return False
+
+    def _flatten_structure(self, structure: List[Dict]) -> List[Dict]:
+        """展平结构"""
+        flattened = []
+        for section in structure:
+            flattened.append(section)
+            flattened.extend(self._flatten_structure(section['children']))
+        return flattened
+
+    def _find_section_end(self, lines: List[str], start_idx: int, current_level: int) -> int:
+        """查找章节结束位置"""
+        for i in range(start_idx, len(lines)):
+            line = lines[i].strip()
+            if line.startswith('#') and ' ' in line:
+                level_match = re.match(r'^(#+)\s+', line)
+                if level_match:
+                    level = len(level_match.group(1))
+                    if level <= current_level:
+                        return i - 1
+        return len(lines) - 1
+
+    def split_document(self) -> List[Dict[str, Any]]:
+        if not self.structure:
+            self.parse_structure()
+
+        chunks = []
+        flattened = self._flatten_structure(self.structure)
+
+        for section in flattened:
+            if not section['content'] or not self._has_meaningful_content(section['content']):
+                continue
+
+            if len(section['content']) <= self.max_chunk_size:
+                chunk = self._create_chunk(section)
+                if chunk:
+                    chunks.append(chunk)
+            else:
+                sub_chunks = self._split_section_content(section)
+                chunks.extend(sub_chunks)
+
+        return chunks
+
+    def _split_section_content(self, section: Dict) -> List[Dict]:
+        """拆分章节内容为更小的块"""
+        content = section['content']
+        lines = content.split('\n')
+        chunks = []
+        current_chunk = []
+        current_size = 0
+
+        for line in lines:
+            line_size = len(line)
+
+            if (current_size + line_size > self.max_chunk_size and current_chunk) or \
+                    (line.strip() and line.startswith('#') and current_chunk and current_size > 100):
+
+                chunk_content = '\n'.join(current_chunk)
+                chunk_content = self._clean_content(chunk_content)
+                if self._has_meaningful_content(chunk_content):
+                    chunk = self._create_sub_chunk(section, chunk_content, len(chunks))
+                    chunks.append(chunk)
+
+                current_chunk = [line]
+                current_size = line_size
+            else:
+                current_chunk.append(line)
+                current_size += line_size
+
+        if current_chunk:
+            chunk_content = '\n'.join(current_chunk)
+            chunk_content = self._clean_content(chunk_content)
+            if self._has_meaningful_content(chunk_content):
+                chunk = self._create_sub_chunk(section, chunk_content, len(chunks))
+                chunks.append(chunk)
+
+        return chunks
+
+    def _create_chunk(self, section: Dict) -> Dict:
+        """创建文档块"""
+        content_with_parents = self._build_content_with_parents(section)
+
+        return {
+            'metadata': {
+                'title': section['title'],
+                'level': section['level'],
+                'hierarchy': section['hierarchy'],
+                'full_title_path': section['full_title'],
+                'start_line': section['start_line'],
+                'end_line': section['end_line'],
+                'chunk_type': 'section'
+            },
+            'content': content_with_parents,
+            'content_size': len(content_with_parents)
+        }
+
+    def _create_sub_chunk(self, section: Dict, content: str, sub_index: int) -> Dict:
+        """创建子文档块"""
+        content_with_parents = self._build_content_with_parents(section, content, sub_index)
+
+        return {
+            'metadata': {
+                'title': f"{section['title']} - 部分{sub_index + 1}",
+                'level': section['level'],
+                'hierarchy': section['hierarchy'],
+                'full_title_path': section['full_title'],
+                'start_line': section['start_line'],
+                'end_line': section['end_line'],
+                'chunk_type': 'subsection',
+                'parent_title': section['title'],
+                'sub_index': sub_index
+            },
+            'content': content_with_parents,
+            #'page_content': content_with_parents,
+            'content_size': len(content_with_parents)
+        }
+
+    def _build_content_with_parents(self, section: Dict, sub_content: str = None, sub_index: int = None) -> str:
+        """构建包含所有父标题的完整内容"""
+        # 获取所有父级标题
+        parents = []
+        current = section
+        while current:
+            parents.insert(0, current)
+            current = current.get('parent')
+
+        content_lines = []
+
+        # 添加所有层级的标题(从一级标题开始)
+        for i, parent_section in enumerate(parents):
+            level = i + 1
+            content_lines.append(f"{'#' * level} {parent_section['title']}")
+
+        # 如果有子分块,添加子分块标题
+        if sub_index is not None:
+            content_lines.append(f"{'#' * (len(parents) + 1)} 部分 {sub_index + 1}")
+
+        content_lines.append("")  # 标题和内容之间的空行
+
+        # 添加内容
+        if sub_content:
+            content_lines.append(sub_content)
+        else:
+            content_lines.append(section['content'])
+
+        full_content = '\n'.join(content_lines)
+        return self._clean_content(full_content)
+
+
+    def save_chunks(self, chunks: List[Dict], output_dir: str):
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        metadata = {
+            'total_chunks': len(chunks),
+            'max_chunk_size': self.max_chunk_size,
+            'document_info': {
+                'title': '公路工程施工安全技术规范',
+                'code': 'JTG F90-2015'
+            }
+        }
+
+        with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, ensure_ascii=False, indent=2)
+
+        for i, chunk in enumerate(chunks):
+            filename = f"chunk_{i:03d}.md"
+            filepath = os.path.join(output_dir, filename)
+
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(chunk['content'])
+
+            chunk_metadata_path = os.path.join(output_dir, f"chunk_{i:03d}_metadata.json")
+            with open(chunk_metadata_path, 'w', encoding='utf-8') as f:
+                json.dump(chunk['metadata'], f, ensure_ascii=False, indent=2)
+
+    def generate_structure_report(self, output_file: str):
+        """生成结构报告"""
+        if not self.structure:
+            self.parse_structure()
+
+        report = ["# 文档结构分析报告\n"]
+        report.append(f"## 总章节数: {len(self._flatten_structure(self.structure))}")
+        report.append(f"## 最大块大小: {self.max_chunk_size} 字符\n")
+
+        report.append("## 文档层级结构:\n")
+        self._add_structure_to_report(self.structure, report, 0)
+
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(report))
+
+    def _add_structure_to_report(self, structure: List[Dict], report: List[str], indent: int):
+        """递归添加结构到报告"""
+        for section in structure:
+            indent_str = "  " * indent
+            title = section['title']
+            level = section['level']
+            content_size = len(section['content'])
+            children_count = len(section['children'])
+
+            report.append(f"{indent_str}- {'#' * level} {title} "
+                          f"(内容大小: {content_size}字符, 子章节: {children_count})")
+
+            if section['children']:
+                self._add_structure_to_report(section['children'], report, indent + 1)
+
+def main():
+    # 创建测试文档
+
+    # 初始化拆分器
+    splitter = MarkdownDocumentSplitter(max_chunk_size=2000)
+
+    # 示例文件路径
+    file_path = "I:/wangxun_dev_workspace/lq_workspace/output/"
+    pdf_file = file_path + "公路工程施工安全技术规范_2.md"
+
+    # 加载文档(这里假设文档已经保存为markdown文件)
+    # 在实际使用中,您需要先将提供的文档内容保存为markdown文件
+    try:
+        splitter.load_markdown(pdf_file)
+    except FileNotFoundError:
+        logger.info("请先将文档内容保存为 '公路工程施工安全技术规范.md' 文件")
+        return
+
+    # 解析结构
+    structure = splitter.parse_structure()
+    logger.info("文档结构解析完成")
+
+    # 拆分文档
+    chunks = splitter.split_document()
+    logger.info(f"生成 {len(chunks)} 个文档块")
+
+    # 保存结果
+    output_dir = 'knowledge_base_chunks'
+    splitter.save_chunks(chunks, output_dir)
+
+    # 生成结构报告
+    splitter.generate_structure_report(os.path.join(output_dir, 'structure_report.md'))
+
+    logger.info(f"所有文件已保存到 {output_dir} 目录")
+
+    # 显示示例
+    if chunks:
+        logger.info(f"\n第一个分块内容:")
+        logger.info("=" * 50)
+        logger.info(chunks[0]['content'])
+        logger.info("=" * 50)
+
+        logger.info(f"\n第一个分块元数据:")
+        logger.info(json.dumps(chunks[0]['metadata'], ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()

+ 317 - 0
file_processors/bfp_md_processor_bak.py

@@ -0,0 +1,317 @@
+
+
+import re
+import os
+import json
+from typing import List, Dict, Any, Tuple
+
+class MarkdownDocumentSplitter:
+    def __init__(self, max_chunk_size: int = 4000):
+        """
+        初始化文档拆分器
+        
+        Args:
+            max_chunk_size: 最大块大小(字符数)
+        """
+        self.max_chunk_size = max_chunk_size
+        self.content = ""
+        self.structure = []
+        
+    def load_markdown(self, file_path: str) -> str:
+        """
+        加载Markdown文件
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            文件内容
+        """
+        with open(file_path, 'r', encoding='utf-8') as f:
+            self.content = f.read()
+        return self.content
+    
+    def parse_structure(self) -> List[Dict[str, Any]]:
+        """
+        解析文档结构
+        
+        Returns:
+            文档结构列表
+        """
+        lines = self.content.split('\n')
+        structure = []
+        current_section = None
+        
+        for i, line in enumerate(lines):
+            # 检测标题行
+            if line.strip().startswith('#') and ' ' in line:
+                level = len(line.split(' ')[0].strip())
+                title = line.split(' ', 1)[1].strip()
+                
+                section = {
+                    'level': level,
+                    'title': title,
+                    'start_line': i,
+                    'end_line': None,
+                    'content': '',
+                    'children': []
+                }
+                
+                # 如果是顶级标题
+                if level == 1:
+                    structure.append(section)
+                    current_section = section
+                else:
+                    # 找到父级标题
+                    parent = self._find_parent_section(structure, level-1)
+                    if parent:
+                        parent['children'].append(section)
+                        current_section = section
+        
+        # 填充内容
+        for section in self._flatten_structure(structure):
+            start = section['start_line'] + 1
+            end = self._find_section_end(lines, start, section['level'])
+            section['end_line'] = end
+            section['content'] = '\n'.join(lines[start:end+1]).strip()
+            
+        self.structure = structure
+        return structure
+    
+    def _find_parent_section(self, structure: List[Dict], target_level: int) -> Dict:
+        """递归查找父级标题"""
+        if not structure:
+            return None
+            
+        for section in reversed(structure):
+            if section['level'] == target_level:
+                return section
+            if section['children']:
+                result = self._find_parent_section(section['children'], target_level)
+                if result:
+                    return result
+        return None
+    
+    def _flatten_structure(self, structure: List[Dict]) -> List[Dict]:
+        """展平结构"""
+        flattened = []
+        for section in structure:
+            flattened.append(section)
+            flattened.extend(self._flatten_structure(section['children']))
+        return flattened
+    
+    def _find_section_end(self, lines: List[str], start_idx: int, current_level: int) -> int:
+        """查找章节结束位置"""
+        for i in range(start_idx, len(lines)):
+            line = lines[i].strip()
+            if line.startswith('#') and ' ' in line:
+                level = len(line.split(' ')[0].strip())
+                if level <= current_level:
+                    return i - 1
+        return len(lines) - 1
+    
+    def split_document(self) -> List[Dict[str, Any]]:
+        """
+        拆分文档为知识库片段
+        
+        Returns:
+            拆分后的片段列表
+        """
+        if not self.structure:
+            self.parse_structure()
+            
+        chunks = []
+        flattened = self._flatten_structure(self.structure)
+        
+        for section in flattened:
+            # 检查是否需要进一步拆分
+            if len(section['content']) <= self.max_chunk_size:
+                chunk = self._create_chunk(section)
+                chunks.append(chunk)
+            else:
+                # 需要进一步拆分为子块
+                sub_chunks = self._split_section_content(section)
+                chunks.extend(sub_chunks)
+                
+        return chunks
+    
+    def _split_section_content(self, section: Dict) -> List[Dict]:
+        """拆分章节内容为更小的块"""
+        content = section['content']
+        lines = content.split('\n')
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        
+        for line in lines:
+            line_size = len(line)
+            
+            # 如果是新段落或当前块太大,创建新块
+            if (current_size + line_size > self.max_chunk_size and current_chunk) or \
+               (line.strip() and not line.startswith(' ') and current_chunk):
+                
+                chunk_content = '\n'.join(current_chunk)
+                chunk = self._create_sub_chunk(section, chunk_content, len(chunks))
+                chunks.append(chunk)
+                
+                current_chunk = [line]
+                current_size = line_size
+            else:
+                current_chunk.append(line)
+                current_size += line_size
+        
+        # 添加最后一个块
+        if current_chunk:
+            chunk_content = '\n'.join(current_chunk)
+            chunk = self._create_sub_chunk(section, chunk_content, len(chunks))
+            chunks.append(chunk)
+            
+        return chunks
+    
+    def _create_chunk(self, section: Dict) -> Dict:
+        """创建文档块"""
+        hierarchy = self._get_section_hierarchy(section)
+        
+        return {
+            'metadata': {
+                'title': section['title'],
+                'level': section['level'],
+                'hierarchy': hierarchy,
+                'start_line': section['start_line'],
+                'end_line': section['end_line'],
+                'chunk_type': 'section'
+            },
+            'content': f"# {section['title']}\n\n{section['content']}",
+            'content_size': len(section['content'])
+        }
+    
+    def _create_sub_chunk(self, section: Dict, content: str, sub_index: int) -> Dict:
+        """创建子文档块"""
+        hierarchy = self._get_section_hierarchy(section)
+        
+        return {
+            'metadata': {
+                'title': f"{section['title']} - 部分{sub_index + 1}",
+                'level': section['level'],
+                'hierarchy': hierarchy,
+                'start_line': section['start_line'],
+                'end_line': section['end_line'],
+                'chunk_type': 'subsection',
+                'parent_title': section['title'],
+                'sub_index': sub_index
+            },
+            'content': f"# {section['title']}\n\n## 部分 {sub_index + 1}\n\n{content}",
+            'content_size': len(content)
+        }
+    
+    def _get_section_hierarchy(self, section: Dict) -> List[str]:
+        """获取章节层级路径"""
+        # 简化实现,实际应用中需要递归查找父级标题
+        return [section['title']]
+    
+    def save_chunks(self, chunks: List[Dict], output_dir: str):
+        """
+        保存拆分后的块到文件
+        
+        Args:
+            chunks: 文档块列表
+            output_dir: 输出目录
+        """
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        
+        # 保存元数据
+        metadata = {
+            'total_chunks': len(chunks),
+            'max_chunk_size': self.max_chunk_size,
+            'document_info': {
+                'title': '公路工程施工安全技术规范',
+                'code': 'JTG F90-2015'
+            }
+        }
+        
+        with open(os.path.join(output_dir, 'metadata.json'), 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, ensure_ascii=False, indent=2)
+        
+        # 保存每个块
+        for i, chunk in enumerate(chunks):
+            filename = f"chunk_{i:03d}.md"
+            filepath = os.path.join(output_dir, filename)
+            
+            # 保存内容
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(chunk['content'])
+            
+            # 保存块元数据
+            chunk_metadata_path = os.path.join(output_dir, f"chunk_{i:03d}_metadata.json")
+            with open(chunk_metadata_path, 'w', encoding='utf-8') as f:
+                json.dump(chunk['metadata'], f, ensure_ascii=False, indent=2)
+    
+    def generate_structure_report(self, output_file: str):
+        """生成结构报告"""
+        if not self.structure:
+            self.parse_structure()
+            
+        report = ["# 文档结构分析报告\n"]
+        report.append(f"## 总章节数: {len(self._flatten_structure(self.structure))}")
+        report.append(f"## 最大块大小: {self.max_chunk_size} 字符\n")
+        
+        report.append("## 文档层级结构:\n")
+        self._add_structure_to_report(self.structure, report, 0)
+        
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(report))
+    
+    def _add_structure_to_report(self, structure: List[Dict], report: List[str], indent: int):
+        """递归添加结构到报告"""
+        for section in structure:
+            indent_str = "  " * indent
+            title = section['title']
+            level = section['level']
+            content_size = len(section['content'])
+            children_count = len(section['children'])
+            
+            report.append(f"{indent_str}- {'#' * level} {title} "
+                         f"(内容大小: {content_size}字符, 子章节: {children_count})")
+            
+            if section['children']:
+                self._add_structure_to_report(section['children'], report, indent + 1)
+
+# 使用示例
+def main():
+    # 初始化拆分器
+    splitter = MarkdownDocumentSplitter(max_chunk_size=4000)
+    # 示例文件路径
+    file_path = "I:/wangxun_dev_workspace/lq_workspace/output/"
+    pdf_file = file_path + "公路工程施工安全技术规范.md"
+    
+    # 加载文档(这里假设文档已经保存为markdown文件)
+    # 在实际使用中,您需要先将提供的文档内容保存为markdown文件
+    try:
+        splitter.load_markdown(pdf_file)
+    except FileNotFoundError:
+        print("请先将文档内容保存为 '公路工程施工安全技术规范.md' 文件")
+        return
+    
+    # 解析文档结构
+    structure = splitter.parse_structure()
+    print(f"解析完成,共找到 {len(structure)} 个顶级章节")
+    
+    # 拆分文档
+    chunks = splitter.split_document()
+    print(f"拆分完成,共生成 {len(chunks)} 个文档块")
+    
+    # 保存拆分结果
+    output_dir = 'knowledge_base_chunks'
+    splitter.save_chunks(chunks, output_dir)
+    
+    # 生成结构报告
+    splitter.generate_structure_report(os.path.join(output_dir, 'structure_report.md'))
+    
+    print(f"所有文件已保存到 {output_dir} 目录")
+
+
+if __name__ == "__main__":
+    # 首先创建示例文件(在实际使用中请替换为您的实际文档)
+    # 运行主程序
+    main()

+ 115 - 0
file_processors/doc_worker/advanced_document_splitter.py

@@ -0,0 +1,115 @@
+# advanced_document_splitter.py
+import os
+import json
+from datetime import datetime
+from typing import List, Dict, Any
+from .document_parser import DocumentSplitter, DocumentChunk
+
+class AdvancedDocumentSplitter:
+    """高级文档拆分器,支持批量处理和多种输出格式"""
+    
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
+        self.splitter = DocumentSplitter(chunk_size, chunk_overlap)
+        self.processed_files = []
+    
+    def batch_process(self, input_folder: str, output_folder: str):
+        """批量处理文件夹中的所有文档"""
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+        
+        supported_extensions = ['.pdf', '.docx']
+        
+        for filename in os.listdir(input_folder):
+            file_path = os.path.join(input_folder, filename)
+            if os.path.isfile(file_path):
+                file_ext = os.path.splitext(filename)[1].lower()
+                if file_ext in supported_extensions:
+                    try:
+                        print(f"处理文件: {filename}")
+                        chunks = self.splitter.split_document(file_path)
+                        
+                        # 保存结果
+                        base_name = os.path.splitext(filename)[0]
+                        self._save_multiple_formats(chunks, output_folder, base_name)
+                        
+                        self.processed_files.append({
+                            'filename': filename,
+                            'chunk_count': len(chunks),
+                            'processed_at': datetime.now().isoformat()
+                        })
+                        
+                    except Exception as e:
+                        print(f"处理文件 {filename} 时出错: {e}")
+    
+    def _save_multiple_formats(self, chunks: List[DocumentChunk], output_folder: str, base_name: str):
+        """保存为多种格式"""
+        # JSON格式
+        json_file = os.path.join(output_folder, f"{base_name}_chunks.json")
+        self.splitter.save_chunks_to_json(chunks, json_file)
+        
+        # 文本格式
+        txt_file = os.path.join(output_folder, f"{base_name}_chunks.txt")
+        self._save_chunks_to_txt(chunks, txt_file)
+        
+        # 元数据索引
+        meta_file = os.path.join(output_folder, f"{base_name}_metadata.json")
+        self._save_metadata_index(chunks, meta_file)
+    
+    def _save_chunks_to_txt(self, chunks: List[DocumentChunk], output_file: str):
+        """保存片段到文本文件"""
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for i, chunk in enumerate(chunks):
+                f.write(f"=== 片段 {i+1} ===\n")
+                f.write(f"标题: {chunk.metadata.title}\n")
+                f.write(f"层级: {chunk.metadata.level}\n")
+                f.write(f"页码: {chunk.metadata.page_start}-{chunk.metadata.page_end}\n")
+                f.write(f"父级: {' -> '.join(chunk.metadata.parent_titles)}\n")
+                f.write(f"片段: {chunk.chunk_index + 1}/{chunk.total_chunks}\n")
+                f.write(f"内容哈希: {chunk.metadata.content_hash}\n")
+                f.write("内容:\n")
+                f.write(chunk.content)
+                f.write("\n\n" + "="*50 + "\n\n")
+    
+    def _save_metadata_index(self, chunks: List[DocumentChunk], output_file: str):
+        """保存元数据索引"""
+        metadata = []
+        for chunk in chunks:
+            metadata.append({
+                'title': chunk.metadata.title,
+                'level': chunk.metadata.level,
+                'page_range': f"{chunk.metadata.page_start}-{chunk.metadata.page_end}",
+                'parent_titles': chunk.metadata.parent_titles,
+                'chunk_index': chunk.chunk_index,
+                'total_chunks': chunk.total_chunks,
+                'content_hash': chunk.metadata.content_hash,
+                'content_length': len(chunk.content)
+            })
+        
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(metadata, f, ensure_ascii=False, indent=2)
+    
+    def generate_report(self, output_file: str):
+        """生成处理报告"""
+        report = {
+            'processed_at': datetime.now().isoformat(),
+            'total_files': len(self.processed_files),
+            'total_chunks': sum(f['chunk_count'] for f in self.processed_files),
+            'files': self.processed_files
+        }
+        
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(report, f, ensure_ascii=False, indent=2)
+
+# 使用示例
+if __name__ == "__main__":
+    # 批量处理示例
+    advanced_splitter = AdvancedDocumentSplitter(chunk_size=800, chunk_overlap=50)
+    
+    input_folder = "documents"
+    output_folder = "output_chunks"
+    
+    if os.path.exists(input_folder):
+        advanced_splitter.batch_process(input_folder, output_folder)
+        advanced_splitter.generate_report(os.path.join(output_folder, "processing_report.json"))
+    else:
+        print(f"输入文件夹不存在: {input_folder}")

+ 121 - 0
file_processors/doc_worker/base_document.py

@@ -0,0 +1,121 @@
+import re
+import json
+from typing import List, Dict, Any, Tuple
+from dataclasses import dataclass
+import hashlib
+import docx
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+
+@dataclass
+class DocumentMetadata:
+    """文档元数据"""
+    title: str
+    level: int
+    page_start: int
+    page_end: int
+    parent_titles: List[str]
+    content_hash: str
+
+@dataclass
+class DocumentChunk:
+    """文档片段"""
+    content: str
+    metadata: DocumentMetadata
+    chunk_index: int
+    total_chunks: int
+
+class DocumentParser:
+    """文档解析器基类"""
+    
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+        )
+    
+    def extract_toc_structure(self, file_path: str) -> List[Dict[str, Any]]:
+        """提取目录结构 - 子类需要实现"""
+        raise NotImplementedError
+    
+    def extract_content_by_section(self, file_path: str, section_info: Dict[str, Any]) -> str:
+        """提取指定章节内容 - 子类需要实现"""
+        raise NotImplementedError
+    
+    def calculate_content_hash(self, content: str) -> str:
+        """计算内容哈希值"""
+        return hashlib.md5(content.encode('utf-8')).hexdigest()
+    
+    def split_document(self, file_path: str) -> List[DocumentChunk]:
+        """拆分文档"""
+        print(f"开始解析文档: {file_path}")
+        
+        # 1. 提取目录结构
+        toc_structure = self.extract_toc_structure(file_path)
+        print(f"提取到 {len(toc_structure)} 个目录项")
+        #print(f"提取到目录: {toc_structure}")
+        
+        # 2. 按目录结构拆分
+        all_chunks = []
+        for section in toc_structure:
+            section_chunks = self._process_section(file_path, section)
+            all_chunks.extend(section_chunks)
+        
+        print(f"文档拆分完成,共生成 {len(all_chunks)} 个片段")
+        return all_chunks
+    
+    def _process_section(self, file_path: str, section: Dict[str, Any]) -> List[DocumentChunk]:
+        """处理单个章节"""
+        # 提取章节内容
+        content = self.extract_content_by_section(file_path, section)
+        if not content.strip():
+            return []
+        
+        # 检查内容大小
+        if len(content) <= self.chunk_size:
+            # 内容在固定大小内,直接作为一个片段
+            metadata = DocumentMetadata(
+                title=section['title'],
+                level=section['level'],
+                page_start=section.get('page_start', 1),
+                page_end=section.get('page_end', section.get('page_start', 1)),
+                parent_titles=section.get('parent_titles', []),
+                content_hash=self.calculate_content_hash(content)
+            )
+            
+            return [DocumentChunk(
+                content=content,
+                metadata=metadata,
+                chunk_index=0,
+                total_chunks=1
+            )]
+        else:
+            # 内容过大,进一步拆分
+            return self._split_large_section(content, section)
+    
+    def _split_large_section(self, content: str, section: Dict[str, Any]) -> List[DocumentChunk]:
+        """拆分大章节"""
+        chunks = []
+        split_texts = self.text_splitter.split_text(content)
+        
+        for i, chunk_content in enumerate(split_texts):
+            metadata = DocumentMetadata(
+                title=section['title'],
+                level=section['level'],
+                page_start=section.get('page_start', 1),
+                page_end=section.get('page_end', section.get('page_start', 1)),
+                parent_titles=section.get('parent_titles', []),
+                content_hash=self.calculate_content_hash(chunk_content)
+            )
+            
+            chunks.append(DocumentChunk(
+                content=chunk_content,
+                metadata=metadata,
+                chunk_index=i,
+                total_chunks=len(split_texts)
+            ))
+        
+        return chunks

+ 963 - 0
file_processors/doc_worker/document_parser.py

@@ -0,0 +1,963 @@
+import re
+import json
+import os
+from base_document import DocumentParser,DocumentChunk
+from typing import List, Dict, Any, Tuple
+from PyPDF2 import PdfReader, PdfWriter
+import fitz  # PyMuPDF - 更好的PDF解析
+import docx
+
+
+
+class EnhancedPDFTocExtractor:
+    """增强的PDF目录提取器"""
+    
+    def __init__(self):
+        self.level_patterns = {
+            1: [
+                r'^\d+\s+',  # "1 "
+                r'^第[一二三四五六七八九十]+\S*\s+',  # "第一章"
+                r'^[A-Z]\s+',  # "A "
+            ],
+            2: [
+                r'^\d+\.\d+\s+',  # "1.1 "
+                r'^[一二三四五六七八九十]+、\s*',  # "一、"
+                r'^\(\d+\)\s+',  # "(1)"
+            ],
+            3: [
+                r'^\d+\.\d+\.\d+\s+',  # "1.1.1"
+                r'^\d+\)\s+',  # "1)"
+            ]
+        }
+    
+    def extract_complete_toc(self, file_path: str) -> List[Dict[str, Any]]:
+        """提取完整的目录结构"""
+        print(f"正在提取PDF目录: {os.path.basename(file_path)}")
+        
+        # 方法1: 使用PyMuPDF提取书签
+        pymupdf_toc = self._extract_with_pymupdf(file_path)
+        if pymupdf_toc and self._validate_toc(pymupdf_toc):
+            print(f"PyMuPDF提取到 {len(pymupdf_toc)} 个有效目录项")
+            return pymupdf_toc
+        
+        # 方法2: 从文本内容提取目录结构
+        text_toc = self._extract_from_text_content(file_path)
+        if text_toc and self._validate_toc(text_toc):
+            print(f"文本分析提取到 {len(text_toc)} 个有效目录项")
+            return text_toc
+        
+        print("未提取到有效的目录结构")
+        return []
+    
+    def _extract_with_pymupdf(self, file_path: str) -> List[Dict[str, Any]]:
+        """使用PyMuPDF提取目录"""
+        try:
+            doc = fitz.open(file_path)
+            
+            # 获取书签
+            toc = doc.get_toc()
+            if not toc:
+                print("PDF没有书签目录")
+                doc.close()
+                return []
+            
+            processed_toc = []
+            parent_stack = []  # 用于跟踪父级标题
+            
+            for item in toc:
+                level, title, page_num = item
+                
+                # 清理标题
+                clean_title = self._clean_title(title)
+                if not clean_title or len(clean_title) < 2:
+                    continue
+                
+                # 确定实际页码(PyMuPDF页码从1开始)
+                actual_page = max(1, page_num)
+                
+                # 处理层级关系
+                while parent_stack and parent_stack[-1]['level'] >= level:
+                    parent_stack.pop()
+                
+                parent_titles = [p['title'] for p in parent_stack]
+                
+                # 创建目录项
+                toc_item = {
+                    'title': clean_title,
+                    'level': level,
+                    'page_start': actual_page,
+                    'page_end': actual_page,
+                    'parent_titles': parent_titles,
+                    'source': 'pymupdf_toc'
+                }
+                
+                processed_toc.append(toc_item)
+                
+                # 更新父级栈
+                parent_stack.append({
+                    'title': clean_title,
+                    'level': level,
+                    'page': actual_page
+                })
+            
+            doc.close()
+            return processed_toc
+            
+        except Exception as e:
+            print(f"PyMuPDF提取目录失败: {e}")
+            return []
+    
+    def _extract_from_text_content(self, file_path: str) -> List[Dict[str, Any]]:
+        """从文本内容提取目录结构"""
+        try:
+            doc = fitz.open(file_path)
+            full_text = ""
+            
+            # 提取前20页的文本(目录通常在文档前面)
+            max_pages_for_toc = min(20, len(doc))
+            page_texts = []
+            
+            for page_num in range(max_pages_for_toc):
+                page = doc[page_num]
+                text = page.get_text().strip()
+                if text:
+                    page_texts.append((page_num + 1, text))
+                    full_text += f"--- 第{page_num + 1}页 ---\n{text}\n\n"
+            
+            doc.close()
+            
+            # 从文本中识别目录
+            toc_items = self._identify_toc_from_text(full_text, page_texts)
+            
+            # 如果找到目录,进一步分析层级
+            if toc_items:
+                toc_items = self._analyze_toc_levels(toc_items)
+            
+            return toc_items
+            
+        except Exception as e:
+            print(f"文本内容提取目录失败: {e}")
+            return []
+    
+    def _identify_toc_from_text(self, full_text: str, page_texts: List[Tuple[int, str]]) -> List[Dict[str, Any]]:
+        """从文本中识别目录"""
+        toc_items = []
+        
+        # 目录特征模式
+        toc_patterns = [
+            # 带页码的目录项: "1 总则 ........... 1"
+            r'^(\d+(?:\.\d+)*)\s+([^\.]{5,50}?)\s*\.{3,}\s*(\d+)\s*$',
+            # 中文编号: "第一章 总则 ........... 1"
+            r'^(第[一二三四五六七八九十百千]+[章节条])\s+([^\.]{5,50}?)\s*\.{3,}\s*(\d+)\s*$',
+            # 简单格式: "1 总则 1"
+            r'^(\d+(?:\.\d+)*)\s+([^\d\.]{5,50}?)\s+(\d+)\s*$',
+        ]
+        
+        lines = full_text.split('\n')
+        current_page = 1
+        
+        for line in lines:
+            line = line.strip()
+            if not line or len(line) > 200:
+                continue
+            
+            # 更新当前页码
+            page_match = re.match(r'^---\s*第(\d+)页\s*---$', line)
+            if page_match:
+                current_page = int(page_match.group(1))
+                continue
+            
+            # 检查是否是目录项
+            for pattern in toc_patterns:
+                match = re.match(pattern, line)
+                if match:
+                    numbering = match.group(1)
+                    title = match.group(2).strip()
+                    page_num = int(match.group(3)) if match.groups() >= 3 else current_page
+                    
+                    # 清理标题
+                    title = self._clean_title(title)
+                    if not title or len(title) < 2:
+                        continue
+                    
+                    # 确定层级
+                    level = self._determine_level_from_numbering(numbering)
+                    
+                    toc_items.append({
+                        'title': title,
+                        'level': level,
+                        'page_start': page_num,
+                        'page_end': page_num,
+                        'parent_titles': [],
+                        'source': 'text_analysis'
+                    })
+                    break
+        
+        return toc_items
+    
+    def _analyze_toc_levels(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """分析目录项的层级关系"""
+        if not toc_items:
+            return []
+        
+        # 根据编号模式重新确定层级
+        for item in toc_items:
+            title = item['title']
+            # 基于标题前的编号确定层级
+            level = self._analyze_title_level(title)
+            if level > 0:
+                item['level'] = level
+        
+        # 构建层级关系
+        return self._build_hierarchy(toc_items)
+    
+    def _analyze_title_level(self, title: str) -> int:
+        """分析标题的层级"""
+        # 检查常见的层级模式
+        if re.match(r'^\d+\.\d+\.\d+', title):
+            return 3
+        elif re.match(r'^\d+\.\d+', title):
+            return 2
+        elif re.match(r'^\d+', title):
+            return 1
+        elif re.match(r'^第[一二三四五六七八九十]+[章节]', title):
+            return 1
+        elif re.match(r'^[一二三四五六七八九十]+、', title):
+            return 2
+        elif re.match(r'^\(\d+\)', title):
+            return 3
+        
+        return 1  # 默认层级
+    
+    def _build_hierarchy(self, toc_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """构建层级关系"""
+        if not toc_items:
+            return []
+        
+        hierarchical_items = []
+        parent_stack = []
+        
+        for item in toc_items:
+            current_level = item['level']
+            
+            # 弹出栈中层级大于等于当前层级的项目
+            while parent_stack and parent_stack[-1]['level'] >= current_level:
+                parent_stack.pop()
+            
+            # 获取父级标题
+            parent_titles = [p['title'] for p in parent_stack]
+            item['parent_titles'] = parent_titles
+            
+            hierarchical_items.append(item)
+            
+            # 将当前项目压入栈
+            parent_stack.append({
+                'title': item['title'],
+                'level': item['level'],
+                'page': item['page_start']
+            })
+        
+        return hierarchical_items
+    
+    def _determine_level_from_numbering(self, numbering: str) -> int:
+        """根据编号确定层级"""
+        if '.' in numbering:
+            dot_count = numbering.count('.')
+            return min(dot_count + 1, 3)
+        elif re.match(r'^第[一二三四五六七八九十]+[章节]', numbering):
+            return 1
+        elif re.match(r'^[一二三四五六七八九十]+、', numbering):
+            return 2
+        else:
+            return 1
+    
+    def _clean_title(self, title: str) -> str:
+        """清理标题"""
+        if not title:
+            return ""
+        
+        # 移除.pdf后缀
+        title = re.sub(r'\.pdf$', '', title, flags=re.IGNORECASE)
+        
+        # 移除常见的噪音字符
+        #title = re.sub(r'^[\s\d\.\-•>*]*', '', title)  # 开头的编号和符号
+        #title = re.sub(r'[\s\d\.\-•>*]*$', '', title)  # 结尾的编号和符号
+        
+        # 移除多余的空白字符
+        title = re.sub(r'\s+', ' ', title).strip()
+        
+        # 移除目录特有的噪音
+        title = re.sub(r'\.{3,}.*$', '', title)  # 移除省略号和页码
+        
+        return title
+    
+    def _validate_toc(self, toc_items: List[Dict[str, Any]]) -> bool:
+        """验证目录结构的有效性"""
+        if not toc_items:
+            return False
+        
+        # 检查是否有合理的页码分布
+        pages = [item['page_start'] for item in toc_items]
+        unique_pages = set(pages)
+        
+        # 如果所有页码都是1,可能有问题
+        if len(unique_pages) == 1 and 1 in unique_pages:
+            print("警告:所有目录项的页码都是1,可能提取不准确")
+            # 不立即返回False,可能短文档确实都在第1页
+        
+        # 检查标题质量
+        valid_titles = 0
+        for item in toc_items:
+            title = item['title']
+            if len(title) >= 2 and len(title) <= 100:
+                valid_titles += 1
+        
+        # 至少要有一定比例的有效标题
+        return valid_titles >= max(2, len(toc_items) * 0.5)
+
+
+
+class PDFTocExtractor:
+    """PDF目录提取器 - 专门处理PDF目录解析"""
+    
+    def __init__(self):
+        self.level_markers = {
+            1: ['chapter', 'part', '篇', '章', 'section'],
+            2: ['section', '节', 'subsection'],
+            3: ['subsubsection', '小节', 'topic']
+        }
+    
+    def extract_toc_with_pymupdf(self, file_path: str) -> List[Dict[str, Any]]:
+        """使用PyMuPDF提取目录(更准确)"""
+        try:
+            doc = fitz.open(file_path)
+            toc = doc.get_toc()
+            doc.close()
+            
+            return self._process_pymupdf_toc(toc)
+        except Exception as e:
+            print(f"PyMuPDF提取目录失败: {e}")
+            return []
+    
+    def _process_pymupdf_toc(self, toc: List) -> List[Dict[str, Any]]:
+        """处理PyMuPDF返回的目录数据"""
+        processed_toc = []
+        
+        for item in toc:
+            print(item)
+            level, title, page_num = item
+            # PyMuPDF的页码是从1开始的,但需要验证
+            actual_page = max(1, page_num)
+            
+            processed_toc.append({
+                'title': self._clean_title(title),
+                'level': level,
+                'page_start': actual_page,
+                'page_end': actual_page,
+                'parent_titles': [],
+                'source': 'pymupdf_toc'
+            })
+        
+        return processed_toc
+    
+    def extract_toc_with_pypdf2(self, file_path: str) -> List[Dict[str, Any]]:
+        """使用PyPDF2提取目录"""
+        try:
+            with open(file_path, 'rb') as file:
+                pdf_reader = PdfReader(file)
+                
+                if not hasattr(pdf_reader, 'outline') or not pdf_reader.outline:
+                    return []
+                
+                return self._extract_toc_from_outline(pdf_reader.outline, pdf_reader)
+        except Exception as e:
+            print(f"PyPDF2提取目录失败: {e}")
+            return []
+    
+    def _extract_toc_from_outline(self, outline, pdf_reader, level=1, parent_titles=None, parent_pages=None):
+        """从书签中提取目录结构"""
+        if parent_titles is None:
+            parent_titles = []
+        if parent_pages is None:
+            parent_pages = []
+        
+        toc_items = []
+        
+        for item in outline:
+            if isinstance(item, dict):
+                # 提取标题
+                title = self._extract_title(item)
+                if not title:
+                    continue
+                
+                # 提取页码 - 这是关键修复
+                page_num = self._extract_page_number(item, pdf_reader)
+                
+                # 确定层级
+                actual_level = self._determine_level(title, level)
+                
+                toc_item = {
+                    'title': title,
+                    'level': actual_level,
+                    'page_start': page_num,
+                    'page_end': page_num,
+                    'parent_titles': parent_titles.copy(),
+                    'parent_pages': parent_pages.copy(),
+                    'source': 'pypdf2_outline'
+                }
+                
+                toc_items.append(toc_item)
+                
+                # 处理子项目
+                if '/First' in item:
+                    child_parent_titles = parent_titles + [title]
+                    child_parent_pages = parent_pages + [page_num]
+                    children = self._extract_toc_from_outline(
+                        item['/First'], pdf_reader, actual_level + 1, 
+                        child_parent_titles, child_parent_pages
+                    )
+                    toc_items.extend(children)
+            
+            elif isinstance(item, list):
+                # 处理嵌套结构
+                nested_items = self._extract_toc_from_outline(
+                    item, pdf_reader, level, parent_titles, parent_pages
+                )
+                toc_items.extend(nested_items)
+        
+        return toc_items
+    
+    def _extract_title(self, item) -> str:
+        """提取并清理标题"""
+        title = item.get('/Title', '')
+        if isinstance(title, str):
+            # 清理标题中的编号和特殊字符
+            title = re.sub(r'^\s*[\d\.\s]+\s*', '', title)
+            title = title.strip()
+        return title if title else "未命名标题"
+    
+    def _extract_page_number(self, item, pdf_reader) -> int:
+        """提取页码 - 关键修复方法"""
+        try:
+            # 方法1: 从/A/Destinations提取
+            if '/A' in item:
+                action = item['/A']
+                if '/D' in action:
+                    dest = action['/D']
+                    if isinstance(dest, list) and len(dest) > 0:
+                        page_ref = dest[0]
+                        return self._get_page_number_from_ref(page_ref, pdf_reader)
+            
+            # 方法2: 从/Dest直接提取
+            if '/Dest' in item:
+                dest = item['/Dest']
+                if isinstance(dest, list) and len(dest) > 0:
+                    page_ref = dest[0]
+                    return self._get_page_number_from_ref(page_ref, pdf_reader)
+            
+            # 方法3: 尝试从其他属性提取
+            for key in ['/Page', '/P']:
+                if key in item:
+                    page_ref = item[key]
+                    return self._get_page_number_from_ref(page_ref, pdf_reader)
+                    
+        except Exception as e:
+            print(f"提取页码失败: {e}")
+        
+        # 默认返回第1页
+        return 1
+    
+    def _get_page_number_from_ref(self, page_ref, pdf_reader) -> int:
+        """从页面引用获取实际页码"""
+        try:
+            if hasattr(page_ref, 'get_object'):
+                page_obj = page_ref.get_object()
+            else:
+                page_obj = page_ref
+            
+            # 在PDF阅读器中查找页面索引
+            for i, page in enumerate(pdf_reader.pages):
+                if hasattr(page, 'get_object'):
+                    page_obj2 = page.get_object()
+                    if page_obj2 == page_obj:
+                        return i + 1  # 转换为从1开始的页码
+            
+            # 如果找不到,尝试其他方法
+            if hasattr(page_obj, 'indirect_ref'):
+                # 使用间接引用ID来估算
+                ref_id = getattr(page_obj.indirect_ref, 'idnum', 0)
+                if ref_id > 0:
+                    return min(max(1, ref_id % 100), len(pdf_reader.pages))
+                    
+        except Exception as e:
+            print(f"解析页面引用失败: {e}")
+        
+        return 1
+    
+    def _determine_level(self, title: str, base_level: int) -> int:
+        """根据标题内容确定层级"""
+        title_lower = title.lower()
+        
+        # 基于关键词判断层级
+        for level, markers in self.level_markers.items():
+            for marker in markers:
+                if marker in title_lower:
+                    return level
+        
+        # 基于标题格式判断
+        if re.match(r'^(第[一二三四五六七八九十]+[章节篇])', title):
+            return 1
+        elif re.match(r'^\d+\.\d+', title):
+            dots_count = title.count('.')
+            return min(dots_count, 3)
+        elif re.match(r'^[一二三四五六七八九十]、', title):
+            return 2
+        
+        # 默认使用基础层级
+        return base_level
+    
+    def _clean_title(self, title: str) -> str:
+        """清理标题"""
+        # 移除多余的空白字符
+        title = re.sub(r'\s+', ' ', title).strip()
+        
+        # 移除开头的编号(如果存在)
+        title = re.sub(r'^\s*[\d\.\s]+\s*', '', title)
+        
+        return title
+    
+    def extract_toc_from_text(self, file_path: str) -> List[Dict[str, Any]]:
+        """从文本内容中提取目录结构"""
+        try:
+            doc = fitz.open(file_path)
+            toc_items = []
+            
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                text = page.get_text()
+                
+                # 在文本中查找标题模式
+                headings = self._find_headings_in_text(text, page_num + 1)
+                toc_items.extend(headings)
+            
+            doc.close()
+            return toc_items
+        except Exception as e:
+            print(f"从文本提取目录失败: {e}")
+            return []
+    
+    def _find_headings_in_text(self, text: str, page_num: int) -> List[Dict[str, Any]]:
+        """在文本中查找标题"""
+        headings = []
+        lines = text.split('\n')
+        
+        heading_patterns = [
+            # 中文标题模式
+            (r'^(第[一二三四五六七八九十零百千]+[章节条款篇])\s+(.+)$', 1),
+            (r'^([一二三四五六七八九十]、)\s*(.+)$', 2),
+            # 数字标题模式
+            (r'^(\d+)\s+(.+)$', 2),
+            (r'^(\d+\.\d+)\s+(.+)$', 3),
+            (r'^(\d+\.\d+\.\d+)\s+(.+)$', 4),
+            # 英文标题模式
+            (r'^(Chapter|Section)\s+(\d+)\s+(.+)$', 1),
+            (r'^(\d+\.\d+)\s+(.+)$', 2),
+        ]
+        
+        for line in lines:
+            line = line.strip()
+            if len(line) > 100:  # 太长的行不是标题
+                continue
+                
+            for pattern, level in heading_patterns:
+                match = re.match(pattern, line)
+                if match:
+                    if len(match.groups()) == 2:
+                        title = match.group(2)
+                    else:
+                        title = match.group(3) if len(match.groups()) > 2 else line
+                    
+                    headings.append({
+                        'title': title.strip(),
+                        'level': level,
+                        'page_start': page_num,
+                        'page_end': page_num,
+                        'parent_titles': [],
+                        'source': 'text_analysis'
+                    })
+                    break
+        
+        return headings
+
+class PDFParser(DocumentParser):
+    """PDF文档解析器 - 修复版本"""
+    
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
+        super().__init__(chunk_size, chunk_overlap)
+        self.toc_extractor = EnhancedPDFTocExtractor()
+    
+    def extract_toc_structure(self, file_path: str) -> List[Dict[str, Any]]:
+        """提取PDF目录结构 - 增强版本"""
+        print(f"正在解析PDF目录: {os.path.basename(file_path)}")
+        
+        # 使用增强的目录提取器
+        toc_structure = self.toc_extractor.extract_complete_toc(file_path)
+        
+        if not toc_structure:
+            print("未提取到目录结构,将使用传统分割方式")
+            return []
+        
+        # 验证并修复目录结构
+        toc_structure = self._validate_and_fix_toc(toc_structure, file_path)
+        
+        print(f"最终提取到 {len(toc_structure)} 个目录项")
+        return toc_structure
+    
+    def _validate_and_fix_toc(self, toc_structure: List[Dict[str, Any]], file_path: str) -> List[Dict[str, Any]]:
+        """验证并修复目录结构 - 完整版本"""
+        if not toc_structure:
+            return []
+        
+        doc_length = self._get_pdf_length(file_path)
+        fixed_toc = []
+        
+        print(f"文档总页数: {doc_length}")
+        print(f"开始验证 {len(toc_structure)} 个目录项...")
+        
+        invalid_count = 0
+        
+        for i, item in enumerate(toc_structure):
+            original_page = item['page_start']
+            
+            # 检查页码有效性
+            if (original_page <= 0 or 
+                original_page > doc_length + 5 or  # 允许稍微超出
+                (i > 0 and original_page < toc_structure[i-1]['page_start'])):  # 页码倒序
+                
+                # 标记为估算的页码
+                estimated_page = self._estimate_page_number(item, doc_length, toc_structure)
+                item['page_start'] = estimated_page
+                item['page_end'] = estimated_page
+                item['source'] = 'estimated'
+                invalid_count += 1
+                
+                print(f"  修复第{i+1}项: '{item['title']}' 页码 {original_page} -> {estimated_page}")
+            
+            # 清理标题
+            original_title = item['title']
+            item['title'] = self._clean_title_completely(original_title)
+            if original_title != item['title']:
+                print(f"  清理标题: '{original_title}' -> '{item['title']}'")
+            
+            fixed_toc.append(item)
+        
+        if invalid_count > 0:
+            print(f"共修复 {invalid_count} 个无效页码")
+    
+        return fixed_toc
+    
+    def _get_pdf_length(self, file_path: str) -> int:
+        """获取PDF页数"""
+        try:
+            doc = fitz.open(file_path)
+            length = len(doc)
+            doc.close()
+            return length
+        except:
+            return 100  # 默认值
+    
+    def _estimate_page_number(self, item: Dict[str, Any], doc_length: int, toc_structure: List[Dict[str, Any]] = None) -> int:
+        """估算页码 - 改进版本"""
+        if not toc_structure:
+            return 1
+        
+        try:
+            index = toc_structure.index(item)
+            
+            # 方法1: 如果有前面的有效页码,基于前面的页码估算
+            for i in range(index - 1, -1, -1):
+                prev_item = toc_structure[i]
+                if (prev_item['page_start'] > 0 and 
+                    prev_item['page_start'] <= doc_length and
+                    prev_item.get('source') != 'estimated'):
+                    # 基于前一项的页码估算,假设每项占2-3页
+                    estimated = prev_item['page_start'] + 2
+                    return min(estimated, doc_length)
+            
+            # 方法2: 基于索引位置估算(假设目录在文档前部)
+            if index < 10:  # 前10项可能在文档前20页
+                return min(index + 1, 20)
+            else:
+                # 后续项按比例分布
+                progress = index / len(toc_structure)
+                estimated = int(progress * doc_length * 0.8) + 1  # 留20%给附录等
+                return min(max(1, estimated), doc_length)
+                
+        except (ValueError, IndexError):
+            return 1
+    
+    def _clean_title_completely(self, title: str) -> str:
+        """完全清理标题"""
+        if not title:
+            return "未命名标题"
+        
+        # 移除文件后缀
+        title = re.sub(r'\.(pdf|docx?|txt)$', '', title, flags=re.IGNORECASE)
+        
+        # 移除页码引用
+        title = re.sub(r'[\(\{\[\<]?页码?\s*\d+[\)\}\ \]\>]?', '', title)
+        
+        # 清理空白字符
+        title = re.sub(r'\s+', ' ', title).strip()
+        
+        return title if title else "未命名标题"
+    
+    def extract_full_content(self, file_path: str) -> str:
+        """提取完整PDF内容"""
+        full_content = ""
+        
+        try:
+            doc = fitz.open(file_path)
+            
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                text = page.get_text()
+                
+                if text.strip():
+                    full_content += f"--- 第{page_num + 1}页 ---\n{text}\n\n"
+            
+            doc.close()
+            
+        except Exception as e:
+            print(f"PyMuPDF提取内容失败: {e}")
+            # 回退到PyPDF2
+            try:
+                with open(file_path, 'rb') as file:
+                    pdf_reader = PdfReader(file)
+                    
+                    for page_num, page in enumerate(pdf_reader.pages, 1):
+                        page_content = page.extract_text()
+                        if page_content.strip():
+                            full_content += f"--- 第{page_num}页 ---\n{page_content}\n\n"
+            except Exception as e2:
+                print(f"PyPDF2提取内容也失败: {e2}")
+        
+        return full_content
+    
+    def extract_content_by_section(self, file_path: str, section_info: Dict[str, Any]) -> str:
+        """提取指定章节内容"""
+        content = ""
+        start_page = section_info.get('page_start', 1)
+        end_page = section_info.get('page_end', start_page)
+        
+        print(f"提取章节: {section_info['title']}, 页码: {start_page}-{end_page}")
+        
+        try:
+            doc = fitz.open(file_path)
+            
+            # 调整页码范围
+            start_idx = max(0, start_page - 1)
+            end_idx = min(len(doc) - 1, end_page - 1)
+            
+            for page_num in range(start_idx, end_idx + 1):
+                page = doc[page_num]
+                text = page.get_text()
+                if text.strip():
+                    content += text + "\n\n"
+            
+            doc.close()
+            
+        except Exception as e:
+            print(f"提取章节内容失败: {e}")
+        
+        return content
+
+
+
+class WordParser(DocumentParser):
+    """Word文档解析器"""
+    
+    def extract_toc_structure(self, file_path: str) -> List[Dict[str, Any]]:
+        """提取Word目录结构"""
+        doc = docx.Document(file_path)
+        toc_structure = []
+        current_parents = []
+        
+        for paragraph in doc.paragraphs:
+            if paragraph.style.name.startswith('Heading'):
+                level = int(paragraph.style.name.replace('Heading', '').strip())
+                title = paragraph.text.strip()
+                
+                # 更新父级标题栈
+                while current_parents and current_parents[-1]['level'] >= level:
+                    current_parents.pop()
+                
+                parent_titles = [p['title'] for p in current_parents]
+                
+                # 估算页码(Word没有直接的页码信息)
+                # 这里使用段落索引作为近似值
+                para_index = doc.paragraphs.index(paragraph)
+                estimated_page = para_index // 50 + 1  # 假设每页50个段落
+                
+                toc_structure.append({
+                    'title': title,
+                    'level': level,
+                    'page_start': estimated_page,
+                    'page_end': estimated_page,
+                    'parent_titles': parent_titles.copy(),
+                    'paragraph_index': para_index
+                })
+                
+                current_parents.append({
+                    'title': title,
+                    'level': level,
+                    'index': len(toc_structure) - 1
+                })
+        
+        return toc_structure
+    
+    def extract_content_by_section(self, file_path: str, section_info: Dict[str, Any]) -> str:
+        """提取指定章节内容"""
+        doc = docx.Document(file_path)
+        content = []
+        in_section = False
+        current_level = section_info['level']
+        
+        start_index = section_info.get('paragraph_index', 0)
+        next_section_index = self._find_next_section_index(doc, start_index, current_level)
+        
+        for i, paragraph in enumerate(doc.paragraphs):
+            if i < start_index:
+                continue
+            
+            if i == start_index:
+                in_section = True
+                content.append(paragraph.text)
+                continue
+            
+            if in_section:
+                # 检查是否到达下一同级或更高级标题
+                if (paragraph.style.name.startswith('Heading') and 
+                    i >= next_section_index):
+                    break
+                
+                content.append(paragraph.text)
+        
+        return '\n'.join(content)
+    
+    def _find_next_section_index(self, doc, start_index: int, current_level: int) -> int:
+        """查找下一个同级或更高级标题的索引"""
+        for i in range(start_index + 1, len(doc.paragraphs)):
+            paragraph = doc.paragraphs[i]
+            if paragraph.style.name.startswith('Heading'):
+                level = int(paragraph.style.name.replace('Heading', '').strip())
+                if level <= current_level:
+                    return i
+        return len(doc.paragraphs)
+
+class DocumentSplitter:
+    """文档拆分管理器"""
+    
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.parsers = {
+            '.pdf': PDFParser(chunk_size, chunk_overlap),
+            '.docx': WordParser(chunk_size, chunk_overlap)
+        }
+    
+    def split_document(self, file_path: str) -> List[DocumentChunk]:
+        """拆分文档"""
+        file_ext = os.path.splitext(file_path)[1].lower()
+        
+        if file_ext not in self.parsers:
+            raise ValueError(f"不支持的文件格式: {file_ext}")
+        
+        parser = self.parsers[file_ext]
+        return parser.split_document(file_path)
+    
+    def save_chunks_to_json(self, chunks: List[DocumentChunk], output_file: str):
+        """保存片段到JSON文件"""
+        chunk_data = []
+        
+        for chunk in chunks:
+            chunk_data.append({
+                'content': chunk.content,
+                'metadata': {
+                    'title': chunk.metadata.title,
+                    'level': chunk.metadata.level,
+                    'page_start': chunk.metadata.page_start,
+                    'page_end': chunk.metadata.page_end,
+                    'parent_titles': chunk.metadata.parent_titles,
+                    'content_hash': chunk.metadata.content_hash
+                },
+                'chunk_index': chunk.chunk_index,
+                'total_chunks': chunk.total_chunks
+            })
+        
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(chunk_data, f, ensure_ascii=False, indent=2)
+    
+    def print_chunk_summary(self, chunks: List[DocumentChunk]):
+        """打印片段摘要"""
+        print("\n=== 文档拆分摘要 ===")
+        print(f"总片段数: {len(chunks)}")
+        
+        for i, chunk in enumerate(chunks):
+            print(f"\n片段 {i+1}:")
+            print(f"  标题: {chunk.metadata.title}")
+            print(f"  层级: {chunk.metadata.level}")
+            print(f"  页码: {chunk.metadata.page_start}-{chunk.metadata.page_end}")
+            print(f"  父级: {' -> '.join(chunk.metadata.parent_titles)}")
+            print(f"  片段: {chunk.chunk_index + 1}/{chunk.total_chunks}")
+            print(f"  内容长度: {len(chunk.content)}")
+            print(f"  内容预览: {chunk.content[:100]}...")
+
+# 使用示例
+def main():
+    # 初始化拆分器
+    splitter = DocumentSplitter(chunk_size=800, chunk_overlap=50)
+    
+    # 示例文件路径
+    file_path = "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/bfp_files/"
+    pdf_file = file_path + "公路工程施工安全技术规范.pdf"
+    #pdf_file = file_path + "公路桥涵施工技术规范.pdf"
+    word_file = "example.docx"
+    
+    try:
+
+      if os.path.exists(pdf_file):
+        print(f"测试PDF目录提取: {pdf_file}")
+        
+        # 创建PDF解析器实例进行测试
+        pdf_parser = PDFParser()
+        
+        # 测试目录提取
+        #toc_structure = pdf_parser.extract_toc_structure(pdf_file)
+        # print(f"\n提取到的目录结构 ({len(toc_structure)} 项):")
+        # for i, item in enumerate(toc_structure):
+        #     print(f"{i+1:2d}. 层级:{item['level']:2d} 页码:{item['page_start']:3d}-{item['page_end']:3d} 标题: {item['title']}")
+        
+        # 测试完整文档拆分
+        chunks = splitter.split_document(pdf_file)
+        #splitter.print_chunk_summary(chunks)
+
+        # 拆分PDF文档
+        # if os.path.exists(pdf_file):
+        #     print(f"处理PDF文档: {pdf_file}")
+        #     pdf_chunks = splitter.split_document(pdf_file)
+        #     splitter.save_chunks_to_json(pdf_chunks, "pdf_chunks.json")
+        #     splitter.print_chunk_summary(pdf_chunks[:5])  # 只显示前5个片段
+        
+        # 拆分Word文档
+        # if os.path.exists(word_file):
+        #     print(f"\n处理Word文档: {word_file}")
+        #     word_chunks = splitter.split_document(word_file)
+        #     splitter.save_chunks_to_json(word_chunks, "word_chunks.json")
+        #     splitter.print_chunk_summary(word_chunks[:5])
+    
+    except Exception as e:
+        print(f"处理文档时出错: {e}")
+
+if __name__ == "__main__":
+    main()

+ 625 - 0
file_processors/doc_worker/document_parser2.py

@@ -0,0 +1,625 @@
+import os
+import re
+import json
+from typing import List, Dict, Any, Tuple, Optional
+from dataclasses import dataclass
+import hashlib
+
+try:
+    from PyPDF2 import PdfReader, PdfWriter
+    import fitz  # PyMuPDF
+    import docx
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+except ImportError:
+    print("请安装所需依赖:pip install PyPDF2 pymupdf python-docx langchain-text-splitter")
+    exit(1)
+
+@dataclass
+class DocumentMetadata:
+    """文档元数据"""
+    title: str
+    level: int
+    page_start: int
+    page_end: int
+    parent_titles: List[str]
+    content_hash: str
+    section_type: str
+
+@dataclass
+class DocumentChunk:
+    """文档片段"""
+    content: str
+    metadata: DocumentMetadata
+    chunk_index: int
+    total_chunks: int
+
+class RobustPDFTocExtractor:
+    """健壮的PDF目录提取器 - 专门处理页码提取失败的情况"""
+    
+    def __init__(self):
+        self.level_patterns = {
+            1: [r'^\d+\s+', r'^第[一二三四五六七八九十]+\S*\s+'],
+            2: [r'^\d+\.\d+\s+', r'^[一二三四五六七八九十]+、\s*'],
+            3: [r'^\d+\.\d+\.\d+\s+', r'^\(\d+\)\s+']
+        }
+    
+    def extract_toc_with_fallback(self, file_path: str) -> List[Dict[str, Any]]:
+        """提取目录,包含多种回退策略"""
+        print(f"正在提取PDF目录: {os.path.basename(file_path)}")
+        
+        # 方法1: 使用PyMuPDF提取书签
+        pymupdf_toc = self._extract_with_pymupdf(file_path)
+        if pymupdf_toc and self._has_valid_pages(pymupdf_toc):
+            print(f"PyMuPDF提取到 {len(pymupdf_toc)} 个有效目录项")
+            return pymupdf_toc
+        
+        # 方法2: 从目录页文本提取
+        text_toc = self._extract_from_toc_page(file_path)
+        if text_toc and self._has_valid_pages(text_toc):
+            print(f"目录页分析提取到 {len(text_toc)} 个有效目录项")
+            return text_toc
+        
+        # 方法3: 智能估算页码
+        estimated_toc = self._extract_and_estimate_pages(file_path)
+        if estimated_toc:
+            print(f"智能估算提取到 {len(estimated_toc)} 个目录项")
+            return estimated_toc
+        
+        print("无法提取有效的目录结构,将使用传统分割方式")
+        return []
+    
+    def _extract_with_pymupdf(self, file_path: str) -> List[Dict[str, Any]]:
+        """使用PyMuPDF提取目录"""
+        try:
+            doc = fitz.open(file_path)
+            toc = doc.get_toc()
+            doc.close()
+            
+            if not toc:
+                return []
+            
+            processed_toc = []
+            parent_stack = []
+            
+            for item in toc:
+                level, title, page_num = item
+                
+                clean_title = self._clean_title(title)
+                if not clean_title or len(clean_title) < 2:
+                    continue
+                
+                # 确定实际页码
+                actual_page = max(1, page_num)
+                
+                # 处理层级关系
+                while parent_stack and parent_stack[-1]['level'] >= level:
+                    parent_stack.pop()
+                
+                parent_titles = [p['title'] for p in parent_stack]
+                
+                toc_item = {
+                    'title': clean_title,
+                    'level': level,
+                    'page_start': actual_page,
+                    'page_end': actual_page,
+                    'parent_titles': parent_titles,
+                    'source': 'pymupdf_toc'
+                }
+                
+                processed_toc.append(toc_item)
+                parent_stack.append({
+                    'title': clean_title,
+                    'level': level,
+                    'page': actual_page
+                })
+            
+            return processed_toc
+            
+        except Exception as e:
+            print(f"PyMuPDF提取目录失败: {e}")
+            return []
+    
+    def _extract_from_toc_page(self, file_path: str) -> List[Dict[str, Any]]:
+        """从目录页提取目录结构"""
+        try:
+            doc = fitz.open(file_path)
+            print(f"文档内容doc: {doc}")
+            toc_items = []
+            
+            # 检查前10页,寻找目录页
+            for page_num in range(min(10, len(doc))):
+                page = doc[page_num]
+                text = page.get_text()
+                
+                # 检查是否是目录页(包含"目录"、"目次"等关键词)
+                flag = self._is_toc_page(text)
+                print(f"检查目录页 {page_num + 1}-{text}: {flag}")
+                if flag:
+                    items = self._parse_toc_page_text(text, page_num + 1)
+                    toc_items.extend(items)
+                    break  # 找到目录页后停止
+            
+            doc.close()
+            return toc_items
+            
+        except Exception as e:
+            print(f"目录页分析失败: {e}")
+            return []
+    
+    def _is_toc_page(self, text: str) -> bool:
+        """判断是否是目录页 - 修复空格问题"""
+        # 1. 预处理文本:合并连续空格为单个空格
+        normalized_text = re.sub(r'\s+', ' ', text)
+        text_lower = normalized_text.lower()
+        
+        # 2. 关键词检测(处理带空格的关键词)
+        toc_keywords = [
+            '目录', '目次', 'contents', 'content', 'table of contents',
+            '目 录', '目 次'  # 明确包含空格的关键词
+        ]
+        
+        for keyword in toc_keywords:
+            # 移除关键词中的空格进行匹配
+            clean_keyword = re.sub(r'\s+', '', keyword)
+            clean_text = re.sub(r'\s+', '', text_lower)
+            
+            if clean_keyword in clean_text:
+                return True
+            
+            # 同时检查原始关键词(处理正常情况)
+            if keyword in text_lower:
+                return True
+        
+        # 3. 使用正则表达式匹配带空格的关键词
+        toc_patterns = [
+            r'目\s*录',    # 匹配"目  录"、"目 录"等
+            r'目\s*次',    # 匹配"目  次"、"目 次"等  
+            r'content\s*s', # 匹配"content s"等
+        ]
+        
+        for pattern in toc_patterns:
+            if re.search(pattern, text_lower, re.IGNORECASE):
+                return True
+        
+        # 4. 检测典型的目录结构模式(原有逻辑)
+        lines = text.split('\n')
+        toc_line_count = 0
+        total_lines = len(lines)
+        
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+                
+            # 检测目录项特征
+            if self._is_toc_line(line):
+                toc_line_count += 1
+        
+        # 如果超过30%的行看起来像目录项,认为是目录页
+        if total_lines > 0 and toc_line_count / total_lines > 0.3:
+            return True
+        
+        # 5. 检测目录特有的格式模式
+        toc_patterns = [
+            r'\.{3,}\s*\d+',  # 省略号后跟页码
+            r'\d+\s*\.{3,}',  # 数字后跟省略号
+            r'^\s*\d+(\.\d+)*\s+',  # 数字编号开头
+            r'^\s*第[一二三四五六七八九十]+\S*\s+',  # 中文编号开头
+        ]
+        
+        for pattern in toc_patterns:
+            if re.search(pattern, text):
+                return True
+    
+        return False
+    
+    def _parse_toc_page_text(self, text: str, page_num: int) -> List[Dict[str, Any]]:
+        """解析目录页文本"""
+        toc_items = []
+        lines = text.split('\n')
+        
+        # 目录项模式
+        toc_patterns = [
+            # "1 总则 ........... 1"
+            r'^(\d+(?:\.\d+)*)\s+([^\.]{2,50}?)\s*\.{3,}\s*(\d+)\s*$',
+            # "第一章 总则 ........... 1"  
+            r'^(第[一二三四五六七八九十百千]+[章节条])\s+([^\.]{2,50}?)\s*\.{3,}\s*(\d+)\s*$',
+            # "附录A 危险性较大的工程 ........... 100"
+            r'^(附录[ABCDE])\s+([^\.]{2,50}?)\s*\.{3,}\s*(\d+)\s*$',
+        ]
+        
+        for line in lines:
+            line = line.strip()
+            if not line or len(line) > 200:
+                continue
+                
+            for pattern in toc_patterns:
+                match = re.match(pattern, line)
+                if match:
+                    numbering = match.group(1)
+                    title = match.group(2).strip()
+                    page_num = int(match.group(3))
+                    
+                    title = self._clean_title(title)
+                    if not title:
+                        continue
+                    
+                    level = self._determine_level(numbering)
+                    
+                    toc_items.append({
+                        'title': title,
+                        'level': level,
+                        'page_start': page_num,
+                        'page_end': page_num,
+                        'parent_titles': [],
+                        'source': 'toc_page'
+                    })
+                    break
+        
+        return toc_items
+    
+    def _extract_and_estimate_pages(self, file_path: str) -> List[Dict[str, Any]]:
+        """提取目录结构并智能估算页码"""
+        try:
+            doc = fitz.open(file_path)
+            doc_length = len(doc)
+            
+            # 首先提取目录结构(不包含准确页码)
+            toc_structure = self._extract_toc_structure_only(file_path)
+            if not toc_structure:
+                doc.close()
+                return []
+            
+            # 智能估算页码
+            estimated_toc = self._smart_estimate_pages(toc_structure, doc_length)
+            doc.close()
+            
+            return estimated_toc
+            
+        except Exception as e:
+            print(f"智能估算页码失败: {e}")
+            return []
+    
+    def _extract_toc_structure_only(self, file_path: str) -> List[Dict[str, Any]]:
+        """仅提取目录结构(不包含页码)"""
+        try:
+            doc = fitz.open(file_path)
+            toc = doc.get_toc()
+            doc.close()
+            
+            if not toc:
+                return []
+            
+            toc_structure = []
+            for item in toc:
+                level, title, _ = item  # 忽略页码
+                clean_title = self._clean_title(title)
+                if clean_title and len(clean_title) >= 2:
+                    toc_structure.append({
+                        'title': clean_title,
+                        'level': level,
+                        'source': 'structure_only'
+                    })
+            
+            return toc_structure
+            
+        except Exception as e:
+            print(f"提取目录结构失败: {e}")
+            return []
+    
+    def _smart_estimate_pages(self, toc_structure: List[Dict[str, Any]], doc_length: int) -> List[Dict[str, Any]]:
+        """智能估算页码"""
+        if not toc_structure:
+            return []
+        
+        estimated_toc = []
+        current_page = 1
+        
+        # 标准文档的典型页数分布
+        page_estimates = {
+            '封面': 1, '扉页': 1, '前言': 2, '目录': 2, '总则': 3, '术语': 4,
+            '附录': 5, '封底': 1
+        }
+        
+        for i, item in enumerate(toc_structure):
+            title = item['title']
+            level = item['level']
+            
+            # 基于标题内容估算页数
+            estimated_pages = 2  # 默认2页
+            
+            for key, pages in page_estimates.items():
+                if key in title:
+                    estimated_pages = pages
+                    break
+            
+            # 调整基于层级的页数
+            if level == 1:
+                estimated_pages = max(estimated_pages, 3)
+            elif level == 2:
+                estimated_pages = max(estimated_pages, 2)
+            else:
+                estimated_pages = max(estimated_pages, 1)
+            
+            # 确保不超过文档总长度
+            if current_page > doc_length:
+                current_page = doc_length
+            
+            estimated_toc.append({
+                'title': title,
+                'level': level,
+                'page_start': current_page,
+                'page_end': min(current_page + estimated_pages - 1, doc_length),
+                'parent_titles': [],
+                'source': 'smart_estimated'
+            })
+            
+            current_page += estimated_pages
+        
+        return estimated_toc
+    
+    def _has_valid_pages(self, toc_structure: List[Dict[str, Any]]) -> bool:
+        """检查目录是否有有效的页码"""
+        if not toc_structure:
+            return False
+        
+        valid_pages = 0
+        for item in toc_structure:
+            if item['page_start'] > 1:  # 至少有一个不是第1页
+                valid_pages += 1
+        
+        # 至少要有10%的目录项有有效页码
+        return valid_pages >= max(2, len(toc_structure) * 0.1)
+    
+    def _determine_level(self, numbering: str) -> int:
+        """根据编号确定层级"""
+        if '.' in numbering:
+            return numbering.count('.') + 1
+        elif '第' in numbering and '章' in numbering:
+            return 1
+        elif '附录' in numbering:
+            return 1
+        else:
+            return 1
+    
+    def _clean_title(self, title: str) -> str:
+        """清理标题"""
+        if not title:
+            return ""
+        
+        # 移除文件后缀和噪音
+        title = re.sub(r'\.(pdf|docx?|txt)$', '', title, flags=re.IGNORECASE)
+        title = re.sub(r'\.{3,}.*$', '', title)  # 移除省略号和页码
+        title = re.sub(r'\s+', ' ', title).strip()
+        
+        return title
+
+class RobustPDFParser:
+    """健壮的PDF解析器 - 处理页码提取失败的情况"""
+    
+    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+        )
+        self.toc_extractor = RobustPDFTocExtractor()
+    
+    def split_document(self, file_path: str) -> List[DocumentChunk]:
+        """拆分文档 - 健壮版本"""
+        print(f"开始解析文档: {os.path.basename(file_path)}")
+        
+        # 提取目录结构
+        toc_structure = self.toc_extractor.extract_toc_with_fallback(file_path)
+        
+        if toc_structure:
+            print(f"按目录结构拆分,共 {len(toc_structure)} 个章节")
+            return self._split_by_toc_structure(file_path, toc_structure)
+        else:
+            print("使用传统方式拆分文档")
+            return self._split_traditional(file_path)
+    
+    def _split_by_toc_structure(self, file_path: str, toc_structure: List[Dict[str, Any]]) -> List[DocumentChunk]:
+        """按目录结构拆分"""
+        all_chunks = []
+        doc_length = self._get_pdf_length(file_path)
+        
+        for i, section in enumerate(toc_structure):
+            print(f"处理章节 {i+1}/{len(toc_structure)}: {section['title']}")
+            
+            # 提取章节内容
+            content = self._extract_section_content(file_path, section, doc_length)
+            if not content.strip():
+                print(f"  警告: 章节 '{section['title']}' 内容为空")
+                continue
+            
+            # 处理内容拆分
+            section_chunks = self._process_section_content(content, section)
+            all_chunks.extend(section_chunks)
+        
+        print(f"文档拆分完成,共生成 {len(all_chunks)} 个片段")
+        return all_chunks
+    
+    def _extract_section_content(self, file_path: str, section: Dict[str, Any], doc_length: int) -> str:
+        """提取章节内容 - 健壮版本"""
+        start_page = section['page_start']
+        end_page = section['page_end']
+        
+        # 验证页码范围
+        if start_page < 1 or start_page > doc_length or end_page < start_page:
+            print(f"  警告: 章节 '{section['title']}' 页码范围无效 ({start_page}-{end_page})")
+            # 使用智能页码范围
+            start_page, end_page = self._estimate_section_pages(section, doc_length, toc_structure)
+            print(f"  使用估算页码: {start_page}-{end_page}")
+        
+        content = ""
+        try:
+            doc = fitz.open(file_path)
+            
+            start_idx = max(0, start_page - 1)
+            end_idx = min(len(doc) - 1, end_page - 1)
+            
+            for page_num in range(start_idx, end_idx + 1):
+                page = doc[page_num]
+                text = page.get_text()
+                if text.strip():
+                    content += text + "\n\n"
+            
+            doc.close()
+            
+        except Exception as e:
+            print(f"  提取章节内容失败: {e}")
+        
+        return content
+    
+    def _estimate_section_pages(self, section: Dict[str, Any], doc_length: int, toc_structure: List[Dict[str, Any]] = None) -> Tuple[int, int]:
+        """估算章节的页码范围"""
+        # 简单实现:每个章节分配2-5页
+        section_index = toc_structure.index(section) if toc_structure else 0
+        start_page = max(1, section_index * 3 + 1)
+        end_page = min(doc_length, start_page + 4)  # 每个章节5页
+        
+        return start_page, end_page
+    
+    def _process_section_content(self, content: str, section: Dict[str, Any]) -> List[DocumentChunk]:
+        """处理章节内容"""
+        if len(content) <= self.chunk_size:
+            # 内容较小,直接作为一个片段
+            metadata = DocumentMetadata(
+                title=section['title'],
+                level=section['level'],
+                page_start=section['page_start'],
+                page_end=section['page_end'],
+                parent_titles=section.get('parent_titles', []),
+                content_hash=self._calculate_content_hash(content),
+                section_type='toc'
+            )
+            
+            return [DocumentChunk(
+                content=content,
+                metadata=metadata,
+                chunk_index=0,
+                total_chunks=1
+            )]
+        else:
+            # 内容较大,进一步拆分
+            return self._split_large_content(content, section)
+    
+    def _split_large_content(self, content: str, section: Dict[str, Any]) -> List[DocumentChunk]:
+        """拆分大段内容"""
+        chunks = []
+        split_texts = self.text_splitter.split_text(content)
+        
+        for i, chunk_content in enumerate(split_texts):
+            metadata = DocumentMetadata(
+                title=f"{section['title']} - 部分{i+1}",
+                level=section['level'],
+                page_start=section['page_start'],
+                page_end=section['page_end'],
+                parent_titles=section.get('parent_titles', []),
+                content_hash=self._calculate_content_hash(chunk_content),
+                section_type='toc_split'
+            )
+            
+            chunks.append(DocumentChunk(
+                content=chunk_content,
+                metadata=metadata,
+                chunk_index=i,
+                total_chunks=len(split_texts)
+            ))
+        
+        return chunks
+    
+    def _split_traditional(self, file_path: str) -> List[DocumentChunk]:
+        """传统分割方式"""
+        full_content = self._extract_full_content(file_path)
+        
+        if not full_content.strip():
+            print("警告: 文档内容为空")
+            return []
+        
+        # 按固定大小分割
+        chunks = []
+        split_texts = self.text_splitter.split_text(full_content)
+        
+        for i, content in enumerate(split_texts):
+            metadata = DocumentMetadata(
+                title=f"文档内容 - 部分{i+1}",
+                level=1,
+                page_start=1,
+                page_end=self._get_pdf_length(file_path),
+                parent_titles=[],
+                content_hash=self._calculate_content_hash(content),
+                section_type='traditional'
+            )
+            
+            chunks.append(DocumentChunk(
+                content=content,
+                metadata=metadata,
+                chunk_index=i,
+                total_chunks=len(split_texts)
+            ))
+        
+        return chunks
+    
+    def _extract_full_content(self, file_path: str) -> str:
+        """提取完整文档内容"""
+        full_content = ""
+        try:
+            doc = fitz.open(file_path)
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                text = page.get_text()
+                if text.strip():
+                    full_content += text + "\n\n"
+            doc.close()
+        except Exception as e:
+            print(f"提取完整内容失败: {e}")
+        return full_content
+    
+    def _get_pdf_length(self, file_path: str) -> int:
+        """获取PDF页数"""
+        try:
+            doc = fitz.open(file_path)
+            length = len(doc)
+            doc.close()
+            return length
+        except:
+            return 100
+    
+    def _calculate_content_hash(self, content: str) -> str:
+        """计算内容哈希"""
+        return hashlib.md5(content.encode('utf-8')).hexdigest()
+
+# 使用示例
+def main():
+    # 初始化解析器
+    pdf_parser = RobustPDFParser(chunk_size=800, chunk_overlap=50)
+    
+    # 测试文件
+    #test_file = "公路工程施工安全技术规范.pdf"
+    # 示例文件路径
+    file_path = "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/bfp_files/"
+    test_file = file_path + "公路工程施工安全技术规范.pdf"
+    #pdf_file = file_path + "公路桥涵施工技术规范.pdf"
+    
+    if os.path.exists(test_file):
+        print(f"处理PDF文档: {test_file}")
+        chunks = pdf_parser.split_document(test_file)
+        
+        print(f"\n最终生成 {len(chunks)} 个片段")
+        if chunks:
+            print("\n前5个片段:")
+            for i, chunk in enumerate(chunks[:5]):
+                print(f"{i+1}. [{chunk.metadata.section_type}] {chunk.metadata.title}")
+                print(f"   页码: {chunk.metadata.page_start}-{chunk.metadata.page_end}")
+                print(f"   内容长度: {len(chunk.content)}")
+                print(f"   预览: {chunk.content[:100]}...")
+                print()
+    else:
+        print(f"文件不存在: {test_file}")
+
+if __name__ == "__main__":
+    main()

+ 352 - 0
file_processors/doc_worker/test_document.py

@@ -0,0 +1,352 @@
+import fitz  # PyMuPDF
+from docx import Document as DocxDocument
+from docx.document import Document
+from docx.table import Table
+from docx.text.paragraph import Paragraph
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+from docx.text.run import Run
+import re
+from typing import List, Dict, Any, Union, Tuple
+
+# 定义文档片段的数据结构
+class DocFragment:
+    def __init__(self, content: str, metadata: Dict[str, Any]):
+        self.content = content
+        self.metadata = metadata
+
+    def __repr__(self):
+        return f"DocFragment(title='{self.metadata.get('title', 'N/A')}', page='{self.metadata.get('page_start', 'N/A')}-{self.metadata.get('page_end', 'N/A')}')"
+
+class DocumentParser:
+    def __init__(self, max_chars: int = 1000):
+        """
+        初始化解析器。
+
+        :param max_chars: 每个知识库片段的最大字符数。
+        """
+        self.max_chars = max_chars
+        self.fragments: List[DocFragment] = []
+        self.current_doc_type = None
+
+    def parse(self, file_path: str) -> List[DocFragment]:
+        """
+        解析PDF或Word文档。
+
+        :param file_path: 文档文件路径。
+        :return: 解析后的文档片段列表。
+        """
+        if file_path.lower().endswith('.pdf'):
+            self.current_doc_type = 'pdf'
+            self._parse_pdf(file_path)
+        elif file_path.lower().endswith(('.docx', '.doc')):
+            self.current_doc_type = 'word'
+            self._parse_word(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {file_path}")
+        return self.fragments
+
+    def _parse_pdf(self, file_path: str):
+        """解析PDF文档"""
+        doc = fitz.open(file_path)
+        toc = doc.get_toc()  # 获取目录 (Table of Contents)
+        print("PDF TOC:", toc) # 调试用
+
+        # 构建一个按页码索引的标题列表,方便查找
+        page_titles = self._build_page_title_map_pdf(toc, doc.page_count)
+
+        for i in range(doc.page_count):
+            page = doc.load_page(i)
+            text = page.get_text()
+            if not text.strip():
+                continue
+
+            # 尝试找到当前页对应的标题和路径
+            title_info = self._find_title_for_page_pdf(page_titles, i)
+            title = title_info['title']
+            path = title_info['path']
+            level = title_info['level']
+
+            # 一个页面的内容可能属于多个章节,需要更精细地处理。
+            # 这里我们先获取页面所属的主要章节信息,然后尝试根据换行等自然段落拆分。
+            # 但更理想的是,根据目录标题在文本中的位置进行切分。
+            
+            # 为了更精确,我们尝试在页面文本中查找目录标题
+            relevant_toc_items = self._find_toc_items_in_page_text(toc, text, i)
+            
+            if not relevant_toc_items:
+                # 如果页面文本中没有找到明确的目录标题,则使用最近的上级标题
+                # 并将整个页面内容作为一个片段
+                content = text.strip()
+                if content:
+                    metadata = {
+                        "source_file": file_path,
+                        "title": title or "Unknown Section",
+                        "path": " -> ".join(path) if path else "Root",
+                        "level": level,
+                        "page_start": i + 1,
+                        "page_end": i + 1,
+                    }
+                    self._add_fragments(content, metadata)
+            else:
+                # 如果找到了目录标题,则按标题切分页面内容
+                sorted_items = sorted(relevant_toc_items, key=lambda x: x['start_pos'])
+                for idx, item in enumerate(sorted_items):
+                    start_pos = item['start_pos']
+                    end_pos = len(text) if idx == len(sorted_items) - 1 else sorted_items[idx + 1]['start_pos']
+                    section_text = text[start_pos:end_pos].strip()
+                    
+                    if section_text:
+                        # 查找此section_text对应的完整路径
+                        section_path = self._get_path_from_toc(toc, item['title'], item['page_num'])
+                        section_title = item['title']
+                        section_level = item['level']
+                        
+                        metadata = {
+                            "source_file": file_path,
+                            "title": section_title,
+                            "path": " -> ".join(section_path) if section_path else "Root",
+                            "level": section_level,
+                            "page_start": item['page_num'] + 1, # PyMuPDF页码从0开始
+                            "page_end": item['page_num'] + 1,
+                        }
+                        self._add_fragments(section_text, metadata)
+
+
+    def _build_page_title_map_pdf(self, toc: List, num_pages: int):
+        """为PDF构建一个按页码映射标题的辅助结构"""
+        page_map = [{"title": "Unknown Section", "path": [], "level": 0} for _ in range(num_pages)]
+        stack = []
+
+        for item in toc:
+            level, title, page_num_zero_based = item[0], item[1], item[2] - 1 # TOC页码是1-based, 转为0-based
+            if page_num_zero_based < 0 or page_num_zero_based >= num_pages:
+                 continue # 无效页码
+
+            # 调整栈以匹配当前层级
+            while stack and stack[-1]['level'] >= level:
+                stack.pop()
+            
+            stack.append({"title": title, "level": level})
+            current_path = [item["title"] for item in stack]
+            
+            # 更新从此页到下一标题页之前的所有页的映射
+            start_page = page_num_zero_based
+            # 找到下一个同级或更高级的标题页码,或文档末尾
+            next_start_page = num_pages
+            for next_item in toc:
+                 if next_item[0] <= level and next_item[2] - 1 > page_num_zero_based:
+                     next_start_page = next_item[2] - 1
+                     break
+            
+            for p in range(start_page, min(next_start_page, num_pages)):
+                 page_map[p] = {"title": title, "path": current_path.copy(), "level": level}
+        
+        return page_map
+
+    def _find_title_for_page_pdf(self, page_titles: List[Dict], page_num: int):
+        """查找指定页码对应的标题信息"""
+        if 0 <= page_num < len(page_titles):
+            return page_titles[page_num]
+        return {"title": "Unknown Section", "path": [], "level": 0}
+
+    def _find_toc_items_in_page_text(self, toc: List, page_text: str, page_num: int) -> List[Dict]:
+        """在页面文本中查找目录项的起始位置"""
+        found_items = []
+        # 将TOC中属于当前页的标题按文本长度降序排列,优先匹配长标题,避免短标题误匹配
+        page_toc_items = [item for item in toc if item[2] - 1 == page_num] # 转为0-based
+        page_toc_items.sort(key=lambda x: len(x[1]), reverse=True)
+        
+        text_pos = 0
+        for level, title, p_num in page_toc_items:
+             # 使用更宽松的匹配,可能需要处理换行、空格等问题
+             # 这里简单使用str.find,实际可能需要更复杂的NLP处理
+             clean_title = re.escape(title.strip())
+             match = re.search(clean_title, page_text[text_pos:], re.IGNORECASE)
+             if match:
+                 actual_pos = text_pos + match.start()
+                 found_items.append({
+                     "title": title,
+                     "level": level,
+                     "page_num": p_num - 1, # 0-based
+                     "start_pos": actual_pos
+                 })
+                 # 更新搜索起始位置,避免重复匹配同一位置
+                 text_pos = actual_pos + 1 
+                 
+        # 按起始位置排序
+        found_items.sort(key=lambda x: x['start_pos'])
+        return found_items
+
+    def _get_path_from_toc(self, toc: List, target_title: str, target_page: int) -> List[str]:
+        """根据标题和页码在TOC中查找其完整路径"""
+        stack = []
+        for item in toc:
+            level, title, page_num = item[0], item[1], item[2] - 1 # 1-based to 0-based
+            # 调整栈深度
+            while stack and stack[-1]['level'] >= level:
+                stack.pop()
+            stack.append({"title": title, "level": level})
+            
+            if title == target_title and page_num == target_page:
+                return [s["title"] for s in stack]
+        return [] # 如果未找到,则返回空列表或根路径
+
+
+    def _parse_word(self, file_path: str):
+        """解析Word文档"""
+        docx_doc = DocxDocument(file_path)
+        # Word 没有像PDF那样直接的“目录”API,但我们可以解析段落样式来识别标题
+        # 常见标题样式: "Heading 1", "Heading 2", ..., "Heading 9"
+        # 需要遍历所有段落,识别标题,构建层级结构,然后按此结构切分内容
+        # 这比PDF更复杂,因为Word内容更结构化(段落、表格、图片等)
+
+        # 简化处理:遍历段落,识别标题,累积内容直到遇到下一个同级或更高级标题
+        current_structure = [] # [(level, title), ...]
+        current_content = ""
+        current_page_hint = 1 # Word没有直接的页码API,这里用一个近似值或忽略
+
+        for element in docx_doc.element.body:
+            if isinstance(element, CT_P):
+                paragraph = Paragraph(element, docx_doc)
+                style_name = paragraph.style.name if paragraph.style else "Normal"
+                
+                # 检查是否为标题样式
+                if style_name.startswith('Heading'):
+                    try:
+                        level = int(style_name.split()[-1])
+                    except ValueError:
+                        level = 99 # 非标准标题,视为普通内容
+
+                    if level < 99:
+                        title_text = paragraph.text.strip()
+                        if not title_text: continue # 跳过空标题
+
+                        # 保存当前累积的内容作为一个片段
+                        if current_content.strip():
+                            metadata = self._create_metadata_word(current_structure, file_path, current_page_hint)
+                            self._add_fragments(current_content.strip(), metadata)
+                        
+                        # 更新结构
+                        # 移除更深层次的标题
+                        current_structure = [(l, t) for l, t in current_structure if l < level]
+                        # 添加当前标题
+                        current_structure.append((level, title_text))
+                        # 重置内容
+                        current_content = ""
+                        continue # 标题本身不加入内容,下一个段落开始新内容
+                
+                # 累积非标题段落内容
+                # 处理段落中的runs
+                para_text = ""
+                for run in paragraph.runs:
+                    para_text += run.text
+                current_content += para_text + "\n" # 保持段落分隔
+            
+            elif isinstance(element, CT_Tbl):
+                table = Table(element, docx_doc)
+                # 简单处理表格:将其转换为文本添加到内容中
+                table_text = "\n"
+                for row in table.rows:
+                    row_text = " | ".join([cell.text for cell in row.cells])
+                    table_text += row_text + "\n"
+                current_content += table_text + "\n"
+
+        # 处理文档末尾剩余的内容
+        if current_content.strip():
+            metadata = self._create_metadata_word(current_structure, file_path, current_page_hint)
+            self._add_fragments(current_content.strip(), metadata)
+
+    def _create_metadata_word(self, structure: List[Tuple[int, str]], file_path: str, page_hint: int) -> Dict[str, Any]:
+        """为Word文档片段创建元数据"""
+        if structure:
+            level, title = structure[-1]
+            path = " -> ".join([s[1] for s in structure])
+        else:
+            level, title = 0, "Document Body"
+            path = "Root"
+        return {
+            "source_file": file_path,
+            "title": title,
+            "path": path,
+            "level": level,
+            "page_start": page_hint, # Word页码近似或可忽略
+            "page_end": page_hint,
+        }
+
+
+    def _add_fragments(self, content: str, metadata: Dict[str, Any]):
+        """将内容按大小拆分并添加到片段列表"""
+        if len(content) <= self.max_chars:
+            self.fragments.append(DocFragment(content, metadata.copy()))
+        else:
+            # 简单按字符数拆分,尽量在句子或段落边界处拆分
+            parts = self._split_content_by_size(content)
+            for i, part in enumerate(parts):
+                frag_meta = metadata.copy()
+                frag_meta["part_index"] = i
+                frag_meta["total_parts"] = len(parts)
+                self.fragments.append(DocFragment(part, frag_meta))
+
+    def _split_content_by_size(self, text: str) -> List[str]:
+        """按最大字符数拆分文本,尽量在自然边界处拆分"""
+        if len(text) <= self.max_chars:
+            return [text]
+        
+        parts = []
+        while len(text) > self.max_chars:
+            # 从 max_chars 位置向前查找切分点
+            split_point = self.max_chars
+            # 优先查找句号、问号、感叹号
+            for sep in ['.', '!', '?', '\n', '\r', ';', ':', ',', ' ', '']:
+                last_sep_index = text.rfind(sep, 0, self.max_chars)
+                if last_sep_index != -1 and last_sep_index > self.max_chars * 0.8: # 确保不是切得太短
+                    split_point = last_sep_index + len(sep)
+                    break
+            
+            parts.append(text[:split_point])
+            text = text[split_point:]
+        
+        if text: # 添加最后一部分
+            parts.append(text)
+        
+        return parts
+
+# --- 使用示例 ---
+if __name__ == "__main__":
+    # 请将 'your_document.pdf' 或 'your_document.docx' 替换为你的实际文件路径
+    file_path_pdf = "sample_document.pdf"  # 示例PDF路径
+    file_path_word = "sample_document.docx" # 示例Word路径
+
+    # 示例文件路径
+    file_path = "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/bfp_files/"
+    file_path_pdf = file_path + "公路工程施工安全技术规范.pdf"
+    #file_path_pdf = file_path + "公路桥涵施工技术规范.pdf"
+
+    file_path_word = file_path + ""
+
+    parser = DocumentParser(max_chars=500) # 设置最大字符数为500
+
+    # 解析PDF
+    try:
+        fragments_pdf = parser.parse(file_path_pdf)
+        print(f"--- Parsed {len(fragments_pdf)} fragments from PDF ---")
+        for frag in fragments_pdf:
+            print(f"Title: {frag.metadata['title']}, Path: {frag.metadata['path']}, Pages: {frag.metadata['page_start']}-{frag.metadata['page_end']}, Level: {frag.metadata['level']}")
+            # print(f"Content Preview: {frag.content[:100]}...") # 打印内容预览
+            print("-" * 20)
+    except Exception as e:
+        print(f"Error parsing PDF: {e}")
+
+    # 解析Word
+    # try:
+    #     # 重新实例化以避免与PDF解析混淆
+    #     parser_word = DocumentParser(max_chars=500)
+    #     fragments_word = parser_word.parse(file_path_word)
+    #     print(f"--- Parsed {len(fragments_word)} fragments from Word ---")
+    #     for frag in fragments_word:
+    #         print(f"Title: {frag.metadata['title']}, Path: {frag.metadata['path']}, Level: {frag.metadata['level']}")
+    #         # print(f"Content Preview: {frag.content[:100]}...") # 打印内容预览
+    #         print("-" * 20)
+    # except Exception as e:
+    #     print(f"Error parsing Word: {e}")

+ 8 - 0
file_processors/knowledge_base_chunks/metadata.json

@@ -0,0 +1,8 @@
+{
+  "total_chunks": 17,
+  "max_chunk_size": 2000,
+  "document_info": {
+    "title": "公路工程施工安全技术规范",
+    "code": "JTG F90-2015"
+  }
+}

+ 29 - 0
file_processors/knowledge_base_chunks/structure_report.md

@@ -0,0 +1,29 @@
+# 文档结构分析报告
+
+## 总章节数: 22
+## 最大块大小: 2000 字符
+
+## 文档层级结构:
+
+- # 中华人民共和国行业标准 (内容大小: 0字符, 子章节: 0)
+- # 公路工程施工安全技术规范 (内容大小: 147字符, 子章节: 0)
+- # 人民交通出版社股份有限公司 (内容大小: 0字符, 子章节: 0)
+- # 中华人民共和国交通运输部 (内容大小: 0字符, 子章节: 0)
+- # 公 告 (内容大小: 4字符, 子章节: 0)
+- # 交通运输部关于发布《公路工程施工安全技术规范》的公告 (内容大小: 254字符, 子章节: 0)
+- # 前 言 (内容大小: 1312字符, 子章节: 0)
+- # 1总则 (内容大小: 173字符, 子章节: 0)
+- # 2术语 (内容大小: 0字符, 子章节: 0)
+- # 2.0.1危险源hazards (内容大小: 164字符, 子章节: 0)
+- # 2.0.4应急预案emergency response plan (内容大小: 62字符, 子章节: 0)
+- # 2.0.5风险评估risk assessment (内容大小: 48字符, 子章节: 0)
+- # 2.0.6特种设备special equipment (内容大小: 52字符, 子章节: 0)
+- # 2.0.7特殊作业人员special operator (内容大小: 200字符, 子章节: 0)
+- # 3基本规定 (内容大小: 1049字符, 子章节: 0)
+- # 4施工准备 (内容大小: 0字符, 子章节: 0)
+- # 4.1驻地和场站建设 (内容大小: 871字符, 子章节: 0)
+- # 4.2施工便道 (内容大小: 492字符, 子章节: 0)
+- # 4.3临时码头和栈桥 (内容大小: 768字符, 子章节: 0)
+- # 4.4施工临时用电 (内容大小: 2203字符, 子章节: 0)
+- # 4.5生产生活用水 (内容大小: 86字符, 子章节: 0)
+- # 4.6施工机械设备 (内容大小: 306字符, 子章节: 0)

+ 47 - 0
file_processors/mineru/config.py

@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+配置文件 - PDF转Markdown转换工具
+"""
+
+# MinerU服务器配置
+SERVER_URL = "http://192.168.0.166:8000"
+
+# 目录配置
+INPUT_DIR = "./raw_file"  # 输入PDF文件目录
+OUTPUT_DIR = "./output"   # 输出结果目录
+
+# 转换参数配置
+CONVERT_CONFIG = {
+    # 文档语言列表,支持: ch(中文), en(英文), etc.
+    "lang_list": ["ch"],
+    
+    # 解析后端选择:
+    # - "pipeline": 速度快,无幻觉,精度82+,推荐使用(需要GPU)
+    # - "vlm": 兼容性好,速度较慢,精度90+
+    "backend": "pipeline",
+    
+    # 解析方法,默认"auto"自动选择
+    "parse_method": "auto",
+    
+    # 是否启用公式识别
+    "formula_enable": True,
+    
+    # 是否启用表格识别
+    "table_enable": True,
+    
+    # 返回选项
+    "return_md": True,              # 返回Markdown文本
+    "return_middle_json": False,    # 返回中间JSON结果
+    "return_model_output": False,   # 返回模型原始输出
+    "return_content_list": False,   # 返回内容列表
+    "return_images": True,          # 返回提取的图片
+    
+    # 响应格式
+    "response_format_zip": False,   # 是否以ZIP格式返回
+    
+    # 页码范围(从0开始)
+    "start_page_id": 0,      # 起始页码
+    "end_page_id": 9999,    # 结束页码(9999表示到最后一页)
+}
+

+ 65 - 0
file_processors/mineru/convert_single.py

@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+单文件转换脚本 - 用于测试单个PDF文件
+"""
+
+import sys
+from pdf_to_markdown import PDFToMarkdownConverter
+from config import SERVER_URL, OUTPUT_DIR, CONVERT_CONFIG
+
+
+def main():
+    """主函数"""
+    
+    if len(sys.argv) < 2:
+        print("用法: python convert_single.py <PDF文件路径>")
+        print("\n示例:")
+        print("  python convert_single.py raw_file/example.pdf")
+        return 1
+    
+    pdf_path = sys.argv[1]
+    
+    print("=" * 70)
+    print(" 单文件PDF转Markdown")
+    print("=" * 70)
+    print(f"\n文件: {pdf_path}")
+    print(f"服务器: {SERVER_URL}")
+    print(f"输出目录: {OUTPUT_DIR}")
+    print("\n" + "=" * 70 + "\n")
+    
+    # 创建转换器
+    converter = PDFToMarkdownConverter(server_url=SERVER_URL)
+    
+    try:
+        # 转换PDF
+        result = converter.convert_pdf(
+            pdf_path=pdf_path,
+            output_dir=OUTPUT_DIR,
+            **CONVERT_CONFIG
+        )
+        
+        # 保存Markdown
+        from pathlib import Path
+        pdf_name = Path(pdf_path).stem
+        md_path = Path(OUTPUT_DIR) / f"{pdf_name}.md"
+        converter.save_markdown(result, str(md_path))
+        
+        print("\n" + "=" * 70)
+        print("转换成功!")
+        print("=" * 70)
+        
+        return 0
+        
+    except Exception as e:
+        print("\n" + "=" * 70)
+        print(f"转换失败: {e}")
+        print("=" * 70)
+        import traceback
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    exit(main())
+

+ 382 - 0
file_processors/mineru/pdf_to_markdown.py

@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+PDF转Markdown转换脚本
+使用MinerU本地部署版本进行PDF文档解析
+"""
+
+import os
+import requests
+from pathlib import Path
+from typing import List, Optional
+import zipfile
+import io
+
+
+class PDFToMarkdownConverter:
+    """PDF转Markdown转换器"""
+    
+    def __init__(self, server_url: str = "http://localhost:8000"):
+        """
+        初始化转换器
+        
+        Args:
+            server_url: MinerU服务器地址,默认为本地8000端口
+        """
+        self.server_url = server_url.rstrip('/')
+        self.api_endpoint = f"{self.server_url}/file_parse"
+    
+    def convert_pdf(
+        self,
+        pdf_path: str,
+        output_dir: str = "./output",
+        lang_list: List[str] = None,
+        backend: str = "pipeline",
+        parse_method: str = "auto",
+        formula_enable: bool = True,
+        table_enable: bool = True,
+        return_md: bool = True,
+        return_middle_json: bool = False,
+        return_model_output: bool = False,
+        return_content_list: bool = False,
+        return_images: bool = False,
+        response_format_zip: bool = False,
+        start_page_id: int = 0,
+        end_page_id: int = 99999
+    ) -> dict:
+        """
+        转换单个PDF文件为Markdown
+        
+        参数说明(基于MinerU官方文档):
+            pdf_path: PDF文件路径
+            output_dir: 输出目录路径,默认为"./output"
+            lang_list: 文档语言列表,默认["ch"]表示中文。支持: ch(中文), en(英文), etc.
+            backend: 解析后端,可选值:
+                - "pipeline": 速度快,无幻觉,精度82+,需要GPU(推荐)
+                - "vlm": 兼容性好,速度较慢,精度90+
+            parse_method: 解析方法,默认"auto"自动选择
+            formula_enable: 是否启用公式识别,默认True
+            table_enable: 是否启用表格识别,默认True
+            return_md: 是否返回Markdown文本,默认True
+            return_middle_json: 是否返回中间JSON结果,默认False
+            return_model_output: 是否返回模型原始输出,默认False
+            return_content_list: 是否返回内容列表,默认False
+            return_images: 是否返回提取的图片,默认False
+            response_format_zip: 是否以ZIP格式返回结果,默认False
+            start_page_id: 起始页码(从0开始),默认0
+            end_page_id: 结束页码,默认99999(表示到最后一页)
+        
+        Returns:
+            dict: 包含转换结果的字典
+        """
+        if lang_list is None:
+            lang_list = ["ch"]
+        
+        # 检查文件是否存在
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
+        
+        # 准备文件
+        files = {
+            'files': (os.path.basename(pdf_path), open(pdf_path, 'rb'), 'application/pdf')
+        }
+        
+        # 准备表单数据
+        data = {
+            'output_dir': output_dir,
+            'lang_list': lang_list,
+            'backend': backend,
+            'parse_method': parse_method,
+            'formula_enable': formula_enable,
+            'table_enable': table_enable,
+            'return_md': return_md,
+            'return_middle_json': return_middle_json,
+            'return_model_output': return_model_output,
+            'return_content_list': return_content_list,
+            'return_images': return_images,
+            'response_format_zip': response_format_zip,
+            'start_page_id': start_page_id,
+            'end_page_id': end_page_id
+        }
+        
+        try:
+            print(f"正在转换: {pdf_path}")
+            print(f"后端: {backend}, 语言: {lang_list}")
+            
+            # 发送请求
+            response = requests.post(
+                self.api_endpoint,
+                files=files,
+                data=data,
+                timeout=600  # 5分钟超时
+            )
+            
+            # 关闭文件
+            files['files'][1].close()
+            
+            # 检查响应状态
+            response.raise_for_status()
+            
+            # 处理响应
+            if response_format_zip:
+                # 如果返回ZIP格式
+                return self._handle_zip_response(response, pdf_path, output_dir)
+            else:
+                # 如果返回JSON格式
+                return response.json()
+                
+        except requests.exceptions.RequestException as e:
+            print(f"请求失败: {e}")
+            raise
+        except Exception as e:
+            print(f"转换出错: {e}")
+            raise
+    
+    def _handle_zip_response(self, response, pdf_path: str, output_dir: str) -> dict:
+        """处理ZIP格式的响应"""
+        pdf_name = Path(pdf_path).stem
+        output_path = Path(output_dir) / pdf_name
+        output_path.mkdir(parents=True, exist_ok=True)
+        
+        # 解压ZIP内容
+        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
+            zip_file.extractall(output_path)
+        
+        print(f"结果已保存到: {output_path}")
+        return {"status": "success", "output_path": str(output_path)}
+    
+    def convert_directory(
+        self,
+        input_dir: str,
+        output_dir: str = "./output",
+        **kwargs
+    ) -> dict:
+        """
+        批量转换目录中的所有PDF文件
+        
+        Args:
+            input_dir: 输入目录路径
+            output_dir: 输出目录路径
+            **kwargs: 其他转换参数,参考convert_pdf方法
+        
+        Returns:
+            dict: 包含转换结果统计的字典
+        """
+        input_path = Path(input_dir)
+        if not input_path.exists():
+            raise FileNotFoundError(f"输入目录不存在: {input_dir}")
+        
+        # 获取所有PDF文件
+        pdf_files = list(input_path.glob("*.pdf"))
+        
+        if not pdf_files:
+            print(f"目录中没有找到PDF文件: {input_dir}")
+            return {"total": 0, "success": 0, "failed": 0}
+        
+        print(f"找到 {len(pdf_files)} 个PDF文件")
+        
+        results = {
+            "total": len(pdf_files),
+            "success": 0,
+            "failed": 0,
+            "details": []
+        }
+        
+        # 逐个转换
+        for pdf_file in pdf_files:
+            try:
+                result = self.convert_pdf(
+                    str(pdf_file),
+                    output_dir=output_dir,
+                    **kwargs
+                )
+                
+                # 立即保存Markdown文件
+                if result:
+                    md_filename = pdf_file.stem + ".md"
+                    md_path = Path(output_dir) / md_filename
+                    self.save_markdown(result, str(md_path))
+                
+                results["success"] += 1
+                results["details"].append({
+                    "file": pdf_file.name,
+                    "status": "success",
+                    "result": result
+                })
+                print(f"成功: {pdf_file.name}")
+            except Exception as e:
+                results["failed"] += 1
+                results["details"].append({
+                    "file": pdf_file.name,
+                    "status": "failed",
+                    "error": str(e)
+                })
+                print(f"✗ 失败: {pdf_file.name} - {e}")
+        
+        return results
+    
+    def save_markdown(self, result: dict, output_path: str):
+        """
+        保存Markdown内容到文件
+        
+        Args:
+            result: convert_pdf返回的结果字典
+            output_path: 输出文件路径
+        """
+        import json
+        
+        # 确保输出目录存在
+        output_file = Path(output_path)
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+        
+        # 尝试多种可能的键名来提取markdown内容
+        markdown_content = None
+        
+        # MinerU API返回格式:{'backend': ..., 'version': ..., 'results': {filename: {...}}}
+        # results是一个字典,键是文件名,值是解析结果
+        if 'results' in result and isinstance(result['results'], dict):
+            # 获取第一个文件的结果(results是字典)
+            file_results = result['results']
+            if file_results:
+                # 获取第一个文件的数据
+                file_result = next(iter(file_results.values()))
+                
+                # 常见的可能键名
+                possible_keys = ['md_content', 'markdown', 'md', 'content_list', 'text', 'content', 'data']
+                
+                for key in possible_keys:
+                    if key in file_result:
+                        markdown_content = file_result[key]
+                        # 如果是content_list,需要特殊处理
+                        if key == 'content_list' and isinstance(markdown_content, list):
+                            # 将content_list转换为markdown字符串
+                            markdown_content = self._content_list_to_markdown(markdown_content)
+                        if markdown_content:
+                            break
+        
+        # 如果results是列表格式(兼容旧版本)
+        elif 'results' in result and isinstance(result['results'], list) and len(result['results']) > 0:
+            file_result = result['results'][0]
+            
+            possible_keys = ['md_content', 'markdown', 'md', 'content_list', 'text', 'content', 'data']
+            
+            for key in possible_keys:
+                if key in file_result:
+                    markdown_content = file_result[key]
+                    if key == 'content_list' and isinstance(markdown_content, list):
+                        markdown_content = self._content_list_to_markdown(markdown_content)
+                    if markdown_content:
+                        break
+        
+        # 尝试直接在顶层查找
+        if not markdown_content:
+            possible_keys = ['md_content', 'markdown', 'md', 'content', 'text', 'result', 'data']
+            
+            for key in possible_keys:
+                if key in result:
+                    markdown_content = result[key]
+                    if markdown_content:
+                        break
+            
+            # 如果还是没找到,尝试查找嵌套结构
+            if not markdown_content and isinstance(result, dict):
+                if 'data' in result and isinstance(result['data'], dict):
+                    for key in possible_keys:
+                        if key in result['data']:
+                            markdown_content = result['data'][key]
+                            if markdown_content:
+                                break
+        
+        if not markdown_content:
+            # 输出调试信息
+            print("警告: 无法提取markdown内容")
+            print(f"  返回的键: {list(result.keys())}")
+            
+            # 如果有results,显示其内部结构
+            if 'results' in result and isinstance(result['results'], list) and len(result['results']) > 0:
+                print(f"  results[0]的键: {list(result['results'][0].keys())}")
+            
+            # 保存原始JSON以便调试
+            json_path = output_path.replace('.md', '_debug.json')
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(result, f, ensure_ascii=False, indent=2)
+            print(f"  调试JSON已保存到: {json_path}")
+            return
+        
+        # 保存markdown文件
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(markdown_content)
+        
+        file_size = len(markdown_content)
+        print(f"  Markdown已保存: {output_path} ({file_size} 字符)")
+    
+    def _content_list_to_markdown(self, content_list: list) -> str:
+        """
+        将content_list转换为markdown字符串
+        
+        Args:
+            content_list: 内容列表
+            
+        Returns:
+            str: markdown格式的文本
+        """
+        markdown_lines = []
+        
+        for item in content_list:
+            if isinstance(item, dict):
+                # 提取文本内容
+                text = item.get('text', item.get('content', ''))
+                if text:
+                    markdown_lines.append(text)
+            elif isinstance(item, str):
+                markdown_lines.append(item)
+        
+        return '\n\n'.join(markdown_lines)
+
+
+def main():
+    """主函数 - 示例用法"""
+    
+    # 配置参数
+    SERVER_URL = "http://localhost:8000"  # MinerU服务器地址
+    INPUT_DIR = "./raw_file"  # 输入目录
+    OUTPUT_DIR = "./output"  # 输出目录
+    
+    # 创建转换器实例
+    converter = PDFToMarkdownConverter(server_url=SERVER_URL)
+    
+    print("=" * 60)
+    print("PDF转Markdown转换工具 (基于MinerU)")
+    print("=" * 60)
+    
+    try:
+        # 批量转换目录中的所有PDF文件
+        results = converter.convert_directory(
+            input_dir=INPUT_DIR,
+            output_dir=OUTPUT_DIR,
+            lang_list=["ch"],  # 中文文档
+            backend="pipeline",  # 使用pipeline后端(速度快)
+            formula_enable=True,  # 启用公式识别
+            table_enable=True,  # 启用表格识别
+            return_md=True,  # 返回Markdown
+            return_images=True,  # 返回图片
+            response_format_zip=False  # 不使用ZIP格式
+        )
+        
+        # 打印统计结果
+        print("\n" + "=" * 60)
+        print("转换完成!")
+        print(f"总计: {results['total']} 个文件")
+        print(f"成功: {results['success']} 个文件")
+        print(f"失败: {results['failed']} 个文件")
+        print("=" * 60)
+        
+    except Exception as e:
+        print(f"\n错误: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()
+

+ 116 - 0
file_processors/pdf_mineru_md.py

@@ -0,0 +1,116 @@
+
+
+
+import os
+import time
+from tqdm import tqdm
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from foundation.logger.loggering import server_logger
+from foundation.utils.common import handler_err
+from foundation.base.config import config_handler
+from langchain_core.documents import Document
+from foundation.rag.vector.base_vector import BaseVectorDB
+
+class BfpPDFMineruMdProcessor:
+    """
+        编制依据将 PDF文档转换为 markdown 处理器
+    """
+    def __init__(self, directory , output_directory ,  **kwargs):
+        """
+        初始化 PDF 处理器
+        :param directory: PDF 文件所在目录
+        :param kwargs: 其他参数
+        """
+        self.directory = directory  # PDF 文件所在目录
+        self.output_directory = output_directory
+        self.file_suffix_list = kwargs.get('file_suffix_list', ['.pdf' , '.docx' , '.doc'])
+        server_logger.info(f"""
+                    初始化PDF文件导入器:
+                    配置参数:
+                    - 文件后缀列表:{self.file_suffix_list}
+                    - 导入的文件路径:{self.directory}
+                    - 导出的文件路径:{self.output_directory}
+                    """)
+
+    def load_pdf_files(self):
+        """
+        加载目录下的所有PDF文件
+        """
+        file_path = os.path.join(self.directory)
+        pdf_path_files = []
+        pdf_file_names = []
+        #server_logger.info(f"file_path: {file_path}")
+        for file_name in os.listdir(file_path):
+            # 获取后缀(带点) # file_name.lower().endswith('.docx'):
+            file_suffix = os.path.splitext(file_name)[1] 
+            if file_suffix in self.file_suffix_list:
+                pdf_file_names.append(file_name)
+                pdf_path_files.append(os.path.join(file_path, file_name))
+            else:
+                server_logger.info(f"Skipping {file_name} because it is not a PDF file.")
+
+        server_logger.info(f"Found {len(pdf_file_names)} PDF files.")
+        server_logger.info(f"pdf_path_files: {pdf_path_files},pdf_file_names:{pdf_file_names}")
+        return pdf_path_files , pdf_file_names
+
+
+    def process_tqdm_pdfs_group(self):
+        """
+        处理PDF文件组 并且直接入库处理
+        """
+        # 读取PDF文件内容
+        pdf_path_files , pdf_file_names = self.load_pdf_files()
+
+        server_logger.info(f"process {len(pdf_path_files)} documents.")
+        start_time = time.time()
+        total_docs = 0
+
+        total_batches = len(pdf_path_files)
+        
+        with tqdm(total=total_batches, desc="process batches", unit="batch") as pbar:
+            for pdf_path_file , pdf_file_name in zip(pdf_path_files , pdf_file_names):
+                # pdf 转换为 markdown
+                self.pdf_md_process(pdf_path_file=pdf_path_file)
+
+                total_docs += 1
+                # 计算并显示当前的TPM
+                elapsed_time = time.time() - start_time
+                if elapsed_time > 0:
+                    tpm = (total_docs / elapsed_time) * 60
+                    pbar.set_postfix({"TPM": f"{tpm:.2f}"})
+
+                pbar.update(1)
+
+        # TODO 切分的问题 可以增加metadata元数据信息 
+        server_logger.info(f"Processed Documents:{self.directory},docs:{len(pdf_path_files)},total_docs:{total_docs}")
+        
+
+
+    def pdf_md_process(self , pdf_path_file):
+        """
+            pdf 转换为 markdown
+        """
+        import sys
+        from .mineru.pdf_to_markdown import PDFToMarkdownConverter
+        from .mineru.config import CONVERT_CONFIG
+
+        server_url = config_handler.get('minerU', 'MINERU_SERVER_URL')
+
+        pdf_path = pdf_path_file        
+        # 创建转换器
+        converter = PDFToMarkdownConverter(server_url=server_url)
+        try:
+            # 转换PDF
+            result = converter.convert_pdf(
+                pdf_path=pdf_path,
+                output_dir=self.output_directory,
+                **CONVERT_CONFIG
+            )
+            # 保存Markdown
+            from pathlib import Path
+            pdf_name = Path(pdf_path).stem
+            md_path = Path(self.output_directory) / f"{pdf_name}.md"
+            converter.save_markdown(result, str(md_path))
+        except Exception as err:
+            handler_err(logger=server_logger , err=err)

+ 9 - 2
foundation/rag/vector/base_vector.py

@@ -31,8 +31,15 @@ class BaseVectorDB:
         result = []
         for doc in documents:
             tmp = {}
-            tmp['content'] = doc.page_content
-            tmp['metadata'] = doc.metadata if doc.metadata else {}
+            tmp['content'] = doc["content"] if doc["content"] else doc.page_content
+            tmp['metadata'] = doc["metadata"] if doc["metadata"] else doc.metadata if doc.metadata else {}
+            tmp['title'] = doc.get("title")
+            tmp['level'] = doc.get("level")
+            tmp['full_title_path'] = doc.get("full_title_path")
+            tmp['start_line'] = doc.get("start_line")
+            tmp['end_line'] = doc.get("end_line")
+            tmp['hierarchy'] = doc.get("hierarchy")
+            tmp['chunk_type'] = doc.get("section")
             result.append(tmp)
         return result
 

+ 1 - 0
foundation/rag/vector/milvus_vector.py

@@ -176,6 +176,7 @@ class MilvusVectorManager(BaseVectorDB):
             return insert_result.primary_keys
             
         except Exception as e:
+            logger.info(f"documents:{documents}")
             logger.error(f"Error batch inserting: {e}")
             return None
     

+ 8 - 0
test/bfp_chunks_files/公路工程施工安全技术规范.md/metadata.json

@@ -0,0 +1,8 @@
+{
+  "total_chunks": 253,
+  "max_chunk_size": 500,
+  "document_info": {
+    "title": "公路工程施工安全技术规范",
+    "code": "JTG F90-2015"
+  }
+}

+ 185 - 0
test/bfp_chunks_files/公路工程施工安全技术规范.md/structure_report.md

@@ -0,0 +1,185 @@
+# 文档结构分析报告
+
+## 总章节数: 178
+## 最大块大小: 500 字符
+
+## 文档层级结构:
+
+- # 中华人民共和国行业标准 (内容大小: 0字符, 子章节: 0)
+- # 公路工程施工安全技术规范 (内容大小: 147字符, 子章节: 0)
+- # 人民交通出版社股份有限公司 (内容大小: 0字符, 子章节: 0)
+- # 中华人民共和国交通运输部 (内容大小: 0字符, 子章节: 0)
+- # 公 告 (内容大小: 4字符, 子章节: 0)
+- # 交通运输部关于发布《公路工程施工安全技术规范》的公告 (内容大小: 254字符, 子章节: 0)
+- # 前 言 (内容大小: 1312字符, 子章节: 0)
+- # 目 次 (内容大小: 0字符, 子章节: 0)
+- # 1总则 (内容大小: 12字符, 子章节: 0)
+- # 4施工准备 6 (内容大小: 85字符, 子章节: 0)
+- # 5通用作业 ...........·... 12 (内容大小: 186字符, 子章节: 0)
+- # 6路基工程… 29 (内容大小: 100字符, 子章节: 0)
+- # 6.8特殊路基·… … 34 (内容大小: 0字符, 子章节: 0)
+- # 7路面工程 36 (内容大小: 59字符, 子章节: 0)
+- # 8桥涵工程· 38 (内容大小: 259字符, 子章节: 0)
+- # 9隧道工程…· 62 (内容大小: 201字符, 子章节: 0)
+- # 9.13水下隧道 78 (内容大小: 84字符, 子章节: 0)
+- # 10交通安全设施 83 (内容大小: 86字符, 子章节: 0)
+- # 11改扩建工程 85 (内容大小: 36字符, 子章节: 0)
+- # 12特殊季节与环境施·.………… 88 (内容大小: 210字符, 子章节: 0)
+- # 本规范用词用语说明 98 (内容大小: 0字符, 子章节: 0)
+- # 附件《公路工程施工安全技术规范》(JTG F90—2015)条文说明· 99 (内容大小: 143字符, 子章节: 0)
+- # 1总则 (内容大小: 173字符, 子章节: 0)
+- # 2术语 (内容大小: 0字符, 子章节: 0)
+- # 2.0.1危险源hazards (内容大小: 164字符, 子章节: 0)
+- # 2.0.4应急预案emergency response plan (内容大小: 62字符, 子章节: 0)
+- # 2.0.5风险评估risk assessment (内容大小: 48字符, 子章节: 0)
+- # 2.0.6特种设备special equipment (内容大小: 52字符, 子章节: 0)
+- # 2.0.7特殊作业人员special operator (内容大小: 200字符, 子章节: 0)
+- # 3基本规定 (内容大小: 1049字符, 子章节: 0)
+- # 4施工准备 (内容大小: 0字符, 子章节: 0)
+- # 4.1驻地和场站建设 (内容大小: 871字符, 子章节: 0)
+- # 4.2施工便道 (内容大小: 492字符, 子章节: 0)
+- # 4.3临时码头和栈桥 (内容大小: 768字符, 子章节: 0)
+- # 4.4施工临时用电 (内容大小: 2203字符, 子章节: 0)
+- # 4.5生产生活用水 (内容大小: 86字符, 子章节: 0)
+- # 4.6施工机械设备 (内容大小: 321字符, 子章节: 0)
+- # 5通用作业 (内容大小: 0字符, 子章节: 0)
+- # 5.1测量作业 (内容大小: 482字符, 子章节: 0)
+- # 5.2支架及模板工程 (内容大小: 1671字符, 子章节: 0)
+- # 5.2.15模板存放应符合下列规定: (内容大小: 131字符, 子章节: 0)
+- # 5.3钢筋工程 (内容大小: 385字符, 子章节: 0)
+- # 5.4混凝土工程 (内容大小: 773字符, 子章节: 0)
+- # 5.5电焊与气焊 (内容大小: 1104字符, 子章节: 0)
+- # 5.6起重吊装 (内容大小: 1122字符, 子章节: 0)
+- # 5.6.17起重机严禁吊人。 (内容大小: 193字符, 子章节: 0)
+- # 5.7高处作业 (内容大小: 2804字符, 子章节: 0)
+- # 5.7.32雨雪季节应采取防滑措施。 (内容大小: 0字符, 子章节: 0)
+- # 5.8水上作业 (内容大小: 1447字符, 子章节: 0)
+- # 5.9潜水作业 (内容大小: 969字符, 子章节: 0)
+- # 5.10爆破作业 (内容大小: 912字符, 子章节: 0)
+- # 5.11小型机具 (内容大小: 532字符, 子章节: 0)
+- # 5.12涂装作业 (内容大小: 351字符, 子章节: 0)
+- # 6 路基工程 (内容大小: 0字符, 子章节: 0)
+- # 6.1一般规定 (内容大小: 419字符, 子章节: 0)
+- # 6.2场地清理 (内容大小: 61字符, 子章节: 0)
+- # 6.3土方工程 (内容大小: 503字符, 子章节: 0)
+- # 6.4石方工程 (内容大小: 146字符, 子章节: 0)
+- # 6.5防护工程 (内容大小: 1871字符, 子章节: 0)
+- # 6.6排水工程 (内容大小: 80字符, 子章节: 0)
+- # 6.7软基处理 (内容大小: 461字符, 子章节: 0)
+- # 6.8特殊路基 (内容大小: 962字符, 子章节: 0)
+- # 7路面工程 (内容大小: 0字符, 子章节: 0)
+- # 7.1一般规定 (内容大小: 385字符, 子章节: 0)
+- # 7.2基层与底基层 (内容大小: 314字符, 子章节: 0)
+- # 7.3沥青面层 (内容大小: 390字符, 子章节: 0)
+- # 7.4水泥混凝土面层 (内容大小: 73字符, 子章节: 0)
+- # 8桥涵工程 (内容大小: 0字符, 子章节: 0)
+- # 8.1一般规定 (内容大小: 135字符, 子章节: 0)
+- # 8.2预应力混凝土工程 (内容大小: 381字符, 子章节: 0)
+- # 8.3钻(挖)孔灌注桩 (内容大小: 984字符, 子章节: 0)
+- # 8.4沉入桩 (内容大小: 782字符, 子章节: 0)
+- # 8.5沉井 (内容大小: 1000字符, 子章节: 0)
+- # 8.6地下连续墙 (内容大小: 216字符, 子章节: 0)
+- # 8.7围堰 (内容大小: 369字符, 子章节: 0)
+- # 8.7.5钢吊(套)箱围堰施工应符合下列规定: (内容大小: 199字符, 子章节: 0)
+- # 8.8明挖地基 (内容大小: 1085字符, 子章节: 0)
+- # 8.9承台与墩台 (内容大小: 718字符, 子章节: 0)
+- # 8.10砌体 (内容大小: 623字符, 子章节: 0)
+- # 8.11钢筋混凝土和预应力梁式桥 (内容大小: 1751字符, 子章节: 0)
+- # 8.12拱桥 (内容大小: 2914字符, 子章节: 0)
+- # 8.13斜拉桥 (内容大小: 1836字符, 子章节: 0)
+- # 8.13.5斜拉索施工应符合下列规定: (内容大小: 551字符, 子章节: 0)
+- # 8.14悬索桥 (内容大小: 2045字符, 子章节: 0)
+- # 8.14.7先导索施工应符合下列规定: (内容大小: 1748字符, 子章节: 0)
+- # 8.15钢桥 (内容大小: 1602字符, 子章节: 0)
+- # 8.16桥面及附属工程 (内容大小: 192字符, 子章节: 0)
+- # 8.17涵洞与通道 (内容大小: 644字符, 子章节: 0)
+- # 9 隧道工程 (内容大小: 0字符, 子章节: 0)
+- # 9.1一般规定 (内容大小: 1216字符, 子章节: 0)
+- # 9.2洞口与明洞 (内容大小: 579字符, 子章节: 0)
+- # 9.3开挖 (内容大小: 1453字符, 子章节: 0)
+- # 9.4装渣与运输 (内容大小: 199字符, 子章节: 0)
+- # 9.5支护 (内容大小: 421字符, 子章节: 0)
+- # 9.6衬砌 (内容大小: 290字符, 子章节: 0)
+- # 9.7辅助坑道 (内容大小: 1429字符, 子章节: 0)
+- # 9.8防水和排水 (内容大小: 88字符, 子章节: 0)
+- # 9.8.2隧道排水作业应符合下列规定: (内容大小: 285字符, 子章节: 0)
+- # 9.9通风、防尘及防有害气体 (内容大小: 2294字符, 子章节: 0)
+- # 3空气中粉尘浓度应符合表9.9.2-2的规定。 (内容大小: 5608字符, 子章节: 0)
+- # 9.10风、水、电供应 (内容大小: 846字符, 子章节: 0)
+- # 9.11不良地质和特殊岩土地段 (内容大小: 4036字符, 子章节: 0)
+- # 9.12盾构施工 (内容大小: 2158字符, 子章节: 0)
+- # 9.13水下隧道 (内容大小: 560字符, 子章节: 0)
+- # 9.14特殊地段 (内容大小: 367字符, 子章节: 0)
+- # 9.15小净距及连拱隧道 (内容大小: 419字符, 子章节: 0)
+- # 9.16附属设施工程 (内容大小: 363字符, 子章节: 0)
+- # 9.17超前地质预报和监控量测 (内容大小: 993字符, 子章节: 0)
+- # 9.18逃生与救援 (内容大小: 500字符, 子章节: 0)
+- # 10 交通安全设施 (内容大小: 0字符, 子章节: 0)
+- # 10.1一般规定 (内容大小: 182字符, 子章节: 0)
+- # 10.2护栏 (内容大小: 285字符, 子章节: 0)
+- # 10.3交通标志 (内容大小: 181字符, 子章节: 0)
+- # 10.4交通标线 (内容大小: 151字符, 子章节: 0)
+- # 10.5隔离栅和桥梁护网 (内容大小: 135字符, 子章节: 0)
+- # 10.6防眩设施 (内容大小: 59字符, 子章节: 0)
+- # 11 改扩建工程 (内容大小: 0字符, 子章节: 0)
+- # 11.1改扩建 (内容大小: 382字符, 子章节: 0)
+- # 11.2拆除 (内容大小: 1105字符, 子章节: 0)
+- # 12特殊季节与特殊环境施工 (内容大小: 0字符, 子章节: 0)
+- # 12.1一般规定 (内容大小: 109字符, 子章节: 0)
+- # 12.2冬季施工 (内容大小: 380字符, 子章节: 0)
+- # 12.3雨季施工 (内容大小: 176字符, 子章节: 0)
+- # 12.4夜间施工 (内容大小: 113字符, 子章节: 0)
+- # 12.5高温施工 (内容大小: 76字符, 子章节: 0)
+- # 12.6台风季节施工 (内容大小: 74字符, 子章节: 0)
+- # 12.7汛期施工 (内容大小: 94字符, 子章节: 0)
+- # 12.8能见度不良施工 (内容大小: 175字符, 子章节: 0)
+- # 12.9沙漠地区施工 (内容大小: 187字符, 子章节: 0)
+- # 12.10高海拔地区施工 (内容大小: 333字符, 子章节: 0)
+- # 附录A危险性较大的工程 (内容大小: 2310字符, 子章节: 0)
+- # 附录B 专项施工方案主要内容 (内容大小: 231字符, 子章节: 0)
+- # 附录C风险评估报告的内容 (内容大小: 292字符, 子章节: 0)
+- # 附录D 特殊作业人员范围 (内容大小: 164字符, 子章节: 0)
+- # 附录E 特种设备名录 (内容大小: 927字符, 子章节: 0)
+- # 本规范用词用语说明 (内容大小: 485字符, 子章节: 0)
+- # 《公路工程施工安全技术规范 (内容大小: 35字符, 子章节: 0)
+- # 3基本规定 (内容大小: 293字符, 子章节: 0)
+- # 4施工准备 (内容大小: 0字符, 子章节: 0)
+- # 4.3临时码头和栈桥 (内容大小: 85字符, 子章节: 0)
+- # 4.4施工临时用电 (内容大小: 428字符, 子章节: 0)
+- # 5通用作业 (内容大小: 0字符, 子章节: 0)
+- # 5.2支架及模板工程 (内容大小: 79字符, 子章节: 0)
+- # 5.5电焊与气焊 (内容大小: 47字符, 子章节: 0)
+- # 5.6起重吊装 (内容大小: 126字符, 子章节: 0)
+- # 5.7高处作业 (内容大小: 133字符, 子章节: 0)
+- # 5.8水上作业 (内容大小: 647字符, 子章节: 0)
+- # 5.9潜水作业 (内容大小: 313字符, 子章节: 0)
+- # 5.10爆破作业 (内容大小: 21字符, 子章节: 0)
+- # 5.11小型机具 (内容大小: 39字符, 子章节: 0)
+- # 5.12涂装作业 (内容大小: 78字符, 子章节: 0)
+- # 6 路基工程 (内容大小: 0字符, 子章节: 0)
+- # 6.4石方工程 (内容大小: 24字符, 子章节: 0)
+- # 6.5防护工程 (内容大小: 1265字符, 子章节: 0)
+- # 8桥涵工程 (内容大小: 0字符, 子章节: 0)
+- # 8.4沉入桩 (内容大小: 62字符, 子章节: 0)
+- # 8.12拱桥 (内容大小: 626字符, 子章节: 0)
+- # 8.13斜拉桥 (内容大小: 509字符, 子章节: 0)
+- # 8.14悬索桥 (内容大小: 57字符, 子章节: 0)
+- # 8.17涵洞与通道 (内容大小: 75字符, 子章节: 0)
+- # 9 隧道工程 (内容大小: 0字符, 子章节: 0)
+- # 9.1一般规定 (内容大小: 45字符, 子章节: 0)
+- # 9.3开挖 (内容大小: 28字符, 子章节: 0)
+- # 9.7辅助坑道 (内容大小: 72字符, 子章节: 0)
+- # 9.9通风、防尘及防有害气体 (内容大小: 162字符, 子章节: 0)
+- # 9.10风、水、电供应 (内容大小: 70字符, 子章节: 0)
+- # 10本款参考《公路隧道施工技术细则》(JTG/T F60—2009)第12.3.1条制定。 (内容大小: 0字符, 子章节: 0)
+- # 9.11不良地质和特殊岩土地段 (内容大小: 173字符, 子章节: 0)
+- # 9.12盾构施工 (内容大小: 534字符, 子章节: 0)
+- # 10 交通安全设施 (内容大小: 0字符, 子章节: 0)
+- # 10.4交通标线 (内容大小: 22字符, 子章节: 0)
+- # 11改扩建工程 (内容大小: 0字符, 子章节: 0)
+- # 11.1改扩建 (内容大小: 154字符, 子章节: 0)
+- # 12特殊季节与特殊环境施工 (内容大小: 0字符, 子章节: 0)
+- # 12.2冬季施工 (内容大小: 91字符, 子章节: 0)
+- # 12.8能见度不良施工 (内容大小: 25字符, 子章节: 0)
+- # 12.10高海拔地区施工 (内容大小: 64字符, 子章节: 0)
+- # 附录A危险性较大的工程 (内容大小: 671字符, 子章节: 0)

File diff suppressed because it is too large
+ 2563 - 0
test/bfp_md_files/公路工程施工安全技术规范.md


BIN
test/test/test.pdf


+ 91 - 1
views/test_views.py

@@ -23,6 +23,8 @@ from views import test_router, get_operation_id
 from foundation.agent.workflow.test_workflow_graph import test_workflow_graph
 from file_processors.pdf_processor import PDFProcessor
 from file_processors.bfp_pdf_processor import BfpPDFProcessor
+from file_processors.pdf_mineru_md import BfpPDFMineruMdProcessor
+from file_processors.bfp_md_processor import BfpMarkdownProcessor
 
 from foundation.models.silicon_flow import SiliconFlowAPI
 from foundation.rag.vector.pg_vector_mananger import PGVectorManager
@@ -598,6 +600,52 @@ async def pgvector_test_endpoint(
 
 
 
+@test_router.post("/data/bfp/pdf_md", response_model=TestForm)
+async def bfp_md_indb_endpoint(
+        param: TestForm,
+        trace_id: str = Depends(get_operation_id)):
+    """
+        编制依据文档切分处理 和 入库处理
+    """
+    try:
+        server_logger.info(trace_id=trace_id, msg=f"{param}")
+        print(trace_id)
+        # 从字典中获取input
+        input_query = param.input
+        session_id = param.config.session_id
+        context = param.context
+        header_info = {
+        }
+        task_prompt_info = {"task_prompt": ""}
+        #file_directory= "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/pdf_files"
+        #file_directory= "test/bfp_files"
+        file_directory= "test/test"
+        output_directory = "test/bfp_md_files"
+         # 初始化知识问答处理
+        pdf_processor = BfpPDFMineruMdProcessor(directory=file_directory , output_directory=output_directory)
+        pdf_processor.process_tqdm_pdfs_group()
+
+        server_logger.info(trace_id=trace_id, msg=f"【result】: ", log_type="bfp/pdf_md")
+        output = None
+
+        
+
+        #server_logger.debug(trace_id=trace_id, msg=f"【result】: {output}", log_type="agent/chat")
+        # 返回字典格式的响应
+        return JSONResponse(
+            return_json(data={"output": output}, data_type="text", trace_id=trace_id))
+
+    except ValueError as err:
+        handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/pdf_md")
+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
+
+    except Exception as err:
+        handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/pdf_md")
+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
+    
+
+
+
 
 @test_router.post("/data/bfp/indb", response_model=TestForm)
 async def bfp_indb_endpoint(
@@ -817,4 +865,46 @@ async def bfp_search_endpoint(
 
     except Exception as err:
         handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/milvus/search")
-        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
+
+
+@test_router.post("/data/bfp/md/milvus/batch/indb", response_model=TestForm)
+async def bfp_md_batch_indb_endpoint(
+        param: TestForm,
+        trace_id: str = Depends(get_operation_id)):
+    """
+        编制依据文档 批量切分和入库处理
+    """
+    try:
+        server_logger.info(trace_id=trace_id, msg=f"{param}")
+        # 从字典中获取input
+        input_query = param.input
+        session_id = param.config.session_id
+        context = param.context
+        header_info = {
+        }
+        # 初始化客户端(需提前设置环境变量 SILICONFLOW_API_KEY)
+        client = SiliconFlowAPI()
+        # 抽象测试
+        vector_db = MilvusVectorManager(base_api_platform=client)
+        # file_directory= "I:/wangxun_dev_workspace/lq_workspace/LQDataGovernance/test/pdf_files"
+        file_directory = "test/bfp_md_files"
+        # 初始化知识问答处理
+        processor = BfpMarkdownProcessor(directory=file_directory, base_vector=vector_db)
+        processor.process_tqdm_pdfs_group(key_name="collection_name")
+        server_logger.info(trace_id=trace_id, msg=f"【result】: ", log_type="bfp/md/milvus/batch/indb")
+        output = "success"
+
+        # server_logger.debug(trace_id=trace_id, ms g=f"【result】: {output}", log_type="agent/chat")
+        # 返回字典格式的响应
+        return JSONResponse(
+            return_json(data={"output": output}, data_type="text", trace_id=trace_id))
+
+    except ValueError as err:
+        handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/md/milvus/batch/indb")
+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
+
+    except Exception as err:
+        handler_err(server_logger, trace_id=trace_id, err=err, err_name="bfp/md/milvus/batch/indb")
+        return JSONResponse(return_json(code=100500, msg=f"{err}", trace_id=trace_id))
+

Some files were not shown because too many files changed in this diff