|
|
@@ -1,327 +0,0 @@
|
|
|
-"""
|
|
|
-DOCX 文本切分实现
|
|
|
-
|
|
|
-复刻 PDF 处理的切分逻辑:
|
|
|
-1. 跳过目录页,只在正文中定位章节标题
|
|
|
-2. 按最低目录层级进行切分,形成章节块
|
|
|
-3. 对超过最大字符数的块按段落-句子进行再次切分,保持语义完整性
|
|
|
-"""
|
|
|
-
|
|
|
-from __future__ import annotations
|
|
|
-
|
|
|
-from typing import Any, Dict, List
|
|
|
-
|
|
|
-from ..config.provider import default_config_provider
|
|
|
-from ..interfaces import TextSplitter
|
|
|
-from ..utils.title_matcher import TitleMatcher
|
|
|
-from ..utils.text_split_support import HierarchicalChunkMixin
|
|
|
-
|
|
|
-
|
|
|
-class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
|
|
|
- """按目录层级对 DOCX 正文进行智能分块的实现"""
|
|
|
-
|
|
|
- def __init__(self) -> None:
|
|
|
- self._cfg = default_config_provider
|
|
|
- self._title_matcher = TitleMatcher()
|
|
|
-
|
|
|
- def split_by_hierarchy(
|
|
|
- self,
|
|
|
- classification_items: List[Dict[str, Any]],
|
|
|
- pages_content: List[Dict[str, Any]],
|
|
|
- toc_info: Dict[str, Any],
|
|
|
- target_level: int,
|
|
|
- max_chunk_size: int,
|
|
|
- min_chunk_size: int,
|
|
|
- ) -> List[Dict[str, Any]]:
|
|
|
- """
|
|
|
- 按目录层级和字符数智能切分文本
|
|
|
-
|
|
|
- 逻辑与 PDF 处理完全一致
|
|
|
- """
|
|
|
- toc_pages = toc_info.get("toc_pages", []) or []
|
|
|
- all_toc_items = toc_info.get("toc_items", [])
|
|
|
-
|
|
|
- # 使用完整全文
|
|
|
- full_text = "".join(p.get("text", "") for p in pages_content)
|
|
|
-
|
|
|
- print(f" 正在定位{len(classification_items)}个已分类的标题...")
|
|
|
- print(f" 目录所在页: {toc_pages}")
|
|
|
-
|
|
|
- # 步骤1: 在正文中定位已分类的标题(跳过目录页)
|
|
|
- located = self._title_matcher.find_title_positions(
|
|
|
- classification_items, full_text, pages_content, toc_pages
|
|
|
- )
|
|
|
-
|
|
|
- # 只保留成功定位的标题
|
|
|
- found_titles = [t for t in located if t["found"]]
|
|
|
- if not found_titles:
|
|
|
- print(f" 错误: 未能在正文中定位任何标题")
|
|
|
- return []
|
|
|
-
|
|
|
- print(f" 成功定位 {len(found_titles)}/{len(classification_items)} 个标题")
|
|
|
-
|
|
|
- # 按位置排序
|
|
|
- found_titles.sort(key=lambda x: x["position"])
|
|
|
-
|
|
|
- # 步骤2: 构建一级目录标题到分类信息的映射
|
|
|
- chapter_classification_map: Dict[str, Dict[str, Any]] = {}
|
|
|
- for item in classification_items:
|
|
|
- if item.get("level") == 1:
|
|
|
- chapter_title = item.get("title", "")
|
|
|
- chapter_classification_map[chapter_title] = {
|
|
|
- "category": item.get("category", ""),
|
|
|
- "category_code": item.get("category_code", "other"),
|
|
|
- "page": item.get("page", ""),
|
|
|
- "level": item.get("level", 1),
|
|
|
- }
|
|
|
-
|
|
|
- # 步骤3: 为每个找到的标题构建完整的层级路径
|
|
|
- for title_info in found_titles:
|
|
|
- hierarchy_path = self._build_hierarchy_path(
|
|
|
- title_info["title"], all_toc_items, target_level
|
|
|
- )
|
|
|
- title_info["hierarchy_path"] = hierarchy_path
|
|
|
-
|
|
|
- # 步骤4: 按目录层级处理每个标题块
|
|
|
- all_chunks: List[Dict[str, Any]] = []
|
|
|
-
|
|
|
- for i, title_info in enumerate(found_titles):
|
|
|
- start_pos = title_info["position"]
|
|
|
-
|
|
|
- # 确定正文块的结束位置(下一个同级标题的位置)
|
|
|
- if i + 1 < len(found_titles):
|
|
|
- end_pos = found_titles[i + 1]["position"]
|
|
|
- else:
|
|
|
- end_pos = len(full_text)
|
|
|
-
|
|
|
- # 提取正文块
|
|
|
- content_block = full_text[start_pos:end_pos]
|
|
|
-
|
|
|
- # 在正文块中查找子标题(按最低层级切分)
|
|
|
- sub_chunks = self._split_by_sub_titles(
|
|
|
- content_block,
|
|
|
- all_toc_items,
|
|
|
- title_info,
|
|
|
- target_level,
|
|
|
- max_chunk_size,
|
|
|
- min_chunk_size,
|
|
|
- )
|
|
|
-
|
|
|
- # 为每个子块添加元数据
|
|
|
- for j, sub_chunk in enumerate(sub_chunks, 1):
|
|
|
- chunk_data = self._build_chunk_metadata(
|
|
|
- sub_chunk, title_info, start_pos, pages_content, i, j, chapter_classification_map
|
|
|
- )
|
|
|
- all_chunks.append(chunk_data)
|
|
|
-
|
|
|
- # 步骤4: 生成最终的chunk_id和serial_number
|
|
|
- final_chunks = self._finalize_chunk_ids(all_chunks)
|
|
|
-
|
|
|
- print(f" 初始切分: {len(all_chunks)} 个块")
|
|
|
- print(f" 最终块数: {len(final_chunks)} 个块")
|
|
|
-
|
|
|
- return final_chunks
|
|
|
-
|
|
|
- def _split_by_sub_titles(
|
|
|
- self,
|
|
|
- content_block: str,
|
|
|
- all_toc_items: List[Dict[str, Any]],
|
|
|
- parent_title_info: Dict[str, Any],
|
|
|
- target_level: int,
|
|
|
- max_chunk_size: int,
|
|
|
- min_chunk_size: int,
|
|
|
- ) -> List[Dict[str, Any]]:
|
|
|
- """
|
|
|
- 在正文块中按子标题进行切分(按照toc_items的顺序和层级关系)
|
|
|
-
|
|
|
- 核心逻辑:
|
|
|
- 1. 查找所有层级的子标题(不限于直接子标题)
|
|
|
- 2. 按位置排序后,两个相邻子标题之间的内容作为一个块
|
|
|
- 3. 只有当块超过 max_chunk_size 时才按句子切分
|
|
|
- """
|
|
|
- # 找到父标题在toc_items中的位置
|
|
|
- parent_title = parent_title_info["title"]
|
|
|
- parent_idx = -1
|
|
|
- parent_level = target_level
|
|
|
-
|
|
|
- for idx, toc_item in enumerate(all_toc_items):
|
|
|
- if toc_item["title"] == parent_title:
|
|
|
- parent_idx = idx
|
|
|
- parent_level = toc_item.get("level", target_level)
|
|
|
- break
|
|
|
-
|
|
|
- if parent_idx < 0:
|
|
|
- # 如果找不到父标题,将整个正文块作为一个块
|
|
|
- if len(content_block) > max_chunk_size:
|
|
|
- return self._split_large_chunk(content_block, max_chunk_size, parent_title, [])
|
|
|
- else:
|
|
|
- return [
|
|
|
- {
|
|
|
- "content": content_block,
|
|
|
- "relative_start": 0,
|
|
|
- "sub_title": "",
|
|
|
- "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
|
|
|
- }
|
|
|
- ]
|
|
|
-
|
|
|
- # 找到下一个同级或更高级标题的位置(确定父标题的范围)
|
|
|
- next_sibling_idx = len(all_toc_items)
|
|
|
- for idx in range(parent_idx + 1, len(all_toc_items)):
|
|
|
- item = all_toc_items[idx]
|
|
|
- if item.get("level", 1) <= parent_level:
|
|
|
- next_sibling_idx = idx
|
|
|
- break
|
|
|
-
|
|
|
- # 查找所有子标题(所有 level > parent_level 的标题)
|
|
|
- # 这是关键:不限于直接子标题,而是所有更深层级的标题
|
|
|
- all_sub_titles = []
|
|
|
- fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
|
|
|
-
|
|
|
- for idx in range(parent_idx + 1, next_sibling_idx):
|
|
|
- toc_item = all_toc_items[idx]
|
|
|
- item_level = toc_item.get("level", 1)
|
|
|
-
|
|
|
- # 查找所有更深层级的子标题
|
|
|
- if item_level > parent_level:
|
|
|
- # 在正文块中查找这个子标题
|
|
|
- pos = self._find_title_in_block(
|
|
|
- toc_item["title"], content_block, fuzzy_threshold
|
|
|
- )
|
|
|
- if pos >= 0:
|
|
|
- # 调试:显示找到的标题及其周围内容
|
|
|
- context_start = max(0, pos - 20)
|
|
|
- context_end = min(len(content_block), pos + len(toc_item["title"]) + 50)
|
|
|
- context = content_block[context_start:context_end].replace("\n", " ")
|
|
|
- print(f" 找到子标题: {toc_item['title']} (level={item_level}), 位置={pos}, 上下文: ...{context}...")
|
|
|
-
|
|
|
- all_sub_titles.append(
|
|
|
- {
|
|
|
- "title": toc_item["title"],
|
|
|
- "level": toc_item["level"],
|
|
|
- "position": pos,
|
|
|
- "toc_index": idx,
|
|
|
- "toc_item": toc_item,
|
|
|
- }
|
|
|
- )
|
|
|
-
|
|
|
- # 按位置排序
|
|
|
- all_sub_titles.sort(key=lambda x: x["position"])
|
|
|
-
|
|
|
- # 如果没有找到任何子标题,将整个正文块作为一个块
|
|
|
- if not all_sub_titles:
|
|
|
- if len(content_block) > max_chunk_size:
|
|
|
- return self._split_large_chunk(
|
|
|
- content_block, max_chunk_size, parent_title,
|
|
|
- parent_title_info.get("hierarchy_path", [parent_title])
|
|
|
- )
|
|
|
- else:
|
|
|
- return [
|
|
|
- {
|
|
|
- "content": content_block,
|
|
|
- "relative_start": 0,
|
|
|
- "sub_title": "",
|
|
|
- "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
|
|
|
- }
|
|
|
- ]
|
|
|
-
|
|
|
- # 找到直接子标题(parent_level + 1)和所有更深层级的标题
|
|
|
- direct_child_level = parent_level + 1
|
|
|
- direct_child_titles = [sub for sub in all_sub_titles if sub["level"] == direct_child_level]
|
|
|
-
|
|
|
- # 找到最低层级(用于判断哪些是最底层的标题)
|
|
|
- max_level = max(sub["level"] for sub in all_sub_titles) if all_sub_titles else parent_level
|
|
|
-
|
|
|
- print(f" 父标题: {parent_title}, 找到 {len(all_sub_titles)} 个子标题, 直接子标题数: {len(direct_child_titles)}, 最低层级: {max_level}")
|
|
|
-
|
|
|
- # 如果没有直接子标题,但有更深层级的标题,使用最低层级标题切分(保持向后兼容)
|
|
|
- if not direct_child_titles and all_sub_titles:
|
|
|
- lowest_level_titles = [sub for sub in all_sub_titles if sub["level"] == max_level]
|
|
|
- print(f" 没有直接子标题,使用最低层级标题切分: {len(lowest_level_titles)} 个")
|
|
|
- direct_child_titles = lowest_level_titles
|
|
|
-
|
|
|
- # 按直接子标题切分(如果存在)
|
|
|
- chunks = []
|
|
|
- if direct_child_titles:
|
|
|
- for i, sub_title in enumerate(direct_child_titles):
|
|
|
- start_pos = sub_title["position"]
|
|
|
-
|
|
|
- # 确定结束位置(下一个同级或更高级标题的位置)
|
|
|
- # 在 all_sub_titles 中查找下一个位置大于当前标题,且 level <= direct_child_level 的标题
|
|
|
- end_pos = len(content_block)
|
|
|
- for next_sub in all_sub_titles:
|
|
|
- if next_sub["position"] > start_pos and next_sub["level"] <= direct_child_level:
|
|
|
- end_pos = next_sub["position"]
|
|
|
- break
|
|
|
-
|
|
|
- chunk_content = content_block[start_pos:end_pos]
|
|
|
-
|
|
|
- # 调试信息
|
|
|
- content_preview = chunk_content[:100].replace("\n", " ")
|
|
|
- print(f" 切分块 {i+1}: {sub_title['title']} (level={sub_title['level']}), 位置: {start_pos}-{end_pos}, 长度: {len(chunk_content)}, 预览: {content_preview}...")
|
|
|
-
|
|
|
- # 检查子标题是否有实际正文内容
|
|
|
- title_len = len(sub_title["title"])
|
|
|
- content_after_title = chunk_content[title_len:].strip()
|
|
|
-
|
|
|
- if not content_after_title or len(content_after_title) < 10:
|
|
|
- print(f" 跳过(内容不足)")
|
|
|
- continue
|
|
|
-
|
|
|
- # 构建层级路径
|
|
|
- hierarchy_path = self._build_hierarchy_path_for_subtitle(
|
|
|
- sub_title["toc_item"], all_toc_items, parent_title_info
|
|
|
- )
|
|
|
-
|
|
|
- # 只有当块超过 max_chunk_size 时才按句子切分
|
|
|
- if len(chunk_content) > max_chunk_size:
|
|
|
- print(f" 块过大,按句子切分")
|
|
|
- split_chunks = self._split_large_chunk(
|
|
|
- chunk_content, max_chunk_size, sub_title["title"], hierarchy_path
|
|
|
- )
|
|
|
- for split_chunk in split_chunks:
|
|
|
- split_chunk["relative_start"] = start_pos + split_chunk["relative_start"]
|
|
|
- split_chunk["sub_title"] = sub_title["title"]
|
|
|
- if "hierarchy_path" not in split_chunk:
|
|
|
- split_chunk["hierarchy_path"] = hierarchy_path
|
|
|
- chunks.append(split_chunk)
|
|
|
- else:
|
|
|
- # 直接作为一个块
|
|
|
- chunks.append(
|
|
|
- {
|
|
|
- "content": chunk_content,
|
|
|
- "relative_start": start_pos,
|
|
|
- "sub_title": sub_title["title"],
|
|
|
- "hierarchy_path": hierarchy_path,
|
|
|
- }
|
|
|
- )
|
|
|
-
|
|
|
- # 如果所有子标题都没有正文内容,返回整个正文块
|
|
|
- if not chunks:
|
|
|
- if len(content_block) > max_chunk_size:
|
|
|
- return self._split_large_chunk(
|
|
|
- content_block, max_chunk_size, parent_title,
|
|
|
- parent_title_info.get("hierarchy_path", [parent_title])
|
|
|
- )
|
|
|
- else:
|
|
|
- return [
|
|
|
- {
|
|
|
- "content": content_block,
|
|
|
- "relative_start": 0,
|
|
|
- "sub_title": "",
|
|
|
- "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
|
|
|
- }
|
|
|
- ]
|
|
|
-
|
|
|
- return chunks
|
|
|
-
|
|
|
- def _find_title_in_block(self, title: str, block: str, fuzzy_threshold: float) -> int:
|
|
|
- """在文本块中查找标题位置(简化版)"""
|
|
|
- # 直接使用 TitleMatcher 的方法
|
|
|
- return self._title_matcher._find_title_in_text(title, block, fuzzy_threshold)
|
|
|
-
|
|
|
- def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
|
|
|
- """根据位置获取页码"""
|
|
|
- for page in pages_content:
|
|
|
- if page["start_pos"] <= pos < page["end_pos"]:
|
|
|
- return int(page["page_num"])
|
|
|
- return 1
|