title = "第十章 其他资料"
positions = matcher._find_full_title_positions(title, full_text)
# 返回: [4524, 43321] - 这两个位置是正确的
best_pos = matcher._select_best_position(positions, full_text, title)
# 但实际流程中返回的是: 32460 - 错误位置!
line_start = full_text.rfind('\n', 0, 32460) + 1 # = 32436
line_end = full_text.find('\n', 32460) # = 32473
line_text = full_text[32436:32473]
# 结果: 'XXXX 公司 XXX 专项施工方案'
# 关键:这一行根本不包含"第十章"!
"第十章" in line_text # False
line_start = full_text.rfind('\n', 0, 43321) + 1 # = 43308
line_end = full_text.find('\n', 43321) # = 43336
line_text = full_text[43308:43336]
# 结果: '第十章 其他资料'
"第十章" in line_text # True
_find_full_title_positions 实现def _find_full_title_positions(self, title: str, text: str) -> List[int]:
positions = []
lines = text.split('\n')
current_pos = 0
for i, line in enumerate(lines):
line_clean = self._remove_escape_chars(line)
line_normalized = self._normalize_title(line_clean)
if title_normalized in line_normalized:
pos_in_line = line_normalized.find(title_normalized)
line_pos = self._find_pattern_in_line(
title_normalized, line, pos_in_line
)
if line_pos >= 0:
found_pos = current_pos + line_pos # <-- 问题可能在这里
positions.append(found_pos)
current_pos += len(line) + 1 # <-- 或者这里
位置计算不一致:
current_pos 是基于原始行 line 的累加line_pos 是通过 _find_pattern_in_line(title_normalized, line, pos_in_line) 计算pos_in_line 是在 line_normalized 中的位置,但传给 _find_pattern_in_line 的是原始 line_find_pattern_in_line 可能的 bug:
def _find_pattern_in_line(self, pattern: str, line: str, normalized_pos: int) -> int:
# 这个函数需要根据 normalized_pos 在原始 line 中找到对应位置
# 如果映射逻辑有误,就会返回错误的 line_pos
字符处理导致偏移:
_normalize_title 可能会增删字符(如合并多个空格)_remove_escape_chars 会删除字符运行以下代码可以验证问题:
import fitz
from core.construction_review.component.doc_worker.utils.title_matcher import TitleMatcher
file_path = 'D:/wx_work/sichuan_luqiao/lu_sgsc_testfile/测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf'
doc = fitz.open(file_path)
full_text = ""
for page in doc:
full_text += page.get_text()
matcher = TitleMatcher()
title = "第十章 其他资料"
# 1. 查找所有位置
positions = matcher._find_full_title_positions(title, full_text)
print(f"找到的位置: {positions}") # [4524, 43321]
# 2. 手动检查位置 32460
pos = 32460
line_start = full_text.rfind('\n', 0, pos) + 1
line_end = full_text.find('\n', pos)
line_text = full_text[line_start:line_end]
print(f"位置 {pos} 所在行: {repr(line_text)}")
print(f"是否包含'第十章': {'第十章' in line_text}")
# 3. 验证正确位置 43321
pos = 43321
line_start = full_text.rfind('\n', 0, pos) + 1
line_end = full_text.find('\n', pos)
line_text = full_text[line_start:line_end]
print(f"位置 {pos} 所在行: {repr(line_text)}")
print(f"是否包含'第十章': {'第十章' in line_text}")
直接使用正则表达式在原始文本中搜索,避免复杂的位置映射:
def _find_full_title_positions(self, title: str, text: str) -> List[int]:
import re
# 构建兼容空格变体的模式
title_parts = title.split()
pattern = r'\s*'.join(re.escape(part) for part in title_parts)
positions = []
for m in re.finditer(pattern, text):
# 验证上下文是否真的是标题行
line_start = text.rfind('\n', 0, m.start()) + 1
line_end = text.find('\n', m.end())
line = text[line_start:line_end].strip()
# 标题应该独占一行或在行首
if line.startswith(title.strip()) or self._is_line_only_title(line, title):
positions.append(m.start())
return positions
如果需要保留现有逻辑,修复 _find_pattern_in_line 中的位置映射:
def _find_pattern_in_line(self, pattern: str, line: str, normalized_pos: int) -> int:
"""
在原始行中找到标准化后模式的对应位置。
关键:需要建立 normalized_line 和原始 line 之间的字符级映射
"""
# 1. 构建位置映射表
normalized_line = self._normalize_title(self._remove_escape_chars(line))
# 2. 验证 normalized_pos 处的文本确实匹配 pattern
if normalized_line[normalized_pos:normalized_pos+len(pattern)] != pattern:
return -1
# 3. 将 normalized_pos 映射回原始行的位置
# 这需要跟踪每个字符在清理前后的位置变化
original_pos = self._map_normalized_to_original(line, normalized_pos)
return original_pos
利用目录信息缩小标题搜索范围,减少误判:
def find_title_positions(self, items, full_text, pages_content, toc_pages):
# ...
for item in items:
title = item['title']
# 从 TOC 获取该标题应该在的页码
expected_page = self._get_expected_page_from_toc(title, toc_info)
if expected_page:
# 只在对应页面范围内搜索
search_start = pages_content[expected_page-1]['start_pos']
search_end = pages_content[expected_page-1]['end_pos']
search_text = full_text[search_start:search_end]
positions = self._find_full_title_positions(title, search_text)
positions = [p + search_start for p in positions]
else:
positions = self._find_full_title_positions(title, full_text)
# ...
core/construction_review/component/doc_worker/utils/title_matcher.py
_find_full_title_positions_find_pattern_in_line_select_best_positioncore/construction_review/component/doc_worker/pdf_worker/text_splitter.py
_get_toc_boundary_position (边界保护逻辑)split_by_hierarchy