|
|
@@ -1,1049 +0,0 @@
|
|
|
-"""
|
|
|
-标题匹配工具
|
|
|
-
|
|
|
-简化版的 TitleMatcher,只保留与 PDF 处理相关的逻辑,
|
|
|
-用于在全文中查找目录标题对应的正文位置。
|
|
|
-"""
|
|
|
-
|
|
|
-from __future__ import annotations
|
|
|
-
|
|
|
-import re
|
|
|
-from difflib import SequenceMatcher
|
|
|
-from typing import Any, Dict, List
|
|
|
-
|
|
|
-from ..config.provider import default_config_provider
|
|
|
-from foundation.observability.logger.loggering import review_logger as logger
|
|
|
-
|
|
|
-
|
|
|
-class TitleMatcher:
|
|
|
- """标题匹配器。"""
|
|
|
-
|
|
|
- def __init__(self) -> None:
|
|
|
- self._cfg = default_config_provider
|
|
|
-
|
|
|
- def find_title_positions(
|
|
|
- self,
|
|
|
- classified_items: List[Dict[str, Any]],
|
|
|
- full_text: str,
|
|
|
- pages_content: List[Dict[str, Any]],
|
|
|
- toc_pages: List[int],
|
|
|
- ) -> List[Dict[str, Any]]:
|
|
|
- """
|
|
|
- 在正文中定位已分类标题(跳过目录页范围)。
|
|
|
-
|
|
|
- 优化逻辑(参考 doc_worker):
|
|
|
- 1. 先在全文中查找标题位置
|
|
|
- 2. 如果找到的位置在目录页范围内,继续在目录页之后查找
|
|
|
- 3. 如果找到的位置不在目录页范围内,直接使用该位置
|
|
|
-
|
|
|
- 修复:支持多位置匹配,结合 toc_page 进行页码择优,
|
|
|
- 避免将目录中的靠前匹配误当作正文标题,导致后续章节内容被错误合并。
|
|
|
- """
|
|
|
- # 计算目录页的文本范围
|
|
|
- toc_start_pos = float("inf")
|
|
|
- toc_end_pos = 0
|
|
|
- for page in pages_content:
|
|
|
- if page["page_num"] in toc_pages:
|
|
|
- toc_start_pos = min(toc_start_pos, page["start_pos"])
|
|
|
- toc_end_pos = max(toc_end_pos, page["end_pos"])
|
|
|
-
|
|
|
- logger.debug(f" 目录页范围: {toc_start_pos} - {toc_end_pos}")
|
|
|
-
|
|
|
- located: List[Dict[str, Any]] = []
|
|
|
- fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
|
|
|
- page_tolerance = int(self._cfg.get("text_splitting.page_tolerance", 10))
|
|
|
-
|
|
|
- for item in classified_items:
|
|
|
- title = item["title"]
|
|
|
- category = item.get("category", "")
|
|
|
- category_code = item.get("category_code", "other")
|
|
|
- toc_page = item.get("page", "")
|
|
|
-
|
|
|
- # 步骤1: 查找所有匹配位置(完整标题 + 正文部分),并排除目录页
|
|
|
- all_positions = self._find_all_valid_title_positions(
|
|
|
- title, full_text, fuzzy_threshold, toc_start_pos, toc_end_pos
|
|
|
- )
|
|
|
-
|
|
|
- pos = -1
|
|
|
- if all_positions:
|
|
|
- # 步骤2: 如果有多个有效位置,根据 toc_page 选择最接近的位置
|
|
|
- if len(all_positions) > 1 and toc_page:
|
|
|
- try:
|
|
|
- toc_page_num = int(toc_page)
|
|
|
- best_pos = all_positions[0]
|
|
|
- best_diff = abs(self._get_page_number(best_pos, pages_content) - toc_page_num)
|
|
|
- for candidate_pos in all_positions[1:]:
|
|
|
- candidate_page = self._get_page_number(candidate_pos, pages_content)
|
|
|
- diff = abs(candidate_page - toc_page_num)
|
|
|
- if diff < best_diff:
|
|
|
- best_diff = diff
|
|
|
- best_pos = candidate_pos
|
|
|
- pos = best_pos
|
|
|
- except ValueError:
|
|
|
- pos = all_positions[0]
|
|
|
- else:
|
|
|
- pos = all_positions[0]
|
|
|
-
|
|
|
- # 步骤3: 确认位置并添加到结果
|
|
|
- if pos >= 0:
|
|
|
- page_num = self._get_page_number(pos, pages_content)
|
|
|
- # 页码校验:如果实际页码与目录页码差距过大,且存在其他候选,则标记为可疑
|
|
|
- if toc_page:
|
|
|
- try:
|
|
|
- toc_page_num = int(toc_page)
|
|
|
- if abs(page_num - toc_page_num) > page_tolerance:
|
|
|
- logger.warning(f" 标题 '{title}' 匹配位置页码({page_num})与目录页码({toc_page_num})差距过大,可能存在错误匹配")
|
|
|
- except ValueError:
|
|
|
- pass
|
|
|
- located.append(
|
|
|
- {
|
|
|
- "title": title,
|
|
|
- "category": category,
|
|
|
- "category_code": category_code,
|
|
|
- "position": pos,
|
|
|
- "toc_page": toc_page,
|
|
|
- "actual_page": page_num,
|
|
|
- "found": True,
|
|
|
- }
|
|
|
- )
|
|
|
- else:
|
|
|
- located.append(
|
|
|
- {
|
|
|
- "title": title,
|
|
|
- "category": category,
|
|
|
- "category_code": category_code,
|
|
|
- "position": -1,
|
|
|
- "toc_page": toc_page,
|
|
|
- "found": False,
|
|
|
- }
|
|
|
- )
|
|
|
-
|
|
|
- return located
|
|
|
-
|
|
|
- def _find_all_valid_title_positions(
|
|
|
- self,
|
|
|
- title: str,
|
|
|
- text: str,
|
|
|
- fuzzy_threshold: float,
|
|
|
- toc_start_pos: float,
|
|
|
- toc_end_pos: float,
|
|
|
- ) -> List[int]:
|
|
|
- """
|
|
|
- 查找标题在正文中的所有有效位置(排除目录页范围),并按位置排序。
|
|
|
-
|
|
|
- 策略:
|
|
|
- 1. 先找完整标题的所有位置;
|
|
|
- 2. 如果完整标题没找到,再找标题正文部分的所有位置;
|
|
|
- 3. 过滤掉目录页范围内的位置。
|
|
|
- """
|
|
|
- positions: List[int] = []
|
|
|
-
|
|
|
- # 方法1: 完整标题匹配
|
|
|
- full_positions = self._find_full_title_positions(title, text)
|
|
|
- if full_positions:
|
|
|
- positions = full_positions
|
|
|
- else:
|
|
|
- # 方法2: 标题正文部分匹配
|
|
|
- title_content = self._extract_title_content(title)
|
|
|
- if title_content:
|
|
|
- content_positions = self._find_content_positions(title_content, text)
|
|
|
- if content_positions:
|
|
|
- positions = content_positions
|
|
|
- # 如果标题正文也没找到,回退到模糊匹配
|
|
|
- if not positions:
|
|
|
- legacy_pos = self._find_title_in_text_legacy(title, text, fuzzy_threshold)
|
|
|
- if legacy_pos >= 0:
|
|
|
- positions = [legacy_pos]
|
|
|
-
|
|
|
- # 过滤目录页范围
|
|
|
- valid_positions = [
|
|
|
- p for p in positions
|
|
|
- if not (toc_end_pos > 0 and toc_start_pos <= p < toc_end_pos)
|
|
|
- ]
|
|
|
-
|
|
|
- return sorted(valid_positions)
|
|
|
-
|
|
|
- def _find_title_in_text(self, title: str, text: str, fuzzy_threshold: float) -> int:
|
|
|
- """
|
|
|
- 在文本中查找标题的近似位置(返回标题在文本中的精确起始位置)。
|
|
|
-
|
|
|
- 优化后的匹配策略:
|
|
|
- 1. 先用完整标题进行定位
|
|
|
- 2. 如果定位不到,再用标题的正文部分进行定位
|
|
|
- 3. 定位到多个位置的元素,选用元素独占一行的(只有标题正文,没有其他非转义字符)
|
|
|
- """
|
|
|
- # 步骤1: 先用完整标题进行定位
|
|
|
- full_title_positions = self._find_full_title_positions(title, text)
|
|
|
-
|
|
|
- if full_title_positions:
|
|
|
- # 如果找到完整标题的多个位置,优先选择独占一行的
|
|
|
- best_pos = self._select_best_position(full_title_positions, text, title)
|
|
|
- if best_pos >= 0:
|
|
|
- return best_pos
|
|
|
- # 如果找不到独占一行的,返回第一个位置
|
|
|
- return full_title_positions[0]
|
|
|
-
|
|
|
- # 步骤2: 如果完整标题定位不到,再用标题的正文部分进行定位
|
|
|
- title_content = self._extract_title_content(title)
|
|
|
-
|
|
|
- if not title_content:
|
|
|
- # 如果没有正文部分,使用原来的逻辑
|
|
|
- return self._find_title_in_text_legacy(title, text, fuzzy_threshold)
|
|
|
-
|
|
|
- # 查找所有匹配标题正文部分的位置
|
|
|
- content_positions = self._find_content_positions(title_content, text)
|
|
|
-
|
|
|
- if not content_positions:
|
|
|
- # 如果没有找到任何位置,使用模糊匹配
|
|
|
- return self._find_title_in_text_legacy(title, text, fuzzy_threshold)
|
|
|
-
|
|
|
- # 步骤3: 定位到多个位置的元素,选用元素独占一行的
|
|
|
- best_pos = self._select_best_position(content_positions, text, title_content)
|
|
|
- if best_pos >= 0:
|
|
|
- return best_pos
|
|
|
-
|
|
|
- # 如果找不到独占一行的,返回第一个位置
|
|
|
- return content_positions[0]
|
|
|
-
|
|
|
- def _is_likely_title_position(self, line: str, pos: int, title: str) -> bool:
|
|
|
- """
|
|
|
- 判断给定位置是否可能是真正的章节标题位置。
|
|
|
-
|
|
|
- 真正的章节标题通常满足以下条件之一:
|
|
|
- 1. 在行首(pos == 0)
|
|
|
- 2. 前面只有章节编号(如"一、",很短)
|
|
|
- 3. 独占一行(行内容基本就是标题)
|
|
|
-
|
|
|
- 参数:
|
|
|
- line: 行文本(已标准化)
|
|
|
- pos: 标题在行中的位置
|
|
|
- title: 标题文本
|
|
|
-
|
|
|
- 返回:
|
|
|
- bool: 如果可能是真正的标题位置则返回True
|
|
|
- """
|
|
|
- # 如果在行首,肯定是标题
|
|
|
- if pos == 0:
|
|
|
- return True
|
|
|
-
|
|
|
- # 检查标题前面的内容
|
|
|
- prefix = line[:pos].strip()
|
|
|
-
|
|
|
- # 如果前面有内容,检查是否是章节编号(如"一、")
|
|
|
- if prefix:
|
|
|
- # 真正的标题前面应该是章节编号(很短)
|
|
|
- # 如果前缀超过5个字符且包含中文词汇,则不是编号
|
|
|
- if len(prefix) > 5:
|
|
|
- # 检查前缀是否包含常见的中文动词或介词(表明是正文而不是编号)
|
|
|
- common_words = ['于', '在', '至', '向', '从', '把', '被', '将', '和', '与', '及', '或', '放', '置', '见', '如']
|
|
|
- for word in common_words:
|
|
|
- if word in prefix:
|
|
|
- return False
|
|
|
-
|
|
|
- # 检查前缀是否像章节编号(只包含数字、中文数字、标点)
|
|
|
- chapter_pattern = r'^[一二三四五六七八九十\d\s、..]*$'
|
|
|
- if not re.match(chapter_pattern, prefix):
|
|
|
- return False
|
|
|
- elif len(prefix) > 3:
|
|
|
- # 长度在3-5之间,检查是否包含明显的正文词汇
|
|
|
- common_words = ['放置', '置于', '见第', '详见', '参见']
|
|
|
- for word in common_words:
|
|
|
- if word in prefix:
|
|
|
- return False
|
|
|
-
|
|
|
- # 长度小于等于3,可能是编号,接受
|
|
|
- return True
|
|
|
-
|
|
|
- return True
|
|
|
-
|
|
|
- def _estimate_position_in_original(self, line: str, line_normalized: str, pos_in_normalized: int, is_no_space: bool = False) -> int:
|
|
|
- """
|
|
|
- 估算位置在原始行中的对应位置。
|
|
|
-
|
|
|
- 策略:
|
|
|
- 1. 首先尝试在原始行中直接查找标题的关键部分(如"第十章")
|
|
|
- 2. 如果找不到,使用比例映射进行估算
|
|
|
-
|
|
|
- 参数:
|
|
|
- line: 原始行文本
|
|
|
- line_normalized: 标准化后的行文本
|
|
|
- pos_in_normalized: 在标准化行中的位置
|
|
|
- is_no_space: 是否是无空格版本
|
|
|
-
|
|
|
- 返回:
|
|
|
- int: 在原始行中的估算位置
|
|
|
- """
|
|
|
- # 提取标题的关键部分(如"第十章")
|
|
|
- # 尝试找到章节号模式
|
|
|
- chapter_pattern = r'第[一二三四五六七八九十\d]+[章节条款部分]'
|
|
|
- match = re.search(chapter_pattern, line_normalized[pos_in_normalized:])
|
|
|
- if match:
|
|
|
- key_part = match.group(0)
|
|
|
- # 在原始行中查找这个关键部分
|
|
|
- if key_part in line:
|
|
|
- return line.index(key_part)
|
|
|
-
|
|
|
- # 如果找不到关键部分,使用比例映射
|
|
|
- if len(line_normalized) > 0:
|
|
|
- ratio = pos_in_normalized / len(line_normalized)
|
|
|
- estimated_pos = int(ratio * len(line))
|
|
|
- return estimated_pos
|
|
|
-
|
|
|
- return 0
|
|
|
-
|
|
|
- def _find_title_in_original_line(self, line: str, title: str, pos_in_normalized: int = None, is_no_space: bool = False) -> int:
|
|
|
- """
|
|
|
- 在原始行中查找标题的位置。
|
|
|
-
|
|
|
- 这是一个简化的方法,直接在原始行中查找标题的几种可能形式:
|
|
|
- 1. 原始标题文本
|
|
|
- 2. 移除空格后的标题文本
|
|
|
- 3. 标准化后的标题文本
|
|
|
-
|
|
|
- 参数:
|
|
|
- line: 原始行文本
|
|
|
- title: 标题文本(可能是标准化后的或无空格版本)
|
|
|
- pos_in_normalized: 标题在标准化行中的位置(可选)
|
|
|
- is_no_space: 是否是无空格版本
|
|
|
-
|
|
|
- 返回:
|
|
|
- int: 标题在原始行中的位置,如果未找到则返回-1
|
|
|
- """
|
|
|
- # 策略1: 直接在原始行中查找
|
|
|
- if title in line:
|
|
|
- return line.index(title)
|
|
|
-
|
|
|
- # 策略2: 如果是无空格版本,尝试在原始行中查找(可能原始行有空格)
|
|
|
- if is_no_space:
|
|
|
- # 尝试在原始行中逐字符匹配
|
|
|
- for i in range(len(line) - len(title) + 1):
|
|
|
- window = line[i:i + len(title) * 2] # 取一个稍大的窗口
|
|
|
- window_clean = self._remove_escape_chars(window).replace(' ', '')
|
|
|
- if title in window_clean:
|
|
|
- return i
|
|
|
- return -1
|
|
|
-
|
|
|
- # 策略3: 使用位置信息进行估算
|
|
|
- if pos_in_normalized is not None:
|
|
|
- # 基于位置比例进行估算
|
|
|
- line_clean = self._remove_escape_chars(line)
|
|
|
- line_normalized = self._normalize_title(line_clean)
|
|
|
- if len(line_normalized) > 0 and pos_in_normalized < len(line_normalized):
|
|
|
- ratio = pos_in_normalized / len(line_normalized)
|
|
|
- estimated_pos = int(ratio * len(line))
|
|
|
- # 在估算位置附近查找
|
|
|
- search_start = max(0, estimated_pos - 10)
|
|
|
- search_end = min(len(line), estimated_pos + len(title) + 10)
|
|
|
- for i in range(search_start, search_end):
|
|
|
- if i + len(title) > len(line):
|
|
|
- break
|
|
|
- window = line[i:i + len(title)]
|
|
|
- window_clean = self._remove_escape_chars(window)
|
|
|
- if title in window_clean or window_clean in title:
|
|
|
- return i
|
|
|
- return estimated_pos
|
|
|
-
|
|
|
- return -1
|
|
|
-
|
|
|
- def _find_full_title_positions(self, title: str, text: str) -> List[int]:
|
|
|
- """
|
|
|
- 查找完整标题在文本中的所有位置。
|
|
|
-
|
|
|
- 支持两种格式:
|
|
|
- 1. 单行标题:"第一章 编制依据"
|
|
|
- 2. 跨行标题:"第一章\n编制依据"(PDF中章节号和标题可能分行)
|
|
|
-
|
|
|
- 返回:
|
|
|
- List[int]: 所有匹配位置的列表
|
|
|
- """
|
|
|
- positions = []
|
|
|
-
|
|
|
- # 移除转义字符后的标题
|
|
|
- title_clean = self._remove_escape_chars(title)
|
|
|
- title_normalized = self._normalize_title(title_clean)
|
|
|
- title_no_space = title_normalized.replace(' ', '')
|
|
|
-
|
|
|
- if not title_normalized:
|
|
|
- return positions
|
|
|
-
|
|
|
- # 按行查找(更高效)
|
|
|
- lines = text.split('\n')
|
|
|
- current_pos = 0
|
|
|
-
|
|
|
- for i, line in enumerate(lines):
|
|
|
- line_clean = self._remove_escape_chars(line)
|
|
|
- line_normalized = self._normalize_title(line_clean)
|
|
|
- line_no_space = line_normalized.replace(' ', '')
|
|
|
-
|
|
|
- # 情况1: 检查行中是否包含完整标题(标准化版本,有空格)
|
|
|
- if title_normalized in line_normalized:
|
|
|
- pos_in_line = line_normalized.find(title_normalized)
|
|
|
- if pos_in_line >= 0:
|
|
|
- # 只接受行首的标题(真正的章节标题应该在行首)
|
|
|
- if pos_in_line == 0 or self._is_likely_title_position(line_normalized, pos_in_line, title_normalized):
|
|
|
- # 简化处理:直接使用 pos_in_line 作为行内偏移
|
|
|
- # 因为 line_normalized 和 line 的字符基本对应(除了转义字符)
|
|
|
- # 对于行首匹配或简单情况,直接使用 pos_in_line
|
|
|
- if pos_in_line == 0:
|
|
|
- # 行首匹配,直接使用 current_pos
|
|
|
- positions.append(current_pos)
|
|
|
- else:
|
|
|
- # 需要找到原始行中对应的位置
|
|
|
- # 简单估算:使用比例映射
|
|
|
- line_pos = self._estimate_position_in_original(line, line_normalized, pos_in_line)
|
|
|
- if line_pos >= 0:
|
|
|
- positions.append(current_pos + line_pos)
|
|
|
-
|
|
|
- # 情况2: 移除空格后查找(处理无空格版本)
|
|
|
- if title_no_space and title_no_space in line_no_space:
|
|
|
- pos_in_line_no_space = line_no_space.find(title_no_space)
|
|
|
- if pos_in_line_no_space >= 0:
|
|
|
- # 检查这是否是行首匹配(真正的章节标题应该在行首)
|
|
|
- if pos_in_line_no_space == 0 or self._is_likely_title_position(line_no_space, pos_in_line_no_space, title_no_space):
|
|
|
- if pos_in_line_no_space == 0:
|
|
|
- # 行首匹配,直接使用 current_pos
|
|
|
- pos = current_pos
|
|
|
- if pos not in positions:
|
|
|
- positions.append(pos)
|
|
|
- else:
|
|
|
- # 需要找到原始行中对应的位置
|
|
|
- line_pos = self._estimate_position_in_original(line, line_no_space, pos_in_line_no_space, is_no_space=True)
|
|
|
- if line_pos >= 0:
|
|
|
- pos = current_pos + line_pos
|
|
|
- if pos not in positions:
|
|
|
- positions.append(pos)
|
|
|
-
|
|
|
- # 跨行标题匹配:检查当前行+下一行合并后是否匹配
|
|
|
- # 这种情况发生在PDF中章节号(如"第一章")和标题正文(如"编制依据")分行显示
|
|
|
- if i + 1 < len(lines):
|
|
|
- next_line = lines[i + 1]
|
|
|
- next_line_clean = self._remove_escape_chars(next_line)
|
|
|
- next_line_normalized = self._normalize_title(next_line_clean)
|
|
|
- next_line_no_space = next_line_normalized.replace(' ', '')
|
|
|
-
|
|
|
- # 合并两行(去掉中间换行)
|
|
|
- # 注意:合并时需要在两行之间添加一个空格,因为换行通常等同于空格
|
|
|
- combined = line_normalized + ' ' + next_line_normalized
|
|
|
- combined_no_space = line_no_space + next_line_no_space
|
|
|
-
|
|
|
- # 检查合并后是否匹配标题(考虑有空格和无空格两种情况)
|
|
|
- is_match = (
|
|
|
- title_normalized in combined or
|
|
|
- title_normalized in combined_no_space or
|
|
|
- title_no_space in combined_no_space
|
|
|
- )
|
|
|
-
|
|
|
- if is_match:
|
|
|
- # 找到了跨行匹配,但需要检查这是否是真正的标题位置
|
|
|
- # 优先匹配标题正文部分在下一行的位置
|
|
|
- title_content = self._extract_title_content(title_normalized)
|
|
|
- if title_content and title_content in next_line_normalized:
|
|
|
- # 标题正文在下一行,检查下一行是否以标题正文开头
|
|
|
- content_pos = next_line_normalized.find(title_content)
|
|
|
- if content_pos == 0 or self._is_likely_title_position(next_line_normalized, content_pos, title_content):
|
|
|
- # 返回下一行的起始位置
|
|
|
- next_line_pos = current_pos + len(line) + 1 # +1 for newline
|
|
|
- positions.append(next_line_pos)
|
|
|
- else:
|
|
|
- # 检查当前行是否以章节号开头(如"第十章")
|
|
|
- # 跨行匹配时,当前行应该只包含章节号,而不应该包含其他正文内容
|
|
|
- title_number = self._extract_title_number(title_normalized)
|
|
|
- if title_number and line_normalized.strip().startswith(title_number):
|
|
|
- # 检查当前行在章节号之后是否只有空白或标点
|
|
|
- remaining = line_normalized.strip()[len(title_number):].strip()
|
|
|
- # 如果章节号后面没有内容,或者只有标点/空格,则认为是真正的标题
|
|
|
- if not remaining or re.match(r'^[、..\s]*$', remaining):
|
|
|
- # 返回当前行位置
|
|
|
- positions.append(current_pos)
|
|
|
-
|
|
|
- current_pos += len(line) + 1 # +1 for newline
|
|
|
-
|
|
|
- # 去重并排序
|
|
|
- return sorted(set(positions))
|
|
|
-
|
|
|
- def _find_content_positions(self, title_content: str, text: str) -> List[int]:
|
|
|
- """
|
|
|
- 查找标题正文部分在文本中的所有位置
|
|
|
-
|
|
|
- 返回:
|
|
|
- List[int]: 所有匹配位置的列表
|
|
|
- """
|
|
|
- positions = []
|
|
|
-
|
|
|
- # 移除转义字符后的文本和标题正文
|
|
|
- text_clean = self._remove_escape_chars(text)
|
|
|
- title_content_clean = self._remove_escape_chars(title_content)
|
|
|
- title_content_normalized = self._normalize_title(title_content_clean)
|
|
|
-
|
|
|
- if not title_content_normalized:
|
|
|
- return positions
|
|
|
-
|
|
|
- # 按行查找(更高效)
|
|
|
- lines = text.split('\n')
|
|
|
- current_pos = 0
|
|
|
-
|
|
|
- for line in lines:
|
|
|
- line_clean = self._remove_escape_chars(line)
|
|
|
- line_normalized = self._normalize_title(line_clean)
|
|
|
-
|
|
|
- # 检查行中是否包含标题正文
|
|
|
- if title_content_normalized in line_normalized:
|
|
|
- pos_in_line = line_normalized.find(title_content_normalized)
|
|
|
- if pos_in_line >= 0:
|
|
|
- line_pos = self._find_pattern_in_line(
|
|
|
- title_content_normalized, line, pos_in_line
|
|
|
- )
|
|
|
- if line_pos >= 0:
|
|
|
- positions.append(current_pos + line_pos)
|
|
|
-
|
|
|
- # 移除空格后查找
|
|
|
- title_no_space = title_content_normalized.replace(' ', '')
|
|
|
- line_no_space = line_normalized.replace(' ', '')
|
|
|
- if title_no_space and title_no_space in line_no_space:
|
|
|
- pos_in_line = line_no_space.find(title_no_space)
|
|
|
- if pos_in_line >= 0:
|
|
|
- line_pos = self._find_pattern_in_line(
|
|
|
- title_no_space, line, pos_in_line
|
|
|
- )
|
|
|
- if line_pos >= 0:
|
|
|
- pos = current_pos + line_pos
|
|
|
- if pos not in positions:
|
|
|
- positions.append(pos)
|
|
|
-
|
|
|
- current_pos += len(line) + 1 # +1 for newline
|
|
|
-
|
|
|
- # 去重并排序
|
|
|
- return sorted(set(positions))
|
|
|
-
|
|
|
- def _select_best_position(self, positions: List[int], text: str, title_or_content: str) -> int:
|
|
|
- """
|
|
|
- 从多个位置中选择最佳位置(优先选择独占一行的)
|
|
|
-
|
|
|
- 参数:
|
|
|
- positions: 候选位置列表
|
|
|
- text: 全文
|
|
|
- title_or_content: 标题或标题正文部分
|
|
|
-
|
|
|
- 返回:
|
|
|
- int: 最佳位置,如果找不到独占一行的则返回-1
|
|
|
- """
|
|
|
- if not positions:
|
|
|
- return -1
|
|
|
-
|
|
|
- # 移除转义字符后的标题
|
|
|
- title_clean = self._remove_escape_chars(title_or_content)
|
|
|
- title_normalized = self._normalize_title(title_clean)
|
|
|
-
|
|
|
- if not title_normalized:
|
|
|
- return -1
|
|
|
-
|
|
|
- best_pos = -1
|
|
|
- best_score = -1
|
|
|
-
|
|
|
- for pos in positions:
|
|
|
- # 找到该位置所在的行
|
|
|
- line_start = text.rfind('\n', 0, pos) + 1
|
|
|
- line_end = text.find('\n', pos)
|
|
|
- if line_end == -1:
|
|
|
- line_end = len(text)
|
|
|
-
|
|
|
- line_text = text[line_start:line_end]
|
|
|
- line_clean = self._remove_escape_chars(line_text).strip()
|
|
|
-
|
|
|
- # 检查该行是否只包含标题(没有其他非转义字符)
|
|
|
- if self._is_line_only_title(line_clean, title_normalized):
|
|
|
- # 计算匹配度(行越短、越接近标题,分数越高)
|
|
|
- score = 1000 - len(line_clean)
|
|
|
- if score > best_score:
|
|
|
- best_score = score
|
|
|
- best_pos = pos
|
|
|
-
|
|
|
- return best_pos
|
|
|
-
|
|
|
- def _find_title_in_text_legacy(self, title: str, text: str, fuzzy_threshold: float) -> int:
|
|
|
- """
|
|
|
- 原有的标题查找逻辑(作为回退方案)
|
|
|
- """
|
|
|
- # 移除转义字符后的标题和文本
|
|
|
- title_clean = self._remove_escape_chars(title)
|
|
|
- text_clean = self._remove_escape_chars(text)
|
|
|
-
|
|
|
- # 标准化标题(统一空白字符)
|
|
|
- normalized_title = self._normalize_title(title_clean)
|
|
|
-
|
|
|
- if not normalized_title:
|
|
|
- return -1
|
|
|
-
|
|
|
- # 方法1: 在清理后的文本中精确匹配,然后映射回原始位置
|
|
|
- if normalized_title in text_clean:
|
|
|
- pos_in_clean = text_clean.index(normalized_title)
|
|
|
- # 映射回原始文本的位置
|
|
|
- original_pos = self._map_clean_position_to_original(pos_in_clean, text, text_clean, normalized_title)
|
|
|
- if original_pos >= 0:
|
|
|
- return original_pos
|
|
|
-
|
|
|
- # 方法2: 移除所有空格后匹配
|
|
|
- title_no_space = normalized_title.replace(' ', '')
|
|
|
- text_clean_no_space = text_clean.replace(' ', '')
|
|
|
- if title_no_space and title_no_space in text_clean_no_space:
|
|
|
- pos_in_clean_no_space = text_clean_no_space.index(title_no_space)
|
|
|
- # 映射回原始文本的位置
|
|
|
- original_pos = self._map_clean_position_to_original(pos_in_clean_no_space, text, text_clean_no_space, title_no_space)
|
|
|
- if original_pos >= 0:
|
|
|
- return original_pos
|
|
|
-
|
|
|
- # 方法3: 按行查找,匹配度最高的行
|
|
|
- lines_original = text.split('\n')
|
|
|
- current_pos_original = 0
|
|
|
- best_ratio = 0.0
|
|
|
- best_pos = -1
|
|
|
-
|
|
|
- for line_original in lines_original:
|
|
|
- line_clean = self._remove_escape_chars(line_original)
|
|
|
- line_stripped = line_clean.strip()
|
|
|
-
|
|
|
- if len(line_stripped) < 3:
|
|
|
- current_pos_original += len(line_original) + 1
|
|
|
- continue
|
|
|
-
|
|
|
- # 计算相似度
|
|
|
- ratio = SequenceMatcher(None, normalized_title, line_stripped).ratio()
|
|
|
-
|
|
|
- if ratio > best_ratio:
|
|
|
- best_ratio = ratio
|
|
|
- best_pos = current_pos_original
|
|
|
-
|
|
|
- current_pos_original += len(line_original) + 1
|
|
|
-
|
|
|
- # 如果找到相似度足够高的行
|
|
|
- if best_ratio >= fuzzy_threshold:
|
|
|
- return best_pos
|
|
|
-
|
|
|
- return -1
|
|
|
-
|
|
|
- def _normalize(self, text: str) -> str:
|
|
|
- """移除控制字符并压缩空白。"""
|
|
|
- if not text:
|
|
|
- return ""
|
|
|
- # 去控制字符
|
|
|
- text = re.sub(r"[\x00-\x1F\x7F]", "", text)
|
|
|
- # 去零宽字符等
|
|
|
- text = re.sub(r"[\u2000-\u200D\u2028\u2029\uFEFF]", "", text)
|
|
|
- # 全角空格 -> 普通空格
|
|
|
- text = text.replace("\u3000", " ")
|
|
|
- # 合并空白
|
|
|
- text = re.sub(r"\s+", " ", text)
|
|
|
- return text.strip()
|
|
|
-
|
|
|
- def _normalize_title(self, title: str) -> str:
|
|
|
- """标准化标题用于匹配(统一空白字符)。"""
|
|
|
- normalized = re.sub(r'\s+', ' ', title)
|
|
|
- normalized = normalized.strip()
|
|
|
- return normalized
|
|
|
-
|
|
|
- def _remove_escape_chars(self, text: str) -> str:
|
|
|
- """
|
|
|
- 移除文本中可能的各种转义字符和特殊字符。
|
|
|
- 完全不保留任何转义字符(如换行、制表、回车等),只保留普通空格和可见字符。
|
|
|
-
|
|
|
- 参考 doc_worker 的实现。
|
|
|
- """
|
|
|
- if not text:
|
|
|
- return text
|
|
|
-
|
|
|
- # 第一步:移除所有控制字符(包括换行符\n、制表符\t、回车符\r等)
|
|
|
- # \x00-\x1F: 控制字符(包括\n=0x0A, \r=0x0D, \t=0x09等)
|
|
|
- # \x7F: DEL字符
|
|
|
- text = re.sub(r'[\x00-\x1F\x7F]', '', text)
|
|
|
-
|
|
|
- # 第二步:移除零宽字符和特殊Unicode空白字符
|
|
|
- # \u200B-\u200D: 零宽空格、零宽非断字符、零宽断字符
|
|
|
- # \uFEFF: 零宽无断字符(BOM)
|
|
|
- # \u2028: 行分隔符
|
|
|
- # \u2029: 段落分隔符
|
|
|
- # \u2000-\u200A: 各种Unicode空格字符
|
|
|
- text = re.sub(r'[\u2000-\u200D\u2028\u2029\uFEFF]', '', text)
|
|
|
-
|
|
|
- # 第三步:将全角空格转换为普通空格(保留其他全角字符)
|
|
|
- text = text.replace('\u3000', ' ')
|
|
|
-
|
|
|
- # 第四步:统一处理连续空格(将多个连续空格替换为单个空格)
|
|
|
- # 注意:这里只处理普通空格(U+0020),不处理其他空白字符(因为已经移除了)
|
|
|
- text = re.sub(r' +', ' ', text)
|
|
|
-
|
|
|
- # 第五步:去除首尾空格
|
|
|
- text = text.strip()
|
|
|
-
|
|
|
- return text
|
|
|
-
|
|
|
- def _map_clean_position_to_original(self, clean_pos: int, original_text: str, clean_text: str, search_pattern: str = None) -> int:
|
|
|
- """
|
|
|
- 将清理后文本的位置映射回原始文本的位置。
|
|
|
-
|
|
|
- 参数:
|
|
|
- clean_pos: 清理后文本中的位置
|
|
|
- original_text: 原始文本
|
|
|
- clean_text: 清理后的文本
|
|
|
- search_pattern: 要搜索的模式(用于在原始文本中直接查找)
|
|
|
-
|
|
|
- 返回:
|
|
|
- int: 原始文本中的位置,如果未找到则返回-1
|
|
|
- """
|
|
|
- if clean_pos >= len(clean_text):
|
|
|
- return len(original_text)
|
|
|
-
|
|
|
- # 如果提供了搜索模式,先在原始文本中直接查找
|
|
|
- if search_pattern:
|
|
|
- # 尝试在原始文本中直接查找(移除转义字符后)
|
|
|
- pattern_clean = self._remove_escape_chars(search_pattern)
|
|
|
- if not pattern_clean:
|
|
|
- pattern_clean = search_pattern
|
|
|
-
|
|
|
- # 在原始文本中查找匹配的位置
|
|
|
- # 使用一个滑动窗口,对每个位置清理后进行比较
|
|
|
- search_window_size = min(len(original_text), len(original_text))
|
|
|
- step = max(1, len(pattern_clean) // 4) # 步长,避免太慢
|
|
|
-
|
|
|
- for i in range(0, search_window_size, step):
|
|
|
- if i + len(pattern_clean) * 2 > len(original_text):
|
|
|
- break
|
|
|
-
|
|
|
- # 取一个窗口,清理后检查是否包含模式
|
|
|
- window = original_text[i:i + len(pattern_clean) * 3]
|
|
|
- window_clean = self._remove_escape_chars(window)
|
|
|
-
|
|
|
- if pattern_clean in window_clean:
|
|
|
- # 找到模式在窗口中的位置
|
|
|
- pos_in_window = window_clean.index(pattern_clean)
|
|
|
- # 映射回原始窗口的位置
|
|
|
- original_window_pos = self._find_pattern_in_original_window(
|
|
|
- pattern_clean, window, i
|
|
|
- )
|
|
|
- if original_window_pos >= 0:
|
|
|
- return original_window_pos
|
|
|
-
|
|
|
- # 如果直接查找失败,使用基于比例的估算
|
|
|
- if len(clean_text) > 0:
|
|
|
- ratio = clean_pos / len(clean_text)
|
|
|
- estimated_pos = int(ratio * len(original_text))
|
|
|
- # 在估算位置附近查找
|
|
|
- search_range = min(100, len(original_text) // 10)
|
|
|
- start = max(0, estimated_pos - search_range)
|
|
|
- end = min(len(original_text), estimated_pos + search_range)
|
|
|
-
|
|
|
- if search_pattern:
|
|
|
- # 在估算位置附近查找模式
|
|
|
- pattern_clean_local = self._remove_escape_chars(search_pattern)
|
|
|
- for i in range(start, end):
|
|
|
- if i + len(search_pattern) > len(original_text):
|
|
|
- break
|
|
|
- window = original_text[i:i + len(search_pattern) * 2]
|
|
|
- window_clean = self._remove_escape_chars(window)
|
|
|
- if search_pattern in window_clean or (pattern_clean_local and pattern_clean_local in window_clean):
|
|
|
- return i
|
|
|
-
|
|
|
- return estimated_pos
|
|
|
-
|
|
|
- return -1
|
|
|
-
|
|
|
- def _map_no_space_to_original(self, pattern_no_space: str, line: str, pos_in_no_space: int, line_normalized: str, line_no_space: str) -> int:
|
|
|
- """
|
|
|
- 将无空格版本中的位置映射回原始行中的位置。
|
|
|
-
|
|
|
- 参数:
|
|
|
- pattern_no_space: 无空格的模式
|
|
|
- line: 原始行文本
|
|
|
- pos_in_no_space: 模式在无空格行中的位置
|
|
|
- line_normalized: 标准化后的行(有空格)
|
|
|
- line_no_space: 无空格的行
|
|
|
-
|
|
|
- 返回:
|
|
|
- int: 模式在原始行中的位置,如果未找到则返回-1
|
|
|
- """
|
|
|
- if pos_in_no_space >= len(line_no_space):
|
|
|
- return -1
|
|
|
-
|
|
|
- # 先尝试在原始行中直接查找
|
|
|
- # 找到 pattern_no_space 在无空格行中的实际文本
|
|
|
- end_pos = pos_in_no_space + len(pattern_no_space)
|
|
|
- if end_pos > len(line_no_space):
|
|
|
- return -1
|
|
|
-
|
|
|
- # 找到对应在 line_normalized 中的位置范围
|
|
|
- # 需要建立 line_normalized 和 line_no_space 之间的字符映射
|
|
|
- norm_to_no_space_idx = []
|
|
|
- no_space_idx = 0
|
|
|
- for i, char in enumerate(line_normalized):
|
|
|
- if char != ' ':
|
|
|
- norm_to_no_space_idx.append(no_space_idx)
|
|
|
- no_space_idx += 1
|
|
|
- else:
|
|
|
- norm_to_no_space_idx.append(-1) # 空格对应 -1
|
|
|
-
|
|
|
- # 找到 pos_in_no_space 对应的 line_normalized 中的位置
|
|
|
- norm_start = -1
|
|
|
- for i, no_space_pos in enumerate(norm_to_no_space_idx):
|
|
|
- if no_space_pos == pos_in_no_space:
|
|
|
- norm_start = i
|
|
|
- break
|
|
|
-
|
|
|
- if norm_start < 0:
|
|
|
- return -1
|
|
|
-
|
|
|
- # 找到 pattern 在 line_normalized 中的结束位置
|
|
|
- norm_end = -1
|
|
|
- target_end_no_space = pos_in_no_space + len(pattern_no_space)
|
|
|
- for i, no_space_pos in enumerate(norm_to_no_space_idx):
|
|
|
- if no_space_pos >= target_end_no_space or (no_space_pos == -1 and i > norm_start and norm_to_no_space_idx[i-1] >= target_end_no_space - 1):
|
|
|
- norm_end = i
|
|
|
- break
|
|
|
-
|
|
|
- if norm_end < 0:
|
|
|
- norm_end = len(line_normalized)
|
|
|
-
|
|
|
- # 现在使用 line_normalized 中的位置范围来映射回原始行
|
|
|
- return self._find_pattern_in_line(line_normalized[norm_start:norm_end], line, norm_start, line_normalized)
|
|
|
-
|
|
|
- def _find_pattern_in_line(self, pattern: str, line: str, pattern_pos_in_normalized: int, normalized_line: str = None) -> int:
|
|
|
- """
|
|
|
- 在原始行中找到模式的位置
|
|
|
-
|
|
|
- 参数:
|
|
|
- pattern: 要查找的模式(已标准化)
|
|
|
- line: 原始行文本
|
|
|
- pattern_pos_in_normalized: 模式在标准化行中的位置
|
|
|
- normalized_line: 标准化后的行文本(可选,用于更精确的位置映射)
|
|
|
-
|
|
|
- 返回:
|
|
|
- int: 模式在原始行中的位置,如果未找到则返回-1
|
|
|
- """
|
|
|
- # 先尝试直接查找
|
|
|
- if pattern in line:
|
|
|
- return line.index(pattern)
|
|
|
-
|
|
|
- # 使用提供的标准化行或重新计算
|
|
|
- if normalized_line is None:
|
|
|
- line_clean = self._remove_escape_chars(line)
|
|
|
- line_normalized = self._normalize_title(line_clean)
|
|
|
- else:
|
|
|
- line_normalized = normalized_line
|
|
|
-
|
|
|
- if pattern_pos_in_normalized >= len(line_normalized):
|
|
|
- return -1
|
|
|
-
|
|
|
- # 检查 pattern_pos_in_normalized 处的文本是否匹配 pattern
|
|
|
- end_pos = pattern_pos_in_normalized + len(pattern)
|
|
|
- if end_pos > len(line_normalized):
|
|
|
- return -1
|
|
|
-
|
|
|
- actual_pattern = line_normalized[pattern_pos_in_normalized:end_pos]
|
|
|
- if actual_pattern != pattern:
|
|
|
- # 不完全匹配,尝试查找实际匹配的位置
|
|
|
- if pattern in line_normalized:
|
|
|
- pattern_pos_in_normalized = line_normalized.index(pattern)
|
|
|
- else:
|
|
|
- return -1
|
|
|
-
|
|
|
- # 通过字符对齐找到原始位置
|
|
|
- clean_chars = 0
|
|
|
- original_chars = 0
|
|
|
-
|
|
|
- for orig_char in line:
|
|
|
- if clean_chars >= pattern_pos_in_normalized:
|
|
|
- break
|
|
|
-
|
|
|
- orig_char_clean = self._remove_escape_chars(orig_char)
|
|
|
- if orig_char_clean:
|
|
|
- orig_char_normalized = self._normalize_title(orig_char_clean)
|
|
|
- if orig_char_normalized:
|
|
|
- clean_chars += len(orig_char_normalized)
|
|
|
- original_chars += 1
|
|
|
-
|
|
|
- return original_chars if original_chars < len(line) else -1
|
|
|
-
|
|
|
- def _find_pattern_in_original_window(self, pattern_clean: str, original_window: str, window_start_pos: int) -> int:
|
|
|
- """
|
|
|
- 在原始窗口中找到清理后模式对应的位置。
|
|
|
-
|
|
|
- 参数:
|
|
|
- pattern_clean: 清理后的模式
|
|
|
- original_window: 原始窗口文本
|
|
|
- window_start_pos: 窗口在原始文本中的起始位置
|
|
|
-
|
|
|
- 返回:
|
|
|
- int: 模式在原始文本中的位置,如果未找到则返回-1
|
|
|
- """
|
|
|
- # 尝试在原始窗口中直接查找
|
|
|
- if pattern_clean in original_window:
|
|
|
- return window_start_pos + original_window.index(pattern_clean)
|
|
|
-
|
|
|
- # 如果直接查找失败,使用清理后的窗口
|
|
|
- window_clean = self._remove_escape_chars(original_window)
|
|
|
- if pattern_clean in window_clean:
|
|
|
- pos_in_clean = window_clean.index(pattern_clean)
|
|
|
- # 映射回原始窗口的位置(近似)
|
|
|
- if len(window_clean) > 0:
|
|
|
- ratio = pos_in_clean / len(window_clean)
|
|
|
- return window_start_pos + int(ratio * len(original_window))
|
|
|
-
|
|
|
- return -1
|
|
|
-
|
|
|
- def _get_page_number(self, position: int, pages_content: List[Dict[str, Any]]) -> int:
|
|
|
- for page in pages_content:
|
|
|
- if page["start_pos"] <= position < page["end_pos"]:
|
|
|
- return int(page["page_num"])
|
|
|
- return 1
|
|
|
-
|
|
|
- def _extract_title_number(self, title: str) -> str:
|
|
|
- """
|
|
|
- 从标题中提取编号部分
|
|
|
-
|
|
|
- 例如:
|
|
|
- "第一章 编制依据" -> "第一章"
|
|
|
- "一、工程概况" -> "一"
|
|
|
- "1. 施工计划" -> "1"
|
|
|
- """
|
|
|
- if not title:
|
|
|
- return ""
|
|
|
-
|
|
|
- # 匹配章节格式(如 第一章、第1章等)
|
|
|
- chapter_match = re.match(r'^(第[一二三四五六七八九十\d]+[章节条款部分])', title)
|
|
|
- if chapter_match:
|
|
|
- return chapter_match.group(1)
|
|
|
-
|
|
|
- # 匹配方括号数字格式(如 【1】、【2】等)
|
|
|
- bracket_match = re.match(r'^(【\d+】)', title)
|
|
|
- if bracket_match:
|
|
|
- return bracket_match.group(1)
|
|
|
-
|
|
|
- # 匹配双方括号数字格式(如 〖1.1〗、〖2.3〗等)
|
|
|
- double_bracket_match = re.match(r'^(〖\d+(?:\.\d+)*〗)', title)
|
|
|
- if double_bracket_match:
|
|
|
- return double_bracket_match.group(1)
|
|
|
-
|
|
|
- # 匹配数字编号格式(如 1.5, 1.6, 1.2.3等,可能后跟空格或、)
|
|
|
- number_match = re.match(r'^(\d+(?:\.\d+)*)[\s、..]?', title)
|
|
|
- if number_match:
|
|
|
- return number_match.group(1)
|
|
|
-
|
|
|
- # 匹配中文编号格式(如 一、二、三等)
|
|
|
- chinese_match = re.match(r'^([一二三四五六七八九十]+)[、..]', title)
|
|
|
- if chinese_match:
|
|
|
- return chinese_match.group(1)
|
|
|
-
|
|
|
- # 匹配圆括号编号格式(如 (1)、(一)等)
|
|
|
- paren_match = re.match(r'^([\((][一二三四五六七八九十\d]+[\))])', title)
|
|
|
- if paren_match:
|
|
|
- return paren_match.group(1)
|
|
|
-
|
|
|
- return ""
|
|
|
-
|
|
|
- def _extract_title_content(self, title: str) -> str:
|
|
|
- """
|
|
|
- 从标题中提取正文部分(去除编号)
|
|
|
-
|
|
|
- 例如:
|
|
|
- "第一章 编制依据" -> "编制依据"
|
|
|
- "一、工程概况" -> "工程概况"
|
|
|
- "1. 施工计划" -> "施工计划"
|
|
|
- """
|
|
|
- if not title:
|
|
|
- return title
|
|
|
-
|
|
|
- # 提取编号
|
|
|
- number = self._extract_title_number(title)
|
|
|
- if number:
|
|
|
- # 移除编号部分
|
|
|
- content = title[len(number):].strip()
|
|
|
- # 移除可能的标点符号(如 "、", ".", " " 等)
|
|
|
- content = re.sub(r'^[、..\s]+', '', content)
|
|
|
- return content
|
|
|
-
|
|
|
- return title
|
|
|
-
|
|
|
- def _check_number_in_context(self, number: str, context: str, title_pos_in_context: int) -> bool:
|
|
|
- """
|
|
|
- 检查编号是否在标题位置的上下文中
|
|
|
-
|
|
|
- 参数:
|
|
|
- number: 编号字符串
|
|
|
- context: 上下文文本
|
|
|
- title_pos_in_context: 标题在上下文中的位置
|
|
|
-
|
|
|
- 返回:
|
|
|
- bool: 如果编号在标题附近找到则返回True
|
|
|
- """
|
|
|
- if not number:
|
|
|
- return False
|
|
|
-
|
|
|
- # 在标题位置前后查找编号
|
|
|
- # 编号可能在标题之前或之后
|
|
|
- check_before = max(0, title_pos_in_context - len(number) - 10)
|
|
|
- check_after = min(len(context), title_pos_in_context + 100)
|
|
|
-
|
|
|
- context_around = context[check_before:check_after]
|
|
|
-
|
|
|
- # 清理上下文用于匹配
|
|
|
- context_clean = self._remove_escape_chars(context_around)
|
|
|
- number_clean = self._remove_escape_chars(number)
|
|
|
-
|
|
|
- # 检查编号是否在上下文中
|
|
|
- if number_clean in context_clean:
|
|
|
- return True
|
|
|
-
|
|
|
- # 也检查移除空格后的匹配
|
|
|
- context_no_space = context_clean.replace(' ', '')
|
|
|
- number_no_space = number_clean.replace(' ', '')
|
|
|
- if number_no_space and number_no_space in context_no_space:
|
|
|
- return True
|
|
|
-
|
|
|
- return False
|
|
|
-
|
|
|
- def _is_line_only_title(self, line_clean: str, title_content: str) -> bool:
|
|
|
- """
|
|
|
- 检查行是否只包含标题(没有其他字符,转义字符除外)
|
|
|
-
|
|
|
- 参数:
|
|
|
- line_clean: 清理后的行文本
|
|
|
- title_content: 标题正文部分
|
|
|
-
|
|
|
- 返回:
|
|
|
- bool: 如果行只包含标题则返回True
|
|
|
- """
|
|
|
- if not line_clean or not title_content:
|
|
|
- return False
|
|
|
-
|
|
|
- # 标准化行文本和标题
|
|
|
- line_normalized = self._normalize_title(line_clean)
|
|
|
- title_normalized = self._normalize_title(title_content)
|
|
|
-
|
|
|
- # 如果行完全匹配标题
|
|
|
- if line_normalized == title_normalized:
|
|
|
- return True
|
|
|
-
|
|
|
- # 如果行以标题开头,后面只有空白或标点
|
|
|
- if line_normalized.startswith(title_normalized):
|
|
|
- remaining = line_normalized[len(title_normalized):].strip()
|
|
|
- # 如果剩余部分只包含标点符号或空白,认为是匹配的
|
|
|
- if not remaining or re.match(r'^[,。、;:!?\s]*$', remaining):
|
|
|
- return True
|
|
|
-
|
|
|
- # 移除空格后比较
|
|
|
- line_no_space = line_normalized.replace(' ', '')
|
|
|
- title_no_space = title_normalized.replace(' ', '')
|
|
|
- if line_no_space == title_no_space:
|
|
|
- return True
|
|
|
-
|
|
|
- if line_no_space.startswith(title_no_space):
|
|
|
- remaining = line_no_space[len(title_no_space):]
|
|
|
- if not remaining or re.match(r'^[,。、;:!?]*$', remaining):
|
|
|
- return True
|
|
|
-
|
|
|
- return False
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|