|
|
@@ -19,6 +19,23 @@ class TOCPatternMatcher:
|
|
|
def __init__(self) -> None:
|
|
|
self._cfg = default_config_provider
|
|
|
|
|
|
+ @staticmethod
|
|
|
+ def extract_page_number(page_str: str) -> str:
|
|
|
+ """
|
|
|
+ 从可能带有修饰符号的页码字符串中提取纯数字。
|
|
|
+
|
|
|
+ 例如:
|
|
|
+ - '‐ 1 ‐' -> '1'
|
|
|
+ - '19' -> '19'
|
|
|
+ - ' 10 ' -> '10'
|
|
|
+ - '‐ 19 ‐' -> '19'
|
|
|
+ """
|
|
|
+ # 使用正则表达式提取第一个连续的数字序列
|
|
|
+ match = re.search(r'\d+', page_str)
|
|
|
+ if match:
|
|
|
+ return match.group(0)
|
|
|
+ return page_str.strip() # 如果没有找到数字,返回清理后的原始字符串
|
|
|
+
|
|
|
def has_numbering(self, text: str) -> bool:
|
|
|
"""检查文本是否包含编号格式。"""
|
|
|
numbering_patterns: List[str] = self._cfg.get("numbering.formats", [])
|
|
|
@@ -40,7 +57,8 @@ class TOCPatternMatcher:
|
|
|
if re.match(r"^第[一二三四五六七八九十\d]+[章节条款]\s*$", line):
|
|
|
if i + 1 < len(lines):
|
|
|
next_line = lines[i + 1].strip()
|
|
|
- if re.search(r"[.·]{2,}.*\d{1,4}\s*$", next_line):
|
|
|
+ # 支持带修饰符号的页码匹配
|
|
|
+ if re.search(r"[.·]{2,}.*?\d+.*?\s*$", next_line):
|
|
|
merged_line = line + next_line
|
|
|
merged_lines.append(merged_line)
|
|
|
i += 2
|
|
|
@@ -72,7 +90,10 @@ class TOCPatternMatcher:
|
|
|
continue
|
|
|
|
|
|
title = match.group(1).strip()
|
|
|
- page_num = match.group(2).strip()
|
|
|
+ page_num_raw = match.group(2).strip()
|
|
|
+
|
|
|
+ # 从可能带有修饰符号的页码中提取纯数字
|
|
|
+ page_num = self.extract_page_number(page_num_raw)
|
|
|
|
|
|
title_clean = re.sub(r"[.·]{2,}", "", title)
|
|
|
title_clean = re.sub(r"\s{2,}", " ", title_clean)
|