|
@@ -5,6 +5,7 @@ PDF 全文提取实现
|
|
|
from __future__ import annotations
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
import io
|
|
import io
|
|
|
|
|
+import re
|
|
|
from typing import Any, Dict, List, Tuple
|
|
from typing import Any, Dict, List, Tuple
|
|
|
|
|
|
|
|
import fitz # PyMuPDF
|
|
import fitz # PyMuPDF
|
|
@@ -56,10 +57,11 @@ class PdfFullTextExtractor(FullTextExtractor):
|
|
|
def _filter_header_footer(self, text: str) -> str:
|
|
def _filter_header_footer(self, text: str) -> str:
|
|
|
"""
|
|
"""
|
|
|
过滤页眉页脚
|
|
过滤页眉页脚
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
过滤规则:
|
|
过滤规则:
|
|
|
1. 页眉:检测连续空格,检测到就删掉这行
|
|
1. 页眉:检测连续空格,检测到就删掉这行
|
|
|
- 2. 页脚:每页的最后一行,删掉每页的最后一行
|
|
|
|
|
|
|
+ 2. 页脚:智能判断最后一行是否为页脚(页码、固定模板、分隔线等),
|
|
|
|
|
+ 仅在符合页脚特征时才删除,避免误删正文内容
|
|
|
"""
|
|
"""
|
|
|
# 获取配置
|
|
# 获取配置
|
|
|
header_space_threshold = self._cfg.get(
|
|
header_space_threshold = self._cfg.get(
|
|
@@ -67,11 +69,11 @@ class PdfFullTextExtractor(FullTextExtractor):
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
lines = text.split("\n")
|
|
lines = text.split("\n")
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 如果只有一行或没有行,直接返回
|
|
# 如果只有一行或没有行,直接返回
|
|
|
if len(lines) <= 1:
|
|
if len(lines) <= 1:
|
|
|
return text
|
|
return text
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 第一步:过滤页眉(连续空格超过阈值的行)
|
|
# 第一步:过滤页眉(连续空格超过阈值的行)
|
|
|
filtered_lines: List[str] = []
|
|
filtered_lines: List[str] = []
|
|
|
for line in lines:
|
|
for line in lines:
|
|
@@ -84,20 +86,52 @@ class PdfFullTextExtractor(FullTextExtractor):
|
|
|
max_consecutive_spaces = max(max_consecutive_spaces, current_spaces)
|
|
max_consecutive_spaces = max(max_consecutive_spaces, current_spaces)
|
|
|
else:
|
|
else:
|
|
|
current_spaces = 0
|
|
current_spaces = 0
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 如果连续空格数超过阈值,认为是页眉行,跳过
|
|
# 如果连续空格数超过阈值,认为是页眉行,跳过
|
|
|
if max_consecutive_spaces >= header_space_threshold:
|
|
if max_consecutive_spaces >= header_space_threshold:
|
|
|
continue
|
|
continue
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 保留非页眉行
|
|
# 保留非页眉行
|
|
|
filtered_lines.append(line)
|
|
filtered_lines.append(line)
|
|
|
-
|
|
|
|
|
- # 第二步:过滤页脚(删除最后一行)
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 第二步:智能过滤页脚(仅在最后一行看起来像页脚时才删除)
|
|
|
if len(filtered_lines) > 0:
|
|
if len(filtered_lines) > 0:
|
|
|
- filtered_lines.pop() # 删除最后一行
|
|
|
|
|
|
|
+ last_line = filtered_lines[-1].strip()
|
|
|
|
|
+ if self._is_likely_footer(last_line):
|
|
|
|
|
+ filtered_lines.pop()
|
|
|
|
|
|
|
|
return "\n".join(filtered_lines)
|
|
return "\n".join(filtered_lines)
|
|
|
|
|
|
|
|
|
|
+ def _is_likely_footer(self, line: str) -> bool:
|
|
|
|
|
+ """判断一行文本是否可能是页脚(页码、固定模板、分隔线等)"""
|
|
|
|
|
+ if not line:
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ # 纯数字页码
|
|
|
|
|
+ if line.isdigit():
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ # 常见页码格式:第X页、共X页、X / Y
|
|
|
|
|
+ if re.match(r"^[第共]\s*\d+\s*[页页次]?$", line):
|
|
|
|
|
+ return True
|
|
|
|
|
+ if re.match(r"^\d+\s*/\s*\d+$", line):
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ # 日期或短标识(如 "2024年3月"、"2024-03")
|
|
|
|
|
+ if re.match(r"^\d{4}[-年/.]\d{1,2}", line):
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ # 很短且不含中文字符(通常是页码、英文标识等)
|
|
|
|
|
+ chinese_chars = self._count_chinese_chars(line)
|
|
|
|
|
+ if len(line) <= 8 and chinese_chars == 0:
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ # 全是特殊字符(横线、点、下划线等分隔线)
|
|
|
|
|
+ if re.match(r"^[\-—_.·\s]+$", line):
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
def _count_chinese_chars(self, text: str) -> int:
|
|
def _count_chinese_chars(self, text: str) -> int:
|
|
|
"""
|
|
"""
|
|
|
统计文本中的中文字符数(不含转义字符)
|
|
统计文本中的中文字符数(不含转义字符)
|