|
@@ -28,6 +28,27 @@ from ..config.provider import default_config_provider
|
|
|
from ..interfaces import DocumentSource, FullTextExtractor
|
|
from ..interfaces import DocumentSource, FullTextExtractor
|
|
|
from .fulltext_extractor import PdfFullTextExtractor
|
|
from .fulltext_extractor import PdfFullTextExtractor
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
+def _read_ini_config(section: str, key: str, default: Any = None) -> Any:
|
|
|
|
|
+ """从项目根目录的 config.ini 读取配置"""
|
|
|
|
|
+ try:
|
|
|
|
|
+ import configparser
|
|
|
|
|
+ from pathlib import Path
|
|
|
|
|
+
|
|
|
|
|
+ # 查找项目根目录的 config.ini
|
|
|
|
|
+ config_path = Path(__file__).parent.parent.parent.parent.parent.parent / "config" / "config.ini"
|
|
|
|
|
+ if not config_path.exists():
|
|
|
|
|
+ return default
|
|
|
|
|
+
|
|
|
|
|
+ config = configparser.ConfigParser()
|
|
|
|
|
+ config.read(config_path, encoding="utf-8")
|
|
|
|
|
+
|
|
|
|
|
+ if section in config and key in config[section]:
|
|
|
|
|
+ return config[section][key]
|
|
|
|
|
+ return default
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ return default
|
|
|
|
|
+
|
|
|
# 尝试导入 PIL 用于图片压缩
|
|
# 尝试导入 PIL 用于图片压缩
|
|
|
try:
|
|
try:
|
|
|
from PIL import Image
|
|
from PIL import Image
|
|
@@ -59,29 +80,58 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
|
|
|
|
|
def __init__(
|
|
def __init__(
|
|
|
self,
|
|
self,
|
|
|
- layout_dpi: int = 180,
|
|
|
|
|
- ocr_dpi: int = 220,
|
|
|
|
|
- jpg_quality: int = 85, # 降低为 85 配合 GLM-OCR
|
|
|
|
|
|
|
+ layout_dpi: int = 200, # 【优化】统一 DPI 为 200,兼顾版面分析和 OCR 质量
|
|
|
|
|
+ ocr_dpi: int = 200, # 【优化】与 layout_dpi 保持一致,避免重复渲染
|
|
|
|
|
+ jpg_quality: int = 90,
|
|
|
api_url: Optional[str] = None,
|
|
api_url: Optional[str] = None,
|
|
|
timeout: int = 600
|
|
timeout: int = 600
|
|
|
) -> None:
|
|
) -> None:
|
|
|
self._cfg = default_config_provider
|
|
self._cfg = default_config_provider
|
|
|
self.local_extractor = PdfFullTextExtractor()
|
|
self.local_extractor = PdfFullTextExtractor()
|
|
|
|
|
|
|
|
- # GLM-OCR 配置
|
|
|
|
|
- self.api_url = api_url or self._cfg.get(
|
|
|
|
|
- "glm_ocr.api_url",
|
|
|
|
|
|
|
+ # 【新增】OCR 引擎选择配置
|
|
|
|
|
+ # 优先级:config.ini [ocr] ENGINE > 默认 glm_ocr
|
|
|
|
|
+ # 同时支持 "glm_ocr"/"glm-ocr" 和 "mineru"/"mineru-ocr" 等多种写法
|
|
|
|
|
+ raw_engine = _read_ini_config("ocr", "engine", "glm_ocr")
|
|
|
|
|
+ self.ocr_engine = raw_engine.lower().strip() if raw_engine else "glm_ocr"
|
|
|
|
|
+
|
|
|
|
|
+ # 规范化引擎名称(统一转换为标准格式)
|
|
|
|
|
+ if self.ocr_engine in ("glm_ocr", "glm-ocr", "glmocr"):
|
|
|
|
|
+ self.ocr_engine_normalized = "glm_ocr"
|
|
|
|
|
+ elif self.ocr_engine in ("mineru", "mineru-ocr", "mineru_ocr"):
|
|
|
|
|
+ self.ocr_engine_normalized = "mineru"
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.warning(f"[HybridExtractor] 未知的 OCR 引擎 '{self.ocr_engine}',使用默认 glm_ocr")
|
|
|
|
|
+ self.ocr_engine_normalized = "glm_ocr"
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"[HybridExtractor] OCR 引擎配置: '{self.ocr_engine}' -> 使用: '{self.ocr_engine_normalized}'")
|
|
|
|
|
+
|
|
|
|
|
+ # GLM-OCR 配置(从 config.ini 读取,兼容原有逻辑)
|
|
|
|
|
+ self.glm_api_url = api_url or _read_ini_config(
|
|
|
|
|
+ "ocr", "glm_ocr_api_url",
|
|
|
"http://183.220.37.46:25429/v1/chat/completions"
|
|
"http://183.220.37.46:25429/v1/chat/completions"
|
|
|
)
|
|
)
|
|
|
- self.timeout = timeout
|
|
|
|
|
- self.headers = {"Content-Type": "application/json"}
|
|
|
|
|
|
|
+ self.glm_timeout = int(_read_ini_config("ocr", "glm_ocr_timeout", "600"))
|
|
|
|
|
+ self.glm_headers = {"Content-Type": "application/json"}
|
|
|
|
|
+
|
|
|
|
|
+ # 【新增】MinerU 配置
|
|
|
|
|
+ self.mineru_api_url = _read_ini_config(
|
|
|
|
|
+ "ocr", "mineru_api_url",
|
|
|
|
|
+ "http://183.220.37.46:25428/file_parse"
|
|
|
|
|
+ )
|
|
|
|
|
+ self.mineru_timeout = int(_read_ini_config("ocr", "mineru_timeout", "300"))
|
|
|
|
|
|
|
|
- # 飞浆版面分析配置
|
|
|
|
|
|
|
+ # 【优化】飞浆版面分析配置 - DPI 统一为 200
|
|
|
|
|
+ # 原理:版面分析和 OCR 使用相同 DPI,第一阶段渲染的图片可直接复用
|
|
|
self.layout_dpi = layout_dpi
|
|
self.layout_dpi = layout_dpi
|
|
|
self.ocr_dpi = ocr_dpi
|
|
self.ocr_dpi = ocr_dpi
|
|
|
self.jpg_quality = jpg_quality
|
|
self.jpg_quality = jpg_quality
|
|
|
self._layout_engine: Optional[Any] = None
|
|
self._layout_engine: Optional[Any] = None
|
|
|
|
|
|
|
|
|
|
+ # 【优化】图片缓存:版面分析阶段缓存 table 页图片,供 OCR 阶段复用
|
|
|
|
|
+ # 格式: {page_num: (width, height, jpeg_bytes)}
|
|
|
|
|
+ self._image_cache: Dict[int, tuple] = {}
|
|
|
|
|
+
|
|
|
# 外部注入的进度状态字典
|
|
# 外部注入的进度状态字典
|
|
|
self._progress_state: Optional[dict] = None
|
|
self._progress_state: Optional[dict] = None
|
|
|
|
|
|
|
@@ -98,16 +148,21 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
self._layout_engine = RapidLayout()
|
|
self._layout_engine = RapidLayout()
|
|
|
return self._layout_engine
|
|
return self._layout_engine
|
|
|
|
|
|
|
|
- def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
|
|
|
|
|
|
|
+ def _detect_table_pages(self, doc: fitz.Document, dpi: int = 200) -> Set[int]:
|
|
|
"""
|
|
"""
|
|
|
使用飞浆 RapidLayout 检测所有页面,返回包含 table 区域的页码集合。
|
|
使用飞浆 RapidLayout 检测所有页面,返回包含 table 区域的页码集合。
|
|
|
- 【保持不变】
|
|
|
|
|
|
|
+
|
|
|
|
|
+ 【优化】检测到 table 的页面,将 JPEG 图片缓存到 self._image_cache
|
|
|
|
|
+ 供后续 OCR 阶段直接使用,避免重复渲染 PDF。
|
|
|
"""
|
|
"""
|
|
|
table_pages: Set[int] = set()
|
|
table_pages: Set[int] = set()
|
|
|
layout_engine = self._get_layout_engine()
|
|
layout_engine = self._get_layout_engine()
|
|
|
total_pages = len(doc)
|
|
total_pages = len(doc)
|
|
|
|
|
+
|
|
|
|
|
+ # 清空图片缓存
|
|
|
|
|
+ self._image_cache.clear()
|
|
|
|
|
|
|
|
- logger.debug(f" [飞浆分析] 开始版面分析,共 {total_pages} 页...")
|
|
|
|
|
|
|
+ logger.info(f" [飞浆分析] 开始版面分析,共 {total_pages} 页,DPI={dpi}(图片缓存已启用)")
|
|
|
|
|
|
|
|
for page_num in range(1, total_pages + 1):
|
|
for page_num in range(1, total_pages + 1):
|
|
|
page = doc[page_num - 1]
|
|
page = doc[page_num - 1]
|
|
@@ -133,7 +188,17 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
# 判断是否包含 table
|
|
# 判断是否包含 table
|
|
|
if "table" in labels:
|
|
if "table" in labels:
|
|
|
table_pages.add(page_num)
|
|
table_pages.add(page_num)
|
|
|
- logger.debug(f" 第 {page_num} 页: 检测到 table 区域 -> 将走 GLM-OCR")
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 【优化】缓存 table 页图片为 JPEG,供 OCR 阶段复用
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 直接保存 Pixmap 的 JPEG 数据,无需 PIL 转换
|
|
|
|
|
+ jpeg_bytes = pix.tobytes("jpeg")
|
|
|
|
|
+ self._image_cache[page_num] = (pix.width, pix.height, jpeg_bytes)
|
|
|
|
|
+ logger.debug(f" 第 {page_num} 页: 检测到 table -> 缓存图片 "
|
|
|
|
|
+ f"({pix.width}x{pix.height}, {len(jpeg_bytes)/1024:.1f} KB)")
|
|
|
|
|
+ except Exception as cache_err:
|
|
|
|
|
+ logger.warning(f" 第 {page_num} 页: 图片缓存失败 ({cache_err})")
|
|
|
|
|
+
|
|
|
else:
|
|
else:
|
|
|
region_types = ", ".join(set(labels)) if labels else "无"
|
|
region_types = ", ".join(set(labels)) if labels else "无"
|
|
|
logger.debug(f" 第 {page_num} 页: {region_types}")
|
|
logger.debug(f" 第 {page_num} 页: {region_types}")
|
|
@@ -147,7 +212,9 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
self._progress_state['current'] = int(page_num / total_pages * 50)
|
|
self._progress_state['current'] = int(page_num / total_pages * 50)
|
|
|
self._progress_state['message'] = f"版面分析中:已分析 {page_num}/{total_pages} 页"
|
|
self._progress_state['message'] = f"版面分析中:已分析 {page_num}/{total_pages} 页"
|
|
|
|
|
|
|
|
- logger.debug(f" [飞浆分析] 完成,共 {len(table_pages)} 页包含 table 区域: {sorted(table_pages)}")
|
|
|
|
|
|
|
+ cache_size_mb = sum(len(data[2]) for data in self._image_cache.values()) / 1024 / 1024
|
|
|
|
|
+ logger.info(f" [飞浆分析] 完成: {len(table_pages)} 页 table,"
|
|
|
|
|
+ f"缓存 {len(self._image_cache)} 页图片 ({cache_size_mb:.1f} MB)")
|
|
|
return table_pages
|
|
return table_pages
|
|
|
|
|
|
|
|
def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
|
|
def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
|
|
@@ -156,7 +223,14 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
|
|
1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
|
|
|
2. 含有 table 的页面走 GLM-OCR
|
|
2. 含有 table 的页面走 GLM-OCR
|
|
|
3. 其他页面走本地 PyMuPDF 提取
|
|
3. 其他页面走本地 PyMuPDF 提取
|
|
|
|
|
+
|
|
|
|
|
+ 【统计信息】本方法会统计并输出总提取时间、OCR页数等信息
|
|
|
"""
|
|
"""
|
|
|
|
|
+ # 记录总开始时间
|
|
|
|
|
+ total_start_time = time.time()
|
|
|
|
|
+ layout_analysis_time = 0.0
|
|
|
|
|
+ ocr_total_time = 0.0
|
|
|
|
|
+
|
|
|
# 打开文档
|
|
# 打开文档
|
|
|
if source.content is not None:
|
|
if source.content is not None:
|
|
|
doc = fitz.open(stream=io.BytesIO(source.content))
|
|
doc = fitz.open(stream=io.BytesIO(source.content))
|
|
@@ -175,22 +249,28 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
ocr_page_count = 0 # 统计需要OCR的页数
|
|
ocr_page_count = 0 # 统计需要OCR的页数
|
|
|
|
|
|
|
|
# INFO级别:开始文档提取(方便查看主要流程)
|
|
# INFO级别:开始文档提取(方便查看主要流程)
|
|
|
- logger.info(f"[文档提取] 开始处理,共 {total_pages} 页,使用混合模式(GLM-OCR)")
|
|
|
|
|
- logger.debug(f"开始混合提取(飞浆版面分析 + GLM-OCR),共 {total_pages} 页...")
|
|
|
|
|
|
|
+ current_engine = "GLM-OCR" if self.ocr_engine_normalized == "glm_ocr" else "MinerU"
|
|
|
|
|
+ logger.info(f"[文档提取] 开始处理,共 {total_pages} 页,OCR引擎: {current_engine}")
|
|
|
|
|
+ logger.debug(f"开始混合提取(飞浆版面分析 + {current_engine}),共 {total_pages} 页...")
|
|
|
|
|
|
|
|
if self._progress_state is not None:
|
|
if self._progress_state is not None:
|
|
|
self._progress_state['current'] = 0
|
|
self._progress_state['current'] = 0
|
|
|
self._progress_state['message'] = f"版面分析中:已分析 0/{total_pages} 页"
|
|
self._progress_state['message'] = f"版面分析中:已分析 0/{total_pages} 页"
|
|
|
|
|
|
|
|
# ========== 第一阶段:飞浆版面分析 ==========
|
|
# ========== 第一阶段:飞浆版面分析 ==========
|
|
|
|
|
+ layout_start_time = time.time()
|
|
|
table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
|
|
table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
|
|
|
|
|
+ layout_analysis_time = time.time() - layout_start_time
|
|
|
ocr_page_count = len(table_pages)
|
|
ocr_page_count = len(table_pages)
|
|
|
|
|
|
|
|
# INFO级别:版面分析完成,显示OCR页数
|
|
# INFO级别:版面分析完成,显示OCR页数
|
|
|
if ocr_page_count > 0:
|
|
if ocr_page_count > 0:
|
|
|
- logger.info(f"[文档提取] 版面分析完成,共 {ocr_page_count} 页需要OCR识别,{total_pages - ocr_page_count} 页直接提取")
|
|
|
|
|
|
|
+ logger.info(f"[文档提取] 版面分析完成,共 {ocr_page_count} 页需要OCR识别,"
|
|
|
|
|
+ f"{total_pages - ocr_page_count} 页直接提取,"
|
|
|
|
|
+ f"版面分析耗时: {layout_analysis_time:.2f}s")
|
|
|
else:
|
|
else:
|
|
|
- logger.info(f"[文档提取] 版面分析完成,无扫描页,全部直接提取")
|
|
|
|
|
|
|
+ logger.info(f"[文档提取] 版面分析完成,无扫描页,全部直接提取,"
|
|
|
|
|
+ f"版面分析耗时: {layout_analysis_time:.2f}s")
|
|
|
|
|
|
|
|
# ========== 第二阶段:分流处理 ==========
|
|
# ========== 第二阶段:分流处理 ==========
|
|
|
logger.debug(f"\n开始分流处理...")
|
|
logger.debug(f"\n开始分流处理...")
|
|
@@ -199,13 +279,22 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
page_num = i + 1
|
|
page_num = i + 1
|
|
|
|
|
|
|
|
if page_num in table_pages:
|
|
if page_num in table_pages:
|
|
|
- logger.debug(f" [第 {page_num} 页] 检测到 table -> 走 GLM-OCR")
|
|
|
|
|
|
|
+ # 【修改】根据配置选择 OCR 引擎
|
|
|
|
|
+ # 使用规范化后的引擎名称(支持 glm_ocr/glm-ocr 和 mineru/mineru-ocr)
|
|
|
|
|
+ is_glm_ocr = self.ocr_engine_normalized == "glm_ocr"
|
|
|
|
|
+ ocr_name = "GLM-OCR" if is_glm_ocr else "MinerU"
|
|
|
|
|
+ logger.debug(f" [第 {page_num} 页] 检测到 table -> 走 {ocr_name}")
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
- # 调用 GLM-OCR
|
|
|
|
|
- page_text = self._ocr_page_with_glm(page, page_num, source_file)
|
|
|
|
|
|
|
+ # 根据配置调用不同的 OCR 引擎,并统计 OCR 时间
|
|
|
|
|
+ ocr_start_time = time.time()
|
|
|
|
|
+ if is_glm_ocr:
|
|
|
|
|
+ page_text = self._ocr_page_with_glm(page, page_num, source_file)
|
|
|
|
|
+ else:
|
|
|
|
|
+ page_text = self._ocr_page_with_mineru(doc, page_num, source_file)
|
|
|
|
|
+ ocr_total_time += time.time() - ocr_start_time
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- logger.error(f" GLM-OCR 失败,回退到本地提取: {e}")
|
|
|
|
|
|
|
+ logger.error(f" {ocr_name} 失败,回退到本地提取: {e}")
|
|
|
raw_text = page.get_text()
|
|
raw_text = page.get_text()
|
|
|
page_text = self.local_extractor._filter_header_footer(raw_text)
|
|
page_text = self.local_extractor._filter_header_footer(raw_text)
|
|
|
else:
|
|
else:
|
|
@@ -232,10 +321,33 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
|
|
|
|
|
finally:
|
|
finally:
|
|
|
doc.close()
|
|
doc.close()
|
|
|
|
|
+ # 【优化】清理图片缓存,释放内存
|
|
|
|
|
+ if hasattr(self, '_image_cache'):
|
|
|
|
|
+ cache_size = len(self._image_cache)
|
|
|
|
|
+ self._image_cache.clear()
|
|
|
|
|
+ if cache_size > 0:
|
|
|
|
|
+ logger.debug(f" [缓存清理] 已清理 {cache_size} 页图片缓存")
|
|
|
|
|
|
|
|
- # INFO级别:文档提取完成
|
|
|
|
|
|
|
+ # ========== 统计信息输出 ==========
|
|
|
|
|
+ # INFO级别:文档提取完成,输出详细统计
|
|
|
|
|
+ total_time = time.time() - total_start_time
|
|
|
total_chars = sum(len(page['text']) for page in pages)
|
|
total_chars = sum(len(page['text']) for page in pages)
|
|
|
- logger.info(f"[文档提取] 完成,共 {total_pages} 页,总字符数: {total_chars}")
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 计算各类时间占比
|
|
|
|
|
+ ocr_avg_time = ocr_total_time / ocr_page_count if ocr_page_count > 0 else 0
|
|
|
|
|
+ local_pages = total_pages - ocr_page_count
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(
|
|
|
|
|
+ f"[文档提取] 完成统计 | "
|
|
|
|
|
+ f"总页数: {total_pages} | "
|
|
|
|
|
+ f"OCR页数: {ocr_page_count} | "
|
|
|
|
|
+ f"本地提取: {local_pages} | "
|
|
|
|
|
+ f"总耗时: {total_time:.2f}s | "
|
|
|
|
|
+ f"版面分析: {layout_analysis_time:.2f}s | "
|
|
|
|
|
+ f"OCR耗时: {ocr_total_time:.2f}s | "
|
|
|
|
|
+ f"OCR平均: {ocr_avg_time:.2f}s/页 | "
|
|
|
|
|
+ f"总字符数: {total_chars}"
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
return pages
|
|
return pages
|
|
|
|
|
|
|
@@ -243,42 +355,41 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
"""
|
|
"""
|
|
|
将单页转为图片并调用 GLM-OCR 本地 API 识别
|
|
将单页转为图片并调用 GLM-OCR 本地 API 识别
|
|
|
|
|
|
|
|
- 【逻辑来源】glm_ocr_api_extractor.py 最终实现版本
|
|
|
|
|
|
|
+ 【优化】优先使用版面分析阶段缓存的图片,避免重复渲染
|
|
|
|
|
|
|
|
流程:
|
|
流程:
|
|
|
- 1. PyMuPDF 渲染页面为图片(220 DPI)
|
|
|
|
|
- 2. PIL 压缩图片(短边限制 1024px,JPEG 质量 85)
|
|
|
|
|
- 3. Base64 编码
|
|
|
|
|
- 4. 构建 OpenAI 兼容格式请求
|
|
|
|
|
|
|
+ 1. 优先使用缓存图片(如可用)
|
|
|
|
|
+ 2. 否则 PyMuPDF 渲染页面为图片(200 DPI)
|
|
|
|
|
+ 3. PIL 压缩图片(短边限制 1024px,JPEG 质量 90)
|
|
|
|
|
+ 4. Base64 编码
|
|
|
5. POST 请求 GLM-OCR API
|
|
5. POST 请求 GLM-OCR API
|
|
|
6. 解析响应并转换 HTML→Markdown
|
|
6. 解析响应并转换 HTML→Markdown
|
|
|
-
|
|
|
|
|
- 请求格式:
|
|
|
|
|
- {
|
|
|
|
|
- "model": "GLM-OCR",
|
|
|
|
|
- "messages": [{
|
|
|
|
|
- "role": "user",
|
|
|
|
|
- "content": [
|
|
|
|
|
- {"type": "text", "text": "提示词"},
|
|
|
|
|
- {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
|
|
|
|
|
- ]
|
|
|
|
|
- }],
|
|
|
|
|
- "max_tokens": 2048,
|
|
|
|
|
- "temperature": 0.1
|
|
|
|
|
- }
|
|
|
|
|
"""
|
|
"""
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
+ # 【优化】检查是否有缓存图片
|
|
|
|
|
+ cached = self._image_cache.get(page_num)
|
|
|
|
|
+ use_cache = cached is not None
|
|
|
|
|
+
|
|
|
# INFO级别:开始调用GLM-OCR识别(方便查看主要流程)
|
|
# INFO级别:开始调用GLM-OCR识别(方便查看主要流程)
|
|
|
- logger.info(f"[GLM-OCR] 开始识别第 {page_num} 页(扫描页)")
|
|
|
|
|
|
|
+ cache_info = "(使用缓存图片)" if use_cache else ""
|
|
|
|
|
+ logger.info(f"[GLM-OCR] 开始识别第 {page_num} 页 {cache_info}")
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
- # 1. 渲染为图片
|
|
|
|
|
- pix = page.get_pixmap(dpi=self.ocr_dpi)
|
|
|
|
|
- img_bytes = pix.tobytes("jpeg")
|
|
|
|
|
- original_kb = len(img_bytes) / 1024
|
|
|
|
|
-
|
|
|
|
|
- logger.debug(f" [GLM-OCR] 第 {page_num} 页图片: {original_kb:.1f} KB ({pix.width}x{pix.height})")
|
|
|
|
|
|
|
+ # 1. 获取图片(优先使用缓存)
|
|
|
|
|
+ if use_cache:
|
|
|
|
|
+ # 【优化】使用版面分析阶段缓存的图片
|
|
|
|
|
+ width, height, img_bytes = cached
|
|
|
|
|
+ original_kb = len(img_bytes) / 1024
|
|
|
|
|
+ logger.debug(f" [GLM-OCR] 第 {page_num} 页使用缓存图片: "
|
|
|
|
|
+ f"{original_kb:.1f} KB ({width}x{height})")
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 兜底:重新渲染(理论上不会发生,因为 table 页都应已缓存)
|
|
|
|
|
+ pix = page.get_pixmap(dpi=self.ocr_dpi)
|
|
|
|
|
+ img_bytes = pix.tobytes("jpeg")
|
|
|
|
|
+ original_kb = len(img_bytes) / 1024
|
|
|
|
|
+ logger.warning(f" [GLM-OCR] 第 {page_num} 页无缓存,重新渲染: "
|
|
|
|
|
+ f"{original_kb:.1f} KB ({pix.width}x{pix.height})")
|
|
|
|
|
|
|
|
# 2. 压缩图片
|
|
# 2. 压缩图片
|
|
|
compressed_bytes = self._compress_image(img_bytes)
|
|
compressed_bytes = self._compress_image(img_bytes)
|
|
@@ -313,10 +424,10 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
|
|
|
|
|
# 5. 调用 GLM-OCR API
|
|
# 5. 调用 GLM-OCR API
|
|
|
response = requests.post(
|
|
response = requests.post(
|
|
|
- self.api_url,
|
|
|
|
|
- headers=self.headers,
|
|
|
|
|
|
|
+ self.glm_api_url,
|
|
|
|
|
+ headers=self.glm_headers,
|
|
|
json=payload,
|
|
json=payload,
|
|
|
- timeout=self.timeout
|
|
|
|
|
|
|
+ timeout=self.glm_timeout
|
|
|
)
|
|
)
|
|
|
response.raise_for_status()
|
|
response.raise_for_status()
|
|
|
|
|
|
|
@@ -338,6 +449,116 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
logger.error(f" [GLM-OCR] 第 {page_num} 页识别失败: {e}")
|
|
logger.error(f" [GLM-OCR] 第 {page_num} 页识别失败: {e}")
|
|
|
raise
|
|
raise
|
|
|
|
|
|
|
|
|
|
+ def _ocr_page_with_mineru(self, doc: fitz.Document, page_num: int, original_filename: str) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 【新增】使用 MinerU 本地 API 识别单页
|
|
|
|
|
+
|
|
|
|
|
+ 流程:
|
|
|
|
|
+ 1. 【优化】优先使用版面分析缓存的图片(JPEG)
|
|
|
|
|
+ 2. 无缓存时,提取单页为临时 PDF 文件
|
|
|
|
|
+ 3. 调用 MinerU API 上传识别
|
|
|
|
|
+ 4. 提取 Markdown 内容
|
|
|
|
|
+ 5. 清理临时文件
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ doc: 原始 PDF 文档对象
|
|
|
|
|
+ page_num: 页码(1-based)
|
|
|
|
|
+ original_filename: 原始文件名(用于日志)
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ str: 识别出的 Markdown 文本
|
|
|
|
|
+ """
|
|
|
|
|
+ import tempfile
|
|
|
|
|
+ import os
|
|
|
|
|
+
|
|
|
|
|
+ start_time = time.time()
|
|
|
|
|
+
|
|
|
|
|
+ # 【优化】检查是否有缓存图片
|
|
|
|
|
+ cached = self._image_cache.get(page_num)
|
|
|
|
|
+ use_cache = cached is not None
|
|
|
|
|
+
|
|
|
|
|
+ # INFO级别:开始识别
|
|
|
|
|
+ cache_info = "(使用缓存图片)" if use_cache else ""
|
|
|
|
|
+ logger.info(f"[MinerU] 开始识别第 {page_num} 页 {cache_info}")
|
|
|
|
|
+
|
|
|
|
|
+ tmp_pdf_path = None
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 【优化】优先使用缓存的图片数据
|
|
|
|
|
+ if use_cache:
|
|
|
|
|
+ width, height, img_bytes = cached
|
|
|
|
|
+ logger.debug(f" [MinerU] 第 {page_num} 页使用缓存图片: "
|
|
|
|
|
+ f"{len(img_bytes)/1024:.1f} KB ({width}x{height})")
|
|
|
|
|
+
|
|
|
|
|
+ # 使用图片直接上传(MinerU 支持图片格式)
|
|
|
|
|
+ files = {'files': (f"page_{page_num}.jpg", io.BytesIO(img_bytes))}
|
|
|
|
|
+ response = requests.post(
|
|
|
|
|
+ self.mineru_api_url,
|
|
|
|
|
+ files=files,
|
|
|
|
|
+ timeout=self.mineru_timeout
|
|
|
|
|
+ )
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 兜底:提取单页为临时 PDF
|
|
|
|
|
+ logger.debug(f" [MinerU] 第 {page_num} 页无缓存,创建临时 PDF")
|
|
|
|
|
+
|
|
|
|
|
+ single_page_doc = fitz.open()
|
|
|
|
|
+ single_page_doc.insert_pdf(doc, from_page=page_num-1, to_page=page_num-1)
|
|
|
|
|
+
|
|
|
|
|
+ # 创建临时文件
|
|
|
|
|
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp_file:
|
|
|
|
|
+ tmp_pdf_path = tmp_file.name
|
|
|
|
|
+
|
|
|
|
|
+ single_page_doc.save(tmp_pdf_path)
|
|
|
|
|
+ single_page_doc.close()
|
|
|
|
|
+
|
|
|
|
|
+ file_size_kb = os.path.getsize(tmp_pdf_path) / 1024
|
|
|
|
|
+ logger.debug(f" [MinerU] 第 {page_num} 页临时文件: {file_size_kb:.1f} KB")
|
|
|
|
|
+
|
|
|
|
|
+ # 调用 MinerU API
|
|
|
|
|
+ with open(tmp_pdf_path, 'rb') as f:
|
|
|
|
|
+ files = {'files': (f"page_{page_num}.pdf", f)}
|
|
|
|
|
+ response = requests.post(
|
|
|
|
|
+ self.mineru_api_url,
|
|
|
|
|
+ files=files,
|
|
|
|
|
+ timeout=self.mineru_timeout
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ if response.status_code != 200:
|
|
|
|
|
+ raise RuntimeError(f"MinerU API error: {response.status_code} - {response.text[:200]}")
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 解析结果
|
|
|
|
|
+ result = response.json()
|
|
|
|
|
+ content = ""
|
|
|
|
|
+
|
|
|
|
|
+ if "results" in result and isinstance(result["results"], dict):
|
|
|
|
|
+ for filename, file_data in result["results"].items():
|
|
|
|
|
+ if isinstance(file_data, dict) and "md_content" in file_data:
|
|
|
|
|
+ content = file_data["md_content"]
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 处理 HTML 转 Markdown(如果包含 HTML 标签)
|
|
|
|
|
+ if "<table" in content.lower() or "<div" in content.lower():
|
|
|
|
|
+ logger.debug(f" [MinerU] 检测到 HTML 标签,转换为 Markdown")
|
|
|
|
|
+ content = self._process_raw_content(content)
|
|
|
|
|
+
|
|
|
|
|
+ elapsed = time.time() - start_time
|
|
|
|
|
+ logger.info(f"[MinerU] 第 {page_num} 页识别完成,耗时: {elapsed:.2f}s,字符数: {len(content)}")
|
|
|
|
|
+
|
|
|
|
|
+ return content
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f" [MinerU] 第 {page_num} 页识别失败: {e}")
|
|
|
|
|
+ raise
|
|
|
|
|
+
|
|
|
|
|
+ finally:
|
|
|
|
|
+ # 清理临时文件
|
|
|
|
|
+ if tmp_pdf_path and os.path.exists(tmp_pdf_path):
|
|
|
|
|
+ try:
|
|
|
|
|
+ os.remove(tmp_pdf_path)
|
|
|
|
|
+ logger.debug(f" [MinerU] 清理临时文件: {tmp_pdf_path}")
|
|
|
|
|
+ except:
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
def _compress_image(self, img_bytes: bytes) -> bytes:
|
|
def _compress_image(self, img_bytes: bytes) -> bytes:
|
|
|
"""
|
|
"""
|
|
|
压缩图片至 GLM-OCR 要求的尺寸限制内
|
|
压缩图片至 GLM-OCR 要求的尺寸限制内
|