|
@@ -1,28 +1,42 @@
|
|
|
"""
|
|
"""
|
|
|
-混合全文提取实现 (HybridFullTextExtractor) - 飞浆版面分析版
|
|
|
|
|
|
|
+混合全文提取实现 (HybridFullTextExtractor) - GLM-OCR 版
|
|
|
|
|
|
|
|
-基于飞浆 RapidLayout 版面分析,检测 table 区域判断扫描件:
|
|
|
|
|
-1. 第一阶段:使用飞浆 RapidLayout 对所有页面进行版面分析
|
|
|
|
|
-2. 第二阶段:含有 table 区域的页面走 MinerU OCR,其余走本地提取
|
|
|
|
|
|
|
+【修改日期】2025-03-27
|
|
|
|
|
+【修改说明】OCR 引擎从 MinerU 替换为 GLM-OCR 本地 API
|
|
|
|
|
+- 版面分析阶段:保持不变(飞浆 RapidLayout)
|
|
|
|
|
+- OCR 阶段:改为 GLM-OCR 单页请求
|
|
|
|
|
+- 删除所有 MinerU 相关代码
|
|
|
|
|
+
|
|
|
|
|
+【请求格式】参考 glm_ocr_api_extractor.py 最终实现版本
|
|
|
|
|
+【API 地址】http://183.220.37.46:25429/v1/chat/completions
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
+import base64
|
|
|
import io
|
|
import io
|
|
|
|
|
+import time
|
|
|
|
|
+from typing import Any, Dict, List, Optional, Set
|
|
|
|
|
+
|
|
|
import fitz # PyMuPDF
|
|
import fitz # PyMuPDF
|
|
|
-import os
|
|
|
|
|
-import tempfile
|
|
|
|
|
import numpy as np
|
|
import numpy as np
|
|
|
-from typing import Any, Dict, List, Optional, Set
|
|
|
|
|
|
|
+import requests
|
|
|
|
|
|
|
|
from foundation.observability.logger.loggering import review_logger as logger
|
|
from foundation.observability.logger.loggering import review_logger as logger
|
|
|
|
|
|
|
|
from ..config.provider import default_config_provider
|
|
from ..config.provider import default_config_provider
|
|
|
from ..interfaces import DocumentSource, FullTextExtractor
|
|
from ..interfaces import DocumentSource, FullTextExtractor
|
|
|
from .fulltext_extractor import PdfFullTextExtractor
|
|
from .fulltext_extractor import PdfFullTextExtractor
|
|
|
-from .mineru_extractor import LocalMinerUFullTextExtractor
|
|
|
|
|
|
|
|
|
|
-# 尝试导入 RapidLayout,如果未安装则给出友好提示
|
|
|
|
|
|
|
+# 尝试导入 PIL 用于图片压缩
|
|
|
|
|
+try:
|
|
|
|
|
+ from PIL import Image
|
|
|
|
|
+ PIL_AVAILABLE = True
|
|
|
|
|
+except ImportError:
|
|
|
|
|
+ PIL_AVAILABLE = False
|
|
|
|
|
+ logger.warning("PIL 未安装,GLM-OCR 图片压缩功能将不可用")
|
|
|
|
|
+
|
|
|
|
|
+# 尝试导入 RapidLayout
|
|
|
try:
|
|
try:
|
|
|
from rapid_layout import RapidLayout
|
|
from rapid_layout import RapidLayout
|
|
|
RAPID_LAYOUT_AVAILABLE = True
|
|
RAPID_LAYOUT_AVAILABLE = True
|
|
@@ -33,32 +47,44 @@ except ImportError:
|
|
|
|
|
|
|
|
class HybridFullTextExtractor(FullTextExtractor):
|
|
class HybridFullTextExtractor(FullTextExtractor):
|
|
|
"""
|
|
"""
|
|
|
- 混合提取器:基于飞浆版面分析检测 table 区域,智能路由扫描页到 MinerU OCR。
|
|
|
|
|
|
|
+ 混合提取器:基于飞浆版面分析检测 table 区域,智能路由扫描页到 GLM-OCR。
|
|
|
|
|
+
|
|
|
|
|
+ 【变更记录】
|
|
|
|
|
+ - 2025-03-27: OCR 引擎从 MinerU 切换为 GLM-OCR 本地 API
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
|
|
+ # GLM-OCR 图片尺寸限制
|
|
|
|
|
+ MAX_SHORT_EDGE = 1024 # 短边最大 1024px
|
|
|
|
|
+ JPEG_QUALITY = 90 # 提高质量到 90,平衡识别效果和传输大小
|
|
|
|
|
+
|
|
|
def __init__(
|
|
def __init__(
|
|
|
self,
|
|
self,
|
|
|
layout_dpi: int = 180,
|
|
layout_dpi: int = 180,
|
|
|
ocr_dpi: int = 220,
|
|
ocr_dpi: int = 220,
|
|
|
- jpg_quality: int = 90
|
|
|
|
|
|
|
+ jpg_quality: int = 85, # 降低为 85 配合 GLM-OCR
|
|
|
|
|
+ api_url: Optional[str] = None,
|
|
|
|
|
+ timeout: int = 600
|
|
|
) -> None:
|
|
) -> None:
|
|
|
self._cfg = default_config_provider
|
|
self._cfg = default_config_provider
|
|
|
- # 复用已有的提取器
|
|
|
|
|
self.local_extractor = PdfFullTextExtractor()
|
|
self.local_extractor = PdfFullTextExtractor()
|
|
|
- self.mineru_extractor = LocalMinerUFullTextExtractor() # 使用本地 MinerU
|
|
|
|
|
-
|
|
|
|
|
- # 飞浆版面分析配置(保守版优化参数)
|
|
|
|
|
- self.layout_dpi = layout_dpi # 版面分析 DPI:180(平衡检测精度和速度)
|
|
|
|
|
- self.ocr_dpi = ocr_dpi # OCR阶段 DPI:220(表格识别甜点值)
|
|
|
|
|
- self.jpg_quality = jpg_quality # JPEG质量:90(几乎无损,文件可控)
|
|
|
|
|
- self._layout_engine: Optional[Any] = None # 延迟初始化
|
|
|
|
|
-
|
|
|
|
|
- # 外部注入的进度状态字典(由 DocumentWorkflow 设置,心跳协程读取)
|
|
|
|
|
- # 格式:{'current': int(0-100), 'message': str}
|
|
|
|
|
- # 阶段一(版面分析):current 0→50,阶段二(OCR提取):current 50→100
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # GLM-OCR 配置
|
|
|
|
|
+ self.api_url = api_url or self._cfg.get(
|
|
|
|
|
+ "glm_ocr.api_url",
|
|
|
|
|
+ "http://183.220.37.46:25429/v1/chat/completions"
|
|
|
|
|
+ )
|
|
|
|
|
+ self.timeout = timeout
|
|
|
|
|
+ self.headers = {"Content-Type": "application/json"}
|
|
|
|
|
+
|
|
|
|
|
+ # 飞浆版面分析配置
|
|
|
|
|
+ self.layout_dpi = layout_dpi
|
|
|
|
|
+ self.ocr_dpi = ocr_dpi
|
|
|
|
|
+ self.jpg_quality = jpg_quality
|
|
|
|
|
+ self._layout_engine: Optional[Any] = None
|
|
|
|
|
+
|
|
|
|
|
+ # 外部注入的进度状态字典
|
|
|
self._progress_state: Optional[dict] = None
|
|
self._progress_state: Optional[dict] = None
|
|
|
|
|
|
|
|
- # 检查 RapidLayout 是否可用
|
|
|
|
|
if not RAPID_LAYOUT_AVAILABLE:
|
|
if not RAPID_LAYOUT_AVAILABLE:
|
|
|
raise ImportError(
|
|
raise ImportError(
|
|
|
"RapidLayout 未安装。请在 doc_worker_venv 虚拟环境中运行:\n"
|
|
"RapidLayout 未安装。请在 doc_worker_venv 虚拟环境中运行:\n"
|
|
@@ -75,13 +101,7 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
|
|
def _detect_table_pages(self, doc: fitz.Document, dpi: int = 150) -> Set[int]:
|
|
|
"""
|
|
"""
|
|
|
使用飞浆 RapidLayout 检测所有页面,返回包含 table 区域的页码集合。
|
|
使用飞浆 RapidLayout 检测所有页面,返回包含 table 区域的页码集合。
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- doc: PyMuPDF 文档对象
|
|
|
|
|
- dpi: PDF 转图片的分辨率
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 包含 table 区域的页码集合 (1-based)
|
|
|
|
|
|
|
+ 【保持不变】
|
|
|
"""
|
|
"""
|
|
|
table_pages: Set[int] = set()
|
|
table_pages: Set[int] = set()
|
|
|
layout_engine = self._get_layout_engine()
|
|
layout_engine = self._get_layout_engine()
|
|
@@ -90,41 +110,39 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
logger.debug(f" [飞浆分析] 开始版面分析,共 {total_pages} 页...")
|
|
logger.debug(f" [飞浆分析] 开始版面分析,共 {total_pages} 页...")
|
|
|
|
|
|
|
|
for page_num in range(1, total_pages + 1):
|
|
for page_num in range(1, total_pages + 1):
|
|
|
- page = doc[page_num - 1] # PyMuPDF 使用 0-based 索引
|
|
|
|
|
|
|
+ page = doc[page_num - 1]
|
|
|
|
|
|
|
|
- # 1. 将页面转换为图片
|
|
|
|
|
|
|
+ # 将页面转换为图片
|
|
|
pix = page.get_pixmap(dpi=dpi)
|
|
pix = page.get_pixmap(dpi=dpi)
|
|
|
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
|
|
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
|
|
|
|
|
|
|
|
- # 2. 飞浆版面分析
|
|
|
|
|
|
|
+ # 飞浆版面分析
|
|
|
try:
|
|
try:
|
|
|
layout_output = layout_engine(img)
|
|
layout_output = layout_engine(img)
|
|
|
|
|
|
|
|
- # 3. 解析版面结果,检查是否有 table 区域
|
|
|
|
|
|
|
+ # 解析版面结果,检查是否有 table 区域
|
|
|
labels = []
|
|
labels = []
|
|
|
if hasattr(layout_output, 'class_names'):
|
|
if hasattr(layout_output, 'class_names'):
|
|
|
labels = list(layout_output.class_names)
|
|
labels = list(layout_output.class_names)
|
|
|
elif hasattr(layout_output, 'boxes'):
|
|
elif hasattr(layout_output, 'boxes'):
|
|
|
- # 兼容不同版本的输出格式
|
|
|
|
|
labels = [
|
|
labels = [
|
|
|
label for _, label, _
|
|
label for _, label, _
|
|
|
in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
|
|
in zip(layout_output.boxes, layout_output.class_names, layout_output.scores)
|
|
|
]
|
|
]
|
|
|
|
|
|
|
|
- # 4. 判断是否包含 table
|
|
|
|
|
|
|
+ # 判断是否包含 table
|
|
|
if "table" in labels:
|
|
if "table" in labels:
|
|
|
table_pages.add(page_num)
|
|
table_pages.add(page_num)
|
|
|
- logger.debug(f" 第 {page_num} 页: 检测到 table 区域 -> 将走 MinerU OCR")
|
|
|
|
|
|
|
+ logger.debug(f" 第 {page_num} 页: 检测到 table 区域 -> 将走 GLM-OCR")
|
|
|
else:
|
|
else:
|
|
|
region_types = ", ".join(set(labels)) if labels else "无"
|
|
region_types = ", ".join(set(labels)) if labels else "无"
|
|
|
logger.debug(f" 第 {page_num} 页: {region_types}")
|
|
logger.debug(f" 第 {page_num} 页: {region_types}")
|
|
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.error(f" 第 {page_num} 页: 版面分析失败 ({e}),默认不走 OCR")
|
|
logger.error(f" 第 {page_num} 页: 版面分析失败 ({e}),默认不走 OCR")
|
|
|
- # 分析失败时,保守起见不走 OCR
|
|
|
|
|
pass
|
|
pass
|
|
|
|
|
|
|
|
- # 阶段一进度:已分析页 / 总页数 → 0% ~ 50%
|
|
|
|
|
|
|
+ # 阶段一进度
|
|
|
if self._progress_state is not None:
|
|
if self._progress_state is not None:
|
|
|
self._progress_state['current'] = int(page_num / total_pages * 50)
|
|
self._progress_state['current'] = int(page_num / total_pages * 50)
|
|
|
self._progress_state['message'] = f"版面分析中:已分析 {page_num}/{total_pages} 页"
|
|
self._progress_state['message'] = f"版面分析中:已分析 {page_num}/{total_pages} 页"
|
|
@@ -136,10 +154,10 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
"""
|
|
"""
|
|
|
执行混合提取流程:
|
|
执行混合提取流程:
|
|
|
1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
|
|
1. 首先用飞浆 RapidLayout 检测所有页面的 table 区域
|
|
|
- 2. 含有 table 的页面走 MinerU OCR
|
|
|
|
|
|
|
+ 2. 含有 table 的页面走 GLM-OCR
|
|
|
3. 其他页面走本地 PyMuPDF 提取
|
|
3. 其他页面走本地 PyMuPDF 提取
|
|
|
"""
|
|
"""
|
|
|
- # 1. 打开文档
|
|
|
|
|
|
|
+ # 打开文档
|
|
|
if source.content is not None:
|
|
if source.content is not None:
|
|
|
doc = fitz.open(stream=io.BytesIO(source.content))
|
|
doc = fitz.open(stream=io.BytesIO(source.content))
|
|
|
source_file = "bytes_stream"
|
|
source_file = "bytes_stream"
|
|
@@ -154,14 +172,25 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
total_pages = len(doc)
|
|
total_pages = len(doc)
|
|
|
- logger.debug(f"开始混合提取(飞浆版面分析 + 本地 MinerU),共 {total_pages} 页...")
|
|
|
|
|
|
|
+ ocr_page_count = 0 # 统计需要OCR的页数
|
|
|
|
|
+
|
|
|
|
|
+ # INFO级别:开始文档提取(方便查看主要流程)
|
|
|
|
|
+ logger.info(f"[文档提取] 开始处理,共 {total_pages} 页,使用混合模式(GLM-OCR)")
|
|
|
|
|
+ logger.debug(f"开始混合提取(飞浆版面分析 + GLM-OCR),共 {total_pages} 页...")
|
|
|
|
|
|
|
|
if self._progress_state is not None:
|
|
if self._progress_state is not None:
|
|
|
self._progress_state['current'] = 0
|
|
self._progress_state['current'] = 0
|
|
|
self._progress_state['message'] = f"版面分析中:已分析 0/{total_pages} 页"
|
|
self._progress_state['message'] = f"版面分析中:已分析 0/{total_pages} 页"
|
|
|
|
|
|
|
|
- # ========== 第一阶段:飞浆版面分析,检测 table 页 ==========
|
|
|
|
|
|
|
+ # ========== 第一阶段:飞浆版面分析 ==========
|
|
|
table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
|
|
table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
|
|
|
|
|
+ ocr_page_count = len(table_pages)
|
|
|
|
|
+
|
|
|
|
|
+ # INFO级别:版面分析完成,显示OCR页数
|
|
|
|
|
+ if ocr_page_count > 0:
|
|
|
|
|
+ logger.info(f"[文档提取] 版面分析完成,共 {ocr_page_count} 页需要OCR识别,{total_pages - ocr_page_count} 页直接提取")
|
|
|
|
|
+ else:
|
|
|
|
|
+ logger.info(f"[文档提取] 版面分析完成,无扫描页,全部直接提取")
|
|
|
|
|
|
|
|
# ========== 第二阶段:分流处理 ==========
|
|
# ========== 第二阶段:分流处理 ==========
|
|
|
logger.debug(f"\n开始分流处理...")
|
|
logger.debug(f"\n开始分流处理...")
|
|
@@ -169,25 +198,23 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
for i, page in enumerate(doc):
|
|
for i, page in enumerate(doc):
|
|
|
page_num = i + 1
|
|
page_num = i + 1
|
|
|
|
|
|
|
|
- # 判断是否为 table 页(即扫描件)
|
|
|
|
|
if page_num in table_pages:
|
|
if page_num in table_pages:
|
|
|
- logger.debug(f" [第 {page_num} 页] 检测到 table -> 走本地 MinerU OCR")
|
|
|
|
|
|
|
+ logger.debug(f" [第 {page_num} 页] 检测到 table -> 走 GLM-OCR")
|
|
|
|
|
|
|
|
- # --- 扫描件处理 (MinerU OCR) ---
|
|
|
|
|
try:
|
|
try:
|
|
|
- page_text = self._ocr_page(page, page_num, source_file)
|
|
|
|
|
|
|
+ # 调用 GLM-OCR
|
|
|
|
|
+ page_text = self._ocr_page_with_glm(page, page_num, source_file)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- logger.error(f" MinerU OCR 失败,回退到本地提取: {e}")
|
|
|
|
|
|
|
+ logger.error(f" GLM-OCR 失败,回退到本地提取: {e}")
|
|
|
raw_text = page.get_text()
|
|
raw_text = page.get_text()
|
|
|
page_text = self.local_extractor._filter_header_footer(raw_text)
|
|
page_text = self.local_extractor._filter_header_footer(raw_text)
|
|
|
else:
|
|
else:
|
|
|
logger.debug(f" [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
|
|
logger.debug(f" [第 {page_num} 页] 无 table -> 走本地 PyMuPDF 提取")
|
|
|
|
|
|
|
|
- # --- 电子版处理 (本地 PyMuPDF) ---
|
|
|
|
|
text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
|
|
text_with_tables = self.local_extractor._extract_text_with_table_placeholders(page)
|
|
|
page_text = self.local_extractor._filter_header_footer(text_with_tables)
|
|
page_text = self.local_extractor._filter_header_footer(text_with_tables)
|
|
|
|
|
|
|
|
- # --- 组装结果 ---
|
|
|
|
|
|
|
+ # 组装结果
|
|
|
pages.append({
|
|
pages.append({
|
|
|
"page_num": page_num,
|
|
"page_num": page_num,
|
|
|
"text": page_text,
|
|
"text": page_text,
|
|
@@ -197,7 +224,7 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
})
|
|
})
|
|
|
current_pos += len(page_text)
|
|
current_pos += len(page_text)
|
|
|
|
|
|
|
|
- # 阶段二进度:已处理页 / 总页数 → 50% ~ 100%
|
|
|
|
|
|
|
+ # 阶段二进度
|
|
|
if self._progress_state is not None:
|
|
if self._progress_state is not None:
|
|
|
self._progress_state['current'] = 50 + int(page_num / total_pages * 50)
|
|
self._progress_state['current'] = 50 + int(page_num / total_pages * 50)
|
|
|
ocr_flag = "(OCR)" if page_num in table_pages else ""
|
|
ocr_flag = "(OCR)" if page_num in table_pages else ""
|
|
@@ -205,53 +232,334 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
|
|
|
|
|
finally:
|
|
finally:
|
|
|
doc.close()
|
|
doc.close()
|
|
|
|
|
+
|
|
|
|
|
+ # INFO级别:文档提取完成
|
|
|
|
|
+ total_chars = sum(len(page['text']) for page in pages)
|
|
|
|
|
+ logger.info(f"[文档提取] 完成,共 {total_pages} 页,总字符数: {total_chars}")
|
|
|
|
|
|
|
|
return pages
|
|
return pages
|
|
|
|
|
|
|
|
- def _ocr_page(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
|
|
|
|
|
|
|
+ def _ocr_page_with_glm(self, page: fitz.Page, page_num: int, original_filename: str) -> str:
|
|
|
"""
|
|
"""
|
|
|
- 将单页转为图片并调用本地 MinerU OCR。
|
|
|
|
|
- 使用 JPEG 格式以减小文件大小,提高传输效率。
|
|
|
|
|
|
|
+ 将单页转为图片并调用 GLM-OCR 本地 API 识别
|
|
|
|
|
+
|
|
|
|
|
+ 【逻辑来源】glm_ocr_api_extractor.py 最终实现版本
|
|
|
|
|
+
|
|
|
|
|
+ 流程:
|
|
|
|
|
+ 1. PyMuPDF 渲染页面为图片(220 DPI)
|
|
|
|
|
+ 2. PIL 压缩图片(短边限制 1024px,JPEG 质量 85)
|
|
|
|
|
+ 3. Base64 编码
|
|
|
|
|
+ 4. 构建 OpenAI 兼容格式请求
|
|
|
|
|
+ 5. POST 请求 GLM-OCR API
|
|
|
|
|
+ 6. 解析响应并转换 HTML→Markdown
|
|
|
|
|
+
|
|
|
|
|
+ 请求格式:
|
|
|
|
|
+ {
|
|
|
|
|
+ "model": "GLM-OCR",
|
|
|
|
|
+ "messages": [{
|
|
|
|
|
+ "role": "user",
|
|
|
|
|
+ "content": [
|
|
|
|
|
+ {"type": "text", "text": "提示词"},
|
|
|
|
|
+ {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
|
|
|
|
|
+ ]
|
|
|
|
|
+ }],
|
|
|
|
|
+ "max_tokens": 2048,
|
|
|
|
|
+ "temperature": 0.1
|
|
|
|
|
+ }
|
|
|
"""
|
|
"""
|
|
|
- # 1. 渲染为图片(保守版优化:220 DPI 提升表格识别精度)
|
|
|
|
|
- pix = page.get_pixmap(dpi=self.ocr_dpi)
|
|
|
|
|
|
|
+ start_time = time.time()
|
|
|
|
|
+
|
|
|
|
|
+ # INFO级别:开始调用GLM-OCR识别(方便查看主要流程)
|
|
|
|
|
+ logger.info(f"[GLM-OCR] 开始识别第 {page_num} 页(扫描页)")
|
|
|
|
|
|
|
|
- # 2. 保存为临时 JPEG 文件(比 PNG 更小)
|
|
|
|
|
- tmp_path = None
|
|
|
|
|
try:
|
|
try:
|
|
|
- with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
|
|
|
|
|
- tmp_path = tmp_file.name
|
|
|
|
|
|
|
+ # 1. 渲染为图片
|
|
|
|
|
+ pix = page.get_pixmap(dpi=self.ocr_dpi)
|
|
|
|
|
+ img_bytes = pix.tobytes("jpeg")
|
|
|
|
|
+ original_kb = len(img_bytes) / 1024
|
|
|
|
|
|
|
|
- # 保存为 JPEG 格式,质量 90%,几乎无损且文件可控
|
|
|
|
|
- pix.save(tmp_path, "jpeg", jpg_quality=self.jpg_quality)
|
|
|
|
|
|
|
+ logger.debug(f" [GLM-OCR] 第 {page_num} 页图片: {original_kb:.1f} KB ({pix.width}x{pix.height})")
|
|
|
|
|
|
|
|
- # 检查文件是否正确生成
|
|
|
|
|
- if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
|
|
|
|
|
- logger.error(f" [WARN] 无法创建第 {page_num} 页的临时图片")
|
|
|
|
|
- return ""
|
|
|
|
|
|
|
+ # 2. 压缩图片
|
|
|
|
|
+ compressed_bytes = self._compress_image(img_bytes)
|
|
|
|
|
+ compressed_kb = len(compressed_bytes) / 1024
|
|
|
|
|
+
|
|
|
|
|
+ # 3. Base64 编码
|
|
|
|
|
+ img_base64 = base64.b64encode(compressed_bytes).decode('utf-8').replace('\n', '').replace('\r', '')
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 构建 OpenAI 兼容格式请求
|
|
|
|
|
+ payload = {
|
|
|
|
|
+ "model": "GLM-OCR",
|
|
|
|
|
+ "messages": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "role": "user",
|
|
|
|
|
+ "content": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "type": "text",
|
|
|
|
|
+ "text": "请详细识别图片中的所有文字内容,保留原始排版格式,以 Markdown 格式输出。"
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ "type": "image_url",
|
|
|
|
|
+ "image_url": {
|
|
|
|
|
+ "url": f"data:image/jpeg;base64,{img_base64}"
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ ]
|
|
|
|
|
+ }
|
|
|
|
|
+ ],
|
|
|
|
|
+ "max_tokens": 2048,
|
|
|
|
|
+ "temperature": 0.1
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # 5. 调用 GLM-OCR API
|
|
|
|
|
+ response = requests.post(
|
|
|
|
|
+ self.api_url,
|
|
|
|
|
+ headers=self.headers,
|
|
|
|
|
+ json=payload,
|
|
|
|
|
+ timeout=self.timeout
|
|
|
|
|
+ )
|
|
|
|
|
+ response.raise_for_status()
|
|
|
|
|
+
|
|
|
|
|
+ # 6. 解析结果
|
|
|
|
|
+ result = response.json()
|
|
|
|
|
+ content = self._extract_content(result)
|
|
|
|
|
+
|
|
|
|
|
+ # 7. 处理 HTML 转 Markdown
|
|
|
|
|
+ md_content = self._process_raw_content(content)
|
|
|
|
|
+
|
|
|
|
|
+ elapsed = time.time() - start_time
|
|
|
|
|
+ # INFO级别:识别完成(方便查看主要流程)
|
|
|
|
|
+ logger.info(f"[GLM-OCR] 第 {page_num} 页识别完成,耗时: {elapsed:.2f}s,字符数: {len(md_content)}")
|
|
|
|
|
+ logger.debug(f" [GLM-OCR] 第 {page_num} 页详细耗时: {elapsed:.2f}s")
|
|
|
|
|
+
|
|
|
|
|
+ return md_content
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f" [GLM-OCR] 第 {page_num} 页识别失败: {e}")
|
|
|
|
|
+ raise
|
|
|
|
|
|
|
|
- # 输出文件大小信息(用于调试)
|
|
|
|
|
- file_size_kb = os.path.getsize(tmp_path) / 1024
|
|
|
|
|
- logger.debug(f" [INFO] 第 {page_num} 页图片: {file_size_kb:.1f} KB ({pix.width}x{pix.height})")
|
|
|
|
|
|
|
+ def _compress_image(self, img_bytes: bytes) -> bytes:
|
|
|
|
|
+ """
|
|
|
|
|
+ 压缩图片至 GLM-OCR 要求的尺寸限制内
|
|
|
|
|
+
|
|
|
|
|
+ 【逻辑来源】glm_ocr_api_extractor.py _compress_image 方法
|
|
|
|
|
+
|
|
|
|
|
+ 压缩规则:
|
|
|
|
|
+ - 短边最大 1024px
|
|
|
|
|
+ - JPEG 质量 85
|
|
|
|
|
+ - 等比缩放
|
|
|
|
|
+ """
|
|
|
|
|
+ if not PIL_AVAILABLE:
|
|
|
|
|
+ logger.debug(" [压缩] PIL 不可用,使用原始图片")
|
|
|
|
|
+ return img_bytes
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ img = Image.open(io.BytesIO(img_bytes))
|
|
|
|
|
|
|
|
- # 3. 构造一个临时的 DocumentSource
|
|
|
|
|
- tmp_source = DocumentSource(path=tmp_path)
|
|
|
|
|
|
|
+ # 转为 RGB
|
|
|
|
|
+ if img.mode in ('RGBA', 'LA', 'P'):
|
|
|
|
|
+ background = Image.new('RGB', img.size, (255, 255, 255))
|
|
|
|
|
+ if img.mode == 'P':
|
|
|
|
|
+ img = img.convert('RGBA')
|
|
|
|
|
+ if img.mode in ('RGBA', 'LA'):
|
|
|
|
|
+ background.paste(img, mask=img.split()[-1])
|
|
|
|
|
+ img = background
|
|
|
|
|
+ elif img.mode != 'RGB':
|
|
|
|
|
+ img = img.convert('RGB')
|
|
|
|
|
|
|
|
- # 4. 调用本地 MinerU
|
|
|
|
|
- results = self.mineru_extractor.extract_full_text(tmp_source)
|
|
|
|
|
|
|
+ original_size = img.size
|
|
|
|
|
|
|
|
- if results and len(results) > 0:
|
|
|
|
|
- return results[0]["text"]
|
|
|
|
|
- return ""
|
|
|
|
|
|
|
+ # 检查是否需要缩放(短边 > 1024px)
|
|
|
|
|
+ min_edge = min(img.size)
|
|
|
|
|
+ if min_edge > self.MAX_SHORT_EDGE:
|
|
|
|
|
+ ratio = self.MAX_SHORT_EDGE / min_edge
|
|
|
|
|
+ new_size = (int(img.width * ratio), int(img.height * ratio))
|
|
|
|
|
+ img = img.resize(new_size, Image.Resampling.LANCZOS)
|
|
|
|
|
+ logger.debug(f" [压缩] 图片缩放: {original_size} -> {img.size}")
|
|
|
|
|
+
|
|
|
|
|
+ # 压缩为 JPEG
|
|
|
|
|
+ buffer = io.BytesIO()
|
|
|
|
|
+ img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
|
|
|
|
|
+
|
|
|
|
|
+ compressed_kb = len(buffer.getvalue()) / 1024
|
|
|
|
|
+ original_kb = len(img_bytes) / 1024
|
|
|
|
|
+ logger.debug(f" [压缩] {original_kb:.1f} KB -> {compressed_kb:.1f} KB")
|
|
|
|
|
+
|
|
|
|
|
+ return buffer.getvalue()
|
|
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- logger.error(f" [WARN] 第 {page_num} 页 OCR 失败: {e}")
|
|
|
|
|
|
|
+ logger.warning(f" [压缩] 主流程压缩失败,使用兜底压缩: {e}")
|
|
|
|
|
+ # 兜底:简化流程,但保持相同质量
|
|
|
|
|
+ try:
|
|
|
|
|
+ img = Image.open(io.BytesIO(img_bytes))
|
|
|
|
|
+ if img.mode != 'RGB':
|
|
|
|
|
+ img = img.convert('RGB')
|
|
|
|
|
+ # 确保尺寸符合要求(短边 <= 1024)
|
|
|
|
|
+ min_edge = min(img.size)
|
|
|
|
|
+ if min_edge > self.MAX_SHORT_EDGE:
|
|
|
|
|
+ ratio = self.MAX_SHORT_EDGE / min_edge
|
|
|
|
|
+ new_size = (int(img.width * ratio), int(img.height * ratio))
|
|
|
|
|
+ img = img.resize(new_size, Image.Resampling.LANCZOS)
|
|
|
|
|
+ buffer = io.BytesIO()
|
|
|
|
|
+ # 兜底也使用相同质量,确保识别效果
|
|
|
|
|
+ img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
|
|
|
|
|
+ logger.debug(f" [压缩] 兜底压缩成功: {len(buffer.getvalue())/1024:.1f} KB")
|
|
|
|
|
+ return buffer.getvalue()
|
|
|
|
|
+ except Exception as e2:
|
|
|
|
|
+ logger.error(f" [压缩] 兜底压缩也失败: {e2}")
|
|
|
|
|
+ # 最后兜底:使用原始图片(可能导致API错误)
|
|
|
|
|
+ return img_bytes
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_content(self, result: Dict[str, Any]) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 从 OpenAI 兼容响应中提取内容
|
|
|
|
|
+
|
|
|
|
|
+ 响应格式:
|
|
|
|
|
+ {
|
|
|
|
|
+ "choices": [{
|
|
|
|
|
+ "message": {
|
|
|
|
|
+ "content": "识别结果..."
|
|
|
|
|
+ }
|
|
|
|
|
+ }]
|
|
|
|
|
+ }
|
|
|
|
|
+ """
|
|
|
|
|
+ if "choices" in result and isinstance(result["choices"], list):
|
|
|
|
|
+ if len(result["choices"]) > 0:
|
|
|
|
|
+ message = result["choices"][0].get("message", {})
|
|
|
|
|
+ return message.get("content", "")
|
|
|
|
|
+ return ""
|
|
|
|
|
+
|
|
|
|
|
+ def _process_raw_content(self, raw_content: str) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 处理原始内容(HTML 转 Markdown)
|
|
|
|
|
+
|
|
|
|
|
+ 【逻辑来源】glm_ocr_api_extractor.py _process_raw_content 方法
|
|
|
|
|
+
|
|
|
|
|
+ 处理流程:
|
|
|
|
|
+ 1. 检测并转换 HTML 表格
|
|
|
|
|
+ 2. 检测 HTML 格式,使用 markdownify 转换
|
|
|
|
|
+ 3. 失败则返回原始内容
|
|
|
|
|
+ """
|
|
|
|
|
+ if not raw_content:
|
|
|
return ""
|
|
return ""
|
|
|
|
|
+
|
|
|
|
|
+ # 转换 HTML 表格
|
|
|
|
|
+ if "<table" in raw_content.lower():
|
|
|
|
|
+ raw_content = self._convert_html_tables_to_markdown(raw_content)
|
|
|
|
|
+
|
|
|
|
|
+ # HTML 转 Markdown
|
|
|
|
|
+ if self._is_html_content(raw_content):
|
|
|
|
|
+ try:
|
|
|
|
|
+ import markdownify
|
|
|
|
|
+ return markdownify.markdownify(raw_content, heading_style="ATX").strip()
|
|
|
|
|
+ except ImportError:
|
|
|
|
|
+ logger.debug(" [转换] markdownify 未安装,跳过 HTML 转换")
|
|
|
|
|
+
|
|
|
|
|
+ return raw_content.strip()
|
|
|
|
|
+
|
|
|
|
|
+ def _is_html_content(self, content: str) -> bool:
|
|
|
|
|
+ """检查内容是否为 HTML 格式"""
|
|
|
|
|
+ if not content:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ html_indicators = [
|
|
|
|
|
+ "<!DOCTYPE", "<html", "<body", "<div", "<p>", "<table",
|
|
|
|
|
+ "<h1", "<h2", "<span", "<br", " ", """
|
|
|
|
|
+ ]
|
|
|
|
|
+ content_lower = content.lower()
|
|
|
|
|
+ html_tag_count = sum(1 for indicator in html_indicators if indicator.lower() in content_lower)
|
|
|
|
|
+ return html_tag_count >= 2
|
|
|
|
|
+
|
|
|
|
|
+ def _convert_html_tables_to_markdown(self, content: str) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 将 HTML 表格转换为 Markdown 表格格式
|
|
|
|
|
+
|
|
|
|
|
+ 【逻辑来源】glm_ocr_api_extractor.py _convert_html_tables_to_markdown 方法
|
|
|
|
|
+ """
|
|
|
|
|
+ import re
|
|
|
|
|
+
|
|
|
|
|
+ def extract_cell_text(cell_html: str) -> str:
|
|
|
|
|
+ text = re.sub(r'<[^>]+>', '', cell_html)
|
|
|
|
|
+ text = text.replace(' ', ' ').replace('<', '<').replace('>', '>')
|
|
|
|
|
+ text = text.replace('&', '&').replace('"', '"').replace(''', "'")
|
|
|
|
|
+ return text.strip()
|
|
|
|
|
+
|
|
|
|
|
+ def parse_colspan(td_html: str) -> int:
|
|
|
|
|
+ match = re.search(r'colspan=["\']?(\d+)["\']?', td_html, re.IGNORECASE)
|
|
|
|
|
+ return int(match.group(1)) if match else 1
|
|
|
|
|
+
|
|
|
|
|
+ def convert_table_match(match):
|
|
|
|
|
+ table_html = match.group(0)
|
|
|
|
|
|
|
|
- finally:
|
|
|
|
|
- # 清理临时文件
|
|
|
|
|
- if tmp_path and os.path.exists(tmp_path):
|
|
|
|
|
- try:
|
|
|
|
|
- os.remove(tmp_path)
|
|
|
|
|
- except:
|
|
|
|
|
- pass
|
|
|
|
|
|
|
+ # 提取 thead 和 tbody
|
|
|
|
|
+ thead_match = re.search(r'<thead[^>]*>(.*?)</thead>', table_html, re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+ tbody_match = re.search(r'<tbody[^>]*>(.*?)</tbody>', table_html, re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+
|
|
|
|
|
+ all_rows = []
|
|
|
|
|
+
|
|
|
|
|
+ # 处理 thead 中的行
|
|
|
|
|
+ if thead_match:
|
|
|
|
|
+ thead_html = thead_match.group(1)
|
|
|
|
|
+ tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', thead_html, re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+ for tr in tr_matches:
|
|
|
|
|
+ all_rows.append(tr)
|
|
|
|
|
+
|
|
|
|
|
+ # 处理 tbody 中的行
|
|
|
|
|
+ if tbody_match:
|
|
|
|
|
+ tbody_html = tbody_match.group(1)
|
|
|
|
|
+ tr_matches = re.findall(r'<tr[^>]*>(.*?)</tr>', tbody_html, re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+ for tr in tr_matches:
|
|
|
|
|
+ all_rows.append(tr)
|
|
|
|
|
+
|
|
|
|
|
+ # 如果没有 thead/tbody,直接提取所有 tr
|
|
|
|
|
+ if not all_rows:
|
|
|
|
|
+ all_rows = re.findall(r'<tr[^>]*>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+
|
|
|
|
|
+ # 解析所有行
|
|
|
|
|
+ parsed_rows = []
|
|
|
|
|
+ for tr_html in all_rows:
|
|
|
|
|
+ cells = re.findall(r'<(t[dh])[^>]*>(.*?)</\1>', tr_html, re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+
|
|
|
|
|
+ row_data = []
|
|
|
|
|
+ for tag, cell_content in cells:
|
|
|
|
|
+ full_cell_match = re.search(rf'<{tag}[^>]*>', tr_html[tr_html.find(cell_content)-50:tr_html.find(cell_content)])
|
|
|
|
|
+ cell_start = full_cell_match.group(0) if full_cell_match else f'<{tag}>'
|
|
|
|
|
+
|
|
|
|
|
+ text = extract_cell_text(cell_content)
|
|
|
|
|
+ colspan = parse_colspan(cell_start)
|
|
|
|
|
+ row_data.append((text, colspan))
|
|
|
|
|
+
|
|
|
|
|
+ if row_data:
|
|
|
|
|
+ parsed_rows.append(row_data)
|
|
|
|
|
+
|
|
|
|
|
+ if not parsed_rows:
|
|
|
|
|
+ return ""
|
|
|
|
|
+
|
|
|
|
|
+ # 计算最大列数(考虑 colspan)
|
|
|
|
|
+ max_cols = 0
|
|
|
|
|
+ for row in parsed_rows:
|
|
|
|
|
+ cols = sum(colspan for _, colspan in row)
|
|
|
|
|
+ max_cols = max(max_cols, cols)
|
|
|
|
|
+
|
|
|
|
|
+ # 展开 colspan 并生成 Markdown
|
|
|
|
|
+ md_rows = []
|
|
|
|
|
+ for row in parsed_rows:
|
|
|
|
|
+ expanded_cells = []
|
|
|
|
|
+ for text, colspan in row:
|
|
|
|
|
+ expanded_cells.append(text)
|
|
|
|
|
+ for _ in range(colspan - 1):
|
|
|
|
|
+ expanded_cells.append("")
|
|
|
|
|
+
|
|
|
|
|
+ while len(expanded_cells) < max_cols:
|
|
|
|
|
+ expanded_cells.append("")
|
|
|
|
|
+
|
|
|
|
|
+ md_rows.append("| " + " | ".join(expanded_cells) + " |")
|
|
|
|
|
+
|
|
|
|
|
+ # 添加分隔行
|
|
|
|
|
+ if len(md_rows) > 0:
|
|
|
|
|
+ md_rows.insert(1, "| " + " | ".join(["---"] * max_cols) + " |")
|
|
|
|
|
+
|
|
|
|
|
+ return "\n".join(md_rows)
|
|
|
|
|
+
|
|
|
|
|
+ return re.sub(r'<table[^>]*>.*?</table>', convert_table_match, content,
|
|
|
|
|
+ flags=re.DOTALL | re.IGNORECASE)
|