|
|
@@ -172,6 +172,10 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
|
|
|
try:
|
|
|
total_pages = len(doc)
|
|
|
+ ocr_page_count = 0 # 统计需要OCR的页数
|
|
|
+
|
|
|
+ # INFO级别:开始文档提取(方便查看主要流程)
|
|
|
+ logger.info(f"[文档提取] 开始处理,共 {total_pages} 页,使用混合模式(GLM-OCR)")
|
|
|
logger.debug(f"开始混合提取(飞浆版面分析 + GLM-OCR),共 {total_pages} 页...")
|
|
|
|
|
|
if self._progress_state is not None:
|
|
|
@@ -180,6 +184,13 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
|
|
|
# ========== 第一阶段:飞浆版面分析 ==========
|
|
|
table_pages = self._detect_table_pages(doc, dpi=self.layout_dpi)
|
|
|
+ ocr_page_count = len(table_pages)
|
|
|
+
|
|
|
+ # INFO级别:版面分析完成,显示OCR页数
|
|
|
+ if ocr_page_count > 0:
|
|
|
+ logger.info(f"[文档提取] 版面分析完成,共 {ocr_page_count} 页需要OCR识别,{total_pages - ocr_page_count} 页直接提取")
|
|
|
+ else:
|
|
|
+ logger.info(f"[文档提取] 版面分析完成,无扫描页,全部直接提取")
|
|
|
|
|
|
# ========== 第二阶段:分流处理 ==========
|
|
|
logger.debug(f"\n开始分流处理...")
|
|
|
@@ -221,6 +232,10 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
|
|
|
finally:
|
|
|
doc.close()
|
|
|
+
|
|
|
+ # INFO级别:文档提取完成
|
|
|
+ total_chars = sum(len(page['text']) for page in pages)
|
|
|
+ logger.info(f"[文档提取] 完成,共 {total_pages} 页,总字符数: {total_chars}")
|
|
|
|
|
|
return pages
|
|
|
|
|
|
@@ -254,6 +269,9 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
"""
|
|
|
start_time = time.time()
|
|
|
|
|
|
+ # INFO级别:开始调用GLM-OCR识别(方便查看主要流程)
|
|
|
+ logger.info(f"[GLM-OCR] 开始识别第 {page_num} 页(扫描页)")
|
|
|
+
|
|
|
try:
|
|
|
# 1. 渲染为图片
|
|
|
pix = page.get_pixmap(dpi=self.ocr_dpi)
|
|
|
@@ -310,7 +328,9 @@ class HybridFullTextExtractor(FullTextExtractor):
|
|
|
md_content = self._process_raw_content(content)
|
|
|
|
|
|
elapsed = time.time() - start_time
|
|
|
- logger.debug(f" [GLM-OCR] 第 {page_num} 页完成,耗时: {elapsed:.2f}s,字符数: {len(md_content)}")
|
|
|
+ # INFO级别:识别完成(方便查看主要流程)
|
|
|
+ logger.info(f"[GLM-OCR] 第 {page_num} 页识别完成,耗时: {elapsed:.2f}s,字符数: {len(md_content)}")
|
|
|
+ logger.debug(f" [GLM-OCR] 第 {page_num} 页详细耗时: {elapsed:.2f}s")
|
|
|
|
|
|
return md_content
|
|
|
|