|
|
@@ -89,10 +89,14 @@ class PdfStructureExtractor:
|
|
|
self._layout_engine = RapidLayout()
|
|
|
return self._layout_engine
|
|
|
|
|
|
- def extract(self, file_content: bytes) -> Dict[str, Any]:
|
|
|
+ def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
|
|
|
"""
|
|
|
从 PDF 字节流提取章节结构。
|
|
|
|
|
|
+ Args:
|
|
|
+ file_content: PDF 文件字节流
|
|
|
+ progress_callback: 进度回调函数,接收 (stage, current, message) 参数
|
|
|
+
|
|
|
Returns:
|
|
|
{
|
|
|
"chapters": {
|
|
|
@@ -106,21 +110,30 @@ class PdfStructureExtractor:
|
|
|
"""
|
|
|
doc = fitz.open(stream=file_content)
|
|
|
try:
|
|
|
- structure = self._extract_from_doc(doc)
|
|
|
+ structure = self._extract_from_doc(doc, progress_callback)
|
|
|
structure["total_pages"] = len(doc)
|
|
|
return structure
|
|
|
finally:
|
|
|
doc.close()
|
|
|
|
|
|
- def _extract_from_doc(self, doc: fitz.Document) -> Dict[str, Any]:
|
|
|
+ def _extract_from_doc(self, doc: fitz.Document, progress_callback=None) -> Dict[str, Any]:
|
|
|
"""提取文档结构(支持 OCR 异步并发)"""
|
|
|
|
|
|
+ def _emit_progress(stage: str, current: int, message: str):
|
|
|
+ """发送进度回调"""
|
|
|
+ if progress_callback:
|
|
|
+ try:
|
|
|
+ progress_callback(stage, current, message)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+
|
|
|
# === 阶段1: 收集所有需要 OCR 的表格区域 ===
|
|
|
table_regions: List[TableRegion] = []
|
|
|
|
|
|
if self.use_ocr:
|
|
|
logger.info("[OCR预处理] 扫描所有页面的表格区域...")
|
|
|
- for page_num in range(len(doc)):
|
|
|
+ total_pages = len(doc)
|
|
|
+ for page_num in range(total_pages):
|
|
|
page = doc.load_page(page_num)
|
|
|
rect = page.rect
|
|
|
clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
|
|
|
@@ -132,6 +145,10 @@ class PdfStructureExtractor:
|
|
|
bbox=bbox,
|
|
|
score=score
|
|
|
))
|
|
|
+ # 每5页或最后一页推送一次进度
|
|
|
+ if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
|
|
|
+ progress = int((page_num + 1) / total_pages * 30) # OCR预处理占30%进度
|
|
|
+ _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
|
|
|
logger.info(f"[OCR预处理] 共发现 {len(table_regions)} 个表格区域需要 OCR")
|
|
|
|
|
|
# === 阶段2: 异步并发执行 OCR (5并发) ===
|
|
|
@@ -139,9 +156,11 @@ class PdfStructureExtractor:
|
|
|
|
|
|
if table_regions:
|
|
|
logger.info(f"[OCR执行] 使用 {self.OCR_CONCURRENT_WORKERS} 并发执行 OCR...")
|
|
|
- ocr_results = self._process_ocr_concurrent(table_regions)
|
|
|
+ _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格,开始OCR识别...")
|
|
|
+ ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
|
|
|
success_count = sum(1 for r in ocr_results if r.success)
|
|
|
logger.info(f"[OCR执行] 完成 {success_count}/{len(table_regions)} 个表格 OCR")
|
|
|
+ _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
|
|
|
|
|
|
# 按页码分组 OCR 结果
|
|
|
ocr_by_page: Dict[int, List[OcrResult]] = {}
|
|
|
@@ -255,9 +274,11 @@ class PdfStructureExtractor:
|
|
|
logger.info(f"[PdfExtractor] 提取完成,共 {len(result['chapters'])} 个章节")
|
|
|
return result
|
|
|
|
|
|
- def _process_ocr_concurrent(self, regions: List[TableRegion]) -> List[OcrResult]:
|
|
|
+ def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
|
|
|
"""同步并发处理 OCR(使用 ThreadPoolExecutor)"""
|
|
|
results: List[OcrResult] = []
|
|
|
+ total = len(regions)
|
|
|
+ completed = 0
|
|
|
|
|
|
with ThreadPoolExecutor(max_workers=self.OCR_CONCURRENT_WORKERS) as executor:
|
|
|
# 提交所有任务
|
|
|
@@ -269,6 +290,7 @@ class PdfStructureExtractor:
|
|
|
# 处理完成的结果
|
|
|
for future in as_completed(future_to_region):
|
|
|
region = future_to_region[future]
|
|
|
+ completed += 1
|
|
|
try:
|
|
|
text = future.result()
|
|
|
results.append(OcrResult(
|
|
|
@@ -288,6 +310,11 @@ class PdfStructureExtractor:
|
|
|
success=False,
|
|
|
))
|
|
|
|
|
|
+ # 每完成5个或最后一个时推送进度
|
|
|
+ if progress_callback and (completed % 5 == 0 or completed == total):
|
|
|
+ progress = 35 + int(completed / total * 15) # OCR执行占15%进度(35-50)
|
|
|
+ progress_callback("版面分析", progress, f"OCR识别中 {completed}/{total}")
|
|
|
+
|
|
|
return results
|
|
|
|
|
|
def _detect_table_regions(
|
|
|
@@ -334,8 +361,10 @@ class PdfStructureExtractor:
|
|
|
|
|
|
return table_regions
|
|
|
|
|
|
- def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float]) -> str:
|
|
|
- """对指定区域进行 OCR 识别(使用 GLM-OCR)"""
|
|
|
+ def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float], max_retries: int = 3) -> str:
|
|
|
+ """对指定区域进行 OCR 识别(使用 GLM-OCR),支持指数退避重试"""
|
|
|
+ import time
|
|
|
+
|
|
|
# 渲染指定区域
|
|
|
rect = fitz.Rect(bbox)
|
|
|
pix = page.get_pixmap(dpi=self.OCR_DPI, clip=rect)
|
|
|
@@ -375,16 +404,33 @@ class PdfStructureExtractor:
|
|
|
if self.ocr_api_key:
|
|
|
headers["Authorization"] = f"Bearer {self.ocr_api_key}"
|
|
|
|
|
|
- response = requests.post(
|
|
|
- self.ocr_api_url,
|
|
|
- headers=headers,
|
|
|
- json=payload,
|
|
|
- timeout=self.ocr_timeout
|
|
|
- )
|
|
|
- response.raise_for_status()
|
|
|
-
|
|
|
- result = response.json()
|
|
|
- return self._extract_ocr_content(result)
|
|
|
+ # 指数退避重试
|
|
|
+ last_error = None
|
|
|
+ for attempt in range(max_retries):
|
|
|
+ try:
|
|
|
+ response = requests.post(
|
|
|
+ self.ocr_api_url,
|
|
|
+ headers=headers,
|
|
|
+ json=payload,
|
|
|
+ timeout=self.ocr_timeout
|
|
|
+ )
|
|
|
+ response.raise_for_status()
|
|
|
+
|
|
|
+ result = response.json()
|
|
|
+ return self._extract_ocr_content(result)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ last_error = e
|
|
|
+ if attempt < max_retries - 1:
|
|
|
+ # 指数退避: 2, 4, 8 秒
|
|
|
+ wait_time = 2 ** (attempt + 1)
|
|
|
+ logger.warning(f" 第 {page.number + 1} 页表格 OCR 第 {attempt + 1} 次失败: {e}, {wait_time}秒后重试...")
|
|
|
+ time.sleep(wait_time)
|
|
|
+ else:
|
|
|
+ logger.error(f" 第 {page.number + 1} 页表格 OCR 最终失败(已重试{max_retries}次): {e}")
|
|
|
+
|
|
|
+ # 所有重试都失败,抛出最后一个错误
|
|
|
+ raise last_error
|
|
|
|
|
|
def _replace_table_regions(
|
|
|
self,
|
|
|
@@ -500,12 +546,22 @@ class PdfStructureExtractor:
|
|
|
return img_bytes
|
|
|
|
|
|
def _extract_ocr_content(self, result: Dict) -> str:
|
|
|
- """从 OCR 响应提取内容"""
|
|
|
+ """从 OCR 响应提取内容,并将 HTML 表格转换为 Markdown"""
|
|
|
+ content = ""
|
|
|
if "choices" in result and isinstance(result["choices"], list):
|
|
|
if len(result["choices"]) > 0:
|
|
|
message = result["choices"][0].get("message", {})
|
|
|
- return message.get("content", "")
|
|
|
- return ""
|
|
|
+ content = message.get("content", "")
|
|
|
+
|
|
|
+ # 如果内容包含 HTML 标签,转换为 Markdown
|
|
|
+ if content and "<" in content and ">" in content:
|
|
|
+ try:
|
|
|
+ from ..doc_worker.pdf_worker.html_to_markdown import convert_html_to_markdown
|
|
|
+ content = convert_html_to_markdown(content)
|
|
|
+ except Exception as e:
|
|
|
+ logger.debug(f"HTML 转 Markdown 失败,保留原始内容: {e}")
|
|
|
+
|
|
|
+ return content
|
|
|
|
|
|
@staticmethod
|
|
|
def _is_header_footer(line: str) -> bool:
|