|
|
@@ -2,12 +2,14 @@
|
|
|
MinerU 本地部署版本全文提取实现
|
|
|
|
|
|
使用本地部署的 MinerU 服务进行 OCR 识别
|
|
|
+支持返回 HTML 格式自动转换为 Markdown
|
|
|
"""
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
import json
|
|
|
import os
|
|
|
+import re
|
|
|
import requests
|
|
|
from pathlib import Path
|
|
|
from typing import Any, Dict, List, Optional
|
|
|
@@ -15,6 +17,13 @@ from typing import Any, Dict, List, Optional
|
|
|
from ..config.provider import default_config_provider
|
|
|
from ..interfaces import DocumentSource, FullTextExtractor
|
|
|
|
|
|
+# 尝试导入 HTML 到 Markdown 转换器
|
|
|
+try:
|
|
|
+ from .html_to_markdown import convert_html_to_markdown, HTMLToMarkdownConverter
|
|
|
+ HTML_CONVERTER_AVAILABLE = True
|
|
|
+except ImportError:
|
|
|
+ HTML_CONVERTER_AVAILABLE = False
|
|
|
+
|
|
|
|
|
|
class LocalMinerUFullTextExtractor(FullTextExtractor):
|
|
|
"""使用本地部署的 MinerU 提取 PDF 全文内容。"""
|
|
|
@@ -123,6 +132,8 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
|
|
|
def _extract_markdown_from_result(self, result: Dict[str, Any]) -> str:
|
|
|
"""
|
|
|
从 MinerU 返回结果中提取 markdown 内容。
|
|
|
+
|
|
|
+ 支持自动检测 HTML 格式并转换为 Markdown。
|
|
|
|
|
|
参数:
|
|
|
result: MinerU API 返回的 JSON 数据
|
|
|
@@ -130,33 +141,42 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
|
|
|
返回:
|
|
|
提取的 markdown 文本
|
|
|
"""
|
|
|
+ raw_content = None
|
|
|
+ content_source = None
|
|
|
+
|
|
|
# 尝试多种可能的结果格式
|
|
|
|
|
|
# 格式1: 直接返回 full_text 字段
|
|
|
if "full_text" in result:
|
|
|
- return result["full_text"]
|
|
|
+ raw_content = result["full_text"]
|
|
|
+ content_source = "full_text"
|
|
|
|
|
|
# 格式2: data.full_text
|
|
|
- if "data" in result and isinstance(result["data"], dict):
|
|
|
+ elif "data" in result and isinstance(result["data"], dict):
|
|
|
if "full_text" in result["data"]:
|
|
|
- return result["data"]["full_text"]
|
|
|
+ raw_content = result["data"]["full_text"]
|
|
|
+ content_source = "data.full_text"
|
|
|
# 格式3: data.markdown
|
|
|
- if "markdown" in result["data"]:
|
|
|
- return result["data"]["markdown"]
|
|
|
+ elif "markdown" in result["data"]:
|
|
|
+ raw_content = result["data"]["markdown"]
|
|
|
+ content_source = "data.markdown"
|
|
|
# 格式4: data.content
|
|
|
- if "content" in result["data"]:
|
|
|
- return result["data"]["content"]
|
|
|
+ elif "content" in result["data"]:
|
|
|
+ raw_content = result["data"]["content"]
|
|
|
+ content_source = "data.content"
|
|
|
|
|
|
# 格式5: markdown 字段
|
|
|
- if "markdown" in result:
|
|
|
- return result["markdown"]
|
|
|
+ elif "markdown" in result:
|
|
|
+ raw_content = result["markdown"]
|
|
|
+ content_source = "markdown"
|
|
|
|
|
|
# 格式6: content 字段
|
|
|
- if "content" in result:
|
|
|
- return result["content"]
|
|
|
+ elif "content" in result:
|
|
|
+ raw_content = result["content"]
|
|
|
+ content_source = "content"
|
|
|
|
|
|
# 格式7: 遍历 pages 提取内容
|
|
|
- if "pages" in result:
|
|
|
+ elif "pages" in result:
|
|
|
pages_text = []
|
|
|
for page in result["pages"]:
|
|
|
if isinstance(page, dict):
|
|
|
@@ -167,17 +187,20 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
|
|
|
elif "content" in page:
|
|
|
pages_text.append(page["content"])
|
|
|
if pages_text:
|
|
|
- return "\n\n".join(pages_text)
|
|
|
+ raw_content = "\n\n".join(pages_text)
|
|
|
+ content_source = "pages"
|
|
|
|
|
|
# 格式8: 本地 MinerU API 格式
|
|
|
# {"results": {"filename": {"md_content": "..."}}}
|
|
|
- if "results" in result and isinstance(result["results"], dict):
|
|
|
+ elif "results" in result and isinstance(result["results"], dict):
|
|
|
for filename, file_data in result["results"].items():
|
|
|
if isinstance(file_data, dict) and "md_content" in file_data:
|
|
|
- return file_data["md_content"]
|
|
|
+ raw_content = file_data["md_content"]
|
|
|
+ content_source = "results.md_content"
|
|
|
+ break
|
|
|
|
|
|
# 格式9: results 列表
|
|
|
- if "results" in result and isinstance(result["results"], list):
|
|
|
+ elif "results" in result and isinstance(result["results"], list):
|
|
|
texts = []
|
|
|
for item in result["results"]:
|
|
|
if isinstance(item, dict):
|
|
|
@@ -188,10 +211,91 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
|
|
|
elif "text" in item:
|
|
|
texts.append(item["text"])
|
|
|
if texts:
|
|
|
- return "\n\n".join(texts)
|
|
|
+ raw_content = "\n\n".join(texts)
|
|
|
+ content_source = "results.list"
|
|
|
|
|
|
# 如果都没找到,打印原始结果用于调试
|
|
|
- print("警告: 无法从 MinerU 结果中提取内容,返回空字符串")
|
|
|
- print(f"结果结构: {list(result.keys())}")
|
|
|
-
|
|
|
- return ""
|
|
|
+ if raw_content is None:
|
|
|
+ print("警告: 无法从 MinerU 结果中提取内容,返回空字符串")
|
|
|
+ print(f"结果结构: {list(result.keys())}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # 检测并转换 HTML 格式
|
|
|
+ if raw_content and self._is_html_content(raw_content):
|
|
|
+ print(f"[INFO] 检测到 HTML 格式内容(来源: {content_source}),自动转换为 Markdown")
|
|
|
+ raw_content = self._convert_html_to_markdown(raw_content)
|
|
|
+
|
|
|
+ return raw_content
|
|
|
+
|
|
|
+ def _is_html_content(self, content: str) -> bool:
|
|
|
+ """
|
|
|
+ 检测内容是否为 HTML 格式
|
|
|
+
|
|
|
+ 通过检查是否包含常见的 HTML 标签来判断
|
|
|
+ """
|
|
|
+ if not content or not isinstance(content, str):
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 检查是否包含常见的 HTML 标签
|
|
|
+ html_tags_pattern = r'<(?:html|head|body|div|span|p|br|hr|table|tr|td|th|ul|ol|li|h[1-6]|b|i|em|strong|a|img|meta|title|link|script|style)[^>]*>'
|
|
|
+
|
|
|
+ # 如果找到多个 HTML 标签,认为是 HTML 内容
|
|
|
+ matches = re.findall(html_tags_pattern, content, re.IGNORECASE)
|
|
|
+
|
|
|
+ # 至少找到 2 个 HTML 标签才认为是 HTML(减少误判)
|
|
|
+ return len(matches) >= 2
|
|
|
+
|
|
|
+ def _convert_html_to_markdown(self, html_content: str) -> str:
|
|
|
+ """
|
|
|
+ 将 HTML 内容转换为 Markdown
|
|
|
+
|
|
|
+ 如果安装了 markdownify 则使用,否则使用简单降级方案
|
|
|
+ """
|
|
|
+ if HTML_CONVERTER_AVAILABLE:
|
|
|
+ try:
|
|
|
+ return convert_html_to_markdown(html_content)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"[WARN] HTML 转 Markdown 失败: {e},使用降级方案")
|
|
|
+ return self._simple_html_to_text(html_content)
|
|
|
+ else:
|
|
|
+ print("[WARN] HTML 转换器不可用,使用简单文本提取")
|
|
|
+ return self._simple_html_to_text(html_content)
|
|
|
+
|
|
|
+ def _simple_html_to_text(self, html_content: str) -> str:
|
|
|
+ """
|
|
|
+ 简单的 HTML 到文本转换(降级方案)
|
|
|
+ """
|
|
|
+ if not html_content:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # 移除 script 和 style 标签及其内容
|
|
|
+ text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
|
|
|
+ text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
|
|
+
|
|
|
+ # 将常见块级标签转为换行
|
|
|
+ text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
|
|
+ text = re.sub(r'</p>', '\n\n', text, flags=re.IGNORECASE)
|
|
|
+ text = re.sub(r'</div>', '\n', text, flags=re.IGNORECASE)
|
|
|
+ text = re.sub(r'</tr>', '\n', text, flags=re.IGNORECASE)
|
|
|
+ text = re.sub(r'</td>', ' ', text, flags=re.IGNORECASE)
|
|
|
+ text = re.sub(r'</th>', ' ', text, flags=re.IGNORECASE)
|
|
|
+
|
|
|
+ # 处理标题标签
|
|
|
+ for i in range(6, 0, -1):
|
|
|
+ text = re.sub(rf'<h{i}[^>]*>(.*?)</h{i}>', rf'{"#" * i} \1\n\n', text, flags=re.IGNORECASE | re.DOTALL)
|
|
|
+
|
|
|
+ # 剥离所有剩余的 HTML 标签
|
|
|
+ text = re.sub(r'<[^>]+>', '', text)
|
|
|
+
|
|
|
+ # 清理 HTML 实体
|
|
|
+ text = text.replace(' ', ' ')
|
|
|
+ text = text.replace('<', '<')
|
|
|
+ text = text.replace('>', '>')
|
|
|
+ text = text.replace('&', '&')
|
|
|
+ text = text.replace('"', '"')
|
|
|
+ text = text.replace(''', "'")
|
|
|
+
|
|
|
+ # 清理多余空行
|
|
|
+ text = re.sub(r'\n{3,}', '\n\n', text)
|
|
|
+
|
|
|
+ return text.strip()
|