|
|
@@ -0,0 +1,121 @@
|
|
|
+import json
|
|
|
+import os
|
|
|
+import subprocess
|
|
|
+from datetime import datetime
|
|
|
+from pathlib import Path
|
|
|
+from typing import Dict, List
|
|
|
+
|
|
|
+# 你的路径
|
|
|
+input_dir = "/home/ubuntu/minerU/input"
|
|
|
+output_dir = "/home/ubuntu/minerU/output"
|
|
|
+
|
|
|
+# 转换记录(追加写入,不覆盖)
|
|
|
+record_dir = Path(output_dir) / "conversion_records"
|
|
|
+attempt_log_file = record_dir / "attempts.jsonl"
|
|
|
+success_log_file = record_dir / "success_docs.jsonl"
|
|
|
+failed_log_file = record_dir / "failed_docs.jsonl"
|
|
|
+
|
|
|
+
|
|
|
+def now_iso() -> str:
|
|
|
+ return datetime.now().isoformat(timespec="seconds")
|
|
|
+
|
|
|
+
|
|
|
+def append_jsonl(path: Path, payload: Dict) -> None:
|
|
|
+ path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
+ with path.open("a", encoding="utf-8") as f:
|
|
|
+ f.write(json.dumps(payload, ensure_ascii=False) + "\n")
|
|
|
+
|
|
|
+
|
|
|
+def load_latest_status_map(paths: List[Path]) -> Dict[str, str]:
|
|
|
+ """
|
|
|
+ 从记录文件读取每个文档的最新状态:
|
|
|
+ success / failed
|
|
|
+ """
|
|
|
+ latest: Dict[str, str] = {}
|
|
|
+ for log_path in paths:
|
|
|
+ if not log_path.exists():
|
|
|
+ continue
|
|
|
+ with log_path.open("r", encoding="utf-8") as f:
|
|
|
+ for line in f:
|
|
|
+ line = line.strip()
|
|
|
+ if not line:
|
|
|
+ continue
|
|
|
+ try:
|
|
|
+ item = json.loads(line)
|
|
|
+ except json.JSONDecodeError:
|
|
|
+ continue
|
|
|
+ filename = item.get("filename")
|
|
|
+ status = item.get("status")
|
|
|
+ if filename and status in {"success", "failed"}:
|
|
|
+ latest[filename] = status
|
|
|
+ return latest
|
|
|
+
|
|
|
+
|
|
|
+def run_single_pdf(pdf_path: Path) -> tuple[bool, int]:
|
|
|
+ cmd = [
|
|
|
+ "mineru",
|
|
|
+ "-p", str(pdf_path),
|
|
|
+ "-o", output_dir,
|
|
|
+ "--source", "modelscope",
|
|
|
+ ]
|
|
|
+ result = subprocess.run(cmd, check=False)
|
|
|
+ return result.returncode == 0, result.returncode
|
|
|
+
|
|
|
+
|
|
|
+def main() -> None:
|
|
|
+ input_path = Path(input_dir)
|
|
|
+ if not input_path.exists():
|
|
|
+ raise FileNotFoundError(f"输入目录不存在: {input_dir}")
|
|
|
+
|
|
|
+ latest_status = load_latest_status_map([success_log_file, failed_log_file, attempt_log_file])
|
|
|
+
|
|
|
+ all_pdfs = sorted([p for p in input_path.iterdir() if p.is_file() and p.suffix.lower() == ".pdf"])
|
|
|
+ pending_pdfs = [p for p in all_pdfs if latest_status.get(p.name) != "success"]
|
|
|
+
|
|
|
+ print(f"📂 输入目录: {input_dir}")
|
|
|
+ print(f"📦 输出目录: {output_dir}")
|
|
|
+ print(f"📒 记录目录: {record_dir}")
|
|
|
+ print(f"📄 PDF总数: {len(all_pdfs)}")
|
|
|
+ print(f"⏭️ 已成功跳过: {len(all_pdfs) - len(pending_pdfs)}")
|
|
|
+ print(f"🔄 本次待转换: {len(pending_pdfs)}")
|
|
|
+
|
|
|
+ success_count = 0
|
|
|
+ failed_count = 0
|
|
|
+
|
|
|
+ for index, pdf_file in enumerate(pending_pdfs, start=1):
|
|
|
+ filename = pdf_file.name
|
|
|
+ print("\n==================================")
|
|
|
+ print(f"[{index}/{len(pending_pdfs)}] 正在转换:{filename}")
|
|
|
+
|
|
|
+ ok, return_code = run_single_pdf(pdf_file)
|
|
|
+ record = {
|
|
|
+ "timestamp": now_iso(),
|
|
|
+ "filename": filename,
|
|
|
+ "pdf_path": str(pdf_file),
|
|
|
+ "status": "success" if ok else "failed",
|
|
|
+ "return_code": return_code,
|
|
|
+ }
|
|
|
+
|
|
|
+ # 总尝试记录(每次都追加,永不覆盖)
|
|
|
+ append_jsonl(attempt_log_file, record)
|
|
|
+
|
|
|
+ if ok:
|
|
|
+ success_count += 1
|
|
|
+ append_jsonl(success_log_file, record)
|
|
|
+ print(f"✅ 转换成功:{filename}")
|
|
|
+ else:
|
|
|
+ failed_count += 1
|
|
|
+ append_jsonl(failed_log_file, record)
|
|
|
+ print(f"❌ 转换失败:{filename} (return_code={return_code})")
|
|
|
+
|
|
|
+ print("\n==================================")
|
|
|
+ print("🏁 本次转换结束")
|
|
|
+ print(f"✅ 成功: {success_count}")
|
|
|
+ print(f"❌ 失败: {failed_count}")
|
|
|
+ print(f"📒 尝试记录: {attempt_log_file}")
|
|
|
+ print(f"📒 成功记录: {success_log_file}")
|
|
|
+ print(f"📒 失败记录: {failed_log_file}")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|