Explorar el Código

feat: mineru转化脚本(在服务器上运行)

ai02 hace 4 semanas
padre
commit
95cf17a6bb
Se han modificado 1 ficheros con 121 adiciones y 0 borrados
  1. 121 0
      src/app/scripts/run_all.py

+ 121 - 0
src/app/scripts/run_all.py

@@ -0,0 +1,121 @@
+import json
+import os
+import subprocess
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List
+
+# 你的路径
+input_dir = "/home/ubuntu/minerU/input"
+output_dir = "/home/ubuntu/minerU/output"
+
+# 转换记录(追加写入,不覆盖)
+record_dir = Path(output_dir) / "conversion_records"
+attempt_log_file = record_dir / "attempts.jsonl"
+success_log_file = record_dir / "success_docs.jsonl"
+failed_log_file = record_dir / "failed_docs.jsonl"
+
+
+def now_iso() -> str:
+    return datetime.now().isoformat(timespec="seconds")
+
+
+def append_jsonl(path: Path, payload: Dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(payload, ensure_ascii=False) + "\n")
+
+
+def load_latest_status_map(paths: List[Path]) -> Dict[str, str]:
+    """
+    从记录文件读取每个文档的最新状态:
+    success / failed
+    """
+    latest: Dict[str, str] = {}
+    for log_path in paths:
+        if not log_path.exists():
+            continue
+        with log_path.open("r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    item = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                filename = item.get("filename")
+                status = item.get("status")
+                if filename and status in {"success", "failed"}:
+                    latest[filename] = status
+    return latest
+
+
+def run_single_pdf(pdf_path: Path) -> tuple[bool, int]:
+    cmd = [
+        "mineru",
+        "-p", str(pdf_path),
+        "-o", output_dir,
+        "--source", "modelscope",
+    ]
+    result = subprocess.run(cmd, check=False)
+    return result.returncode == 0, result.returncode
+
+
+def main() -> None:
+    input_path = Path(input_dir)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_dir}")
+
+    latest_status = load_latest_status_map([success_log_file, failed_log_file, attempt_log_file])
+
+    all_pdfs = sorted([p for p in input_path.iterdir() if p.is_file() and p.suffix.lower() == ".pdf"])
+    pending_pdfs = [p for p in all_pdfs if latest_status.get(p.name) != "success"]
+
+    print(f"📂 输入目录: {input_dir}")
+    print(f"📦 输出目录: {output_dir}")
+    print(f"📒 记录目录: {record_dir}")
+    print(f"📄 PDF总数: {len(all_pdfs)}")
+    print(f"⏭️ 已成功跳过: {len(all_pdfs) - len(pending_pdfs)}")
+    print(f"🔄 本次待转换: {len(pending_pdfs)}")
+
+    success_count = 0
+    failed_count = 0
+
+    for index, pdf_file in enumerate(pending_pdfs, start=1):
+        filename = pdf_file.name
+        print("\n==================================")
+        print(f"[{index}/{len(pending_pdfs)}] 正在转换:{filename}")
+
+        ok, return_code = run_single_pdf(pdf_file)
+        record = {
+            "timestamp": now_iso(),
+            "filename": filename,
+            "pdf_path": str(pdf_file),
+            "status": "success" if ok else "failed",
+            "return_code": return_code,
+        }
+
+        # 总尝试记录(每次都追加,永不覆盖)
+        append_jsonl(attempt_log_file, record)
+
+        if ok:
+            success_count += 1
+            append_jsonl(success_log_file, record)
+            print(f"✅ 转换成功:{filename}")
+        else:
+            failed_count += 1
+            append_jsonl(failed_log_file, record)
+            print(f"❌ 转换失败:{filename} (return_code={return_code})")
+
+    print("\n==================================")
+    print("🏁 本次转换结束")
+    print(f"✅ 成功: {success_count}")
+    print(f"❌ 失败: {failed_count}")
+    print(f"📒 尝试记录: {attempt_log_file}")
+    print(f"📒 成功记录: {success_log_file}")
+    print(f"📒 失败记录: {failed_log_file}")
+
+
+if __name__ == "__main__":
+    main()