|
|
@@ -0,0 +1,157 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+import os
|
|
|
+import re
|
|
|
+import requests
|
|
|
+import zipfile
|
|
|
+import shutil
|
|
|
+from pathlib import Path
|
|
|
+from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
+
|
|
|
+API_URL = "http://183.220.37.46:25428/file_parse"
|
|
|
+#API_URL = "http://183.220.37.46:23428/mineru/file_parse"
|
|
|
+
|
|
|
+# 默认路径(可通过命令行参数覆盖)
|
|
|
+DEFAULT_INPUT_DIR = Path(r"I:\wangxun_dev_workspace\lq_data_wrokspace\bpf_pdf\input")
|
|
|
+DEFAULT_OUTPUT_DIR = Path(r"I:\wangxun_dev_workspace\lq_data_wrokspace\bpf_pdf\output")
|
|
|
+
|
|
|
+
|
|
|
+def clean_filename(name):
|
|
|
+ """清理文件名:去除中文标点符号(括号、书名号等),保留字母数字中文和下划线"""
|
|
|
+ return re.sub(r'[()()\[\]【】《》<>]', '_', name).strip('_')
|
|
|
+
|
|
|
+def parse_file(file_path, output_dir):
|
|
|
+ filename = file_path.name # e.g. "1《中华人民共和国水土保持法》(主席令第39号).pdf"
|
|
|
+ pdf_stem = file_path.stem # e.g. "1《中华人民共和国水土保持法》(主席令第39号)"
|
|
|
+ cleaned_stem = clean_filename(pdf_stem) # e.g. "1_中华人民共和国水土保持法__主席令第39号_"
|
|
|
+
|
|
|
+ outer_dir = output_dir / pdf_stem # 外层:原始PDF文件名
|
|
|
+ inner_dir = outer_dir / cleaned_stem # 内层:清理后的文件名
|
|
|
+
|
|
|
+ try:
|
|
|
+ with open(file_path, 'rb') as f:
|
|
|
+ files = {
|
|
|
+ 'files': (filename, f, 'application/pdf')
|
|
|
+ }
|
|
|
+ data = {
|
|
|
+ 'return_md': 'true',
|
|
|
+ 'response_format_zip': 'true',
|
|
|
+ 'return_original_file': 'true',
|
|
|
+ 'return_middle_json': 'true',
|
|
|
+ 'return_content_list': 'true',
|
|
|
+ 'return_images': 'true'
|
|
|
+ }
|
|
|
+
|
|
|
+ print(f"Processing: {filename}")
|
|
|
+ response = requests.post(API_URL, files=files, data=data)
|
|
|
+
|
|
|
+ if response.status_code == 200:
|
|
|
+ zip_path = output_dir / f"{pdf_stem}_result.zip"
|
|
|
+
|
|
|
+ with open(zip_path, 'wb') as out_f:
|
|
|
+ out_f.write(response.content)
|
|
|
+
|
|
|
+ print(f" Saved zip to: {zip_path}")
|
|
|
+
|
|
|
+ # 解压到临时目录
|
|
|
+ temp_dir = output_dir / f"__temp_{pdf_stem}"
|
|
|
+ temp_dir.mkdir(exist_ok=True)
|
|
|
+ with zipfile.ZipFile(zip_path, 'r') as zipf:
|
|
|
+ zipf.extractall(temp_dir)
|
|
|
+
|
|
|
+ # 创建内外层目录
|
|
|
+ outer_dir.mkdir(exist_ok=True)
|
|
|
+ inner_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ # 定位解压后的实际内容目录
|
|
|
+ source = temp_dir / pdf_stem
|
|
|
+ if source.exists() and source.is_dir():
|
|
|
+ extract_source = source
|
|
|
+ elif (temp_dir / cleaned_stem).exists() and (temp_dir / cleaned_stem).is_dir():
|
|
|
+ extract_source = temp_dir / cleaned_stem
|
|
|
+ else:
|
|
|
+ # 取第一个子目录
|
|
|
+ dirs = [d for d in temp_dir.iterdir() if d.is_dir()]
|
|
|
+ extract_source = dirs[0] if dirs else temp_dir
|
|
|
+
|
|
|
+ # 移动所有文件到内层目录
|
|
|
+ for item in extract_source.iterdir():
|
|
|
+ shutil.move(str(item), str(inner_dir / item.name))
|
|
|
+
|
|
|
+ # 清理临时目录和zip
|
|
|
+ shutil.rmtree(temp_dir, ignore_errors=True)
|
|
|
+ if zip_path.exists():
|
|
|
+ os.remove(zip_path)
|
|
|
+
|
|
|
+ # 复制原始 PDF 到外层
|
|
|
+ shutil.copy2(file_path, outer_dir / filename)
|
|
|
+
|
|
|
+ # 复制 .md 文件到外层(副本)
|
|
|
+ md_files = list(inner_dir.glob("*.md"))
|
|
|
+ if md_files:
|
|
|
+ for md_file in md_files:
|
|
|
+ shutil.copy2(md_file, outer_dir / md_file.name)
|
|
|
+ print(f" PDF + MD copied to outer dir")
|
|
|
+
|
|
|
+ print(f" Extracted to: {inner_dir}")
|
|
|
+ return (filename, True, None)
|
|
|
+ else:
|
|
|
+ error_msg = f"HTTP {response.status_code}: {response.text}"
|
|
|
+ print(f" Error: {error_msg}")
|
|
|
+ return (filename, False, error_msg)
|
|
|
+ except Exception as e:
|
|
|
+ print(f" Exception: {str(e)}")
|
|
|
+ return (filename, False, str(e))
|
|
|
+
|
|
|
+def main(input_dir, output_dir, max_workers=10):
|
|
|
+ input_dir = Path(input_dir)
|
|
|
+ output_dir = Path(output_dir)
|
|
|
+
|
|
|
+ input_dir.mkdir(exist_ok=True)
|
|
|
+ output_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ pdf_files = list(input_dir.glob("*.pdf"))
|
|
|
+
|
|
|
+ if not pdf_files:
|
|
|
+ print("No PDF files found in input directory")
|
|
|
+ return
|
|
|
+
|
|
|
+ print(f"Found {len(pdf_files)} PDF file(s)")
|
|
|
+ print(f"Input: {input_dir}")
|
|
|
+ print(f"Output: {output_dir}")
|
|
|
+ print(f"Processing with {max_workers} concurrent workers\n")
|
|
|
+
|
|
|
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
+ futures = {executor.submit(parse_file, pdf_file, output_dir): pdf_file for pdf_file in pdf_files}
|
|
|
+
|
|
|
+ success_count = 0
|
|
|
+ fail_count = 0
|
|
|
+ failed_files = []
|
|
|
+
|
|
|
+ for future in as_completed(futures):
|
|
|
+ filename, success, error = future.result()
|
|
|
+ if success:
|
|
|
+ success_count += 1
|
|
|
+ else:
|
|
|
+ fail_count += 1
|
|
|
+ failed_files.append((filename, error))
|
|
|
+
|
|
|
+ print(f"\nDone! Success: {success_count}, Failed: {fail_count}")
|
|
|
+
|
|
|
+ if failed_files:
|
|
|
+ print("\nFailed files:")
|
|
|
+ for filename, error in failed_files:
|
|
|
+ print(f" - {filename}: {error}")
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ import argparse
|
|
|
+
|
|
|
+ parser = argparse.ArgumentParser(description='Parse PDF files using MinerU API')
|
|
|
+ parser.add_argument('-i', '--input', type=str, default=str(DEFAULT_INPUT_DIR),
|
|
|
+ help='Input directory containing PDF files')
|
|
|
+ parser.add_argument('-o', '--output', type=str, default=str(DEFAULT_OUTPUT_DIR),
|
|
|
+ help='Output directory for parsed results')
|
|
|
+ parser.add_argument('-w', '--workers', type=int, default=10,
|
|
|
+ help='Number of concurrent workers (default: 10)')
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ main(input_dir=args.input, output_dir=args.output, max_workers=args.workers)
|