|
@@ -7,12 +7,14 @@ from pathlib import Path
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
|
|
|
API_URL = "http://183.220.37.46:25428/file_parse"
|
|
API_URL = "http://183.220.37.46:25428/file_parse"
|
|
|
-INPUT_DIR = Path(r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明\公司集团评审意见_input")
|
|
|
|
|
-OUTPUT_DIR = Path(r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明\公司集团评审意见_output")
|
|
|
|
|
|
|
|
|
|
-def parse_file(file_path):
|
|
|
|
|
|
|
+# 默认路径(可通过命令行参数覆盖)
|
|
|
|
|
+DEFAULT_INPUT_DIR = Path(r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明\公司集团评审意见_input")
|
|
|
|
|
+DEFAULT_OUTPUT_DIR = Path(r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明\公司集团评审意见_output")
|
|
|
|
|
+
|
|
|
|
|
+def parse_file(file_path, output_dir):
|
|
|
filename = file_path.name
|
|
filename = file_path.name
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
try:
|
|
try:
|
|
|
with open(file_path, 'rb') as f:
|
|
with open(file_path, 'rb') as f:
|
|
|
files = {
|
|
files = {
|
|
@@ -26,14 +28,14 @@ def parse_file(file_path):
|
|
|
'return_content_list': 'true',
|
|
'return_content_list': 'true',
|
|
|
'return_images': 'true'
|
|
'return_images': 'true'
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
print(f"Processing: {filename}")
|
|
print(f"Processing: {filename}")
|
|
|
response = requests.post(API_URL, files=files, data=data)
|
|
response = requests.post(API_URL, files=files, data=data)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
if response.status_code == 200:
|
|
if response.status_code == 200:
|
|
|
zip_filename = f"{file_path.stem}_result.zip"
|
|
zip_filename = f"{file_path.stem}_result.zip"
|
|
|
- zip_path = OUTPUT_DIR / zip_filename
|
|
|
|
|
- extract_dir = OUTPUT_DIR / file_path.stem
|
|
|
|
|
|
|
+ zip_path = output_dir / zip_filename
|
|
|
|
|
+ extract_dir = output_dir / file_path.stem
|
|
|
|
|
|
|
|
with open(zip_path, 'wb') as out_f:
|
|
with open(zip_path, 'wb') as out_f:
|
|
|
out_f.write(response.content)
|
|
out_f.write(response.content)
|
|
@@ -61,21 +63,26 @@ def parse_file(file_path):
|
|
|
print(f" Exception: {str(e)}")
|
|
print(f" Exception: {str(e)}")
|
|
|
return (filename, False, str(e))
|
|
return (filename, False, str(e))
|
|
|
|
|
|
|
|
-def main(max_workers=10):
|
|
|
|
|
- INPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
|
- OUTPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
|
-
|
|
|
|
|
- pdf_files = list(INPUT_DIR.glob("*.pdf"))
|
|
|
|
|
|
|
+def main(input_dir, output_dir, max_workers=10):
|
|
|
|
|
+ input_dir = Path(input_dir)
|
|
|
|
|
+ output_dir = Path(output_dir)
|
|
|
|
|
+
|
|
|
|
|
+ input_dir.mkdir(exist_ok=True)
|
|
|
|
|
+ output_dir.mkdir(exist_ok=True)
|
|
|
|
|
+
|
|
|
|
|
+ pdf_files = list(input_dir.glob("*.pdf"))
|
|
|
|
|
|
|
|
if not pdf_files:
|
|
if not pdf_files:
|
|
|
print("No PDF files found in input directory")
|
|
print("No PDF files found in input directory")
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
print(f"Found {len(pdf_files)} PDF file(s)")
|
|
print(f"Found {len(pdf_files)} PDF file(s)")
|
|
|
|
|
+ print(f"Input: {input_dir}")
|
|
|
|
|
+ print(f"Output: {output_dir}")
|
|
|
print(f"Processing with {max_workers} concurrent workers\n")
|
|
print(f"Processing with {max_workers} concurrent workers\n")
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
|
- futures = {executor.submit(parse_file, pdf_file): pdf_file for pdf_file in pdf_files}
|
|
|
|
|
|
|
+ futures = {executor.submit(parse_file, pdf_file, output_dir): pdf_file for pdf_file in pdf_files}
|
|
|
|
|
|
|
|
success_count = 0
|
|
success_count = 0
|
|
|
fail_count = 0
|
|
fail_count = 0
|
|
@@ -98,10 +105,14 @@ def main(max_workers=10):
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
|
import argparse
|
|
import argparse
|
|
|
-
|
|
|
|
|
- parser = argparse.ArgumentParser(description='Parse PDF files using MineRU API')
|
|
|
|
|
|
|
+
|
|
|
|
|
+ parser = argparse.ArgumentParser(description='Parse PDF files using MinerU API')
|
|
|
|
|
+ parser.add_argument('-i', '--input', type=str, default=str(DEFAULT_INPUT_DIR),
|
|
|
|
|
+ help='Input directory containing PDF files')
|
|
|
|
|
+ parser.add_argument('-o', '--output', type=str, default=str(DEFAULT_OUTPUT_DIR),
|
|
|
|
|
+ help='Output directory for parsed results')
|
|
|
parser.add_argument('-w', '--workers', type=int, default=10,
|
|
parser.add_argument('-w', '--workers', type=int, default=10,
|
|
|
help='Number of concurrent workers (default: 10)')
|
|
help='Number of concurrent workers (default: 10)')
|
|
|
args = parser.parse_args()
|
|
args = parser.parse_args()
|
|
|
-
|
|
|
|
|
- main(max_workers=args.workers)
|
|
|
|
|
|
|
+
|
|
|
|
|
+ main(input_dir=args.input, output_dir=args.output, max_workers=args.workers)
|