Sfoglia il codice sorgente

增加API Key认证处理

lingmin_package@163.com 1 settimana fa
parent
commit
05ed964531
3 ha cambiato i file con 219 aggiunte e 23 eliminazioni
  1. 57 20
      src/app/minerU/dev_minerU.py
  2. 5 3
      src/app/minerU/minerU.py
  3. 157 0
      src/app/minerU/minerU_2.py

+ 57 - 20
src/app/minerU/dev_minerU.py

@@ -1,23 +1,33 @@
 #!/usr/bin/env python3
 import os
+import re
 import requests
 import zipfile
 import shutil
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-#API_URL = "http://183.220.37.46:25428/file_parse"
+#API_URL = "http://183.220.37.46:25428/mineru/file_parse"
 API_URL = "http://183.220.37.46:23428/mineru/file_parse"
 API_HEADERS = {
-    "Authorization": "Bearer sk_dev_aC_2gg8BS5ImUScrpaHIKS5x6gdLO9Js_ba854894"
+    "Authorization": "Bearer sk_test_aC_2gg8BS5ImUScrpaHIKS5x6gdLO9Js_ba854894"
 }
 
 # 默认路径(可通过命令行参数覆盖)
 DEFAULT_INPUT_DIR = Path(r"I:\wangxun_dev_workspace\lq_data_wrokspace\bpf_pdf\input")
 DEFAULT_OUTPUT_DIR = Path(r"I:\wangxun_dev_workspace\lq_data_wrokspace\bpf_pdf\output")
 
+def clean_filename(name):
+    """清理文件名:去除中文标点符号(括号、书名号等),保留字母数字中文和下划线"""
+    return re.sub(r'[()()\[\]【】《》<>]', '_', name).strip('_')
+
 def parse_file(file_path, output_dir):
-    filename = file_path.name
+    filename = file_path.name  # e.g. "1《中华人民共和国水土保持法》(主席令第39号).pdf"
+    pdf_stem = file_path.stem  # e.g. "1《中华人民共和国水土保持法》(主席令第39号)"
+    cleaned_stem = clean_filename(pdf_stem)  # e.g. "1_中华人民共和国水土保持法__主席令第39号_"
+
+    outer_dir = output_dir / pdf_stem        # 外层:原始PDF文件名
+    inner_dir = outer_dir / cleaned_stem     # 内层:清理后的文件名
 
     try:
         with open(file_path, 'rb') as f:
@@ -37,27 +47,54 @@ def parse_file(file_path, output_dir):
             response = requests.post(API_URL, files=files, data=data, headers=API_HEADERS)
 
             if response.status_code == 200:
-                zip_filename = f"{file_path.stem}_result.zip"
-                zip_path = output_dir / zip_filename
-                extract_dir = output_dir / file_path.stem
-                
+                zip_path = output_dir / f"{pdf_stem}_result.zip"
+
                 with open(zip_path, 'wb') as out_f:
                     out_f.write(response.content)
-                
+
                 print(f"  Saved zip to: {zip_path}")
-                
-                extract_dir.mkdir(exist_ok=True)
+
+                # 解压到临时目录
+                temp_dir = output_dir / f"__temp_{pdf_stem}"
+                temp_dir.mkdir(exist_ok=True)
                 with zipfile.ZipFile(zip_path, 'r') as zipf:
-                    zipf.extractall(extract_dir)
-                
-                nested_dir = extract_dir / file_path.stem
-                if nested_dir.exists() and nested_dir.is_dir():
-                    for item in nested_dir.iterdir():
-                        shutil.move(str(item), str(extract_dir / item.name))
-                    nested_dir.rmdir()
-                
-                os.remove(zip_path)
-                print(f"  Extracted to: {extract_dir}")
+                    zipf.extractall(temp_dir)
+
+                # 创建内外层目录
+                outer_dir.mkdir(exist_ok=True)
+                inner_dir.mkdir(exist_ok=True)
+
+                # 定位解压后的实际内容目录
+                source = temp_dir / pdf_stem
+                if source.exists() and source.is_dir():
+                    extract_source = source
+                elif (temp_dir / cleaned_stem).exists() and (temp_dir / cleaned_stem).is_dir():
+                    extract_source = temp_dir / cleaned_stem
+                else:
+                    # 取第一个子目录
+                    dirs = [d for d in temp_dir.iterdir() if d.is_dir()]
+                    extract_source = dirs[0] if dirs else temp_dir
+
+                # 移动所有文件到内层目录
+                for item in extract_source.iterdir():
+                    shutil.move(str(item), str(inner_dir / item.name))
+
+                # 清理临时目录和zip
+                shutil.rmtree(temp_dir, ignore_errors=True)
+                if zip_path.exists():
+                    os.remove(zip_path)
+
+                # 复制原始 PDF 到外层
+                shutil.copy2(file_path, outer_dir / filename)
+
+                # 复制 .md 文件到外层(副本)
+                md_files = list(inner_dir.glob("*.md"))
+                if md_files:
+                    for md_file in md_files:
+                        shutil.copy2(md_file, outer_dir / md_file.name)
+                    print(f"  PDF + MD copied to outer dir")
+
+                print(f"  Extracted to: {inner_dir}")
                 return (filename, True, None)
             else:
                 error_msg = f"HTTP {response.status_code}: {response.text}"

+ 5 - 3
src/app/minerU/minerU.py

@@ -7,14 +7,16 @@ import shutil
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-API_URL = "http://183.220.37.46:25428/file_parse"
+API_URL = "http://183.220.37.46:25428/mineru/file_parse"
 #API_URL = "http://183.220.37.46:23428/mineru/file_parse"
+API_HEADERS = {
+    "Authorization": "Bearer sk_test_aC_2gg8BS5ImUScrpaHIKS5x6gdLO9Js_ba854894"
+}
 
 # 默认路径(可通过命令行参数覆盖)
 DEFAULT_INPUT_DIR = Path(r"I:\wangxun_dev_workspace\lq_data_wrokspace\bpf_pdf\input")
 DEFAULT_OUTPUT_DIR = Path(r"I:\wangxun_dev_workspace\lq_data_wrokspace\bpf_pdf\output")
 
-
 def clean_filename(name):
     """清理文件名:去除中文标点符号(括号、书名号等),保留字母数字中文和下划线"""
     return re.sub(r'[()()\[\]【】《》<>]', '_', name).strip('_')
@@ -42,7 +44,7 @@ def parse_file(file_path, output_dir):
             }
 
             print(f"Processing: {filename}")
-            response = requests.post(API_URL, files=files, data=data)
+            response = requests.post(API_URL, files=files, data=data, headers=API_HEADERS)
 
             if response.status_code == 200:
                 zip_path = output_dir / f"{pdf_stem}_result.zip"

+ 157 - 0
src/app/minerU/minerU_2.py

@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+import os
+import re
+import requests
+import zipfile
+import shutil
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+API_URL = "http://183.220.37.46:25428/file_parse"
+#API_URL = "http://183.220.37.46:23428/mineru/file_parse"
+
+# 默认路径(可通过命令行参数覆盖)
+DEFAULT_INPUT_DIR = Path(r"I:\wangxun_dev_workspace\lq_data_wrokspace\bpf_pdf\input")
+DEFAULT_OUTPUT_DIR = Path(r"I:\wangxun_dev_workspace\lq_data_wrokspace\bpf_pdf\output")
+
+
+def clean_filename(name):
+    """清理文件名:去除中文标点符号(括号、书名号等),保留字母数字中文和下划线"""
+    return re.sub(r'[()()\[\]【】《》<>]', '_', name).strip('_')
+
+def parse_file(file_path, output_dir):
+    filename = file_path.name  # e.g. "1《中华人民共和国水土保持法》(主席令第39号).pdf"
+    pdf_stem = file_path.stem  # e.g. "1《中华人民共和国水土保持法》(主席令第39号)"
+    cleaned_stem = clean_filename(pdf_stem)  # e.g. "1_中华人民共和国水土保持法__主席令第39号_"
+
+    outer_dir = output_dir / pdf_stem        # 外层:原始PDF文件名
+    inner_dir = outer_dir / cleaned_stem     # 内层:清理后的文件名
+
+    try:
+        with open(file_path, 'rb') as f:
+            files = {
+                'files': (filename, f, 'application/pdf')
+            }
+            data = {
+                'return_md': 'true',
+                'response_format_zip': 'true',
+                'return_original_file': 'true',
+                'return_middle_json': 'true',
+                'return_content_list': 'true',
+                'return_images': 'true'
+            }
+
+            print(f"Processing: {filename}")
+            response = requests.post(API_URL, files=files, data=data)
+
+            if response.status_code == 200:
+                zip_path = output_dir / f"{pdf_stem}_result.zip"
+
+                with open(zip_path, 'wb') as out_f:
+                    out_f.write(response.content)
+
+                print(f"  Saved zip to: {zip_path}")
+
+                # 解压到临时目录
+                temp_dir = output_dir / f"__temp_{pdf_stem}"
+                temp_dir.mkdir(exist_ok=True)
+                with zipfile.ZipFile(zip_path, 'r') as zipf:
+                    zipf.extractall(temp_dir)
+
+                # 创建内外层目录
+                outer_dir.mkdir(exist_ok=True)
+                inner_dir.mkdir(exist_ok=True)
+
+                # 定位解压后的实际内容目录
+                source = temp_dir / pdf_stem
+                if source.exists() and source.is_dir():
+                    extract_source = source
+                elif (temp_dir / cleaned_stem).exists() and (temp_dir / cleaned_stem).is_dir():
+                    extract_source = temp_dir / cleaned_stem
+                else:
+                    # 取第一个子目录
+                    dirs = [d for d in temp_dir.iterdir() if d.is_dir()]
+                    extract_source = dirs[0] if dirs else temp_dir
+
+                # 移动所有文件到内层目录
+                for item in extract_source.iterdir():
+                    shutil.move(str(item), str(inner_dir / item.name))
+
+                # 清理临时目录和zip
+                shutil.rmtree(temp_dir, ignore_errors=True)
+                if zip_path.exists():
+                    os.remove(zip_path)
+
+                # 复制原始 PDF 到外层
+                shutil.copy2(file_path, outer_dir / filename)
+
+                # 复制 .md 文件到外层(副本)
+                md_files = list(inner_dir.glob("*.md"))
+                if md_files:
+                    for md_file in md_files:
+                        shutil.copy2(md_file, outer_dir / md_file.name)
+                    print(f"  PDF + MD copied to outer dir")
+
+                print(f"  Extracted to: {inner_dir}")
+                return (filename, True, None)
+            else:
+                error_msg = f"HTTP {response.status_code}: {response.text}"
+                print(f"  Error: {error_msg}")
+                return (filename, False, error_msg)
+    except Exception as e:
+        print(f"  Exception: {str(e)}")
+        return (filename, False, str(e))
+
+def main(input_dir, output_dir, max_workers=10):
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+
+    input_dir.mkdir(exist_ok=True)
+    output_dir.mkdir(exist_ok=True)
+
+    pdf_files = list(input_dir.glob("*.pdf"))
+    
+    if not pdf_files:
+        print("No PDF files found in input directory")
+        return
+    
+    print(f"Found {len(pdf_files)} PDF file(s)")
+    print(f"Input: {input_dir}")
+    print(f"Output: {output_dir}")
+    print(f"Processing with {max_workers} concurrent workers\n")
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(parse_file, pdf_file, output_dir): pdf_file for pdf_file in pdf_files}
+        
+        success_count = 0
+        fail_count = 0
+        failed_files = []
+        
+        for future in as_completed(futures):
+            filename, success, error = future.result()
+            if success:
+                success_count += 1
+            else:
+                fail_count += 1
+                failed_files.append((filename, error))
+    
+    print(f"\nDone! Success: {success_count}, Failed: {fail_count}")
+    
+    if failed_files:
+        print("\nFailed files:")
+        for filename, error in failed_files:
+            print(f"  - {filename}: {error}")
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Parse PDF files using MinerU API')
+    parser.add_argument('-i', '--input', type=str, default=str(DEFAULT_INPUT_DIR),
+                        help='Input directory containing PDF files')
+    parser.add_argument('-o', '--output', type=str, default=str(DEFAULT_OUTPUT_DIR),
+                        help='Output directory for parsed results')
+    parser.add_argument('-w', '--workers', type=int, default=10,
+                        help='Number of concurrent workers (default: 10)')
+    args = parser.parse_args()
+
+    main(input_dir=args.input, output_dir=args.output, max_workers=args.workers)