mineru_test.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. # -*- coding: utf-8 -*-
  2. import sys
  3. import os
  4. import json
  5. import base64
  6. import urllib.request
  7. # 获取当前脚本的绝对路径
  8. current_file = os.path.abspath(__file__)
  9. # 获取脚本所在目录(MinerU_Test)
  10. current_dir = os.path.dirname(current_file)
  11. # 获取上一级目录(utils_test)
  12. parent_dir = os.path.dirname(current_dir)
  13. # 获取项目根目录(LQAgentPlatform,即 foundation 所在的目录)
  14. root_dir = os.path.dirname(parent_dir)
  15. # 将项目根目录添加到 sys.path
  16. sys.path.append(root_dir)
  17. import requests
  18. from concurrent.futures import ThreadPoolExecutor, as_completed
  19. from foundation.observability.monitoring.time_statistics import track_execution_time
  20. @track_execution_time
  21. def save_images_from_response(response_data, output_dir="test/mineru_temp"):
  22. """从API响应中提取并保存图片"""
  23. # 创建输出目录
  24. os.makedirs(output_dir, exist_ok=True)
  25. # 检查是否有图片数据
  26. if "results" not in response_data or "scan" not in response_data["results"]:
  27. print("❌ 响应中未找到图片数据")
  28. return
  29. scan_result = response_data["results"]["scan"]
  30. saved_count = 0
  31. # 保存base64图片
  32. if "images" in scan_result:
  33. print(f"发现 {len(scan_result['images'])} 个base64图片")
  34. for filename, base64_data in scan_result["images"].items():
  35. try:
  36. # 解析base64数据格式
  37. if base64_data.startswith("data:image/"):
  38. # 提取实际的base64数据
  39. header, base64_string = base64_data.split(",", 1)
  40. file_extension = header.split("/")[1].split(";")[0]
  41. # 解码base64
  42. image_data = base64.b64decode(base64_string)
  43. # 保存图片文件
  44. output_filename = f"table_{saved_count}.{file_extension}"
  45. output_path = os.path.join(output_dir, output_filename)
  46. with open(output_path, "wb") as f:
  47. f.write(image_data)
  48. print(f"✅ 保存图片: {output_filename} ({len(image_data)} bytes)")
  49. saved_count += 1
  50. else:
  51. # 直接base64数据
  52. image_data = base64.b64decode(base64_data)
  53. output_path = os.path.join(output_dir, f"table_{saved_count}.jpg")
  54. with open(output_path, "wb") as f:
  55. f.write(image_data)
  56. print(f"✅ 保存图片: table_{saved_count}.jpg ({len(image_data)} bytes)")
  57. saved_count += 1
  58. except Exception as e:
  59. print(f"❌ 保存图片 {filename} 失败: {e}")
  60. # 检查middle_json中的图片信息(如果有额外的)
  61. if "middle_json" in scan_result:
  62. try:
  63. middle_json = json.loads(scan_result["middle_json"])
  64. # 查找图片路径和HTML信息
  65. for pdf_info in middle_json.get("pdf_info", []):
  66. for preproc_block in pdf_info.get("preproc_blocks", []):
  67. if preproc_block.get("type") == "table":
  68. for block in preproc_block.get("blocks", []):
  69. if block.get("type") == "table_body":
  70. for line in block.get("lines", []):
  71. for span in line.get("spans", []):
  72. if span.get("type") == "table":
  73. # 保存HTML内容
  74. html_content = span.get('html', 'N/A')
  75. if html_content != 'N/A':
  76. html_file = os.path.join(output_dir, f"table_html_{saved_count}.html")
  77. with open(html_file, "w", encoding="utf-8") as f:
  78. f.write(html_content)
  79. print(f"✅ 保存表格HTML: table_html_{saved_count}.html")
  80. except json.JSONDecodeError as e:
  81. print(f"❌ 解析middle_json失败: {e}")
  82. print(f"✅ 图片保存完成,共保存 {saved_count} 个图片文件到 {output_dir}")
  83. return saved_count
  84. @track_execution_time
  85. def save_middle_json(response_data, output_dir="test/mineru_temp"):
  86. """提取并保存middle_json为单独文件"""
  87. os.makedirs(output_dir, exist_ok=True)
  88. if "results" not in response_data or "scan" not in response_data["results"]:
  89. print("❌ 响应中未找到middle_json数据")
  90. return False
  91. scan_result = response_data["results"]["scan"]
  92. if "middle_json" not in scan_result:
  93. print("❌ scan结果中未找到middle_json")
  94. return False
  95. try:
  96. # 解析middle_json字符串
  97. middle_json_data = json.loads(scan_result["middle_json"])
  98. # 保存格式化的middle_json
  99. middle_json_file = os.path.join(output_dir, "middle_json_pretty.json")
  100. with open(middle_json_file, "w", encoding="utf-8") as f:
  101. json.dump(middle_json_data, f, ensure_ascii=False, indent=4)
  102. print(f"✅ Middle JSON已保存到 {middle_json_file}")
  103. # 打印middle_json的基本信息
  104. if "pdf_info" in middle_json_data:
  105. pdf_info = middle_json_data["pdf_info"]
  106. print(f"📄 PDF信息: {len(pdf_info)} 页")
  107. # 统计各类型块的数量
  108. total_blocks = 0
  109. table_count = 0
  110. text_count = 0
  111. title_count = 0
  112. for page_info in pdf_info:
  113. preproc_blocks = page_info.get("preproc_blocks", [])
  114. total_blocks += len(preproc_blocks)
  115. for block in preproc_blocks:
  116. block_type = block.get("type", "")
  117. if block_type == "table":
  118. table_count += 1
  119. elif block_type == "text":
  120. text_count += 1
  121. elif block_type == "title":
  122. title_count += 1
  123. print(f"📊 统计信息:")
  124. print(f" 总块数: {total_blocks}")
  125. print(f" 表格块: {table_count}")
  126. print(f" 文本块: {text_count}")
  127. print(f" 标题块: {title_count}")
  128. return True
  129. except json.JSONDecodeError as e:
  130. print(f"❌ 解析middle_json失败: {e}")
  131. return False
  132. except Exception as e:
  133. print(f"❌ 保存middle_json失败: {e}")
  134. return False
  135. @track_execution_time
  136. def parse_file():
  137. """使用 API 解析文件"""
  138. API_URL = "http://aiclu.small-app.wang:8000/file_parse"
  139. files = [
  140. ("files", ("scan.png", open("utils_test\MinerU_Test\sgfa_mineru_testimage.png", "rb"), "image/png")),
  141. ]
  142. data = {
  143. "backend": "pipeline",
  144. "parse_method": "auto",
  145. "formula_enable": "true",
  146. "table_enable": "true",
  147. "return_md": "true",
  148. "start_page_id": 0,
  149. "end_page_id": None,
  150. "return_middle_json": "true",
  151. "return_images": "true",
  152. "response_format_zip": "true", # 保持为 true
  153. }
  154. response = requests.post(API_URL, files=files, data=data)
  155. # 检查响应是否为 ZIP 包(通过响应头或内容判断)
  156. if "application/zip" in response.headers.get("Content-Type", "") or response.content.startswith(b"PK"):
  157. # 保存 ZIP 包到本地
  158. zip_output_dir = "utils_test\MinerU_Test\mineru_temp"
  159. os.makedirs(zip_output_dir, exist_ok=True)
  160. zip_file_path = os.path.join(zip_output_dir, "api_response.zip")
  161. # 保存二进制 ZIP 数据
  162. with open(zip_file_path, "wb") as f:
  163. f.write(response.content)
  164. print(f"✅ ZIP 包已保存到: {zip_file_path} ({len(response.content)} bytes)")
  165. # 自动解压 ZIP 包
  166. import zipfile
  167. with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
  168. zip_ref.extractall(os.path.join(zip_output_dir, "unzipped"))
  169. print(f"✅ ZIP 包已解压到: {os.path.join(zip_output_dir, 'unzipped')}")
  170. # 解压后可以读取里面的 JSON/图片文件(示例:读取 middle_json)
  171. unzipped_dir = os.path.join(zip_output_dir, "unzipped")
  172. middle_json_path = os.path.join(unzipped_dir, "middle_json.json") # 假设 ZIP 内有这个文件(需根据实际 ZIP 结构调整)
  173. if os.path.exists(middle_json_path):
  174. with open(middle_json_path, "r", encoding="utf-8") as f:
  175. middle_json_data = json.load(f)
  176. print(f"📊 解压后读取到 middle_json,包含 {len(middle_json_data.get('pdf_info', []))} 页 PDF 信息")
  177. else:
  178. # 若 API 未返回 ZIP(兼容情况),尝试按 JSON 解析
  179. try:
  180. result = response.json()
  181. print("=== API响应概要(JSON格式)===")
  182. # 后续逻辑和原代码一致...
  183. except json.JSONDecodeError as e:
  184. print(f"❌ 解析失败:既不是 ZIP 也不是 JSON - {e}")
  185. parse_file()