mineru_test.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. import sys
  2. import os
  3. import json
  4. import base64
  5. import urllib.request
  6. sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  7. import requests
  8. from concurrent.futures import ThreadPoolExecutor, as_completed
  9. from foundation.observability.monitoring.time_statistics import track_execution_time
  10. @track_execution_time
  11. def save_images_from_response(response_data, output_dir="test/mineru_temp"):
  12. """从API响应中提取并保存图片"""
  13. # 创建输出目录
  14. os.makedirs(output_dir, exist_ok=True)
  15. # 检查是否有图片数据
  16. if "results" not in response_data or "scan" not in response_data["results"]:
  17. print("❌ 响应中未找到图片数据")
  18. return
  19. scan_result = response_data["results"]["scan"]
  20. saved_count = 0
  21. # 保存base64图片
  22. if "images" in scan_result:
  23. print(f"发现 {len(scan_result['images'])} 个base64图片")
  24. for filename, base64_data in scan_result["images"].items():
  25. try:
  26. # 解析base64数据格式
  27. if base64_data.startswith("data:image/"):
  28. # 提取实际的base64数据
  29. header, base64_string = base64_data.split(",", 1)
  30. file_extension = header.split("/")[1].split(";")[0]
  31. # 解码base64
  32. image_data = base64.b64decode(base64_string)
  33. # 保存图片文件
  34. output_filename = f"table_{saved_count}.{file_extension}"
  35. output_path = os.path.join(output_dir, output_filename)
  36. with open(output_path, "wb") as f:
  37. f.write(image_data)
  38. print(f"✅ 保存图片: {output_filename} ({len(image_data)} bytes)")
  39. saved_count += 1
  40. else:
  41. # 直接base64数据
  42. image_data = base64.b64decode(base64_data)
  43. output_path = os.path.join(output_dir, f"table_{saved_count}.jpg")
  44. with open(output_path, "wb") as f:
  45. f.write(image_data)
  46. print(f"✅ 保存图片: table_{saved_count}.jpg ({len(image_data)} bytes)")
  47. saved_count += 1
  48. except Exception as e:
  49. print(f"❌ 保存图片 {filename} 失败: {e}")
  50. # 检查middle_json中的图片信息(如果有额外的)
  51. if "middle_json" in scan_result:
  52. try:
  53. middle_json = json.loads(scan_result["middle_json"])
  54. # 查找图片路径和HTML信息
  55. for pdf_info in middle_json.get("pdf_info", []):
  56. for preproc_block in pdf_info.get("preproc_blocks", []):
  57. if preproc_block.get("type") == "table":
  58. for block in preproc_block.get("blocks", []):
  59. if block.get("type") == "table_body":
  60. for line in block.get("lines", []):
  61. for span in line.get("spans", []):
  62. if span.get("type") == "table":
  63. # 保存HTML内容
  64. html_content = span.get('html', 'N/A')
  65. if html_content != 'N/A':
  66. html_file = os.path.join(output_dir, f"table_html_{saved_count}.html")
  67. with open(html_file, "w", encoding="utf-8") as f:
  68. f.write(html_content)
  69. print(f"✅ 保存表格HTML: table_html_{saved_count}.html")
  70. except json.JSONDecodeError as e:
  71. print(f"❌ 解析middle_json失败: {e}")
  72. print(f"✅ 图片保存完成,共保存 {saved_count} 个图片文件到 {output_dir}")
  73. return saved_count
  74. @track_execution_time
  75. def save_middle_json(response_data, output_dir="test/mineru_temp"):
  76. """提取并保存middle_json为单独文件"""
  77. os.makedirs(output_dir, exist_ok=True)
  78. if "results" not in response_data or "scan" not in response_data["results"]:
  79. print("❌ 响应中未找到middle_json数据")
  80. return False
  81. scan_result = response_data["results"]["scan"]
  82. if "middle_json" not in scan_result:
  83. print("❌ scan结果中未找到middle_json")
  84. return False
  85. try:
  86. # 解析middle_json字符串
  87. middle_json_data = json.loads(scan_result["middle_json"])
  88. # 保存格式化的middle_json
  89. middle_json_file = os.path.join(output_dir, "middle_json_pretty.json")
  90. with open(middle_json_file, "w", encoding="utf-8") as f:
  91. json.dump(middle_json_data, f, ensure_ascii=False, indent=4)
  92. print(f"✅ Middle JSON已保存到 {middle_json_file}")
  93. # 打印middle_json的基本信息
  94. if "pdf_info" in middle_json_data:
  95. pdf_info = middle_json_data["pdf_info"]
  96. print(f"📄 PDF信息: {len(pdf_info)} 页")
  97. # 统计各类型块的数量
  98. total_blocks = 0
  99. table_count = 0
  100. text_count = 0
  101. title_count = 0
  102. for page_info in pdf_info:
  103. preproc_blocks = page_info.get("preproc_blocks", [])
  104. total_blocks += len(preproc_blocks)
  105. for block in preproc_blocks:
  106. block_type = block.get("type", "")
  107. if block_type == "table":
  108. table_count += 1
  109. elif block_type == "text":
  110. text_count += 1
  111. elif block_type == "title":
  112. title_count += 1
  113. print(f"📊 统计信息:")
  114. print(f" 总块数: {total_blocks}")
  115. print(f" 表格块: {table_count}")
  116. print(f" 文本块: {text_count}")
  117. print(f" 标题块: {title_count}")
  118. return True
  119. except json.JSONDecodeError as e:
  120. print(f"❌ 解析middle_json失败: {e}")
  121. return False
  122. except Exception as e:
  123. print(f"❌ 保存middle_json失败: {e}")
  124. return False
  125. def parse_file():
  126. """使用 API 解析文件"""
  127. # 配置 API URL
  128. API_URL = "http://aiclu.small-app.wang:8000/file_parse"
  129. # 1. 准备文件(本地 PDF/图像)
  130. files = [
  131. ("files", ("scan.png", open("test/sgfa_mineru_testimage.png", "rb"), "image/png")), # 支持多文件
  132. ]
  133. # 2. 准备参数
  134. data = {
  135. "backend": "pipeline",
  136. "parse_method": "auto",
  137. "formula_enable": "true",
  138. "table_enable": "true",
  139. "return_md": "true", # 返回 Markdown 内容
  140. "start_page_id": 0,
  141. "end_page_id": None, # 解析所有页
  142. "return_middle_json": "true", # 确保返回middle_json以获取图片信息
  143. "return_images": "true",
  144. "return_middle_json": "true",
  145. "response_format_zip": "true",
  146. }
  147. # 3. 发送请求
  148. response = requests.post(API_URL, files=files, data=data)
  149. try:
  150. result = response.json()
  151. # 打印主要响应内容
  152. print("=== API响应概要 ===")
  153. if "results" in result and "scan" in result["results"]:
  154. scan_result = result["results"]["scan"]
  155. print(f"MD内容长度: {len(scan_result.get('md_content', ''))}")
  156. print(f"包含middle_json: {'middle_json' in scan_result}")
  157. print(f"包含images: {'images' in scan_result}")
  158. print(f"Backend: {result.get('backend', 'N/A')}")
  159. print(f"Version: {result.get('version', 'N/A')}")
  160. # 提取并保存 middle_json
  161. middle_json_saved = save_middle_json(result)
  162. # 保存图片信息
  163. saved_images = save_images_from_response(result)
  164. # 保存完整响应到JSON文件
  165. with open("test/mineru_temp/api_response.json", "w", encoding="utf-8") as f:
  166. json.dump(result, f, ensure_ascii=False, indent=2)
  167. print("✅ 完整API响应已保存到 test/mineru_temp/api_response.json")
  168. # 显示保存结果统计
  169. print(f"\n=== 保存统计 ===")
  170. print(f"Middle JSON保存: {'✅ 成功' if middle_json_saved else '❌ 失败'}")
  171. print(f"图片文件保存: {saved_images} 个")
  172. except json.JSONDecodeError as e:
  173. print(f"❌ 解析JSON响应失败: {e}")
  174. print("原始响应:", response.text)
  175. parse_file()