miner_u.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. import os
  2. import sys
  3. import logging
  4. from urllib.parse import urlparse
  5. # 配置日志
  6. logging.basicConfig(
  7. level=logging.INFO,
  8. format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  9. )
  10. logger = logging.getLogger("MinerU")
  11. # 导入配置和管理器
  12. sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
  13. from app.base.mineru_connection import get_mineru_manager
  14. from app.base.async_mysql_connection import get_db_connection
  15. def main_cli(doc_id):
  16. """
  17. MinerU 命令行入口,供后台进程调用
  18. """
  19. manager = get_mineru_manager()
  20. conn = get_db_connection()
  21. if not conn:
  22. logger.error("数据库连接失败")
  23. return
  24. try:
  25. with conn.cursor() as cursor:
  26. # 1. 获取文档基本信息
  27. cursor.execute("SELECT title, file_url, source_type, source_id FROM t_samp_document_main WHERE id = %s", (doc_id,))
  28. row = cursor.fetchone()
  29. if not row:
  30. logger.warning(f"文档不存在: {doc_id}")
  31. return
  32. title = row['title']
  33. file_url = row['file_url']
  34. source_type = row['source_type']
  35. source_id = row['source_id']
  36. # 2. 如果主表没有 file_url,尝试从子表获取 (兼容逻辑)
  37. if not file_url and source_type and source_id:
  38. TABLE_MAP = {
  39. "basis": "t_samp_standard_base_info",
  40. "work": "t_samp_construction_plan_base_info",
  41. "job": "t_samp_office_regulations"
  42. }
  43. table_name = TABLE_MAP.get(source_type)
  44. if table_name:
  45. url_fields = ['file_url', 'source_url', 'url']
  46. for field in url_fields:
  47. try:
  48. cursor.execute(f"SELECT {field} FROM {table_name} WHERE id = %s", (source_id,))
  49. url_row = cursor.fetchone()
  50. if url_row and url_row[field]:
  51. file_url = url_row[field]
  52. break
  53. except:
  54. continue
  55. if not file_url:
  56. logger.error(f"未找到文件链接: {doc_id}")
  57. manager.update_db_status(doc_id, status=3, error="未找到文件链接(file_url)")
  58. return
  59. # 3. 调用管理器执行转换
  60. logger.info(f"开始处理文档 [{doc_id}]: {title}")
  61. manager.process_document(doc_id, title, file_url)
  62. except Exception as e:
  63. logger.exception(f"处理文档 [{doc_id}] 时发生未捕获异常: {e}")
  64. manager.update_db_status(doc_id, status=3, error=str(e))
  65. finally:
  66. conn.close()
  67. if __name__ == "__main__":
  68. if len(sys.argv) > 1:
  69. # 兼容旧格式: python miner_u.py <table_type> <doc_id>
  70. # 兼容新格式: python miner_u.py <doc_id>
  71. target_id = sys.argv[-1]
  72. main_cli(target_id)
  73. else:
  74. print("Usage: python miner_u.py <doc_id>")