| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- import os
- import sys
- import logging
- from urllib.parse import urlparse
- # 配置日志
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
- )
- logger = logging.getLogger("MinerU")
- # 导入配置和管理器
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
- from app.base.mineru_connection import get_mineru_manager
- from app.base.async_mysql_connection import get_db_connection
- def main_cli(doc_id):
- """
- MinerU 命令行入口,供后台进程调用
- """
- manager = get_mineru_manager()
- conn = get_db_connection()
- if not conn:
- logger.error("数据库连接失败")
- return
-
- try:
- with conn.cursor() as cursor:
- # 2. 获取文档基本信息 (主表即为唯一资产中心)
- cursor.execute("SELECT title, file_url FROM t_samp_document_main WHERE id = %s", (doc_id,))
- row = cursor.fetchone()
- if not row:
- logger.warning(f"文档不存在: {doc_id}")
- return
-
- title = row['title']
- file_url = row['file_url']
- if not file_url:
- logger.error(f"未找到文件链接: {doc_id}")
- manager.update_db_status(doc_id, status=3, error="未找到文件链接(file_url)")
- return
-
- # 3. 调用管理器执行转换
- logger.info(f"开始处理文档 [{doc_id}]: {title}")
- manager.process_document(doc_id, title, file_url)
-
- except Exception as e:
- logger.exception(f"处理文档 [{doc_id}] 时发生未捕获异常: {e}")
- manager.update_db_status(doc_id, status=3, error=str(e))
- finally:
- conn.close()
- if __name__ == "__main__":
- if len(sys.argv) > 1:
- # 兼容旧格式: python miner_u.py <table_type> <doc_id>
- # 兼容新格式: python miner_u.py <doc_id>
- target_id = sys.argv[-1]
- main_cli(target_id)
- else:
- print("Usage: python miner_u.py <doc_id>")
|