import os import sys import logging from urllib.parse import urlparse # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger("MinerU") # 导入配置和管理器 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) from app.base.mineru_connection import get_mineru_manager from app.base.async_mysql_connection import get_db_connection def main_cli(doc_id): """ MinerU 命令行入口,供后台进程调用 """ manager = get_mineru_manager() conn = get_db_connection() if not conn: logger.error("数据库连接失败") return try: with conn.cursor() as cursor: # 1. 获取文档基本信息 cursor.execute("SELECT title, file_url, source_type, source_id FROM t_samp_document_main WHERE id = %s", (doc_id,)) row = cursor.fetchone() if not row: logger.warning(f"文档不存在: {doc_id}") return title = row['title'] file_url = row['file_url'] source_type = row['source_type'] source_id = row['source_id'] # 2. 如果主表没有 file_url,尝试从子表获取 (兼容逻辑) if not file_url and source_type and source_id: TABLE_MAP = { "basis": "t_samp_standard_base_info", "work": "t_samp_construction_plan_base_info", "job": "t_samp_office_regulations" } table_name = TABLE_MAP.get(source_type) if table_name: url_fields = ['file_url', 'source_url', 'url'] for field in url_fields: try: cursor.execute(f"SELECT {field} FROM {table_name} WHERE id = %s", (source_id,)) url_row = cursor.fetchone() if url_row and url_row[field]: file_url = url_row[field] break except: continue if not file_url: logger.error(f"未找到文件链接: {doc_id}") manager.update_db_status(doc_id, status=3, error="未找到文件链接(file_url)") return # 3. 调用管理器执行转换 logger.info(f"开始处理文档 [{doc_id}]: {title}") manager.process_document(doc_id, title, file_url) except Exception as e: logger.exception(f"处理文档 [{doc_id}] 时发生未捕获异常: {e}") manager.update_db_status(doc_id, status=3, error=str(e)) finally: conn.close() if __name__ == "__main__": if len(sys.argv) > 1: # 兼容旧格式: python miner_u.py # 兼容新格式: python miner_u.py target_id = sys.argv[-1] main_cli(target_id) else: print("Usage: python miner_u.py ")