CRBC-MaaS-Platform-Project
/
LQAdminPlatform


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
							import os
import sys
import logging
from urllib.parse import urlparse

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("MinerU")

# 导入配置和管理器
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
from app.base.mineru_connection import get_mineru_manager
from app.base.async_mysql_connection import get_db_connection

def main_cli(doc_id):
    """
    MinerU 命令行入口，供后台进程调用
    """
    manager = get_mineru_manager()
    conn = get_db_connection()
    if not conn:
        logger.error("数据库连接失败")
        return
        
    try:
        with conn.cursor() as cursor:
            # 1. 获取文档基本信息
            cursor.execute("SELECT title, file_url, source_type, source_id FROM t_samp_document_main WHERE id = %s", (doc_id,))
            row = cursor.fetchone()
            if not row:
                logger.warning(f"文档不存在: {doc_id}")
                return
            
            title = row['title']
            file_url = row['file_url']
            source_type = row['source_type']
            source_id = row['source_id']

            # 2. 如果主表没有 file_url，尝试从子表获取 (兼容逻辑)
            if not file_url and source_type and source_id:
                TABLE_MAP = {
                    "basis": "t_samp_standard_base_info",
                    "work": "t_samp_construction_plan_base_info",
                    "job": "t_samp_office_regulations"
                }
                table_name = TABLE_MAP.get(source_type)
                if table_name:
                    url_fields = ['file_url', 'source_url', 'url']
                    for field in url_fields:
                        try:
                            cursor.execute(f"SELECT {field} FROM {table_name} WHERE id = %s", (source_id,))
                            url_row = cursor.fetchone()
                            if url_row and url_row[field]:
                                file_url = url_row[field]
                                break
                        except:
                            continue

            if not file_url:
                logger.error(f"未找到文件链接: {doc_id}")
                manager.update_db_status(doc_id, status=3, error="未找到文件链接(file_url)")
                return
            
            # 3. 调用管理器执行转换
            logger.info(f"开始处理文档 [{doc_id}]: {title}")
            manager.process_document(doc_id, title, file_url)
            
    except Exception as e:
        logger.exception(f"处理文档 [{doc_id}] 时发生未捕获异常: {e}")
        manager.update_db_status(doc_id, status=3, error=str(e))
    finally:
        conn.close()

if __name__ == "__main__":
    if len(sys.argv) > 1:
        # 兼容旧格式: python miner_u.py <table_type> <doc_id>
        # 兼容新格式: python miner_u.py <doc_id>
        target_id = sys.argv[-1]
        main_cli(target_id)
    else:
        print("Usage: python miner_u.py <doc_id>")