Maas2-group
/
Fine-tuning


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
							import os
import json
from pathlib import Path
from typing import Any

from app.config import get_settings
from app.core.db import async_session, ModelCache
from app.core.logging import logger
from sqlalchemy import select

settings = get_settings()


async def resolve_model_path(model_id: str) -> str | None:
    """解析模型的实际路径，兼容 HuggingFace 和 ModelScope 的不同目录结构。"""
    # 策略 1: 从数据库读取实际路径
    info = await get_model_info(model_id)
    if info and info.get("path"):
        p = Path(info["path"])
        if (p / "config.json").exists():
            return str(p)

    # 策略 2: HuggingFace 风格（namespace_name 扁平化）
    hf_path = settings.models_dir / model_id.replace("/", "_")
    if (hf_path / "config.json").exists():
        return str(hf_path)

    # 策略 3: ModelScope 风格（namespace/name 嵌套，含软链接）
    ms_path = settings.models_dir / model_id
    if (ms_path / "config.json").exists():
        return str(ms_path)

    # 策略 4: 扫描 models_dir 下所有目录，匹配名称
    model_name = model_id.split("/")[-1]
    for p in settings.models_dir.rglob("config.json"):
        if p.parent.name == model_name or model_name in str(p.parent):
            return str(p.parent)

    return None


async def download_model(model_id: str, use_modelscope: bool = False) -> dict[str, Any]:
    """从 HF 或 ModelScope 下载模型到本地缓存。"""
    try:
        if use_modelscope:
            import subprocess

            download_dir = str(settings.models_dir / model_id.replace("/", "_"))
            # 用独立进程调用 CLI，完全隔离 FastAPI 事件循环，避免 __aenter__ 错误
            proc = subprocess.run(
                [
                    "modelscope", "download",
                    "--model", model_id,
                    "--local_dir", download_dir,
                ],
                capture_output=True, text=True, timeout=3600,
            )
            if proc.returncode != 0:
                raise RuntimeError(f"modelscope CLI failed: {proc.stderr}")
            local_path = download_dir
        else:
            from huggingface_hub import snapshot_download

            local_path = snapshot_download(
                repo_id=model_id,
                local_dir=str(settings.models_dir / model_id.replace("/", "_")),
                local_dir_use_symlinks=False,
            )

        # 读取 config.json 获取模型信息
        config_path = Path(local_path) / "config.json"
        model_type = "text"
        context_length = 2048
        peft_methods = "lora,qlora,ia3,adalora,prefix_tuning"

        if config_path.exists():
            with open(config_path) as f:
                cfg = json.load(f)
            model_type = cfg.get("model_type", "text")
            context_length = cfg.get("max_position_embeddings", cfg.get("max_sequence_length", 2048))

        # 写入数据库（如果已存在则更新）
        async with async_session() as session:
            result = await session.execute(select(ModelCache).where(ModelCache.id == model_id))
            existing = result.scalar_one_or_none()
            if existing:
                existing.name = model_id.split("/")[-1]
                existing.model_type = model_type
                existing.path = local_path
                existing.is_downloaded = 1
                existing.context_length = context_length
                existing.supported_peft_methods = peft_methods
            else:
                record = ModelCache(
                    id=model_id,
                    name=model_id.split("/")[-1],
                    model_type=model_type,
                    path=local_path,
                    is_downloaded=1,
                    context_length=context_length,
                    supported_peft_methods=peft_methods,
                )
                session.add(record)
            await session.commit()

        logger.info(f"Model downloaded: {model_id} -> {local_path}")
        return {"model_id": model_id, "status": "completed", "path": local_path}
    except Exception as e:
        import traceback

        tb = traceback.format_exc()
        logger.error(f"Model download failed: {type(e).__name__}: {e}")
        logger.error(f"Traceback:\n{tb}")
        return {"model_id": model_id, "status": "failed", "error": error_msg}


async def list_cached_models() -> list[dict[str, Any]]:
    """从数据库列出已缓存的模型（不扫描目录，避免 HF 缓存子目录干扰）。"""
    async with async_session() as session:
        result = await session.execute(select(ModelCache).order_by(ModelCache.created_at.desc()))
        records = result.scalars().all()

    models = []
    for r in records:
        # 验证目录是否真的存在，如果不存在则标记为未下载
        dir_exists = r.path and Path(r.path).exists()
        if not dir_exists:
            # 尝试从 models_dir 下查找
            alt_path = settings.models_dir / r.id.replace("/", "_")
            dir_exists = alt_path.exists()
            if dir_exists:
                r.path = str(alt_path)

        models.append({
            "id": r.id,
            "name": r.name,
            "model_type": r.model_type,
            "path": r.path,
            "is_downloaded": dir_exists,
            "context_length": r.context_length,
            "supported_peft_methods": r.supported_peft_methods.split(",") if r.supported_peft_methods else [],
        })
    return models


async def get_model_info(model_id: str) -> dict[str, Any] | None:
    """获取已缓存模型的元数据。"""
    async with async_session() as session:
        result = await session.execute(select(ModelCache).where(ModelCache.id == model_id))
        record = result.scalar_one_or_none()
        if record:
            return {
                "id": record.id,
                "name": record.name,
                "model_type": record.model_type,
                "path": record.path,
                "is_downloaded": bool(record.is_downloaded) and Path(record.path).exists() if record.path else False,
                "context_length": record.context_length,
                "supported_peft_methods": record.supported_peft_methods.split(",") if record.supported_peft_methods else [],
            }
    return None


async def delete_model(model_id: str) -> dict[str, Any]:
    """删除已缓存的模型（数据库记录 + 本地文件）。"""
    async with async_session() as session:
        result = await session.execute(select(ModelCache).where(ModelCache.id == model_id))
        record = result.scalar_one_or_none()
        if not record:
            return {"status": "not_found", "message": f"Model not found: {model_id}"}

        # 删除本地文件目录（对软链接，删除其指向的真实目录）
        model_dir = Path(record.path) if record.path else settings.models_dir / record.id.replace("/", "_")
        deleted_files = False
        if model_dir.is_symlink():
            # ModelScope 下载的模型可能是软链接，删除真实目录
            real_dir = model_dir.resolve()
            import shutil
            if real_dir.exists() and real_dir.is_dir():
                shutil.rmtree(real_dir, ignore_errors=True)
            # 如果还有父级软链接（如 dphn/ 下的其他链接），一并清理
            parent_link = model_dir.parent
            if parent_link.is_symlink():
                shutil.rmtree(parent_link, ignore_errors=True)
            deleted_files = True
        elif model_dir.exists() and model_dir.is_dir():
            import shutil
            shutil.rmtree(model_dir, ignore_errors=True)
            deleted_files = True

        # 删除数据库记录
        await session.delete(record)
        await session.commit()

        logger.info(f"Model deleted: {model_id} (files={deleted_files})")
        return {"status": "deleted", "model_id": model_id, "files_deleted": deleted_files}