#!/usr/bin/env python3 """测试 MsDataset.load() 完整下载流程。""" import json import sys import os import tempfile from pathlib import Path dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50" namespace, ds_name = dataset_id.split("/", 1) print(f"数据集: {dataset_id}\n") # 测试 MsDataset.load() print("=== 用 MsDataset.load() 下载 ===") from modelscope.msdatasets import MsDataset from PIL import Image ds = None for split in ("train", "validation", "test"): try: if namespace: ds = MsDataset.load(ds_name, namespace=namespace, split=split) else: ds = MsDataset.load(dataset_id, split=split) if ds: count = len(ds) if hasattr(ds, "__len__") else "?" print(f"split='{split}' 成功, 共 {count} 条") break except Exception as e: print(f"split='{split}' 失败: {e}") if not ds: print("所有 split 都失败") sys.exit(1) # 查看前 2 条数据 print("\n=== 前 2 条数据 ===") count = 0 for row in ds: if count >= 2: break print(f"\n--- Record {count} ---") for k, v in row.items(): vtype = type(v).__name__ if isinstance(v, Image.Image): # 模拟保存 tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) if v.mode in ("RGBA", "P", "LA"): v = v.convert("RGB") v.save(tmp.name, format="JPEG", quality=90) size = os.path.getsize(tmp.name) os.unlink(tmp.name) print(f" {k}: PIL.Image ({v.size[0]}x{v.size[1]}, mode={v.mode}) -> saved as {size} bytes") elif isinstance(v, str) and len(v) > 100: print(f" {k}: str (len={len(v)})") else: print(f" {k}: {vtype} = {v}") count += 1 print(f"\n=== 测试通过! ===")