#!/usr/bin/env python3 """测试 MsDataset.load() 能否正确下载图片数据集。""" import sys import json dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50" namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id) print(f"测试数据集: {dataset_id}") print(f"namespace: {namespace}, name: {ds_name}\n") print("=== 用 MsDataset.load() 下载 ===") try: from modelscope.msdatasets import MsDataset ds = None for split in ("train", "validation", "test"): try: if namespace: ds = MsDataset.load(ds_name, namespace=namespace, split=split) else: ds = MsDataset.load(dataset_id, split=split) if ds: print(f"加载 split='{split}' 成功, 共 {len(ds) if hasattr(ds, '__len__') else '?'} 条") break except Exception as e: print(f"split='{split}' 失败: {e}") if not ds: try: if namespace: ds = MsDataset.load(ds_name, namespace=namespace) else: ds = MsDataset.load(dataset_id) print(f"不带 split 加载成功, 类型: {type(ds)}") except Exception as e: print(f"不带 split 也失败: {e}") sys.exit(1) if not hasattr(ds, "__iter__"): print(f"数据集不可迭代, 类型: {type(ds)}") sys.exit(1) # 查看前 3 条数据 print("\n=== 前 3 条数据 ===") count = 0 for row in ds: if count >= 3: break print(f"\n--- Record {count} ---") for k, v in row.items(): vtype = type(v).__name__ if vtype == "Image": print(f" {k}: PIL.Image (size={v.size}, mode={v.mode})") elif isinstance(v, str) and len(v) > 100: print(f" {k}: str (len={len(v)}) '{v[:100]}...'") else: print(f" {k}: {vtype} = {v}") count += 1 print(f"\n=== 完成 ===") except Exception as e: print(f"失败: {e}") import traceback traceback.print_exc()