| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566 |
- #!/usr/bin/env python3
- """测试 MsDataset.load() 能否正确下载图片数据集。"""
- import sys
- import json
- dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
- namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
- print(f"测试数据集: {dataset_id}")
- print(f"namespace: {namespace}, name: {ds_name}\n")
- print("=== 用 MsDataset.load() 下载 ===")
- try:
- from modelscope.msdatasets import MsDataset
- ds = None
- for split in ("train", "validation", "test"):
- try:
- if namespace:
- ds = MsDataset.load(ds_name, namespace=namespace, split=split)
- else:
- ds = MsDataset.load(dataset_id, split=split)
- if ds:
- print(f"加载 split='{split}' 成功, 共 {len(ds) if hasattr(ds, '__len__') else '?'} 条")
- break
- except Exception as e:
- print(f"split='{split}' 失败: {e}")
- if not ds:
- try:
- if namespace:
- ds = MsDataset.load(ds_name, namespace=namespace)
- else:
- ds = MsDataset.load(dataset_id)
- print(f"不带 split 加载成功, 类型: {type(ds)}")
- except Exception as e:
- print(f"不带 split 也失败: {e}")
- sys.exit(1)
- if not hasattr(ds, "__iter__"):
- print(f"数据集不可迭代, 类型: {type(ds)}")
- sys.exit(1)
- # 查看前 3 条数据
- print("\n=== 前 3 条数据 ===")
- count = 0
- for row in ds:
- if count >= 3:
- break
- print(f"\n--- Record {count} ---")
- for k, v in row.items():
- vtype = type(v).__name__
- if vtype == "Image":
- print(f" {k}: PIL.Image (size={v.size}, mode={v.mode})")
- elif isinstance(v, str) and len(v) > 100:
- print(f" {k}: str (len={len(v)}) '{v[:100]}...'")
- else:
- print(f" {k}: {vtype} = {v}")
- count += 1
- print(f"\n=== 完成 ===")
- except Exception as e:
- print(f"失败: {e}")
- import traceback
- traceback.print_exc()
|