#!/usr/bin/env python3 """测试 MsDataset.load() 完整流程(含图片复制)。""" import json import sys import os import shutil from pathlib import Path dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50" namespace, ds_name = dataset_id.split("/", 1) print(f"数据集: {dataset_id}\n") # 清理旧数据 ds_dir = Path("/tmp/ms_test_download") if ds_dir.exists(): shutil.rmtree(ds_dir) ds_dir.mkdir(parents=True) images_dir = ds_dir / "images" print("=== 用 MsDataset.load() 下载 ===") from modelscope.msdatasets import MsDataset from PIL import Image ds = None for split in ("train", "validation", "test"): try: if namespace: ds = MsDataset.load(ds_name, namespace=namespace, split=split) else: ds = MsDataset.load(dataset_id, split=split) if ds: count = len(ds) if hasattr(ds, "__len__") else "?" print(f"split='{split}' 成功, 共 {count} 条") break except Exception as e: print(f"split='{split}' 失败: {e}") if not ds: print("所有 split 都失败") sys.exit(1) # 处理前 5 条数据 print("\n=== 处理前 5 条数据(复制图片) ===") records = [] img_counter = 0 count = 0 for row in ds: if count >= 5: break record = {} for k, v in row.items(): # 检查是否是 PIL.Image 对象 if isinstance(v, Image.Image): images_dir.mkdir(parents=True, exist_ok=True) img_name = f"{img_counter:06d}.jpg" img_path = images_dir / img_name if v.mode in ("RGBA", "P", "LA"): v = v.convert("RGB") v.save(str(img_path), format="JPEG", quality=90) record[k] = f"images/{img_name}" print(f"Record {count}: {k} -> PIL.Image saved as {img_name}") img_counter += 1 # 检查是否是图片文件路径 elif isinstance(v, str) and v.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp')): if os.path.isabs(v) and os.path.exists(v): images_dir.mkdir(parents=True, exist_ok=True) ext = os.path.splitext(v)[1] img_name = f"{img_counter:06d}{ext}" dest_path = images_dir / img_name try: shutil.copy2(v, dest_path) record[k] = f"images/{img_name}" size = os.path.getsize(dest_path) print(f"Record {count}: {k} -> copied {os.path.basename(v)} ({size} bytes) as {img_name}") img_counter += 1 except Exception as e: print(f"Record {count}: {k} -> failed to copy: {e}") record[k] = v else: record[k] = v print(f"Record {count}: {k} -> {v} (relative path)") else: record[k] = v records.append(record) count += 1 # 写入 JSONL print("\n=== 写入 JSONL ===") jsonl_path = ds_dir / "data.jsonl" with open(jsonl_path, "w", encoding="utf-8") as f: for item in records: f.write(json.dumps(item, ensure_ascii=False) + "\n") print(f"写入 {len(records)} 条记录到 {jsonl_path}") # 显示结果 print("\n=== JSONL 内容 ===") with open(jsonl_path, "r", encoding="utf-8") as f: for i, line in enumerate(f): print(f"{i}: {line.strip()}") # 显示 images 目录 print(f"\n=== images 目录 ===") if images_dir.exists(): for img_file in sorted(images_dir.iterdir()): size = img_file.stat().st_size print(f" {img_file.name} ({size} bytes)") else: print(" (空)") print(f"\n=== 测试完成! ===") print(f"数据集目录: {ds_dir}")