| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- #!/usr/bin/env python3
- """测试 MsDataset.load() 完整流程(含图片复制)。"""
- import json
- import sys
- import os
- import shutil
- from pathlib import Path
- dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
- namespace, ds_name = dataset_id.split("/", 1)
- print(f"数据集: {dataset_id}\n")
- # 清理旧数据
- ds_dir = Path("/tmp/ms_test_download")
- if ds_dir.exists():
- shutil.rmtree(ds_dir)
- ds_dir.mkdir(parents=True)
- images_dir = ds_dir / "images"
- print("=== 用 MsDataset.load() 下载 ===")
- from modelscope.msdatasets import MsDataset
- from PIL import Image
- ds = None
- for split in ("train", "validation", "test"):
- try:
- if namespace:
- ds = MsDataset.load(ds_name, namespace=namespace, split=split)
- else:
- ds = MsDataset.load(dataset_id, split=split)
- if ds:
- count = len(ds) if hasattr(ds, "__len__") else "?"
- print(f"split='{split}' 成功, 共 {count} 条")
- break
- except Exception as e:
- print(f"split='{split}' 失败: {e}")
- if not ds:
- print("所有 split 都失败")
- sys.exit(1)
- # 处理前 5 条数据
- print("\n=== 处理前 5 条数据(复制图片) ===")
- records = []
- img_counter = 0
- count = 0
- for row in ds:
- if count >= 5:
- break
- record = {}
- for k, v in row.items():
- # 检查是否是 PIL.Image 对象
- if isinstance(v, Image.Image):
- images_dir.mkdir(parents=True, exist_ok=True)
- img_name = f"{img_counter:06d}.jpg"
- img_path = images_dir / img_name
- if v.mode in ("RGBA", "P", "LA"):
- v = v.convert("RGB")
- v.save(str(img_path), format="JPEG", quality=90)
- record[k] = f"images/{img_name}"
- print(f"Record {count}: {k} -> PIL.Image saved as {img_name}")
- img_counter += 1
- # 检查是否是图片文件路径
- elif isinstance(v, str) and v.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp')):
- if os.path.isabs(v) and os.path.exists(v):
- images_dir.mkdir(parents=True, exist_ok=True)
- ext = os.path.splitext(v)[1]
- img_name = f"{img_counter:06d}{ext}"
- dest_path = images_dir / img_name
- try:
- shutil.copy2(v, dest_path)
- record[k] = f"images/{img_name}"
- size = os.path.getsize(dest_path)
- print(f"Record {count}: {k} -> copied {os.path.basename(v)} ({size} bytes) as {img_name}")
- img_counter += 1
- except Exception as e:
- print(f"Record {count}: {k} -> failed to copy: {e}")
- record[k] = v
- else:
- record[k] = v
- print(f"Record {count}: {k} -> {v} (relative path)")
- else:
- record[k] = v
- records.append(record)
- count += 1
- # 写入 JSONL
- print("\n=== 写入 JSONL ===")
- jsonl_path = ds_dir / "data.jsonl"
- with open(jsonl_path, "w", encoding="utf-8") as f:
- for item in records:
- f.write(json.dumps(item, ensure_ascii=False) + "\n")
- print(f"写入 {len(records)} 条记录到 {jsonl_path}")
- # 显示结果
- print("\n=== JSONL 内容 ===")
- with open(jsonl_path, "r", encoding="utf-8") as f:
- for i, line in enumerate(f):
- print(f"{i}: {line.strip()}")
- # 显示 images 目录
- print(f"\n=== images 目录 ===")
- if images_dir.exists():
- for img_file in sorted(images_dir.iterdir()):
- size = img_file.stat().st_size
- print(f" {img_file.name} ({size} bytes)")
- else:
- print(" (空)")
- print(f"\n=== 测试完成! ===")
- print(f"数据集目录: {ds_dir}")
|