Maas2-group
/
Fine-tuning


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							#!/usr/bin/env python3
"""测试 MsDataset.load() 完整流程（含图片复制）。"""
import json
import sys
import os
import shutil
from pathlib import Path

dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
namespace, ds_name = dataset_id.split("/", 1)

print(f"数据集: {dataset_id}\n")

# 清理旧数据
ds_dir = Path("/tmp/ms_test_download")
if ds_dir.exists():
    shutil.rmtree(ds_dir)
ds_dir.mkdir(parents=True)
images_dir = ds_dir / "images"

print("=== 用 MsDataset.load() 下载 ===")
from modelscope.msdatasets import MsDataset
from PIL import Image

ds = None
for split in ("train", "validation", "test"):
    try:
        if namespace:
            ds = MsDataset.load(ds_name, namespace=namespace, split=split)
        else:
            ds = MsDataset.load(dataset_id, split=split)
        if ds:
            count = len(ds) if hasattr(ds, "__len__") else "?"
            print(f"split='{split}' 成功, 共 {count} 条")
            break
    except Exception as e:
        print(f"split='{split}' 失败: {e}")

if not ds:
    print("所有 split 都失败")
    sys.exit(1)

# 处理前 5 条数据
print("\n=== 处理前 5 条数据（复制图片） ===")
records = []
img_counter = 0
count = 0

for row in ds:
    if count >= 5:
        break

    record = {}
    for k, v in row.items():
        # 检查是否是 PIL.Image 对象
        if isinstance(v, Image.Image):
            images_dir.mkdir(parents=True, exist_ok=True)
            img_name = f"{img_counter:06d}.jpg"
            img_path = images_dir / img_name
            if v.mode in ("RGBA", "P", "LA"):
                v = v.convert("RGB")
            v.save(str(img_path), format="JPEG", quality=90)
            record[k] = f"images/{img_name}"
            print(f"Record {count}: {k} -> PIL.Image saved as {img_name}")
            img_counter += 1
        # 检查是否是图片文件路径
        elif isinstance(v, str) and v.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp')):
            if os.path.isabs(v) and os.path.exists(v):
                images_dir.mkdir(parents=True, exist_ok=True)
                ext = os.path.splitext(v)[1]
                img_name = f"{img_counter:06d}{ext}"
                dest_path = images_dir / img_name
                try:
                    shutil.copy2(v, dest_path)
                    record[k] = f"images/{img_name}"
                    size = os.path.getsize(dest_path)
                    print(f"Record {count}: {k} -> copied {os.path.basename(v)} ({size} bytes) as {img_name}")
                    img_counter += 1
                except Exception as e:
                    print(f"Record {count}: {k} -> failed to copy: {e}")
                    record[k] = v
            else:
                record[k] = v
                print(f"Record {count}: {k} -> {v} (relative path)")
        else:
            record[k] = v

    records.append(record)
    count += 1

# 写入 JSONL
print("\n=== 写入 JSONL ===")
jsonl_path = ds_dir / "data.jsonl"
with open(jsonl_path, "w", encoding="utf-8") as f:
    for item in records:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"写入 {len(records)} 条记录到 {jsonl_path}")

# 显示结果
print("\n=== JSONL 内容 ===")
with open(jsonl_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(f"{i}: {line.strip()}")

# 显示 images 目录
print(f"\n=== images 目录 ===")
if images_dir.exists():
    for img_file in sorted(images_dir.iterdir()):
        size = img_file.stat().st_size
        print(f"  {img_file.name} ({size} bytes)")
else:
    print("  (空)")

print(f"\n=== 测试完成! ===")
print(f"数据集目录: {ds_dir}")