|
|
@@ -1,9 +1,9 @@
|
|
|
#!/usr/bin/env python3
|
|
|
-"""测试 MsDataset.load() 完整下载流程。"""
|
|
|
+"""测试 MsDataset.load() 完整流程(含图片复制)。"""
|
|
|
import json
|
|
|
import sys
|
|
|
import os
|
|
|
-import tempfile
|
|
|
+import shutil
|
|
|
from pathlib import Path
|
|
|
|
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
|
@@ -11,7 +11,13 @@ namespace, ds_name = dataset_id.split("/", 1)
|
|
|
|
|
|
print(f"数据集: {dataset_id}\n")
|
|
|
|
|
|
-# 测试 MsDataset.load()
|
|
|
+# 清理旧数据
|
|
|
+ds_dir = Path("/tmp/ms_test_download")
|
|
|
+if ds_dir.exists():
|
|
|
+ shutil.rmtree(ds_dir)
|
|
|
+ds_dir.mkdir(parents=True)
|
|
|
+images_dir = ds_dir / "images"
|
|
|
+
|
|
|
print("=== 用 MsDataset.load() 下载 ===")
|
|
|
from modelscope.msdatasets import MsDataset
|
|
|
from PIL import Image
|
|
|
@@ -34,28 +40,77 @@ if not ds:
|
|
|
print("所有 split 都失败")
|
|
|
sys.exit(1)
|
|
|
|
|
|
-# 查看前 2 条数据
|
|
|
-print("\n=== 前 2 条数据 ===")
|
|
|
+# 处理前 5 条数据
|
|
|
+print("\n=== 处理前 5 条数据(复制图片) ===")
|
|
|
+records = []
|
|
|
+img_counter = 0
|
|
|
count = 0
|
|
|
+
|
|
|
for row in ds:
|
|
|
- if count >= 2:
|
|
|
+ if count >= 5:
|
|
|
break
|
|
|
- print(f"\n--- Record {count} ---")
|
|
|
+
|
|
|
+ record = {}
|
|
|
for k, v in row.items():
|
|
|
- vtype = type(v).__name__
|
|
|
+ # 检查是否是 PIL.Image 对象
|
|
|
if isinstance(v, Image.Image):
|
|
|
- # 模拟保存
|
|
|
- tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
|
|
|
+ images_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+ img_name = f"{img_counter:06d}.jpg"
|
|
|
+ img_path = images_dir / img_name
|
|
|
if v.mode in ("RGBA", "P", "LA"):
|
|
|
v = v.convert("RGB")
|
|
|
- v.save(tmp.name, format="JPEG", quality=90)
|
|
|
- size = os.path.getsize(tmp.name)
|
|
|
- os.unlink(tmp.name)
|
|
|
- print(f" {k}: PIL.Image ({v.size[0]}x{v.size[1]}, mode={v.mode}) -> saved as {size} bytes")
|
|
|
- elif isinstance(v, str) and len(v) > 100:
|
|
|
- print(f" {k}: str (len={len(v)})")
|
|
|
+ v.save(str(img_path), format="JPEG", quality=90)
|
|
|
+ record[k] = f"images/{img_name}"
|
|
|
+ print(f"Record {count}: {k} -> PIL.Image saved as {img_name}")
|
|
|
+ img_counter += 1
|
|
|
+ # 检查是否是图片文件路径
|
|
|
+ elif isinstance(v, str) and v.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp')):
|
|
|
+ if os.path.isabs(v) and os.path.exists(v):
|
|
|
+ images_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+ ext = os.path.splitext(v)[1]
|
|
|
+ img_name = f"{img_counter:06d}{ext}"
|
|
|
+ dest_path = images_dir / img_name
|
|
|
+ try:
|
|
|
+ shutil.copy2(v, dest_path)
|
|
|
+ record[k] = f"images/{img_name}"
|
|
|
+ size = os.path.getsize(dest_path)
|
|
|
+ print(f"Record {count}: {k} -> copied {os.path.basename(v)} ({size} bytes) as {img_name}")
|
|
|
+ img_counter += 1
|
|
|
+ except Exception as e:
|
|
|
+ print(f"Record {count}: {k} -> failed to copy: {e}")
|
|
|
+ record[k] = v
|
|
|
+ else:
|
|
|
+ record[k] = v
|
|
|
+ print(f"Record {count}: {k} -> {v} (relative path)")
|
|
|
else:
|
|
|
- print(f" {k}: {vtype} = {v}")
|
|
|
+ record[k] = v
|
|
|
+
|
|
|
+ records.append(record)
|
|
|
count += 1
|
|
|
|
|
|
-print(f"\n=== 测试通过! ===")
|
|
|
+# 写入 JSONL
|
|
|
+print("\n=== 写入 JSONL ===")
|
|
|
+jsonl_path = ds_dir / "data.jsonl"
|
|
|
+with open(jsonl_path, "w", encoding="utf-8") as f:
|
|
|
+ for item in records:
|
|
|
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
|
+
|
|
|
+print(f"写入 {len(records)} 条记录到 {jsonl_path}")
|
|
|
+
|
|
|
+# 显示结果
|
|
|
+print("\n=== JSONL 内容 ===")
|
|
|
+with open(jsonl_path, "r", encoding="utf-8") as f:
|
|
|
+ for i, line in enumerate(f):
|
|
|
+ print(f"{i}: {line.strip()}")
|
|
|
+
|
|
|
+# 显示 images 目录
|
|
|
+print(f"\n=== images 目录 ===")
|
|
|
+if images_dir.exists():
|
|
|
+ for img_file in sorted(images_dir.iterdir()):
|
|
|
+ size = img_file.stat().st_size
|
|
|
+ print(f" {img_file.name} ({size} bytes)")
|
|
|
+else:
|
|
|
+ print(" (空)")
|
|
|
+
|
|
|
+print(f"\n=== 测试完成! ===")
|
|
|
+print(f"数据集目录: {ds_dir}")
|