lxylxy123321 3 часов назад
Родитель
Сommit
14e4b78f45
3 измененных файлов с 111 добавлено и 39 удалено
  1. 21 1
      backend/app/services/dataset_service.py
  2. 73 18
      backend/scripts/test_ms_api.py
  3. 17 20
      result.txt

+ 21 - 1
backend/app/services/dataset_service.py

@@ -427,9 +427,10 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
 
 
 def _download_via_msdataset(dataset_id: str, ds_dir: Path) -> tuple[list[dict], int]:
-    """用 MsDataset.load() 下载数据集,处理图片列(PIL.Image → 保存到磁盘)。"""
+    """用 MsDataset.load() 下载数据集,处理图片列(复制图片文件到数据集目录)。"""
     from modelscope.msdatasets import MsDataset
     from PIL import Image
+    import shutil
 
     namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
     images_dir = ds_dir / "images"
@@ -478,6 +479,7 @@ def _download_via_msdataset(dataset_id: str, ds_dir: Path) -> tuple[list[dict],
 
         record = {}
         for k, v in row.items():
+            # 检查是否是 PIL.Image 对象
             if isinstance(v, Image.Image):
                 # 图片对象:保存到磁盘,记录相对路径
                 images_dir.mkdir(parents=True, exist_ok=True)
@@ -488,6 +490,24 @@ def _download_via_msdataset(dataset_id: str, ds_dir: Path) -> tuple[list[dict],
                 v.save(str(img_path), format="JPEG", quality=90)
                 record[k] = f"images/{img_name}"
                 img_counter += 1
+            # 检查是否是图片文件路径
+            elif isinstance(v, str) and v.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp')):
+                # 如果是绝对路径,复制文件到 images 目录
+                if os.path.isabs(v) and os.path.exists(v):
+                    images_dir.mkdir(parents=True, exist_ok=True)
+                    ext = os.path.splitext(v)[1]
+                    img_name = f"{img_counter:06d}{ext}"
+                    dest_path = images_dir / img_name
+                    try:
+                        shutil.copy2(v, dest_path)
+                        record[k] = f"images/{img_name}"
+                        img_counter += 1
+                    except Exception as e:
+                        logger.warning(f"Failed to copy image {v}: {e}")
+                        record[k] = v
+                else:
+                    # 相对路径或其他情况,保持原样
+                    record[k] = v
             else:
                 record[k] = v
 

+ 73 - 18
backend/scripts/test_ms_api.py

@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
-"""测试 MsDataset.load() 完整下载流程。"""
+"""测试 MsDataset.load() 完整流程(含图片复制)。"""
 import json
 import sys
 import os
-import tempfile
+import shutil
 from pathlib import Path
 
 dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
@@ -11,7 +11,13 @@ namespace, ds_name = dataset_id.split("/", 1)
 
 print(f"数据集: {dataset_id}\n")
 
-# 测试 MsDataset.load()
+# 清理旧数据
+ds_dir = Path("/tmp/ms_test_download")
+if ds_dir.exists():
+    shutil.rmtree(ds_dir)
+ds_dir.mkdir(parents=True)
+images_dir = ds_dir / "images"
+
 print("=== 用 MsDataset.load() 下载 ===")
 from modelscope.msdatasets import MsDataset
 from PIL import Image
@@ -34,28 +40,77 @@ if not ds:
     print("所有 split 都失败")
     sys.exit(1)
 
-# 查看前 2 条数据
-print("\n=== 前 2 条数据 ===")
+# 处理前 5 条数据
+print("\n=== 处理前 5 条数据(复制图片) ===")
+records = []
+img_counter = 0
 count = 0
+
 for row in ds:
-    if count >= 2:
+    if count >= 5:
         break
-    print(f"\n--- Record {count} ---")
+
+    record = {}
     for k, v in row.items():
-        vtype = type(v).__name__
+        # 检查是否是 PIL.Image 对象
         if isinstance(v, Image.Image):
-            # 模拟保存
-            tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
+            images_dir.mkdir(parents=True, exist_ok=True)
+            img_name = f"{img_counter:06d}.jpg"
+            img_path = images_dir / img_name
             if v.mode in ("RGBA", "P", "LA"):
                 v = v.convert("RGB")
-            v.save(tmp.name, format="JPEG", quality=90)
-            size = os.path.getsize(tmp.name)
-            os.unlink(tmp.name)
-            print(f"  {k}: PIL.Image ({v.size[0]}x{v.size[1]}, mode={v.mode}) -> saved as {size} bytes")
-        elif isinstance(v, str) and len(v) > 100:
-            print(f"  {k}: str (len={len(v)})")
+            v.save(str(img_path), format="JPEG", quality=90)
+            record[k] = f"images/{img_name}"
+            print(f"Record {count}: {k} -> PIL.Image saved as {img_name}")
+            img_counter += 1
+        # 检查是否是图片文件路径
+        elif isinstance(v, str) and v.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp')):
+            if os.path.isabs(v) and os.path.exists(v):
+                images_dir.mkdir(parents=True, exist_ok=True)
+                ext = os.path.splitext(v)[1]
+                img_name = f"{img_counter:06d}{ext}"
+                dest_path = images_dir / img_name
+                try:
+                    shutil.copy2(v, dest_path)
+                    record[k] = f"images/{img_name}"
+                    size = os.path.getsize(dest_path)
+                    print(f"Record {count}: {k} -> copied {os.path.basename(v)} ({size} bytes) as {img_name}")
+                    img_counter += 1
+                except Exception as e:
+                    print(f"Record {count}: {k} -> failed to copy: {e}")
+                    record[k] = v
+            else:
+                record[k] = v
+                print(f"Record {count}: {k} -> {v} (relative path)")
         else:
-            print(f"  {k}: {vtype} = {v}")
+            record[k] = v
+
+    records.append(record)
     count += 1
 
-print(f"\n=== 测试通过! ===")
+# 写入 JSONL
+print("\n=== 写入 JSONL ===")
+jsonl_path = ds_dir / "data.jsonl"
+with open(jsonl_path, "w", encoding="utf-8") as f:
+    for item in records:
+        f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+print(f"写入 {len(records)} 条记录到 {jsonl_path}")
+
+# 显示结果
+print("\n=== JSONL 内容 ===")
+with open(jsonl_path, "r", encoding="utf-8") as f:
+    for i, line in enumerate(f):
+        print(f"{i}: {line.strip()}")
+
+# 显示 images 目录
+print(f"\n=== images 目录 ===")
+if images_dir.exists():
+    for img_file in sorted(images_dir.iterdir()):
+        size = img_file.stat().st_size
+        print(f"  {img_file.name} ({size} bytes)")
+else:
+    print("  (空)")
+
+print(f"\n=== 测试完成! ===")
+print(f"数据集目录: {ds_dir}")

+ 17 - 20
result.txt

@@ -1,25 +1,22 @@
-=== 获取 OSS 凭证 ===
-Host: https://dataset-hub.oss-cn-hangzhou.aliyuncs.com
-Zip 目录: public-zip/tany0699/carBrands50/master/
-解压目录: public-unzip-dataset/tany0699/carBrands50/master/
-过期时间: 2026-05-29T03:15:38+08:00
+lq@lq:~/Fine-tuning$ cp backend/scripts/test_ms_api.py backend/data/ && sudo docker exec -it finetune-backend python3 /root/Fine-tuning/backend/data/test_ms_api.py tany0699/carBrands50
+数据集: tany0699/carBrands50
 
-=== 测试1: 直接访问 public-zip/train.zip ===
-URL: https://dataset-hub.oss-cn-hangzhou.aliyuncs.com/public-zip/tany0699/carBrands50/master/train.zip
-HEAD: HTTP Error 403: Forbidden
+=== 用 MsDataset.load() 下载 ===
+2026-05-28 07:23:48,514 - modelscope - INFO - No subset_name specified, defaulting to the default
+2026-05-28 07:23:50,442 - modelscope - WARNING - Reusing dataset dataset_builder (/root/.cache/modelscope/hub/datasets/tany0699/carBrands50/master/data_files)
+2026-05-28 07:23:50,443 - modelscope - INFO - Generating dataset dataset_builder (/root/.cache/modelscope/hub/datasets/tany0699/carBrands50/master/data_files)
+2026-05-28 07:23:50,443 - modelscope - INFO - Loading meta-data file ...
+4398it [00:00, 39702.70it/s]
+100% split='train' 成功, 共 4397 条
 
-=== 测试2: 带 STS 签名访问 train.zip ===
-URL: https://dataset-hub.oss-cn-hangzhou.aliyuncs.com/public-zip/tany0699/carBrands50/master/train.zip?OSSAccessKeyId=STS.NZXiXzw8m5QY1fgPrjU8jJPhF&Expires=1779956139&Signature=rEs4WwwW3YJOirB14gUoz/zlaFo%...
-HEAD: HTTP Error 403: Forbidden
+=== 前 2 条数据 ===
 
-=== 测试3: GET 下载 train.zip 前 1MB ===
-GET: HTTP Error 403: Forbidden
+--- Record 0 ---
+  image:FILE: str (len=180)
+  category: int = 4
 
-=== 测试4: 直接访问 public-unzip-dataset(已解压图片) ===
-URL: https://dataset-hub.oss-cn-hangzhou.aliyuncs.com/public-unzip-dataset/tany0699/carBrands50/master/
-失败: HTTP Error 403: Forbidden
+--- Record 1 ---
+  image:FILE: str (len=180)
+  category: int = 4
 
-=== 测试5: 带签名访问 public-unzip-dataset ===
-失败: HTTP Error 403: Forbidden
-
-=== 完成 ===
+=== 测试通过! ===