lxylxy123321 4 часов назад
Родитель
Сommit
5c1cd966ae
3 измененных файлов с 165 добавлено и 136 удалено
  1. 95 1
      backend/app/services/dataset_service.py
  2. 52 106
      backend/scripts/test_ms_api.py
  3. 18 29
      result.txt

+ 95 - 1
backend/app/services/dataset_service.py

@@ -403,12 +403,106 @@ async def recover_stale_downloads() -> None:
 
 
 
 
 def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
 def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
-    """用 modelscope CLI 下载数据集仓库文件,再通过 API 下载数据文件区的压缩包。"""
+    """用 MsDataset.load() 下载数据集,支持图片数据集(自动从 CDN 下载图片)。
+    如果 MsDataset.load() 失败,fallback 到 CLI 方式。"""
+    namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
     ds_dir = settings.processed_dir / f"ms_{dataset_id.replace('/', '_')}"
     ds_dir = settings.processed_dir / f"ms_{dataset_id.replace('/', '_')}"
     ds_dir.mkdir(parents=True, exist_ok=True)
     ds_dir.mkdir(parents=True, exist_ok=True)
+
+    # 优先用 MsDataset.load(),它能自动下载"数据文件"区的图片
+    try:
+        records, record_count = _download_via_msdataset(dataset_id, ds_dir)
+        if records:
+            jsonl_path = ds_dir / "data.jsonl"
+            with open(jsonl_path, "w", encoding="utf-8") as f:
+                for item in records:
+                    f.write(json.dumps(item, ensure_ascii=False) + "\n")
+            logger.info(f"MsDataset.load() 成功: {dataset_id} ({record_count} records)")
+            return ds_dir, jsonl_path, record_count
+    except Exception as e:
+        logger.warning(f"MsDataset.load() failed for {dataset_id}: {e}, falling back to CLI")
+
+    # fallback: CLI 方式(只下载 git 仓库文件,不含数据文件区图片)
     return _download_modelscope_dataset_cli(dataset_id, ds_dir)
     return _download_modelscope_dataset_cli(dataset_id, ds_dir)
 
 
 
 
+def _download_via_msdataset(dataset_id: str, ds_dir: Path) -> tuple[list[dict], int]:
+    """用 MsDataset.load() 下载数据集,处理图片列(PIL.Image → 保存到磁盘)。"""
+    from modelscope.msdatasets import MsDataset
+    from PIL import Image
+
+    namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
+    images_dir = ds_dir / "images"
+
+    # 尝试加载不同 split
+    ds = None
+    for split in ("train", "validation", "test"):
+        try:
+            if namespace:
+                ds = MsDataset.load(ds_name, namespace=namespace, split=split)
+            else:
+                ds = MsDataset.load(dataset_id, split=split)
+            if ds:
+                logger.info(f"MsDataset.load() loaded split '{split}': {len(ds) if hasattr(ds, '__len__') else '?'} records")
+                break
+        except Exception as e:
+            logger.debug(f"split '{split}' failed: {e}")
+
+    if not ds:
+        # 不带 split 参数试试
+        try:
+            if namespace:
+                ds = MsDataset.load(ds_name, namespace=namespace)
+            else:
+                ds = MsDataset.load(dataset_id)
+        except Exception as e:
+            logger.warning(f"MsDataset.load() without split also failed: {e}")
+            return [], 0
+
+    if not ds:
+        return [], 0
+
+    # 检查是否 iterable
+    if not hasattr(ds, '__iter__'):
+        return [], 0
+
+    records = []
+    img_counter = 0
+    columns = None
+
+    for row in ds:
+        if not isinstance(row, dict):
+            continue
+        if columns is None:
+            columns = list(row.keys())
+
+        record = {}
+        for k, v in row.items():
+            if isinstance(v, Image.Image):
+                # 图片对象:保存到磁盘,记录相对路径
+                images_dir.mkdir(parents=True, exist_ok=True)
+                img_name = f"{img_counter:06d}.jpg"
+                img_path = images_dir / img_name
+                if v.mode in ("RGBA", "P", "LA"):
+                    v = v.convert("RGB")
+                v.save(str(img_path), format="JPEG", quality=90)
+                record[k] = f"images/{img_name}"
+                img_counter += 1
+            else:
+                record[k] = v
+
+        records.append(record)
+
+        # 进度日志
+        if len(records) % 500 == 0:
+            logger.info(f"  处理中... {len(records)} records, {img_counter} images saved")
+
+    if img_counter > 0:
+        logger.info(f"共保存 {img_counter} 张图片到 {images_dir}")
+
+    return records, len(records)
+
+
 def _download_modelscope_dataset_cli(dataset_id: str, ds_dir: Path) -> tuple[Path, Path, int]:
 def _download_modelscope_dataset_cli(dataset_id: str, ds_dir: Path) -> tuple[Path, Path, int]:
     """CLI 方式下载数据集(fallback,只下载 git 仓库文件)。"""
     """CLI 方式下载数据集(fallback,只下载 git 仓库文件)。"""
     import subprocess
     import subprocess

+ 52 - 106
backend/scripts/test_ms_api.py

@@ -1,115 +1,61 @@
 #!/usr/bin/env python3
 #!/usr/bin/env python3
-"""测试通过 OSS 凭证下载数据文件区的图片。"""
+"""测试 MsDataset.load() 完整下载流程。"""
 import json
 import json
-import urllib.request
-import urllib.parse
 import sys
 import sys
+import os
+import tempfile
+from pathlib import Path
 
 
 dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
 dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
 namespace, ds_name = dataset_id.split("/", 1)
 namespace, ds_name = dataset_id.split("/", 1)
 
 
 print(f"数据集: {dataset_id}\n")
 print(f"数据集: {dataset_id}\n")
 
 
-from modelscope.hub.api import HubApi
-api = HubApi()
-
-# 获取 OSS 凭证
-print("=== 获取 OSS 凭证 ===")
-config = api.get_dataset_access_config(
-    dataset_name=ds_name,
-    namespace=namespace,
-    revision="master",
-)
-host = config["Host"]
-backup_dir = config["BackupDir"]  # zip 文件
-unzip_dir = config["Dir"]  # 已解压的文件
-access_id = config["AccessId"]
-access_secret = config["AccessSecret"]
-security_token = config["SecurityToken"]
-print(f"Host: {host}")
-print(f"Zip 目录: {backup_dir}")
-print(f"解压目录: {unzip_dir}")
-print(f"过期时间: {config['Expiration']}")
-
-# 测试1: 直接下载 zip(公开访问?)
-print("\n=== 测试1: 直接访问 public-zip/train.zip ===")
-url = f"{host}/{backup_dir}train.zip"
-print(f"URL: {url}")
-try:
-    req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
-    with urllib.request.urlopen(req, timeout=15) as resp:
-        print(f"HEAD: {resp.status} | size={resp.headers.get('Content-Length', '?')}")
-except Exception as e:
-    print(f"HEAD: {e}")
-
-# 测试2: 带 STS 签名下载 zip
-print("\n=== 测试2: 带 STS 签名访问 train.zip ===")
-# OSS STS 签名 URL 格式: ?OSSAccessKeyId=xxx&Expires=xxx&Signature=xxx&security-token=xxx
-import time
-import hmac
-import hashlib
-import base64
-expires = str(int(time.time()) + 3600)
-string_to_sign = f"HEAD\n\n\n{expires}\n/{config['Bucket']}/{backup_dir}train.zip"
-h = hmac.new(access_secret.encode(), string_to_sign.encode(), hashlib.sha1)
-signature = urllib.parse.quote(base64.b64encode(h.digest()))
-url = (f"{host}/{backup_dir}train.zip"
-       f"?OSSAccessKeyId={urllib.parse.quote(access_id)}"
-       f"&Expires={expires}"
-       f"&Signature={signature}"
-       f"&security-token={urllib.parse.quote(security_token)}")
-print(f"URL: {url[:200]}...")
-try:
-    req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
-    with urllib.request.urlopen(req, timeout=15) as resp:
-        print(f"HEAD: {resp.status} | size={resp.headers.get('Content-Length', '?')}")
-        print(f">>> 成功! <<<")
-except Exception as e:
-    print(f"HEAD: {e}")
-
-# 测试3: GET 下载前 1MB
-print("\n=== 测试3: GET 下载 train.zip 前 1MB ===")
-try:
-    req = urllib.request.Request(url, headers={"User-Agent": "Test", "Range": "bytes=0-1048575"})
-    with urllib.request.urlopen(req, timeout=30) as resp:
-        data = resp.read(1048576)
-        is_zip = data[:4] == b'PK\x03\x04'
-        print(f"GET: {resp.status} | {len(data)} bytes | is_zip={is_zip}")
-        if is_zip:
-            print(f">>> 成功! 是 ZIP 文件! <<<")
-except Exception as e:
-    print(f"GET: {e}")
-
-# 测试4: 访问 public-unzip-dataset(已解压的图片)
-print("\n=== 测试4: 直接访问 public-unzip-dataset(已解压图片) ===")
-url = f"{host}/{unzip_dir}"
-print(f"URL: {url}")
-try:
-    req = urllib.request.Request(url, headers={"User-Agent": "Test"})
-    with urllib.request.urlopen(req, timeout=15) as resp:
-        content = resp.read().decode("utf-8", errors="replace")
-        print(f"状态: {resp.status}")
-        print(f"内容前 500 字符: {content[:500]}")
-except Exception as e:
-    print(f"失败: {e}")
-
-# 测试5: 带签名访问 public-unzip-dataset
-print("\n=== 测试5: 带签名访问 public-unzip-dataset ===")
-string_to_sign = f"GET\n\n\n{expires}\n/{config['Bucket']}/{unzip_dir}"
-h = hmac.new(access_secret.encode(), string_to_sign.encode(), hashlib.sha1)
-signature = urllib.parse.quote(base64.b64encode(h.digest()))
-url = (f"{host}/{unzip_dir}"
-       f"?OSSAccessKeyId={urllib.parse.quote(access_id)}"
-       f"&Expires={expires}"
-       f"&Signature={signature}"
-       f"&security-token={urllib.parse.quote(security_token)}")
-try:
-    req = urllib.request.Request(url, headers={"User-Agent": "Test"})
-    with urllib.request.urlopen(req, timeout=15) as resp:
-        content = resp.read().decode("utf-8", errors="replace")
-        print(f"状态: {resp.status}")
-        print(f"内容前 500 字符: {content[:500]}")
-except Exception as e:
-    print(f"失败: {e}")
-
-print("\n=== 完成 ===")
+# 测试 MsDataset.load()
+print("=== 用 MsDataset.load() 下载 ===")
+from modelscope.msdatasets import MsDataset
+from PIL import Image
+
+ds = None
+for split in ("train", "validation", "test"):
+    try:
+        if namespace:
+            ds = MsDataset.load(ds_name, namespace=namespace, split=split)
+        else:
+            ds = MsDataset.load(dataset_id, split=split)
+        if ds:
+            count = len(ds) if hasattr(ds, "__len__") else "?"
+            print(f"split='{split}' 成功, 共 {count} 条")
+            break
+    except Exception as e:
+        print(f"split='{split}' 失败: {e}")
+
+if not ds:
+    print("所有 split 都失败")
+    sys.exit(1)
+
+# 查看前 2 条数据
+print("\n=== 前 2 条数据 ===")
+count = 0
+for row in ds:
+    if count >= 2:
+        break
+    print(f"\n--- Record {count} ---")
+    for k, v in row.items():
+        vtype = type(v).__name__
+        if isinstance(v, Image.Image):
+            # 模拟保存
+            tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
+            if v.mode in ("RGBA", "P", "LA"):
+                v = v.convert("RGB")
+            v.save(tmp.name, format="JPEG", quality=90)
+            size = os.path.getsize(tmp.name)
+            os.unlink(tmp.name)
+            print(f"  {k}: PIL.Image ({v.size[0]}x{v.size[1]}, mode={v.mode}) -> saved as {size} bytes")
+        elif isinstance(v, str) and len(v) > 100:
+            print(f"  {k}: str (len={len(v)})")
+        else:
+            print(f"  {k}: {vtype} = {v}")
+    count += 1
+
+print(f"\n=== 测试通过! ===")

+ 18 - 29
result.txt

@@ -1,36 +1,25 @@
-lq@lq:~/Fine-tuning$ cp backend/scripts/test_ms_api.py backend/data/ && sudo docker exec -it finetune-backend python3 /root/Fine-tuning/backend/data/test_ms_api.py tany0699/carBrands50
-数据集: tany0699/carBrands50
+=== 获取 OSS 凭证 ===
+Host: https://dataset-hub.oss-cn-hangzhou.aliyuncs.com
+Zip 目录: public-zip/tany0699/carBrands50/master/
+解压目录: public-unzip-dataset/tany0699/carBrands50/master/
+过期时间: 2026-05-29T03:15:38+08:00
 
 
-=== 获取 dataset_id ===
-dataset_id=2119, type=2
+=== 测试1: 直接访问 public-zip/train.zip ===
+URL: https://dataset-hub.oss-cn-hangzhou.aliyuncs.com/public-zip/tany0699/carBrands50/master/train.zip
+HEAD: HTTP Error 403: Forbidden
 
 
-=== get_dataset_access_config ===
-签名: (dataset_name: str, namespace: str, revision: Optional[str] = 'master')
-结果: {
-  "AccessId": "STS.NXj6GJFxy94HMWrqjL1UHkFMR",
-  "AccessSecret": "AxCCmYH4s381axveFQWYd48uzDTfGjb43fRmDnQPvbU2",
-  "SecurityToken": "CAISiAR1q6Ft5B2yfSjIr5vfff3+q6dYjvajT3HDlWoZPdpkhIPmsDz2IHlEdXZvAekbt/U/mGBY6/YblrtOU5tCTEDoZNd59ZlL3wKlbpGZJmElejJf2vOfAmG2J0PRPqWwCryLoLm/F96pb1fb7GURpZLxaTSlWXG8LJSNkuQJR98LXw6+H1gkZNBNPVlNpdNYT8W4V5CXPwXtn3DbAWdxpwN4khkf06mkxdCG4ResXTSY6OYevNb2OYP2LZsubpt2T96u0fAzdrfLlzZN4RxL/+Jrir5F4XLLt82UGFRNpAmPNfuMrIw+awRwfIIgEqhIt6Wk0vhxofDejcO1qVdENvoHVD/EFsLygpnIEf+gLdslMLbmMTPVz9qLN5KwrgU2YGoDOQ8NObgJI3RrWxs3UWOYeO319FnWaxyuV+2FzatxyYd/y1TjuMiLPx+TSrOIiXhJa9g3ZlkCLRcQ0Xe8KugDehdQfkkgRZTtFNQsNEAD9f+45F2JDXU+k0s65aOuO6nk3YkEcpj6U5581o4QWY9LqWNCTS6sEuL/1RhMLTE8EOYKgPmwacaliaWMxeyYauPdEeCGCpG4rNd5xMDkawSzTUWZjrVLATITa1bLUS/DXbnUsRr8S6G+HID0kCdX1ShCyYGcweq9HiEdnP5k0KRGtPem8WctjWkgSpOO54k21nSVkirOCcFRGoABbd4cbYFtd40HoLkSnCKfKLZQo0/kRyaw8TMwRMGKOpT3INcqyzwr6VPAUnJesCVdF+u+SsqnMxUfy53qBFro7oU/ciHgS1FhOTPLaXWgDm5gl51s+MVEJ+SiXqY33Wh1XsY+/CHkMt/orNHNUMSio4iFnsHgt316kKACYVHKIVAgAA==",
-  "Dir": "public-unzip-dataset/tany0699/carBrands50/master/",
-  "Host": "https://dataset-hub.oss-cn-hangzhou.aliyuncs.com",
-  "Bucket": "dataset-hub",
-  "Region": "oss-cn-hangzhou",
-  "Expiration": "2026-05-29T03:09:46+08:00",
-  "BackupDir": "public-zip/tany0699/carBrands50/master/"
-}
+=== 测试2: 带 STS 签名访问 train.zip ===
+URL: https://dataset-hub.oss-cn-hangzhou.aliyuncs.com/public-zip/tany0699/carBrands50/master/train.zip?OSSAccessKeyId=STS.NZXiXzw8m5QY1fgPrjU8jJPhF&Expires=1779956139&Signature=rEs4WwwW3YJOirB14gUoz/zlaFo%...
+HEAD: HTTP Error 403: Forbidden
 
 
-=== get_dataset_access_config_for_unzipped ===
-签名: (dataset_name: str, namespace: str, revision: str, zip_file_name: str)
-失败: HubApi.get_dataset_access_config_for_unzipped() missing 1 required positional argument: 'zip_file_name'
+=== 测试3: GET 下载 train.zip 前 1MB ===
+GET: HTTP Error 403: Forbidden
 
 
-=== get_dataset_infos ===
-签名: (dataset_hub_id: str, revision: str, files_metadata: bool = False, timeout: float = 100, recursive: str = 'True')
-失败: HubApi.get_dataset_infos() got an unexpected keyword argument 'dataset_name'
+=== 测试4: 直接访问 public-unzip-dataset(已解压图片) ===
+URL: https://dataset-hub.oss-cn-hangzhou.aliyuncs.com/public-unzip-dataset/tany0699/carBrands50/master/
+失败: HTTP Error 403: Forbidden
 
 
-=== get_dataset_file_url (train.csv) ===
-URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/repo?Source=SDK&Revision=master&FilePath=train.csv&View=False
-
-=== get_dataset_file_url (train.zip) ===
-URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/repo?Source=SDK&Revision=master&FilePath=train.zip&View=False
-HEAD: HTTP Error 404: 
+=== 测试5: 带签名访问 public-unzip-dataset ===
+失败: HTTP Error 403: Forbidden
 
 
 === 完成 ===
 === 完成 ===