lxylxy123321 5 часов назад
Родитель
Сommit
09bcc77edb
2 измененных файлов с 136 добавлено и 150 удалено
  1. 88 67
      backend/scripts/test_ms_api.py
  2. 48 83
      result.txt

+ 88 - 67
backend/scripts/test_ms_api.py

@@ -1,83 +1,104 @@
 #!/usr/bin/env python3
-"""测试 ModelScope 数据文件区 zip 的不同下载 URL 格式。"""
+"""测试不同方式下载 ModelScope 数据文件区图片。"""
 import json
 import urllib.request
 import urllib.parse
 import sys
+import subprocess
+import os
 
 api_base = "https://www.modelscope.cn"
 dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
 namespace, ds_name = dataset_id.split("/", 1)
-zip_file = "train.zip"
 
-print(f"数据集: {dataset_id}, 目标文件: {zip_file}\n")
+print(f"数据集: {dataset_id}\n")
 
-# 先获取 hub_id
-hub_id = None
+# 先查看当前版本
+print("=== 当前版本 ===")
+for pkg in ["modelscope", "datasets"]:
+    try:
+        result = subprocess.run(
+            ["pip", "show", pkg], capture_output=True, text=True, timeout=10
+        )
+        for line in result.stdout.splitlines():
+            if line.startswith("Version:") or line.startswith("Name:"):
+                print(f"  {line}")
+    except Exception as e:
+        print(f"  {pkg}: {e}")
+
+# 方式1: 直接用 hub.api(跳过 msdatasets 的 import 问题)
+print("\n=== 方式1: HubApi snapshot_download ===")
 try:
-    url = f"{api_base}/api/v1/datasets/{dataset_id}"
-    req = urllib.request.Request(url, headers={"User-Agent": "Test"})
-    with urllib.request.urlopen(req, timeout=30) as resp:
-        info = json.loads(resp.read().decode())
-    hub_id = info.get("Data", {}).get("Id") or info.get("Data", {}).get("id")
-    print(f"hub_id = {hub_id}\n")
+    from modelscope.hub.snapshot_download import dataset_snapshot_download
+    print("dataset_snapshot_download 可用!")
+    from modelscope.utils.constant import DownloadMode
+    cache_dir = "/tmp/ms_test_cache"
+    os.makedirs(cache_dir, exist_ok=True)
+    result = dataset_snapshot_download(
+        dataset_id=dataset_id,
+        cache_dir=cache_dir,
+        download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
+    )
+    print(f"成功! 缓存目录: {result}")
+    # 列出文件
+    for root, dirs, files in os.walk(result):
+        for f in files:
+            fp = os.path.join(root, f)
+            size = os.path.getsize(fp)
+            print(f"  {os.path.relpath(fp, result)}  ({size} bytes)")
 except Exception as e:
-    print(f"获取 hub_id 失败: {e}\n")
+    print(f"失败: {e}")
 
-# 测试不同 URL 格式
-urls_to_test = [
-    # 格式1: repo + Source=SDK (已确认 404)
-    f"{api_base}/api/v1/datasets/{dataset_id}/repo?Source=SDK&Revision=master&FilePath={zip_file}&View=false",
-    # 格式2: repo 不带 Source
-    f"{api_base}/api/v1/datasets/{dataset_id}/repo?Revision=master&FilePath={zip_file}",
-    # 格式3: resolve 格式 (类似 HuggingFace)
-    f"{api_base}/api/v1/datasets/{dataset_id}/resolve/master/{zip_file}",
-    # 格式4: 用数字 hub_id
-    f"{api_base}/api/v1/datasets/{hub_id}/repo?Revision=master&FilePath={zip_file}",
-    # 格式5: repo/files 格式
-    f"{api_base}/api/v1/datasets/{dataset_id}/repo/files?Revision=master&FilePath={zip_file}",
-    # 格式6: download 格式
-    f"{api_base}/api/v1/datasets/{dataset_id}/download?Revision=master&FilePath={zip_file}",
-    # 格式7: 直接 URL(类似网页点击下载)
-    f"{api_base}/datasets/{dataset_id}/resolve/master/{zip_file}",
-    # 格式8: data-files 专用端点
-    f"{api_base}/api/v1/datasets/{dataset_id}/data-files?Revision=master&FilePath={zip_file}",
-    # 格式9: dataset_id 用数字 + resolve
-    f"{api_base}/api/v1/datasets/{hub_id}/resolve/master/{zip_file}",
-    # 格式10: 不带 View 参数
-    f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo?Source=SDK&Revision=master&FilePath={zip_file}",
-]
+# 方式2: 尝试 dataset_snapshot_download 不带 DownloadMode
+print("\n=== 方式2: dataset_snapshot_download (简化调用) ===")
+try:
+    from modelscope.hub.snapshot_download import dataset_snapshot_download
+    cache_dir = "/tmp/ms_test_cache2"
+    os.makedirs(cache_dir, exist_ok=True)
+    result = dataset_snapshot_download(
+        dataset_id=dataset_id,
+        cache_dir=cache_dir,
+    )
+    print(f"成功! 缓存目录: {result}")
+    for root, dirs, files in os.walk(result):
+        for f in files:
+            fp = os.path.join(root, f)
+            size = os.path.getsize(fp)
+            print(f"  {os.path.relpath(fp, result)}  ({size} bytes)")
+except Exception as e:
+    print(f"失败: {e}")
 
-for i, url in enumerate(urls_to_test, 1):
-    print(f"--- 格式{i} ---")
-    print(f"  URL: {url}")
-    # HEAD 请求测试
-    try:
-        req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
-        with urllib.request.urlopen(req, timeout=15) as resp:
-            size = resp.headers.get("Content-Length", "?")
-            ctype = resp.headers.get("Content-Type", "?")
-            print(f"  HEAD: {resp.status} | size={size} | type={ctype}")
-            if resp.status == 200:
-                print(f"  >>> 成功! <<<")
-                continue
-    except urllib.error.HTTPError as e:
-        print(f"  HEAD: {e.code}")
-    except Exception as e:
-        print(f"  HEAD: {e}")
+# 方式3: 尝试直接用 HubApi 的 get_dataset_file_url_with_token 或类似方法
+print("\n=== 方式3: HubApi 获取下载 URL ===")
+try:
+    from modelscope.hub.api import HubApi
+    api = HubApi()
+    # 列出所有可用方法
+    methods = [m for m in dir(api) if 'dataset' in m.lower() or 'download' in m.lower()]
+    print(f"可用方法: {methods}")
 
-    # GET 请求测试(只读前 1024 字节)
-    try:
-        req = urllib.request.Request(url, headers={"User-Agent": "Test"})
-        with urllib.request.urlopen(req, timeout=15) as resp:
-            data = resp.read(1024)
-            ctype = resp.headers.get("Content-Type", "?")
-            is_zip = data[:4] == b'PK\x03\x04'
-            print(f"  GET: {resp.status} | {len(data)} bytes | type={ctype} | is_zip={is_zip}")
-            if is_zip:
-                print(f"  >>> 成功! 是 ZIP 文件! <<<")
-    except urllib.error.HTTPError as e:
-        print(f"  GET: {e.code}")
-    except Exception as e:
-        print(f"  GET: {e}")
-    print()
+    # 尝试 list_repo_tree
+    for method_name in ['list_repo_tree', 'get_dataset_meta_file_list']:
+        if hasattr(api, method_name):
+            print(f"\n尝试 {method_name}...")
+            try:
+                method = getattr(api, method_name)
+                result = method(ds_name, namespace=namespace, revision="master")
+                print(f"  结果: {result}")
+            except Exception as e:
+                print(f"  失败: {e}")
+
+except Exception as e:
+    print(f"失败: {e}")
+
+# 方式4: pip 查看 modelscope 可用版本
+print("\n=== 方式4: pip 检查 ===")
+try:
+    result = subprocess.run(
+        ["pip", "index", "versions", "modelscope"], capture_output=True, text=True, timeout=15
+    )
+    print(result.stdout[:500])
+except Exception as e:
+    print(f"失败: {e}")
+
+print("\n=== 完成 ===")

+ 48 - 83
result.txt

@@ -1,89 +1,54 @@
 lq@lq:~/Fine-tuning$ cp backend/scripts/test_ms_api.py backend/data/ && sudo docker exec -it finetune-backend python3 /root/Fine-tuning/backend/data/test_ms_api.py tany0699/carBrands50
-测试数据集: tany0699/carBrands50
+数据集: tany0699/carBrands50, 目标文件: train.zip
 
-=== Test1: 数据集 info API 完整响应 ===
-请求: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50
-{
-  "RequestId": "ec91c118-ce21-47eb-a861-8663b5483315",
-  "Code": 200,
-  "Message": "success",
-  "Data": {
-    "isTop": null,
-    "relatedPaperId": null,
-    "nexa": null,
-    "Id": 2119,
-    "Namespace": "tany0699",
-    "Name": "carBrands50",
-    "CreatedBy": "tany0699",
-    "ChineseName": "车型分类",
-    "License": "Apache License 2.0",
-    "Description": "50种汽车类型分类数据集",
-    "Visibility": 3,
-    "Type": 2,
-    "Owner": "tany0699",
-    "UserDefineTags": "car,car brands,classification",
-    "Likes": 19,
-    "Downloads": 7664,
-    "Size": null,
-    "ReadmeContent": "\n## 数据集描述\n### 数据集简介\n数据集包含50种类型汽车的图像,其中训练集4260张图片,验证集99张图片,数据大小共46MB,支持识别车型包括:BMW、Audi、Jeep、Mini、Suzuki、Bentley等汽车品牌。  \n<img src=\"./sample.jpg\" alt=\"数据示例\"/>  \n\n### 数据集支持的任务\n可用于快速模型验证、性能评估、小数据分类集训练等。\n\n## 数据集的格式和结构\n### 数据格式\n数据集包括训练集train和验证集val,train和val文件夹之下按文件夹进行分类,共有2个子文件夹,同类别标签的图片在同一个文件夹下,图片格式为JPG。同时包含与标注文件中label id相对应的类名文件classname.txt。\n\n### 数据集加载方式\n```python\nfrom modelscope.msdatasets import MsDataset\nfrom modelscope.utils.constant import DownloadMode\n\nms_train_dataset = MsDataset.load(\n            'carBrands50', namespace='tany0699',\n            subset_name='default', split='train') # 加载训练集\nprint(next(iter(ms_train_dataset)))\n\nms_val_dataset = MsDataset.load(\n            'carBrands50', namespace='tany0699',\n            subset_name='default', split='validation') # 加载验证集\nprint(next(iter(ms_val_dataset)))\n```\n### 数据分片\n本数据集包含train和val数据集。\n| 子数据集    |        train | val |     test |\n|---------|-------------:|-----------:|---------:|\n| default |  训练集 |  验证集  | / |\n\n## 数据集生成的相关信息\n### 原始数据\nCar Brands Images:https://www.kaggle.com/datasets/yamaerenay/100-images-of-top-50-car-brands\n\n### Clone with HTTP\n```bash\ngit clone https://www.modelscope.cn/datasets/tany0699/carBrands50.git\n```",
-    "AlreadyStar": false,
-    "GmtCreate": 1675332057,
-    "GmtModified": 1779949098,
-    "Tags": [
-      {
-        "id": 3681,
-        "datasetId": 2119,
-        "domain": "image",
-        "task": "图像分类",
-        "filter": "样本规模",
-        "label": "100-10k",
-        "dataType": 2,
-        "level1Tag": "图像分类",
-        "level1TagName": "image-classification",
-        "level2Tag": "样本规模",
-        "level2TagName": "size_scale",
-        "level3Tag": "100-10k",
-        "level3TagName": "100-10k",
-        "gmtCreate": 1675426534,
-        "gmtModified": 1675426534,
-        "tagId": null
-      }
-    ],
-    "Status": 1,
-    "FullName": null,
-    "Organization": null,
-    "UsedFor": null,
-    "CertificationMark": 0,
-    "LastUpdatedTime": 1675426534,
-    "FromSite": "maas",
-    "SourcePlatform": null,
-    "topIndex": null,
-    "IsFlex": 0,
-    "StorageSize": 220619,
-    "RelateArxivId": null,
-    "Avatar": null,
-    "ProtectedMode": 2,
-    "ApprovalMode": null,
-    "ApprovalNotifyEmail": null,
-    "ApplyMeta": null,
-    "NEXA": null
-  },
-  "PageNumber": null,
-  "PageSize": null,
-  "TotalCount": null
-}
+hub_id = 2119
 
-=== Test2: HubApi 直接调用 ===
-尝试 get_dataset_files...
-get_dataset_files 失败: 'HubApi' object has no attribute 'get_dataset_files'
+--- 格式1 ---
+  URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/repo?Source=SDK&Revision=master&FilePath=train.zip&View=false
+  HEAD: 404
+  GET: 404
 
-尝试 get_dataset_file_url...
-train.csv 下载 URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/repo?Source=SDK&Revision=master&FilePath=train.csv&View=False
+--- 格式2 ---
+  URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/repo?Revision=master&FilePath=train.zip
+  HEAD: 404
+  GET: 404
 
-=== Test3: carBrands50.json 配置文件 ===
-请求: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/repo?Revision=master&FilePath=carBrands50.json&View=false
-{"default":{"train":{"meta":"train.csv","file":"train.zip"},"test":{"meta":"","file":""},"validation":{"meta":"val.csv","file":"val.zip"}}}
+--- 格式3 ---
+  URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/resolve/master/train.zip
+  HEAD: 404
+  GET: 404
 
-=== Test4: dataset_infos.json ===
-请求: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/repo?Revision=master&FilePath=dataset_infos.json&View=false
-{"default":{"features":{"image":{"_type":"Image"},"category":{"_type":"Value"}},"splits":{"validation":{"name":"validation","dataset_name":"carBrands50"},"train":{"name":"train","dataset_name":"carBrands50"}}}}
+--- 格式4 ---
+  URL: https://www.modelscope.cn/api/v1/datasets/2119/repo?Revision=master&FilePath=train.zip
+  HEAD: 404
+  GET: 404
+
+--- 格式5 ---
+  URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/repo/files?Revision=master&FilePath=train.zip
+  HEAD: 400
+  GET: 400
+
+--- 格式6 ---
+  URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/download?Revision=master&FilePath=train.zip
+  HEAD: 404
+  GET: 404
+
+--- 格式7 ---
+  URL: https://www.modelscope.cn/datasets/tany0699/carBrands50/resolve/master/train.zip
+  HEAD: 404
+  GET: 404
+
+--- 格式8 ---
+  URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/data-files?Revision=master&FilePath=train.zip
+  HEAD: 404
+  GET: 404
+
+--- 格式9 ---
+  URL: https://www.modelscope.cn/api/v1/datasets/2119/resolve/master/train.zip
+  HEAD: 404
+  GET: 404
+
+--- 格式10 ---
+  URL: https://www.modelscope.cn/api/v1/datasets/tany0699/carBrands50/repo?Source=SDK&Revision=master&FilePath=train.zip
+  HEAD: 404
+  GET: 404