|
|
@@ -1,66 +1,81 @@
|
|
|
#!/usr/bin/env python3
|
|
|
-"""测试 MsDataset.load() 能否正确下载图片数据集。"""
|
|
|
-import sys
|
|
|
+"""测试 ModelScope 数据集 API,查找数据文件区的 CDN 链接。"""
|
|
|
import json
|
|
|
+import urllib.request
|
|
|
+import urllib.parse
|
|
|
+import sys
|
|
|
|
|
|
+api_base = "https://www.modelscope.cn"
|
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
|
-namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
|
|
|
|
|
|
-print(f"测试数据集: {dataset_id}")
|
|
|
-print(f"namespace: {namespace}, name: {ds_name}\n")
|
|
|
+print(f"测试数据集: {dataset_id}\n")
|
|
|
|
|
|
-print("=== 用 MsDataset.load() 下载 ===")
|
|
|
+# Test 1: 完整查看数据集 info API 返回
|
|
|
+print("=== Test1: 数据集 info API 完整响应 ===")
|
|
|
try:
|
|
|
- from modelscope.msdatasets import MsDataset
|
|
|
+ url = f"{api_base}/api/v1/datasets/{dataset_id}"
|
|
|
+ print(f"请求: {url}")
|
|
|
+ req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
+ with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
+ info = json.loads(resp.read().decode())
|
|
|
+ print(json.dumps(info, indent=2, ensure_ascii=False)[:3000])
|
|
|
+except Exception as e:
|
|
|
+ print(f"失败: {e}")
|
|
|
|
|
|
- ds = None
|
|
|
- for split in ("train", "validation", "test"):
|
|
|
- try:
|
|
|
- if namespace:
|
|
|
- ds = MsDataset.load(ds_name, namespace=namespace, split=split)
|
|
|
- else:
|
|
|
- ds = MsDataset.load(dataset_id, split=split)
|
|
|
- if ds:
|
|
|
- print(f"加载 split='{split}' 成功, 共 {len(ds) if hasattr(ds, '__len__') else '?'} 条")
|
|
|
- break
|
|
|
- except Exception as e:
|
|
|
- print(f"split='{split}' 失败: {e}")
|
|
|
+# Test 2: 尝试 HubApi(跳过 msdatasets 的 import 问题)
|
|
|
+print("\n=== Test2: HubApi 直接调用 ===")
|
|
|
+try:
|
|
|
+ from modelscope.hub.api import HubApi
|
|
|
+ api = HubApi()
|
|
|
|
|
|
- if not ds:
|
|
|
- try:
|
|
|
- if namespace:
|
|
|
- ds = MsDataset.load(ds_name, namespace=namespace)
|
|
|
- else:
|
|
|
- ds = MsDataset.load(dataset_id)
|
|
|
- print(f"不带 split 加载成功, 类型: {type(ds)}")
|
|
|
- except Exception as e:
|
|
|
- print(f"不带 split 也失败: {e}")
|
|
|
- sys.exit(1)
|
|
|
+ # 获取数据集文件列表
|
|
|
+ print("尝试 get_dataset_files...")
|
|
|
+ try:
|
|
|
+ namespace, ds_name = dataset_id.split("/", 1)
|
|
|
+ files = api.get_dataset_files(ds_name, namespace=namespace, recursive=True)
|
|
|
+ print(f"get_dataset_files 返回 {len(files)} 个文件:")
|
|
|
+ for f in files:
|
|
|
+ print(f" {f}")
|
|
|
+ except Exception as e:
|
|
|
+ print(f"get_dataset_files 失败: {e}")
|
|
|
|
|
|
- if not hasattr(ds, "__iter__"):
|
|
|
- print(f"数据集不可迭代, 类型: {type(ds)}")
|
|
|
- sys.exit(1)
|
|
|
+ # 尝试获取文件下载 URL
|
|
|
+ print("\n尝试 get_dataset_file_url...")
|
|
|
+ try:
|
|
|
+ namespace, ds_name = dataset_id.split("/", 1)
|
|
|
+ url = api.get_dataset_file_url("train.csv", ds_name, namespace, revision="master")
|
|
|
+ print(f"train.csv 下载 URL: {url}")
|
|
|
+ except Exception as e:
|
|
|
+ print(f"get_dataset_file_url 失败: {e}")
|
|
|
|
|
|
- # 查看前 3 条数据
|
|
|
- print("\n=== 前 3 条数据 ===")
|
|
|
- count = 0
|
|
|
- for row in ds:
|
|
|
- if count >= 3:
|
|
|
- break
|
|
|
- print(f"\n--- Record {count} ---")
|
|
|
- for k, v in row.items():
|
|
|
- vtype = type(v).__name__
|
|
|
- if vtype == "Image":
|
|
|
- print(f" {k}: PIL.Image (size={v.size}, mode={v.mode})")
|
|
|
- elif isinstance(v, str) and len(v) > 100:
|
|
|
- print(f" {k}: str (len={len(v)}) '{v[:100]}...'")
|
|
|
- else:
|
|
|
- print(f" {k}: {vtype} = {v}")
|
|
|
- count += 1
|
|
|
+except ImportError as e:
|
|
|
+ print(f"import 失败: {e}")
|
|
|
+except Exception as e:
|
|
|
+ print(f"失败: {e}")
|
|
|
|
|
|
- print(f"\n=== 完成 ===")
|
|
|
+# Test 3: 查看 carBrands50.json 配置文件(可能包含数据文件 URL)
|
|
|
+print("\n=== Test3: carBrands50.json 配置文件 ===")
|
|
|
+try:
|
|
|
+ namespace, ds_name = dataset_id.split("/", 1)
|
|
|
+ url = (f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo"
|
|
|
+ f"?Revision=master&FilePath=carBrands50.json&View=false")
|
|
|
+ print(f"请求: {url}")
|
|
|
+ req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
+ with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
+ config = resp.read().decode()
|
|
|
+ print(config[:2000])
|
|
|
+except Exception as e:
|
|
|
+ print(f"失败: {e}")
|
|
|
|
|
|
+# Test 4: 查看 dataset_infos.json
|
|
|
+print("\n=== Test4: dataset_infos.json ===")
|
|
|
+try:
|
|
|
+ url = (f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo"
|
|
|
+ f"?Revision=master&FilePath=dataset_infos.json&View=false")
|
|
|
+ print(f"请求: {url}")
|
|
|
+ req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
+ with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
+ config = resp.read().decode()
|
|
|
+ print(config[:2000])
|
|
|
except Exception as e:
|
|
|
print(f"失败: {e}")
|
|
|
- import traceback
|
|
|
- traceback.print_exc()
|