| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- #!/usr/bin/env python3
- """测试 ModelScope API 能否正确获取数据集文件列表并下载压缩包。"""
- import json
- import urllib.request
- import urllib.parse
- import sys
- api_base = "https://www.modelscope.cn"
- dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
- print(f"测试数据集: {dataset_id}\n")
- # Step 1: 获取数字 hub ID
- print("=== Step1: 获取 hub ID ===")
- try:
- info_url = f"{api_base}/api/v1/datasets/{dataset_id}"
- print(f"请求: {info_url}")
- req = urllib.request.Request(info_url, headers={"User-Agent": "Test"})
- with urllib.request.urlopen(req, timeout=30) as resp:
- info = json.loads(resp.read().decode())
- hub_id = info.get("Data", {}).get("Id") or info.get("Data", {}).get("id")
- print(f"hub_id = {hub_id}\n")
- except Exception as e:
- print(f"失败: {e}\n")
- hub_id = None
- # Step 2: 列出文件
- print("=== Step2: 列出文件 ===")
- files = []
- for test_id in filter(None, [hub_id, dataset_id]):
- try:
- tree_url = (f"{api_base}/api/v1/datasets/{test_id}/repo/tree"
- f"?Revision=master&Root=/&Recursive=True&PageNumber=1&PageSize=10000")
- print(f"请求: {tree_url}")
- req = urllib.request.Request(tree_url, headers={"User-Agent": "Test"})
- with urllib.request.urlopen(req, timeout=30) as resp:
- result = json.loads(resp.read().decode())
- files = result.get("Data", {}).get("Files", [])
- print(f"成功! 共 {len(files)} 个文件:")
- for f in files:
- name = f.get("Name", f.get("name", ""))
- size = f.get("Size", f.get("size", ""))
- print(f" {name} (size={size})")
- if files:
- break
- except Exception as e:
- print(f"失败: {e}")
- print()
- # Step 3: 筛选压缩包
- print("=== Step3: 压缩包文件 ===")
- archive_exts = (".zip", ".tar.gz", ".tgz", ".tar.bz2", ".tbz2", ".tar")
- namespace, ds_name = dataset_id.split("/", 1)
- found = False
- for f in files:
- name = f.get("Name", f.get("name", ""))
- if any(name.lower().endswith(ext) for ext in archive_exts):
- path = f.get("Path", f.get("path", name))
- params = urllib.parse.urlencode({
- "Source": "SDK", "Revision": "master",
- "FilePath": path, "View": "false",
- })
- dl_url = f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo?{params}"
- print(f" {name}")
- print(f" 路径: {path}")
- print(f" 下载URL: {dl_url}")
- found = True
- if not found:
- print(" 未找到压缩包文件")
|