#!/usr/bin/env python3 """测试 ModelScope API 能否正确获取数据集文件列表并下载压缩包。""" import json import urllib.request import urllib.parse import sys api_base = "https://www.modelscope.cn" dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50" print(f"测试数据集: {dataset_id}\n") # Step 1: 获取数字 hub ID print("=== Step1: 获取 hub ID ===") try: info_url = f"{api_base}/api/v1/datasets/{dataset_id}" print(f"请求: {info_url}") req = urllib.request.Request(info_url, headers={"User-Agent": "Test"}) with urllib.request.urlopen(req, timeout=30) as resp: info = json.loads(resp.read().decode()) hub_id = info.get("Data", {}).get("Id") or info.get("Data", {}).get("id") print(f"hub_id = {hub_id}\n") except Exception as e: print(f"失败: {e}\n") hub_id = None # Step 2: 列出文件 print("=== Step2: 列出文件 ===") files = [] for test_id in filter(None, [hub_id, dataset_id]): try: tree_url = (f"{api_base}/api/v1/datasets/{test_id}/repo/tree" f"?Revision=master&Root=/&Recursive=True&PageNumber=1&PageSize=10000") print(f"请求: {tree_url}") req = urllib.request.Request(tree_url, headers={"User-Agent": "Test"}) with urllib.request.urlopen(req, timeout=30) as resp: result = json.loads(resp.read().decode()) files = result.get("Data", {}).get("Files", []) print(f"成功! 共 {len(files)} 个文件:") for f in files: name = f.get("Name", f.get("name", "")) size = f.get("Size", f.get("size", "")) print(f" {name} (size={size})") if files: break except Exception as e: print(f"失败: {e}") print() # Step 3: 筛选压缩包 print("=== Step3: 压缩包文件 ===") archive_exts = (".zip", ".tar.gz", ".tgz", ".tar.bz2", ".tbz2", ".tar") namespace, ds_name = dataset_id.split("/", 1) found = False for f in files: name = f.get("Name", f.get("name", "")) if any(name.lower().endswith(ext) for ext in archive_exts): path = f.get("Path", f.get("path", name)) params = urllib.parse.urlencode({ "Source": "SDK", "Revision": "master", "FilePath": path, "View": "false", }) dl_url = f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo?{params}" print(f" {name}") print(f" 路径: {path}") print(f" 下载URL: {dl_url}") found = True if not found: print(" 未找到压缩包文件")