#!/usr/bin/env python3 """测试 ModelScope 图片数据集下载流程。""" import json import urllib.request import urllib.parse import sys import os import tempfile import zipfile api_base = "https://www.modelscope.cn" dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50" namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id) print(f"测试数据集: {dataset_id}\n") # Step 1: 下载配置文件 print("=== Step1: 下载配置文件 ===") config_url = (f"{api_base}/api/v1/datasets/{dataset_id}/repo" f"?Source=SDK&Revision=master&FilePath={ds_name}.json&View=false") print(f"URL: {config_url}") try: req = urllib.request.Request(config_url, headers={"User-Agent": "Test"}) with urllib.request.urlopen(req, timeout=30) as resp: config = json.loads(resp.read().decode()) print(f"配置内容: {json.dumps(config, ensure_ascii=False, indent=2)}") except Exception as e: print(f"失败: {e}") sys.exit(1) # Step 2: 收集压缩包文件名 print("\n=== Step2: 收集压缩包文件名 ===") archive_files = set() for subset in config.values(): if isinstance(subset, dict): for split_info in subset.values(): if isinstance(split_info, dict): fname = split_info.get("file", "") if fname: archive_files.add(fname) print(f"找到压缩包: {archive_files}") # Step 3: 测试下载第一个压缩包(只下载前 1MB 验证) print("\n=== Step3: 测试下载压缩包 ===") for fname in archive_files: params = urllib.parse.urlencode({ "Source": "SDK", "Revision": "master", "FilePath": fname, "View": "false", }) dl_url = f"{api_base}/api/v1/datasets/{dataset_id}/repo?{params}" print(f"文件: {fname}") print(f"URL: {dl_url}") # 只读 Content-Length 验证可下载 try: req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"}) req.method = "HEAD" with urllib.request.urlopen(req, timeout=30) as resp: size = resp.headers.get("Content-Length", "unknown") content_type = resp.headers.get("Content-Type", "unknown") print(f" Content-Length: {size}") print(f" Content-Type: {content_type}") print(f" 状态: {resp.status}") except Exception as e: print(f" HEAD 请求失败: {e}") # 尝试 GET 前 1MB try: req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"}) with urllib.request.urlopen(req, timeout=30) as resp: data = resp.read(1024 * 1024) print(f" GET 成功, 前 1MB 读取 {len(data)} bytes") # 检查是否是 zip if data[:4] == b'PK\x03\x04': print(f" 确认是 ZIP 格式!") else: print(f" 前 4 字节: {data[:4]}") except Exception as e2: print(f" GET 也失败: {e2}") break print("\n=== 完成 ===")