| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- #!/usr/bin/env python3
- """测试 ModelScope 图片数据集下载流程。"""
- import json
- import urllib.request
- import urllib.parse
- import sys
- import os
- import tempfile
- import zipfile
- api_base = "https://www.modelscope.cn"
- dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
- namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
- print(f"测试数据集: {dataset_id}\n")
- # Step 1: 下载配置文件
- print("=== Step1: 下载配置文件 ===")
- config_url = (f"{api_base}/api/v1/datasets/{dataset_id}/repo"
- f"?Source=SDK&Revision=master&FilePath={ds_name}.json&View=false")
- print(f"URL: {config_url}")
- try:
- req = urllib.request.Request(config_url, headers={"User-Agent": "Test"})
- with urllib.request.urlopen(req, timeout=30) as resp:
- config = json.loads(resp.read().decode())
- print(f"配置内容: {json.dumps(config, ensure_ascii=False, indent=2)}")
- except Exception as e:
- print(f"失败: {e}")
- sys.exit(1)
- # Step 2: 收集压缩包文件名
- print("\n=== Step2: 收集压缩包文件名 ===")
- archive_files = set()
- for subset in config.values():
- if isinstance(subset, dict):
- for split_info in subset.values():
- if isinstance(split_info, dict):
- fname = split_info.get("file", "")
- if fname:
- archive_files.add(fname)
- print(f"找到压缩包: {archive_files}")
- # Step 3: 测试下载第一个压缩包(只下载前 1MB 验证)
- print("\n=== Step3: 测试下载压缩包 ===")
- for fname in archive_files:
- params = urllib.parse.urlencode({
- "Source": "SDK", "Revision": "master",
- "FilePath": fname, "View": "false",
- })
- dl_url = f"{api_base}/api/v1/datasets/{dataset_id}/repo?{params}"
- print(f"文件: {fname}")
- print(f"URL: {dl_url}")
- # 只读 Content-Length 验证可下载
- try:
- req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"})
- req.method = "HEAD"
- with urllib.request.urlopen(req, timeout=30) as resp:
- size = resp.headers.get("Content-Length", "unknown")
- content_type = resp.headers.get("Content-Type", "unknown")
- print(f" Content-Length: {size}")
- print(f" Content-Type: {content_type}")
- print(f" 状态: {resp.status}")
- except Exception as e:
- print(f" HEAD 请求失败: {e}")
- # 尝试 GET 前 1MB
- try:
- req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"})
- with urllib.request.urlopen(req, timeout=30) as resp:
- data = resp.read(1024 * 1024)
- print(f" GET 成功, 前 1MB 读取 {len(data)} bytes")
- # 检查是否是 zip
- if data[:4] == b'PK\x03\x04':
- print(f" 确认是 ZIP 格式!")
- else:
- print(f" 前 4 字节: {data[:4]}")
- except Exception as e2:
- print(f" GET 也失败: {e2}")
- break
- print("\n=== 完成 ===")
|