| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- #!/usr/bin/env python3
- """测试 ModelScope 数据文件区 zip 的不同下载 URL 格式。"""
- import json
- import urllib.request
- import urllib.parse
- import sys
- api_base = "https://www.modelscope.cn"
- dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
- namespace, ds_name = dataset_id.split("/", 1)
- zip_file = "train.zip"
- print(f"数据集: {dataset_id}, 目标文件: {zip_file}\n")
- # 先获取 hub_id
- hub_id = None
- try:
- url = f"{api_base}/api/v1/datasets/{dataset_id}"
- req = urllib.request.Request(url, headers={"User-Agent": "Test"})
- with urllib.request.urlopen(req, timeout=30) as resp:
- info = json.loads(resp.read().decode())
- hub_id = info.get("Data", {}).get("Id") or info.get("Data", {}).get("id")
- print(f"hub_id = {hub_id}\n")
- except Exception as e:
- print(f"获取 hub_id 失败: {e}\n")
- # 测试不同 URL 格式
- urls_to_test = [
- # 格式1: repo + Source=SDK (已确认 404)
- f"{api_base}/api/v1/datasets/{dataset_id}/repo?Source=SDK&Revision=master&FilePath={zip_file}&View=false",
- # 格式2: repo 不带 Source
- f"{api_base}/api/v1/datasets/{dataset_id}/repo?Revision=master&FilePath={zip_file}",
- # 格式3: resolve 格式 (类似 HuggingFace)
- f"{api_base}/api/v1/datasets/{dataset_id}/resolve/master/{zip_file}",
- # 格式4: 用数字 hub_id
- f"{api_base}/api/v1/datasets/{hub_id}/repo?Revision=master&FilePath={zip_file}",
- # 格式5: repo/files 格式
- f"{api_base}/api/v1/datasets/{dataset_id}/repo/files?Revision=master&FilePath={zip_file}",
- # 格式6: download 格式
- f"{api_base}/api/v1/datasets/{dataset_id}/download?Revision=master&FilePath={zip_file}",
- # 格式7: 直接 URL(类似网页点击下载)
- f"{api_base}/datasets/{dataset_id}/resolve/master/{zip_file}",
- # 格式8: data-files 专用端点
- f"{api_base}/api/v1/datasets/{dataset_id}/data-files?Revision=master&FilePath={zip_file}",
- # 格式9: dataset_id 用数字 + resolve
- f"{api_base}/api/v1/datasets/{hub_id}/resolve/master/{zip_file}",
- # 格式10: 不带 View 参数
- f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo?Source=SDK&Revision=master&FilePath={zip_file}",
- ]
- for i, url in enumerate(urls_to_test, 1):
- print(f"--- 格式{i} ---")
- print(f" URL: {url}")
- # HEAD 请求测试
- try:
- req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
- with urllib.request.urlopen(req, timeout=15) as resp:
- size = resp.headers.get("Content-Length", "?")
- ctype = resp.headers.get("Content-Type", "?")
- print(f" HEAD: {resp.status} | size={size} | type={ctype}")
- if resp.status == 200:
- print(f" >>> 成功! <<<")
- continue
- except urllib.error.HTTPError as e:
- print(f" HEAD: {e.code}")
- except Exception as e:
- print(f" HEAD: {e}")
- # GET 请求测试(只读前 1024 字节)
- try:
- req = urllib.request.Request(url, headers={"User-Agent": "Test"})
- with urllib.request.urlopen(req, timeout=15) as resp:
- data = resp.read(1024)
- ctype = resp.headers.get("Content-Type", "?")
- is_zip = data[:4] == b'PK\x03\x04'
- print(f" GET: {resp.status} | {len(data)} bytes | type={ctype} | is_zip={is_zip}")
- if is_zip:
- print(f" >>> 成功! 是 ZIP 文件! <<<")
- except urllib.error.HTTPError as e:
- print(f" GET: {e.code}")
- except Exception as e:
- print(f" GET: {e}")
- print()
|