|
@@ -1,81 +1,83 @@
|
|
|
#!/usr/bin/env python3
|
|
#!/usr/bin/env python3
|
|
|
-"""测试 ModelScope 图片数据集下载流程。"""
|
|
|
|
|
|
|
+"""测试 ModelScope 数据文件区 zip 的不同下载 URL 格式。"""
|
|
|
import json
|
|
import json
|
|
|
import urllib.request
|
|
import urllib.request
|
|
|
import urllib.parse
|
|
import urllib.parse
|
|
|
import sys
|
|
import sys
|
|
|
-import os
|
|
|
|
|
-import tempfile
|
|
|
|
|
-import zipfile
|
|
|
|
|
|
|
|
|
|
api_base = "https://www.modelscope.cn"
|
|
api_base = "https://www.modelscope.cn"
|
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
|
-namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
|
|
|
|
|
|
|
+namespace, ds_name = dataset_id.split("/", 1)
|
|
|
|
|
+zip_file = "train.zip"
|
|
|
|
|
|
|
|
-print(f"测试数据集: {dataset_id}\n")
|
|
|
|
|
|
|
+print(f"数据集: {dataset_id}, 目标文件: {zip_file}\n")
|
|
|
|
|
|
|
|
-# Step 1: 下载配置文件
|
|
|
|
|
-print("=== Step1: 下载配置文件 ===")
|
|
|
|
|
-config_url = (f"{api_base}/api/v1/datasets/{dataset_id}/repo"
|
|
|
|
|
- f"?Source=SDK&Revision=master&FilePath={ds_name}.json&View=false")
|
|
|
|
|
-print(f"URL: {config_url}")
|
|
|
|
|
|
|
+# 先获取 hub_id
|
|
|
|
|
+hub_id = None
|
|
|
try:
|
|
try:
|
|
|
- req = urllib.request.Request(config_url, headers={"User-Agent": "Test"})
|
|
|
|
|
|
|
+ url = f"{api_base}/api/v1/datasets/{dataset_id}"
|
|
|
|
|
+ req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
- config = json.loads(resp.read().decode())
|
|
|
|
|
- print(f"配置内容: {json.dumps(config, ensure_ascii=False, indent=2)}")
|
|
|
|
|
|
|
+ info = json.loads(resp.read().decode())
|
|
|
|
|
+ hub_id = info.get("Data", {}).get("Id") or info.get("Data", {}).get("id")
|
|
|
|
|
+ print(f"hub_id = {hub_id}\n")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- print(f"失败: {e}")
|
|
|
|
|
- sys.exit(1)
|
|
|
|
|
|
|
+ print(f"获取 hub_id 失败: {e}\n")
|
|
|
|
|
|
|
|
-# Step 2: 收集压缩包文件名
|
|
|
|
|
-print("\n=== Step2: 收集压缩包文件名 ===")
|
|
|
|
|
-archive_files = set()
|
|
|
|
|
-for subset in config.values():
|
|
|
|
|
- if isinstance(subset, dict):
|
|
|
|
|
- for split_info in subset.values():
|
|
|
|
|
- if isinstance(split_info, dict):
|
|
|
|
|
- fname = split_info.get("file", "")
|
|
|
|
|
- if fname:
|
|
|
|
|
- archive_files.add(fname)
|
|
|
|
|
-print(f"找到压缩包: {archive_files}")
|
|
|
|
|
|
|
+# 测试不同 URL 格式
|
|
|
|
|
+urls_to_test = [
|
|
|
|
|
+ # 格式1: repo + Source=SDK (已确认 404)
|
|
|
|
|
+ f"{api_base}/api/v1/datasets/{dataset_id}/repo?Source=SDK&Revision=master&FilePath={zip_file}&View=false",
|
|
|
|
|
+ # 格式2: repo 不带 Source
|
|
|
|
|
+ f"{api_base}/api/v1/datasets/{dataset_id}/repo?Revision=master&FilePath={zip_file}",
|
|
|
|
|
+ # 格式3: resolve 格式 (类似 HuggingFace)
|
|
|
|
|
+ f"{api_base}/api/v1/datasets/{dataset_id}/resolve/master/{zip_file}",
|
|
|
|
|
+ # 格式4: 用数字 hub_id
|
|
|
|
|
+ f"{api_base}/api/v1/datasets/{hub_id}/repo?Revision=master&FilePath={zip_file}",
|
|
|
|
|
+ # 格式5: repo/files 格式
|
|
|
|
|
+ f"{api_base}/api/v1/datasets/{dataset_id}/repo/files?Revision=master&FilePath={zip_file}",
|
|
|
|
|
+ # 格式6: download 格式
|
|
|
|
|
+ f"{api_base}/api/v1/datasets/{dataset_id}/download?Revision=master&FilePath={zip_file}",
|
|
|
|
|
+ # 格式7: 直接 URL(类似网页点击下载)
|
|
|
|
|
+ f"{api_base}/datasets/{dataset_id}/resolve/master/{zip_file}",
|
|
|
|
|
+ # 格式8: data-files 专用端点
|
|
|
|
|
+ f"{api_base}/api/v1/datasets/{dataset_id}/data-files?Revision=master&FilePath={zip_file}",
|
|
|
|
|
+ # 格式9: dataset_id 用数字 + resolve
|
|
|
|
|
+ f"{api_base}/api/v1/datasets/{hub_id}/resolve/master/{zip_file}",
|
|
|
|
|
+ # 格式10: 不带 View 参数
|
|
|
|
|
+ f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo?Source=SDK&Revision=master&FilePath={zip_file}",
|
|
|
|
|
+]
|
|
|
|
|
|
|
|
-# Step 3: 测试下载第一个压缩包(只下载前 1MB 验证)
|
|
|
|
|
-print("\n=== Step3: 测试下载压缩包 ===")
|
|
|
|
|
-for fname in archive_files:
|
|
|
|
|
- params = urllib.parse.urlencode({
|
|
|
|
|
- "Source": "SDK", "Revision": "master",
|
|
|
|
|
- "FilePath": fname, "View": "false",
|
|
|
|
|
- })
|
|
|
|
|
- dl_url = f"{api_base}/api/v1/datasets/{dataset_id}/repo?{params}"
|
|
|
|
|
- print(f"文件: {fname}")
|
|
|
|
|
- print(f"URL: {dl_url}")
|
|
|
|
|
-
|
|
|
|
|
- # 只读 Content-Length 验证可下载
|
|
|
|
|
|
|
+for i, url in enumerate(urls_to_test, 1):
|
|
|
|
|
+ print(f"--- 格式{i} ---")
|
|
|
|
|
+ print(f" URL: {url}")
|
|
|
|
|
+ # HEAD 请求测试
|
|
|
try:
|
|
try:
|
|
|
- req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"})
|
|
|
|
|
- req.method = "HEAD"
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
|
|
- size = resp.headers.get("Content-Length", "unknown")
|
|
|
|
|
- content_type = resp.headers.get("Content-Type", "unknown")
|
|
|
|
|
- print(f" Content-Length: {size}")
|
|
|
|
|
- print(f" Content-Type: {content_type}")
|
|
|
|
|
- print(f" 状态: {resp.status}")
|
|
|
|
|
|
|
+ req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
|
|
|
|
|
+ with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
|
|
+ size = resp.headers.get("Content-Length", "?")
|
|
|
|
|
+ ctype = resp.headers.get("Content-Type", "?")
|
|
|
|
|
+ print(f" HEAD: {resp.status} | size={size} | type={ctype}")
|
|
|
|
|
+ if resp.status == 200:
|
|
|
|
|
+ print(f" >>> 成功! <<<")
|
|
|
|
|
+ continue
|
|
|
|
|
+ except urllib.error.HTTPError as e:
|
|
|
|
|
+ print(f" HEAD: {e.code}")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- print(f" HEAD 请求失败: {e}")
|
|
|
|
|
- # 尝试 GET 前 1MB
|
|
|
|
|
- try:
|
|
|
|
|
- req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"})
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
|
|
- data = resp.read(1024 * 1024)
|
|
|
|
|
- print(f" GET 成功, 前 1MB 读取 {len(data)} bytes")
|
|
|
|
|
- # 检查是否是 zip
|
|
|
|
|
- if data[:4] == b'PK\x03\x04':
|
|
|
|
|
- print(f" 确认是 ZIP 格式!")
|
|
|
|
|
- else:
|
|
|
|
|
- print(f" 前 4 字节: {data[:4]}")
|
|
|
|
|
- except Exception as e2:
|
|
|
|
|
- print(f" GET 也失败: {e2}")
|
|
|
|
|
- break
|
|
|
|
|
|
|
+ print(f" HEAD: {e}")
|
|
|
|
|
|
|
|
-print("\n=== 完成 ===")
|
|
|
|
|
|
|
+ # GET 请求测试(只读前 1024 字节)
|
|
|
|
|
+ try:
|
|
|
|
|
+ req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
|
|
+ with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
|
|
+ data = resp.read(1024)
|
|
|
|
|
+ ctype = resp.headers.get("Content-Type", "?")
|
|
|
|
|
+ is_zip = data[:4] == b'PK\x03\x04'
|
|
|
|
|
+ print(f" GET: {resp.status} | {len(data)} bytes | type={ctype} | is_zip={is_zip}")
|
|
|
|
|
+ if is_zip:
|
|
|
|
|
+ print(f" >>> 成功! 是 ZIP 文件! <<<")
|
|
|
|
|
+ except urllib.error.HTTPError as e:
|
|
|
|
|
+ print(f" GET: {e.code}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" GET: {e}")
|
|
|
|
|
+ print()
|