|
|
@@ -1,8 +1,9 @@
|
|
|
#!/usr/bin/env python3
|
|
|
-"""测试 HubApi 的 get_dataset_access_config 获取数据文件 CDN 链接。"""
|
|
|
+"""测试通过 OSS 凭证下载数据文件区的图片。"""
|
|
|
import json
|
|
|
+import urllib.request
|
|
|
+import urllib.parse
|
|
|
import sys
|
|
|
-import inspect
|
|
|
|
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
|
namespace, ds_name = dataset_id.split("/", 1)
|
|
|
@@ -12,87 +13,102 @@ print(f"数据集: {dataset_id}\n")
|
|
|
from modelscope.hub.api import HubApi
|
|
|
api = HubApi()
|
|
|
|
|
|
-# 获取 dataset_id (数字)
|
|
|
-print("=== 获取 dataset_id ===")
|
|
|
-try:
|
|
|
- ds_id, ds_type = api.get_dataset_id_and_type(namespace=namespace, dataset_name=ds_name)
|
|
|
- print(f"dataset_id={ds_id}, type={ds_type}")
|
|
|
-except Exception as e:
|
|
|
- print(f"失败: {e}")
|
|
|
- ds_id = None
|
|
|
+# 获取 OSS 凭证
|
|
|
+print("=== 获取 OSS 凭证 ===")
|
|
|
+config = api.get_dataset_access_config(
|
|
|
+ dataset_name=ds_name,
|
|
|
+ namespace=namespace,
|
|
|
+ revision="master",
|
|
|
+)
|
|
|
+host = config["Host"]
|
|
|
+backup_dir = config["BackupDir"] # zip 文件
|
|
|
+unzip_dir = config["Dir"] # 已解压的文件
|
|
|
+access_id = config["AccessId"]
|
|
|
+access_secret = config["AccessSecret"]
|
|
|
+security_token = config["SecurityToken"]
|
|
|
+print(f"Host: {host}")
|
|
|
+print(f"Zip 目录: {backup_dir}")
|
|
|
+print(f"解压目录: {unzip_dir}")
|
|
|
+print(f"过期时间: {config['Expiration']}")
|
|
|
|
|
|
-# 测试 get_dataset_access_config
|
|
|
-print("\n=== get_dataset_access_config ===")
|
|
|
+# 测试1: 直接下载 zip(公开访问?)
|
|
|
+print("\n=== 测试1: 直接访问 public-zip/train.zip ===")
|
|
|
+url = f"{host}/{backup_dir}train.zip"
|
|
|
+print(f"URL: {url}")
|
|
|
try:
|
|
|
- sig = inspect.signature(api.get_dataset_access_config)
|
|
|
- print(f"签名: {sig}")
|
|
|
- result = api.get_dataset_access_config(
|
|
|
- dataset_name=ds_name,
|
|
|
- namespace=namespace,
|
|
|
- revision="master",
|
|
|
- )
|
|
|
- print(f"结果: {json.dumps(result, indent=2, ensure_ascii=False, default=str)[:3000]}")
|
|
|
+ req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
|
|
|
+ with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
+ print(f"HEAD: {resp.status} | size={resp.headers.get('Content-Length', '?')}")
|
|
|
except Exception as e:
|
|
|
- print(f"失败: {e}")
|
|
|
+ print(f"HEAD: {e}")
|
|
|
|
|
|
-# 测试 get_dataset_access_config_for_unzipped
|
|
|
-print("\n=== get_dataset_access_config_for_unzipped ===")
|
|
|
+# 测试2: 带 STS 签名下载 zip
|
|
|
+print("\n=== 测试2: 带 STS 签名访问 train.zip ===")
|
|
|
+# OSS STS 签名 URL 格式: ?OSSAccessKeyId=xxx&Expires=xxx&Signature=xxx&security-token=xxx
|
|
|
+import time
|
|
|
+import hmac
|
|
|
+import hashlib
|
|
|
+import base64
|
|
|
+expires = str(int(time.time()) + 3600)
|
|
|
+string_to_sign = f"HEAD\n\n\n{expires}\n/{config['Bucket']}/{backup_dir}train.zip"
|
|
|
+h = hmac.new(access_secret.encode(), string_to_sign.encode(), hashlib.sha1)
|
|
|
+signature = urllib.parse.quote(base64.b64encode(h.digest()))
|
|
|
+url = (f"{host}/{backup_dir}train.zip"
|
|
|
+ f"?OSSAccessKeyId={urllib.parse.quote(access_id)}"
|
|
|
+ f"&Expires={expires}"
|
|
|
+ f"&Signature={signature}"
|
|
|
+ f"&security-token={urllib.parse.quote(security_token)}")
|
|
|
+print(f"URL: {url[:200]}...")
|
|
|
try:
|
|
|
- sig = inspect.signature(api.get_dataset_access_config_for_unzipped)
|
|
|
- print(f"签名: {sig}")
|
|
|
- result = api.get_dataset_access_config_for_unzipped(
|
|
|
- dataset_name=ds_name,
|
|
|
- namespace=namespace,
|
|
|
- revision="master",
|
|
|
- )
|
|
|
- print(f"结果: {json.dumps(result, indent=2, ensure_ascii=False, default=str)[:3000]}")
|
|
|
+ req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
|
|
|
+ with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
+ print(f"HEAD: {resp.status} | size={resp.headers.get('Content-Length', '?')}")
|
|
|
+ print(f">>> 成功! <<<")
|
|
|
except Exception as e:
|
|
|
- print(f"失败: {e}")
|
|
|
+ print(f"HEAD: {e}")
|
|
|
|
|
|
-# 测试 get_dataset_infos
|
|
|
-print("\n=== get_dataset_infos ===")
|
|
|
+# 测试3: GET 下载前 1MB
|
|
|
+print("\n=== 测试3: GET 下载 train.zip 前 1MB ===")
|
|
|
try:
|
|
|
- sig = inspect.signature(api.get_dataset_infos)
|
|
|
- print(f"签名: {sig}")
|
|
|
- result = api.get_dataset_infos(
|
|
|
- dataset_name=ds_name,
|
|
|
- namespace=namespace,
|
|
|
- )
|
|
|
- print(f"结果: {json.dumps(result, indent=2, ensure_ascii=False, default=str)[:3000]}")
|
|
|
+ req = urllib.request.Request(url, headers={"User-Agent": "Test", "Range": "bytes=0-1048575"})
|
|
|
+ with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
+ data = resp.read(1048576)
|
|
|
+ is_zip = data[:4] == b'PK\x03\x04'
|
|
|
+ print(f"GET: {resp.status} | {len(data)} bytes | is_zip={is_zip}")
|
|
|
+ if is_zip:
|
|
|
+ print(f">>> 成功! 是 ZIP 文件! <<<")
|
|
|
except Exception as e:
|
|
|
- print(f"失败: {e}")
|
|
|
+ print(f"GET: {e}")
|
|
|
|
|
|
-# 测试 get_dataset_file_url
|
|
|
-print("\n=== get_dataset_file_url (train.csv) ===")
|
|
|
+# 测试4: 访问 public-unzip-dataset(已解压的图片)
|
|
|
+print("\n=== 测试4: 直接访问 public-unzip-dataset(已解压图片) ===")
|
|
|
+url = f"{host}/{unzip_dir}"
|
|
|
+print(f"URL: {url}")
|
|
|
try:
|
|
|
- url = api.get_dataset_file_url(
|
|
|
- file_name="train.csv",
|
|
|
- dataset_name=ds_name,
|
|
|
- namespace=namespace,
|
|
|
- revision="master",
|
|
|
- )
|
|
|
- print(f"URL: {url}")
|
|
|
+ req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
+ with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
+ content = resp.read().decode("utf-8", errors="replace")
|
|
|
+ print(f"状态: {resp.status}")
|
|
|
+ print(f"内容前 500 字符: {content[:500]}")
|
|
|
except Exception as e:
|
|
|
print(f"失败: {e}")
|
|
|
|
|
|
-# 测试 get_dataset_file_url (train.zip - 数据文件区)
|
|
|
-print("\n=== get_dataset_file_url (train.zip) ===")
|
|
|
+# 测试5: 带签名访问 public-unzip-dataset
|
|
|
+print("\n=== 测试5: 带签名访问 public-unzip-dataset ===")
|
|
|
+string_to_sign = f"GET\n\n\n{expires}\n/{config['Bucket']}/{unzip_dir}"
|
|
|
+h = hmac.new(access_secret.encode(), string_to_sign.encode(), hashlib.sha1)
|
|
|
+signature = urllib.parse.quote(base64.b64encode(h.digest()))
|
|
|
+url = (f"{host}/{unzip_dir}"
|
|
|
+ f"?OSSAccessKeyId={urllib.parse.quote(access_id)}"
|
|
|
+ f"&Expires={expires}"
|
|
|
+ f"&Signature={signature}"
|
|
|
+ f"&security-token={urllib.parse.quote(security_token)}")
|
|
|
try:
|
|
|
- url = api.get_dataset_file_url(
|
|
|
- file_name="train.zip",
|
|
|
- dataset_name=ds_name,
|
|
|
- namespace=namespace,
|
|
|
- revision="master",
|
|
|
- )
|
|
|
- print(f"URL: {url}")
|
|
|
- # 尝试下载验证
|
|
|
- import urllib.request
|
|
|
- req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
|
|
|
- try:
|
|
|
- with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
- print(f"HEAD: {resp.status} | size={resp.headers.get('Content-Length', '?')}")
|
|
|
- except Exception as e2:
|
|
|
- print(f"HEAD: {e2}")
|
|
|
+ req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
+ with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
+ content = resp.read().decode("utf-8", errors="replace")
|
|
|
+ print(f"状态: {resp.status}")
|
|
|
+ print(f"内容前 500 字符: {content[:500]}")
|
|
|
except Exception as e:
|
|
|
print(f"失败: {e}")
|
|
|
|