|
@@ -1,115 +1,61 @@
|
|
|
#!/usr/bin/env python3
|
|
#!/usr/bin/env python3
|
|
|
-"""测试通过 OSS 凭证下载数据文件区的图片。"""
|
|
|
|
|
|
|
+"""测试 MsDataset.load() 完整下载流程。"""
|
|
|
import json
|
|
import json
|
|
|
-import urllib.request
|
|
|
|
|
-import urllib.parse
|
|
|
|
|
import sys
|
|
import sys
|
|
|
|
|
+import os
|
|
|
|
|
+import tempfile
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
|
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
|
namespace, ds_name = dataset_id.split("/", 1)
|
|
namespace, ds_name = dataset_id.split("/", 1)
|
|
|
|
|
|
|
|
print(f"数据集: {dataset_id}\n")
|
|
print(f"数据集: {dataset_id}\n")
|
|
|
|
|
|
|
|
-from modelscope.hub.api import HubApi
|
|
|
|
|
-api = HubApi()
|
|
|
|
|
-
|
|
|
|
|
-# 获取 OSS 凭证
|
|
|
|
|
-print("=== 获取 OSS 凭证 ===")
|
|
|
|
|
-config = api.get_dataset_access_config(
|
|
|
|
|
- dataset_name=ds_name,
|
|
|
|
|
- namespace=namespace,
|
|
|
|
|
- revision="master",
|
|
|
|
|
-)
|
|
|
|
|
-host = config["Host"]
|
|
|
|
|
-backup_dir = config["BackupDir"] # zip 文件
|
|
|
|
|
-unzip_dir = config["Dir"] # 已解压的文件
|
|
|
|
|
-access_id = config["AccessId"]
|
|
|
|
|
-access_secret = config["AccessSecret"]
|
|
|
|
|
-security_token = config["SecurityToken"]
|
|
|
|
|
-print(f"Host: {host}")
|
|
|
|
|
-print(f"Zip 目录: {backup_dir}")
|
|
|
|
|
-print(f"解压目录: {unzip_dir}")
|
|
|
|
|
-print(f"过期时间: {config['Expiration']}")
|
|
|
|
|
-
|
|
|
|
|
-# 测试1: 直接下载 zip(公开访问?)
|
|
|
|
|
-print("\n=== 测试1: 直接访问 public-zip/train.zip ===")
|
|
|
|
|
-url = f"{host}/{backup_dir}train.zip"
|
|
|
|
|
-print(f"URL: {url}")
|
|
|
|
|
-try:
|
|
|
|
|
- req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
|
|
- print(f"HEAD: {resp.status} | size={resp.headers.get('Content-Length', '?')}")
|
|
|
|
|
-except Exception as e:
|
|
|
|
|
- print(f"HEAD: {e}")
|
|
|
|
|
-
|
|
|
|
|
-# 测试2: 带 STS 签名下载 zip
|
|
|
|
|
-print("\n=== 测试2: 带 STS 签名访问 train.zip ===")
|
|
|
|
|
-# OSS STS 签名 URL 格式: ?OSSAccessKeyId=xxx&Expires=xxx&Signature=xxx&security-token=xxx
|
|
|
|
|
-import time
|
|
|
|
|
-import hmac
|
|
|
|
|
-import hashlib
|
|
|
|
|
-import base64
|
|
|
|
|
-expires = str(int(time.time()) + 3600)
|
|
|
|
|
-string_to_sign = f"HEAD\n\n\n{expires}\n/{config['Bucket']}/{backup_dir}train.zip"
|
|
|
|
|
-h = hmac.new(access_secret.encode(), string_to_sign.encode(), hashlib.sha1)
|
|
|
|
|
-signature = urllib.parse.quote(base64.b64encode(h.digest()))
|
|
|
|
|
-url = (f"{host}/{backup_dir}train.zip"
|
|
|
|
|
- f"?OSSAccessKeyId={urllib.parse.quote(access_id)}"
|
|
|
|
|
- f"&Expires={expires}"
|
|
|
|
|
- f"&Signature={signature}"
|
|
|
|
|
- f"&security-token={urllib.parse.quote(security_token)}")
|
|
|
|
|
-print(f"URL: {url[:200]}...")
|
|
|
|
|
-try:
|
|
|
|
|
- req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
|
|
- print(f"HEAD: {resp.status} | size={resp.headers.get('Content-Length', '?')}")
|
|
|
|
|
- print(f">>> 成功! <<<")
|
|
|
|
|
-except Exception as e:
|
|
|
|
|
- print(f"HEAD: {e}")
|
|
|
|
|
-
|
|
|
|
|
-# 测试3: GET 下载前 1MB
|
|
|
|
|
-print("\n=== 测试3: GET 下载 train.zip 前 1MB ===")
|
|
|
|
|
-try:
|
|
|
|
|
- req = urllib.request.Request(url, headers={"User-Agent": "Test", "Range": "bytes=0-1048575"})
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
|
|
- data = resp.read(1048576)
|
|
|
|
|
- is_zip = data[:4] == b'PK\x03\x04'
|
|
|
|
|
- print(f"GET: {resp.status} | {len(data)} bytes | is_zip={is_zip}")
|
|
|
|
|
- if is_zip:
|
|
|
|
|
- print(f">>> 成功! 是 ZIP 文件! <<<")
|
|
|
|
|
-except Exception as e:
|
|
|
|
|
- print(f"GET: {e}")
|
|
|
|
|
-
|
|
|
|
|
-# 测试4: 访问 public-unzip-dataset(已解压的图片)
|
|
|
|
|
-print("\n=== 测试4: 直接访问 public-unzip-dataset(已解压图片) ===")
|
|
|
|
|
-url = f"{host}/{unzip_dir}"
|
|
|
|
|
-print(f"URL: {url}")
|
|
|
|
|
-try:
|
|
|
|
|
- req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
|
|
- content = resp.read().decode("utf-8", errors="replace")
|
|
|
|
|
- print(f"状态: {resp.status}")
|
|
|
|
|
- print(f"内容前 500 字符: {content[:500]}")
|
|
|
|
|
-except Exception as e:
|
|
|
|
|
- print(f"失败: {e}")
|
|
|
|
|
-
|
|
|
|
|
-# 测试5: 带签名访问 public-unzip-dataset
|
|
|
|
|
-print("\n=== 测试5: 带签名访问 public-unzip-dataset ===")
|
|
|
|
|
-string_to_sign = f"GET\n\n\n{expires}\n/{config['Bucket']}/{unzip_dir}"
|
|
|
|
|
-h = hmac.new(access_secret.encode(), string_to_sign.encode(), hashlib.sha1)
|
|
|
|
|
-signature = urllib.parse.quote(base64.b64encode(h.digest()))
|
|
|
|
|
-url = (f"{host}/{unzip_dir}"
|
|
|
|
|
- f"?OSSAccessKeyId={urllib.parse.quote(access_id)}"
|
|
|
|
|
- f"&Expires={expires}"
|
|
|
|
|
- f"&Signature={signature}"
|
|
|
|
|
- f"&security-token={urllib.parse.quote(security_token)}")
|
|
|
|
|
-try:
|
|
|
|
|
- req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
|
|
- content = resp.read().decode("utf-8", errors="replace")
|
|
|
|
|
- print(f"状态: {resp.status}")
|
|
|
|
|
- print(f"内容前 500 字符: {content[:500]}")
|
|
|
|
|
-except Exception as e:
|
|
|
|
|
- print(f"失败: {e}")
|
|
|
|
|
-
|
|
|
|
|
-print("\n=== 完成 ===")
|
|
|
|
|
|
|
+# 测试 MsDataset.load()
|
|
|
|
|
+print("=== 用 MsDataset.load() 下载 ===")
|
|
|
|
|
+from modelscope.msdatasets import MsDataset
|
|
|
|
|
+from PIL import Image
|
|
|
|
|
+
|
|
|
|
|
+ds = None
|
|
|
|
|
+for split in ("train", "validation", "test"):
|
|
|
|
|
+ try:
|
|
|
|
|
+ if namespace:
|
|
|
|
|
+ ds = MsDataset.load(ds_name, namespace=namespace, split=split)
|
|
|
|
|
+ else:
|
|
|
|
|
+ ds = MsDataset.load(dataset_id, split=split)
|
|
|
|
|
+ if ds:
|
|
|
|
|
+ count = len(ds) if hasattr(ds, "__len__") else "?"
|
|
|
|
|
+ print(f"split='{split}' 成功, 共 {count} 条")
|
|
|
|
|
+ break
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"split='{split}' 失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+if not ds:
|
|
|
|
|
+ print("所有 split 都失败")
|
|
|
|
|
+ sys.exit(1)
|
|
|
|
|
+
|
|
|
|
|
+# 查看前 2 条数据
|
|
|
|
|
+print("\n=== 前 2 条数据 ===")
|
|
|
|
|
+count = 0
|
|
|
|
|
+for row in ds:
|
|
|
|
|
+ if count >= 2:
|
|
|
|
|
+ break
|
|
|
|
|
+ print(f"\n--- Record {count} ---")
|
|
|
|
|
+ for k, v in row.items():
|
|
|
|
|
+ vtype = type(v).__name__
|
|
|
|
|
+ if isinstance(v, Image.Image):
|
|
|
|
|
+ # 模拟保存
|
|
|
|
|
+ tmp = tempfile.NamedTemporaryFile(suffix=".jpg", delete=False)
|
|
|
|
|
+ if v.mode in ("RGBA", "P", "LA"):
|
|
|
|
|
+ v = v.convert("RGB")
|
|
|
|
|
+ v.save(tmp.name, format="JPEG", quality=90)
|
|
|
|
|
+ size = os.path.getsize(tmp.name)
|
|
|
|
|
+ os.unlink(tmp.name)
|
|
|
|
|
+ print(f" {k}: PIL.Image ({v.size[0]}x{v.size[1]}, mode={v.mode}) -> saved as {size} bytes")
|
|
|
|
|
+ elif isinstance(v, str) and len(v) > 100:
|
|
|
|
|
+ print(f" {k}: str (len={len(v)})")
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f" {k}: {vtype} = {v}")
|
|
|
|
|
+ count += 1
|
|
|
|
|
+
|
|
|
|
|
+print(f"\n=== 测试通过! ===")
|