|
@@ -1,103 +1,98 @@
|
|
|
#!/usr/bin/env python3
|
|
#!/usr/bin/env python3
|
|
|
-"""测试不同方式下载 ModelScope 数据文件区图片。"""
|
|
|
|
|
|
|
+"""测试 HubApi 的 get_dataset_access_config 获取数据文件 CDN 链接。"""
|
|
|
import json
|
|
import json
|
|
|
-import urllib.request
|
|
|
|
|
-import urllib.parse
|
|
|
|
|
import sys
|
|
import sys
|
|
|
-import subprocess
|
|
|
|
|
-import os
|
|
|
|
|
|
|
+import inspect
|
|
|
|
|
|
|
|
-api_base = "https://www.modelscope.cn"
|
|
|
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
|
namespace, ds_name = dataset_id.split("/", 1)
|
|
namespace, ds_name = dataset_id.split("/", 1)
|
|
|
|
|
|
|
|
print(f"数据集: {dataset_id}\n")
|
|
print(f"数据集: {dataset_id}\n")
|
|
|
|
|
|
|
|
-# 先查看当前版本
|
|
|
|
|
-print("=== 当前版本 ===")
|
|
|
|
|
-for pkg in ["modelscope", "datasets"]:
|
|
|
|
|
- try:
|
|
|
|
|
- result = subprocess.run(
|
|
|
|
|
- ["pip", "show", pkg], capture_output=True, text=True, timeout=10
|
|
|
|
|
- )
|
|
|
|
|
- for line in result.stdout.splitlines():
|
|
|
|
|
- if line.startswith("Version:") or line.startswith("Name:"):
|
|
|
|
|
- print(f" {line}")
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- print(f" {pkg}: {e}")
|
|
|
|
|
|
|
+from modelscope.hub.api import HubApi
|
|
|
|
|
+api = HubApi()
|
|
|
|
|
|
|
|
-# 方式1: 直接用 hub.api(跳过 msdatasets 的 import 问题)
|
|
|
|
|
-print("\n=== 方式1: HubApi snapshot_download ===")
|
|
|
|
|
|
|
+# 获取 dataset_id (数字)
|
|
|
|
|
+print("=== 获取 dataset_id ===")
|
|
|
try:
|
|
try:
|
|
|
- from modelscope.hub.snapshot_download import dataset_snapshot_download
|
|
|
|
|
- print("dataset_snapshot_download 可用!")
|
|
|
|
|
- from modelscope.utils.constant import DownloadMode
|
|
|
|
|
- cache_dir = "/tmp/ms_test_cache"
|
|
|
|
|
- os.makedirs(cache_dir, exist_ok=True)
|
|
|
|
|
- result = dataset_snapshot_download(
|
|
|
|
|
- dataset_id=dataset_id,
|
|
|
|
|
- cache_dir=cache_dir,
|
|
|
|
|
- download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
|
|
|
|
|
- )
|
|
|
|
|
- print(f"成功! 缓存目录: {result}")
|
|
|
|
|
- # 列出文件
|
|
|
|
|
- for root, dirs, files in os.walk(result):
|
|
|
|
|
- for f in files:
|
|
|
|
|
- fp = os.path.join(root, f)
|
|
|
|
|
- size = os.path.getsize(fp)
|
|
|
|
|
- print(f" {os.path.relpath(fp, result)} ({size} bytes)")
|
|
|
|
|
|
|
+ ds_id, ds_type = api.get_dataset_id_and_type(namespace=namespace, dataset_name=ds_name)
|
|
|
|
|
+ print(f"dataset_id={ds_id}, type={ds_type}")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f"失败: {e}")
|
|
print(f"失败: {e}")
|
|
|
|
|
+ ds_id = None
|
|
|
|
|
|
|
|
-# 方式2: 尝试 dataset_snapshot_download 不带 DownloadMode
|
|
|
|
|
-print("\n=== 方式2: dataset_snapshot_download (简化调用) ===")
|
|
|
|
|
|
|
+# 测试 get_dataset_access_config
|
|
|
|
|
+print("\n=== get_dataset_access_config ===")
|
|
|
try:
|
|
try:
|
|
|
- from modelscope.hub.snapshot_download import dataset_snapshot_download
|
|
|
|
|
- cache_dir = "/tmp/ms_test_cache2"
|
|
|
|
|
- os.makedirs(cache_dir, exist_ok=True)
|
|
|
|
|
- result = dataset_snapshot_download(
|
|
|
|
|
- dataset_id=dataset_id,
|
|
|
|
|
- cache_dir=cache_dir,
|
|
|
|
|
|
|
+ sig = inspect.signature(api.get_dataset_access_config)
|
|
|
|
|
+ print(f"签名: {sig}")
|
|
|
|
|
+ result = api.get_dataset_access_config(
|
|
|
|
|
+ dataset_name=ds_name,
|
|
|
|
|
+ namespace=namespace,
|
|
|
|
|
+ revision="master",
|
|
|
)
|
|
)
|
|
|
- print(f"成功! 缓存目录: {result}")
|
|
|
|
|
- for root, dirs, files in os.walk(result):
|
|
|
|
|
- for f in files:
|
|
|
|
|
- fp = os.path.join(root, f)
|
|
|
|
|
- size = os.path.getsize(fp)
|
|
|
|
|
- print(f" {os.path.relpath(fp, result)} ({size} bytes)")
|
|
|
|
|
|
|
+ print(f"结果: {json.dumps(result, indent=2, ensure_ascii=False, default=str)[:3000]}")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f"失败: {e}")
|
|
print(f"失败: {e}")
|
|
|
|
|
|
|
|
-# 方式3: 尝试直接用 HubApi 的 get_dataset_file_url_with_token 或类似方法
|
|
|
|
|
-print("\n=== 方式3: HubApi 获取下载 URL ===")
|
|
|
|
|
|
|
+# 测试 get_dataset_access_config_for_unzipped
|
|
|
|
|
+print("\n=== get_dataset_access_config_for_unzipped ===")
|
|
|
try:
|
|
try:
|
|
|
- from modelscope.hub.api import HubApi
|
|
|
|
|
- api = HubApi()
|
|
|
|
|
- # 列出所有可用方法
|
|
|
|
|
- methods = [m for m in dir(api) if 'dataset' in m.lower() or 'download' in m.lower()]
|
|
|
|
|
- print(f"可用方法: {methods}")
|
|
|
|
|
|
|
+ sig = inspect.signature(api.get_dataset_access_config_for_unzipped)
|
|
|
|
|
+ print(f"签名: {sig}")
|
|
|
|
|
+ result = api.get_dataset_access_config_for_unzipped(
|
|
|
|
|
+ dataset_name=ds_name,
|
|
|
|
|
+ namespace=namespace,
|
|
|
|
|
+ revision="master",
|
|
|
|
|
+ )
|
|
|
|
|
+ print(f"结果: {json.dumps(result, indent=2, ensure_ascii=False, default=str)[:3000]}")
|
|
|
|
|
+except Exception as e:
|
|
|
|
|
+ print(f"失败: {e}")
|
|
|
|
|
|
|
|
- # 尝试 list_repo_tree
|
|
|
|
|
- for method_name in ['list_repo_tree', 'get_dataset_meta_file_list']:
|
|
|
|
|
- if hasattr(api, method_name):
|
|
|
|
|
- print(f"\n尝试 {method_name}...")
|
|
|
|
|
- try:
|
|
|
|
|
- method = getattr(api, method_name)
|
|
|
|
|
- result = method(ds_name, namespace=namespace, revision="master")
|
|
|
|
|
- print(f" 结果: {result}")
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- print(f" 失败: {e}")
|
|
|
|
|
|
|
+# 测试 get_dataset_infos
|
|
|
|
|
+print("\n=== get_dataset_infos ===")
|
|
|
|
|
+try:
|
|
|
|
|
+ sig = inspect.signature(api.get_dataset_infos)
|
|
|
|
|
+ print(f"签名: {sig}")
|
|
|
|
|
+ result = api.get_dataset_infos(
|
|
|
|
|
+ dataset_name=ds_name,
|
|
|
|
|
+ namespace=namespace,
|
|
|
|
|
+ )
|
|
|
|
|
+ print(f"结果: {json.dumps(result, indent=2, ensure_ascii=False, default=str)[:3000]}")
|
|
|
|
|
+except Exception as e:
|
|
|
|
|
+ print(f"失败: {e}")
|
|
|
|
|
|
|
|
|
|
+# 测试 get_dataset_file_url
|
|
|
|
|
+print("\n=== get_dataset_file_url (train.csv) ===")
|
|
|
|
|
+try:
|
|
|
|
|
+ url = api.get_dataset_file_url(
|
|
|
|
|
+ file_name="train.csv",
|
|
|
|
|
+ dataset_name=ds_name,
|
|
|
|
|
+ namespace=namespace,
|
|
|
|
|
+ revision="master",
|
|
|
|
|
+ )
|
|
|
|
|
+ print(f"URL: {url}")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f"失败: {e}")
|
|
print(f"失败: {e}")
|
|
|
|
|
|
|
|
-# 方式4: pip 查看 modelscope 可用版本
|
|
|
|
|
-print("\n=== 方式4: pip 检查 ===")
|
|
|
|
|
|
|
+# 测试 get_dataset_file_url (train.zip - 数据文件区)
|
|
|
|
|
+print("\n=== get_dataset_file_url (train.zip) ===")
|
|
|
try:
|
|
try:
|
|
|
- result = subprocess.run(
|
|
|
|
|
- ["pip", "index", "versions", "modelscope"], capture_output=True, text=True, timeout=15
|
|
|
|
|
|
|
+ url = api.get_dataset_file_url(
|
|
|
|
|
+ file_name="train.zip",
|
|
|
|
|
+ dataset_name=ds_name,
|
|
|
|
|
+ namespace=namespace,
|
|
|
|
|
+ revision="master",
|
|
|
)
|
|
)
|
|
|
- print(result.stdout[:500])
|
|
|
|
|
|
|
+ print(f"URL: {url}")
|
|
|
|
|
+ # 尝试下载验证
|
|
|
|
|
+ import urllib.request
|
|
|
|
|
+ req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
|
|
|
|
|
+ try:
|
|
|
|
|
+ with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
|
|
+ print(f"HEAD: {resp.status} | size={resp.headers.get('Content-Length', '?')}")
|
|
|
|
|
+ except Exception as e2:
|
|
|
|
|
+ print(f"HEAD: {e2}")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f"失败: {e}")
|
|
print(f"失败: {e}")
|
|
|
|
|
|