|
@@ -1,83 +1,104 @@
|
|
|
#!/usr/bin/env python3
|
|
#!/usr/bin/env python3
|
|
|
-"""测试 ModelScope 数据文件区 zip 的不同下载 URL 格式。"""
|
|
|
|
|
|
|
+"""测试不同方式下载 ModelScope 数据文件区图片。"""
|
|
|
import json
|
|
import json
|
|
|
import urllib.request
|
|
import urllib.request
|
|
|
import urllib.parse
|
|
import urllib.parse
|
|
|
import sys
|
|
import sys
|
|
|
|
|
+import subprocess
|
|
|
|
|
+import os
|
|
|
|
|
|
|
|
api_base = "https://www.modelscope.cn"
|
|
api_base = "https://www.modelscope.cn"
|
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
|
|
|
namespace, ds_name = dataset_id.split("/", 1)
|
|
namespace, ds_name = dataset_id.split("/", 1)
|
|
|
-zip_file = "train.zip"
|
|
|
|
|
|
|
|
|
|
-print(f"数据集: {dataset_id}, 目标文件: {zip_file}\n")
|
|
|
|
|
|
|
+print(f"数据集: {dataset_id}\n")
|
|
|
|
|
|
|
|
-# 先获取 hub_id
|
|
|
|
|
-hub_id = None
|
|
|
|
|
|
|
+# 先查看当前版本
|
|
|
|
|
+print("=== 当前版本 ===")
|
|
|
|
|
+for pkg in ["modelscope", "datasets"]:
|
|
|
|
|
+ try:
|
|
|
|
|
+ result = subprocess.run(
|
|
|
|
|
+ ["pip", "show", pkg], capture_output=True, text=True, timeout=10
|
|
|
|
|
+ )
|
|
|
|
|
+ for line in result.stdout.splitlines():
|
|
|
|
|
+ if line.startswith("Version:") or line.startswith("Name:"):
|
|
|
|
|
+ print(f" {line}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" {pkg}: {e}")
|
|
|
|
|
+
|
|
|
|
|
+# 方式1: 直接用 hub.api(跳过 msdatasets 的 import 问题)
|
|
|
|
|
+print("\n=== 方式1: HubApi snapshot_download ===")
|
|
|
try:
|
|
try:
|
|
|
- url = f"{api_base}/api/v1/datasets/{dataset_id}"
|
|
|
|
|
- req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
|
|
- info = json.loads(resp.read().decode())
|
|
|
|
|
- hub_id = info.get("Data", {}).get("Id") or info.get("Data", {}).get("id")
|
|
|
|
|
- print(f"hub_id = {hub_id}\n")
|
|
|
|
|
|
|
+ from modelscope.hub.snapshot_download import dataset_snapshot_download
|
|
|
|
|
+ print("dataset_snapshot_download 可用!")
|
|
|
|
|
+ from modelscope.utils.constant import DownloadMode
|
|
|
|
|
+ cache_dir = "/tmp/ms_test_cache"
|
|
|
|
|
+ os.makedirs(cache_dir, exist_ok=True)
|
|
|
|
|
+ result = dataset_snapshot_download(
|
|
|
|
|
+ dataset_id=dataset_id,
|
|
|
|
|
+ cache_dir=cache_dir,
|
|
|
|
|
+ download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
|
|
|
|
|
+ )
|
|
|
|
|
+ print(f"成功! 缓存目录: {result}")
|
|
|
|
|
+ # 列出文件
|
|
|
|
|
+ for root, dirs, files in os.walk(result):
|
|
|
|
|
+ for f in files:
|
|
|
|
|
+ fp = os.path.join(root, f)
|
|
|
|
|
+ size = os.path.getsize(fp)
|
|
|
|
|
+ print(f" {os.path.relpath(fp, result)} ({size} bytes)")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
- print(f"获取 hub_id 失败: {e}\n")
|
|
|
|
|
|
|
+ print(f"失败: {e}")
|
|
|
|
|
|
|
|
-# 测试不同 URL 格式
|
|
|
|
|
-urls_to_test = [
|
|
|
|
|
- # 格式1: repo + Source=SDK (已确认 404)
|
|
|
|
|
- f"{api_base}/api/v1/datasets/{dataset_id}/repo?Source=SDK&Revision=master&FilePath={zip_file}&View=false",
|
|
|
|
|
- # 格式2: repo 不带 Source
|
|
|
|
|
- f"{api_base}/api/v1/datasets/{dataset_id}/repo?Revision=master&FilePath={zip_file}",
|
|
|
|
|
- # 格式3: resolve 格式 (类似 HuggingFace)
|
|
|
|
|
- f"{api_base}/api/v1/datasets/{dataset_id}/resolve/master/{zip_file}",
|
|
|
|
|
- # 格式4: 用数字 hub_id
|
|
|
|
|
- f"{api_base}/api/v1/datasets/{hub_id}/repo?Revision=master&FilePath={zip_file}",
|
|
|
|
|
- # 格式5: repo/files 格式
|
|
|
|
|
- f"{api_base}/api/v1/datasets/{dataset_id}/repo/files?Revision=master&FilePath={zip_file}",
|
|
|
|
|
- # 格式6: download 格式
|
|
|
|
|
- f"{api_base}/api/v1/datasets/{dataset_id}/download?Revision=master&FilePath={zip_file}",
|
|
|
|
|
- # 格式7: 直接 URL(类似网页点击下载)
|
|
|
|
|
- f"{api_base}/datasets/{dataset_id}/resolve/master/{zip_file}",
|
|
|
|
|
- # 格式8: data-files 专用端点
|
|
|
|
|
- f"{api_base}/api/v1/datasets/{dataset_id}/data-files?Revision=master&FilePath={zip_file}",
|
|
|
|
|
- # 格式9: dataset_id 用数字 + resolve
|
|
|
|
|
- f"{api_base}/api/v1/datasets/{hub_id}/resolve/master/{zip_file}",
|
|
|
|
|
- # 格式10: 不带 View 参数
|
|
|
|
|
- f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo?Source=SDK&Revision=master&FilePath={zip_file}",
|
|
|
|
|
-]
|
|
|
|
|
|
|
+# 方式2: 尝试 dataset_snapshot_download 不带 DownloadMode
|
|
|
|
|
+print("\n=== 方式2: dataset_snapshot_download (简化调用) ===")
|
|
|
|
|
+try:
|
|
|
|
|
+ from modelscope.hub.snapshot_download import dataset_snapshot_download
|
|
|
|
|
+ cache_dir = "/tmp/ms_test_cache2"
|
|
|
|
|
+ os.makedirs(cache_dir, exist_ok=True)
|
|
|
|
|
+ result = dataset_snapshot_download(
|
|
|
|
|
+ dataset_id=dataset_id,
|
|
|
|
|
+ cache_dir=cache_dir,
|
|
|
|
|
+ )
|
|
|
|
|
+ print(f"成功! 缓存目录: {result}")
|
|
|
|
|
+ for root, dirs, files in os.walk(result):
|
|
|
|
|
+ for f in files:
|
|
|
|
|
+ fp = os.path.join(root, f)
|
|
|
|
|
+ size = os.path.getsize(fp)
|
|
|
|
|
+ print(f" {os.path.relpath(fp, result)} ({size} bytes)")
|
|
|
|
|
+except Exception as e:
|
|
|
|
|
+ print(f"失败: {e}")
|
|
|
|
|
|
|
|
-for i, url in enumerate(urls_to_test, 1):
|
|
|
|
|
- print(f"--- 格式{i} ---")
|
|
|
|
|
- print(f" URL: {url}")
|
|
|
|
|
- # HEAD 请求测试
|
|
|
|
|
- try:
|
|
|
|
|
- req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
|
|
- size = resp.headers.get("Content-Length", "?")
|
|
|
|
|
- ctype = resp.headers.get("Content-Type", "?")
|
|
|
|
|
- print(f" HEAD: {resp.status} | size={size} | type={ctype}")
|
|
|
|
|
- if resp.status == 200:
|
|
|
|
|
- print(f" >>> 成功! <<<")
|
|
|
|
|
- continue
|
|
|
|
|
- except urllib.error.HTTPError as e:
|
|
|
|
|
- print(f" HEAD: {e.code}")
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- print(f" HEAD: {e}")
|
|
|
|
|
|
|
+# 方式3: 尝试直接用 HubApi 的 get_dataset_file_url_with_token 或类似方法
|
|
|
|
|
+print("\n=== 方式3: HubApi 获取下载 URL ===")
|
|
|
|
|
+try:
|
|
|
|
|
+ from modelscope.hub.api import HubApi
|
|
|
|
|
+ api = HubApi()
|
|
|
|
|
+ # 列出所有可用方法
|
|
|
|
|
+ methods = [m for m in dir(api) if 'dataset' in m.lower() or 'download' in m.lower()]
|
|
|
|
|
+ print(f"可用方法: {methods}")
|
|
|
|
|
|
|
|
- # GET 请求测试(只读前 1024 字节)
|
|
|
|
|
- try:
|
|
|
|
|
- req = urllib.request.Request(url, headers={"User-Agent": "Test"})
|
|
|
|
|
- with urllib.request.urlopen(req, timeout=15) as resp:
|
|
|
|
|
- data = resp.read(1024)
|
|
|
|
|
- ctype = resp.headers.get("Content-Type", "?")
|
|
|
|
|
- is_zip = data[:4] == b'PK\x03\x04'
|
|
|
|
|
- print(f" GET: {resp.status} | {len(data)} bytes | type={ctype} | is_zip={is_zip}")
|
|
|
|
|
- if is_zip:
|
|
|
|
|
- print(f" >>> 成功! 是 ZIP 文件! <<<")
|
|
|
|
|
- except urllib.error.HTTPError as e:
|
|
|
|
|
- print(f" GET: {e.code}")
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- print(f" GET: {e}")
|
|
|
|
|
- print()
|
|
|
|
|
|
|
+ # 尝试 list_repo_tree
|
|
|
|
|
+ for method_name in ['list_repo_tree', 'get_dataset_meta_file_list']:
|
|
|
|
|
+ if hasattr(api, method_name):
|
|
|
|
|
+ print(f"\n尝试 {method_name}...")
|
|
|
|
|
+ try:
|
|
|
|
|
+ method = getattr(api, method_name)
|
|
|
|
|
+ result = method(ds_name, namespace=namespace, revision="master")
|
|
|
|
|
+ print(f" 结果: {result}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+except Exception as e:
|
|
|
|
|
+ print(f"失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+# 方式4: pip 查看 modelscope 可用版本
|
|
|
|
|
+print("\n=== 方式4: pip 检查 ===")
|
|
|
|
|
+try:
|
|
|
|
|
+ result = subprocess.run(
|
|
|
|
|
+ ["pip", "index", "versions", "modelscope"], capture_output=True, text=True, timeout=15
|
|
|
|
|
+ )
|
|
|
|
|
+ print(result.stdout[:500])
|
|
|
|
|
+except Exception as e:
|
|
|
|
|
+ print(f"失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+print("\n=== 完成 ===")
|