test_ms_api.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. #!/usr/bin/env python3
  2. """测试 ModelScope API 能否正确获取数据集文件列表并下载压缩包。"""
  3. import json
  4. import urllib.request
  5. import urllib.parse
  6. import sys
  7. api_base = "https://www.modelscope.cn"
  8. dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
  9. print(f"测试数据集: {dataset_id}\n")
  10. # Step 1: 获取数字 hub ID
  11. print("=== Step1: 获取 hub ID ===")
  12. try:
  13. info_url = f"{api_base}/api/v1/datasets/{dataset_id}"
  14. print(f"请求: {info_url}")
  15. req = urllib.request.Request(info_url, headers={"User-Agent": "Test"})
  16. with urllib.request.urlopen(req, timeout=30) as resp:
  17. info = json.loads(resp.read().decode())
  18. hub_id = info.get("Data", {}).get("Id") or info.get("Data", {}).get("id")
  19. print(f"hub_id = {hub_id}\n")
  20. except Exception as e:
  21. print(f"失败: {e}\n")
  22. hub_id = None
  23. # Step 2: 列出文件
  24. print("=== Step2: 列出文件 ===")
  25. files = []
  26. for test_id in filter(None, [hub_id, dataset_id]):
  27. try:
  28. tree_url = (f"{api_base}/api/v1/datasets/{test_id}/repo/tree"
  29. f"?Revision=master&Root=/&Recursive=True&PageNumber=1&PageSize=10000")
  30. print(f"请求: {tree_url}")
  31. req = urllib.request.Request(tree_url, headers={"User-Agent": "Test"})
  32. with urllib.request.urlopen(req, timeout=30) as resp:
  33. result = json.loads(resp.read().decode())
  34. files = result.get("Data", {}).get("Files", [])
  35. print(f"成功! 共 {len(files)} 个文件:")
  36. for f in files:
  37. name = f.get("Name", f.get("name", ""))
  38. size = f.get("Size", f.get("size", ""))
  39. print(f" {name} (size={size})")
  40. if files:
  41. break
  42. except Exception as e:
  43. print(f"失败: {e}")
  44. print()
  45. # Step 3: 筛选压缩包
  46. print("=== Step3: 压缩包文件 ===")
  47. archive_exts = (".zip", ".tar.gz", ".tgz", ".tar.bz2", ".tbz2", ".tar")
  48. namespace, ds_name = dataset_id.split("/", 1)
  49. found = False
  50. for f in files:
  51. name = f.get("Name", f.get("name", ""))
  52. if any(name.lower().endswith(ext) for ext in archive_exts):
  53. path = f.get("Path", f.get("path", name))
  54. params = urllib.parse.urlencode({
  55. "Source": "SDK", "Revision": "master",
  56. "FilePath": path, "View": "false",
  57. })
  58. dl_url = f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo?{params}"
  59. print(f" {name}")
  60. print(f" 路径: {path}")
  61. print(f" 下载URL: {dl_url}")
  62. found = True
  63. if not found:
  64. print(" 未找到压缩包文件")