test_ms_api.py 3.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. #!/usr/bin/env python3
  2. """测试 ModelScope 数据文件区 zip 的不同下载 URL 格式。"""
  3. import json
  4. import urllib.request
  5. import urllib.parse
  6. import sys
  7. api_base = "https://www.modelscope.cn"
  8. dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
  9. namespace, ds_name = dataset_id.split("/", 1)
  10. zip_file = "train.zip"
  11. print(f"数据集: {dataset_id}, 目标文件: {zip_file}\n")
  12. # 先获取 hub_id
  13. hub_id = None
  14. try:
  15. url = f"{api_base}/api/v1/datasets/{dataset_id}"
  16. req = urllib.request.Request(url, headers={"User-Agent": "Test"})
  17. with urllib.request.urlopen(req, timeout=30) as resp:
  18. info = json.loads(resp.read().decode())
  19. hub_id = info.get("Data", {}).get("Id") or info.get("Data", {}).get("id")
  20. print(f"hub_id = {hub_id}\n")
  21. except Exception as e:
  22. print(f"获取 hub_id 失败: {e}\n")
  23. # 测试不同 URL 格式
  24. urls_to_test = [
  25. # 格式1: repo + Source=SDK (已确认 404)
  26. f"{api_base}/api/v1/datasets/{dataset_id}/repo?Source=SDK&Revision=master&FilePath={zip_file}&View=false",
  27. # 格式2: repo 不带 Source
  28. f"{api_base}/api/v1/datasets/{dataset_id}/repo?Revision=master&FilePath={zip_file}",
  29. # 格式3: resolve 格式 (类似 HuggingFace)
  30. f"{api_base}/api/v1/datasets/{dataset_id}/resolve/master/{zip_file}",
  31. # 格式4: 用数字 hub_id
  32. f"{api_base}/api/v1/datasets/{hub_id}/repo?Revision=master&FilePath={zip_file}",
  33. # 格式5: repo/files 格式
  34. f"{api_base}/api/v1/datasets/{dataset_id}/repo/files?Revision=master&FilePath={zip_file}",
  35. # 格式6: download 格式
  36. f"{api_base}/api/v1/datasets/{dataset_id}/download?Revision=master&FilePath={zip_file}",
  37. # 格式7: 直接 URL(类似网页点击下载)
  38. f"{api_base}/datasets/{dataset_id}/resolve/master/{zip_file}",
  39. # 格式8: data-files 专用端点
  40. f"{api_base}/api/v1/datasets/{dataset_id}/data-files?Revision=master&FilePath={zip_file}",
  41. # 格式9: dataset_id 用数字 + resolve
  42. f"{api_base}/api/v1/datasets/{hub_id}/resolve/master/{zip_file}",
  43. # 格式10: 不带 View 参数
  44. f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo?Source=SDK&Revision=master&FilePath={zip_file}",
  45. ]
  46. for i, url in enumerate(urls_to_test, 1):
  47. print(f"--- 格式{i} ---")
  48. print(f" URL: {url}")
  49. # HEAD 请求测试
  50. try:
  51. req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
  52. with urllib.request.urlopen(req, timeout=15) as resp:
  53. size = resp.headers.get("Content-Length", "?")
  54. ctype = resp.headers.get("Content-Type", "?")
  55. print(f" HEAD: {resp.status} | size={size} | type={ctype}")
  56. if resp.status == 200:
  57. print(f" >>> 成功! <<<")
  58. continue
  59. except urllib.error.HTTPError as e:
  60. print(f" HEAD: {e.code}")
  61. except Exception as e:
  62. print(f" HEAD: {e}")
  63. # GET 请求测试(只读前 1024 字节)
  64. try:
  65. req = urllib.request.Request(url, headers={"User-Agent": "Test"})
  66. with urllib.request.urlopen(req, timeout=15) as resp:
  67. data = resp.read(1024)
  68. ctype = resp.headers.get("Content-Type", "?")
  69. is_zip = data[:4] == b'PK\x03\x04'
  70. print(f" GET: {resp.status} | {len(data)} bytes | type={ctype} | is_zip={is_zip}")
  71. if is_zip:
  72. print(f" >>> 成功! 是 ZIP 文件! <<<")
  73. except urllib.error.HTTPError as e:
  74. print(f" GET: {e.code}")
  75. except Exception as e:
  76. print(f" GET: {e}")
  77. print()