test_ms_api.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. #!/usr/bin/env python3
  2. """测试 ModelScope 图片数据集下载流程。"""
  3. import json
  4. import urllib.request
  5. import urllib.parse
  6. import sys
  7. import os
  8. import tempfile
  9. import zipfile
  10. api_base = "https://www.modelscope.cn"
  11. dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
  12. namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
  13. print(f"测试数据集: {dataset_id}\n")
  14. # Step 1: 下载配置文件
  15. print("=== Step1: 下载配置文件 ===")
  16. config_url = (f"{api_base}/api/v1/datasets/{dataset_id}/repo"
  17. f"?Source=SDK&Revision=master&FilePath={ds_name}.json&View=false")
  18. print(f"URL: {config_url}")
  19. try:
  20. req = urllib.request.Request(config_url, headers={"User-Agent": "Test"})
  21. with urllib.request.urlopen(req, timeout=30) as resp:
  22. config = json.loads(resp.read().decode())
  23. print(f"配置内容: {json.dumps(config, ensure_ascii=False, indent=2)}")
  24. except Exception as e:
  25. print(f"失败: {e}")
  26. sys.exit(1)
  27. # Step 2: 收集压缩包文件名
  28. print("\n=== Step2: 收集压缩包文件名 ===")
  29. archive_files = set()
  30. for subset in config.values():
  31. if isinstance(subset, dict):
  32. for split_info in subset.values():
  33. if isinstance(split_info, dict):
  34. fname = split_info.get("file", "")
  35. if fname:
  36. archive_files.add(fname)
  37. print(f"找到压缩包: {archive_files}")
  38. # Step 3: 测试下载第一个压缩包(只下载前 1MB 验证)
  39. print("\n=== Step3: 测试下载压缩包 ===")
  40. for fname in archive_files:
  41. params = urllib.parse.urlencode({
  42. "Source": "SDK", "Revision": "master",
  43. "FilePath": fname, "View": "false",
  44. })
  45. dl_url = f"{api_base}/api/v1/datasets/{dataset_id}/repo?{params}"
  46. print(f"文件: {fname}")
  47. print(f"URL: {dl_url}")
  48. # 只读 Content-Length 验证可下载
  49. try:
  50. req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"})
  51. req.method = "HEAD"
  52. with urllib.request.urlopen(req, timeout=30) as resp:
  53. size = resp.headers.get("Content-Length", "unknown")
  54. content_type = resp.headers.get("Content-Type", "unknown")
  55. print(f" Content-Length: {size}")
  56. print(f" Content-Type: {content_type}")
  57. print(f" 状态: {resp.status}")
  58. except Exception as e:
  59. print(f" HEAD 请求失败: {e}")
  60. # 尝试 GET 前 1MB
  61. try:
  62. req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"})
  63. with urllib.request.urlopen(req, timeout=30) as resp:
  64. data = resp.read(1024 * 1024)
  65. print(f" GET 成功, 前 1MB 读取 {len(data)} bytes")
  66. # 检查是否是 zip
  67. if data[:4] == b'PK\x03\x04':
  68. print(f" 确认是 ZIP 格式!")
  69. else:
  70. print(f" 前 4 字节: {data[:4]}")
  71. except Exception as e2:
  72. print(f" GET 也失败: {e2}")
  73. break
  74. print("\n=== 完成 ===")