Bladeren bron

修复测试文件

lxylxy123321 8 uur geleden
bovenliggende
commit
f7ee1ab393
2 gewijzigde bestanden met toevoegingen van 72 en 66 verwijderingen
  1. 7 3
      backend/app/services/dataset_service.py
  2. 65 63
      backend/scripts/test_ms_api.py

+ 7 - 3
backend/app/services/dataset_service.py

@@ -420,9 +420,13 @@ def _download_modelscope_dataset_cli(dataset_id: str, ds_dir: Path) -> tuple[Pat
         logger.error(f"ModelScope CLI download failed (code={proc.returncode}): {proc.stderr[:500]}")
         logger.error(f"ModelScope CLI download failed (code={proc.returncode}): {proc.stderr[:500]}")
         raise RuntimeError(f"ModelScope download failed: {proc.stderr[:500]}")
         raise RuntimeError(f"ModelScope download failed: {proc.stderr[:500]}")
 
 
-    # CLI 下载完 git 仓库文件后,通过 API 下载数据文件区的压缩包并解压
-    _download_modelscope_data_files(dataset_id, ds_dir)
-    _extract_archives(ds_dir)
+    # CLI 下载完 git 仓库文件后,尝试下载数据文件区的压缩包并解压(图片数据集用)
+    # 即使失败也不影响原来的下载流程
+    try:
+        _download_modelscope_data_files(dataset_id, ds_dir)
+        _extract_archives(ds_dir)
+    except Exception as e:
+        logger.warning(f"数据文件下载/解压失败(不影响主流程): {e}")
 
 
     # 扫描下载目录中的所有文件
     # 扫描下载目录中的所有文件
     all_files = [p for p in ds_dir.rglob("*") if p.is_file()]
     all_files = [p for p in ds_dir.rglob("*") if p.is_file()]

+ 65 - 63
backend/scripts/test_ms_api.py

@@ -1,81 +1,83 @@
 #!/usr/bin/env python3
 #!/usr/bin/env python3
-"""测试 ModelScope 图片数据集下载流程。"""
+"""测试 ModelScope 数据文件区 zip 的不同下载 URL 格式。"""
 import json
 import json
 import urllib.request
 import urllib.request
 import urllib.parse
 import urllib.parse
 import sys
 import sys
-import os
-import tempfile
-import zipfile
 
 
 api_base = "https://www.modelscope.cn"
 api_base = "https://www.modelscope.cn"
 dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
 dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
-namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
+namespace, ds_name = dataset_id.split("/", 1)
+zip_file = "train.zip"
 
 
-print(f"测试数据集: {dataset_id}\n")
+print(f"数据集: {dataset_id}, 目标文件: {zip_file}\n")
 
 
-# Step 1: 下载配置文件
-print("=== Step1: 下载配置文件 ===")
-config_url = (f"{api_base}/api/v1/datasets/{dataset_id}/repo"
-              f"?Source=SDK&Revision=master&FilePath={ds_name}.json&View=false")
-print(f"URL: {config_url}")
+# 先获取 hub_id
+hub_id = None
 try:
 try:
-    req = urllib.request.Request(config_url, headers={"User-Agent": "Test"})
+    url = f"{api_base}/api/v1/datasets/{dataset_id}"
+    req = urllib.request.Request(url, headers={"User-Agent": "Test"})
     with urllib.request.urlopen(req, timeout=30) as resp:
     with urllib.request.urlopen(req, timeout=30) as resp:
-        config = json.loads(resp.read().decode())
-    print(f"配置内容: {json.dumps(config, ensure_ascii=False, indent=2)}")
+        info = json.loads(resp.read().decode())
+    hub_id = info.get("Data", {}).get("Id") or info.get("Data", {}).get("id")
+    print(f"hub_id = {hub_id}\n")
 except Exception as e:
 except Exception as e:
-    print(f"失败: {e}")
-    sys.exit(1)
+    print(f"获取 hub_id 失败: {e}\n")
 
 
-# Step 2: 收集压缩包文件名
-print("\n=== Step2: 收集压缩包文件名 ===")
-archive_files = set()
-for subset in config.values():
-    if isinstance(subset, dict):
-        for split_info in subset.values():
-            if isinstance(split_info, dict):
-                fname = split_info.get("file", "")
-                if fname:
-                    archive_files.add(fname)
-print(f"找到压缩包: {archive_files}")
+# 测试不同 URL 格式
+urls_to_test = [
+    # 格式1: repo + Source=SDK (已确认 404)
+    f"{api_base}/api/v1/datasets/{dataset_id}/repo?Source=SDK&Revision=master&FilePath={zip_file}&View=false",
+    # 格式2: repo 不带 Source
+    f"{api_base}/api/v1/datasets/{dataset_id}/repo?Revision=master&FilePath={zip_file}",
+    # 格式3: resolve 格式 (类似 HuggingFace)
+    f"{api_base}/api/v1/datasets/{dataset_id}/resolve/master/{zip_file}",
+    # 格式4: 用数字 hub_id
+    f"{api_base}/api/v1/datasets/{hub_id}/repo?Revision=master&FilePath={zip_file}",
+    # 格式5: repo/files 格式
+    f"{api_base}/api/v1/datasets/{dataset_id}/repo/files?Revision=master&FilePath={zip_file}",
+    # 格式6: download 格式
+    f"{api_base}/api/v1/datasets/{dataset_id}/download?Revision=master&FilePath={zip_file}",
+    # 格式7: 直接 URL(类似网页点击下载)
+    f"{api_base}/datasets/{dataset_id}/resolve/master/{zip_file}",
+    # 格式8: data-files 专用端点
+    f"{api_base}/api/v1/datasets/{dataset_id}/data-files?Revision=master&FilePath={zip_file}",
+    # 格式9: dataset_id 用数字 + resolve
+    f"{api_base}/api/v1/datasets/{hub_id}/resolve/master/{zip_file}",
+    # 格式10: 不带 View 参数
+    f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo?Source=SDK&Revision=master&FilePath={zip_file}",
+]
 
 
-# Step 3: 测试下载第一个压缩包(只下载前 1MB 验证)
-print("\n=== Step3: 测试下载压缩包 ===")
-for fname in archive_files:
-    params = urllib.parse.urlencode({
-        "Source": "SDK", "Revision": "master",
-        "FilePath": fname, "View": "false",
-    })
-    dl_url = f"{api_base}/api/v1/datasets/{dataset_id}/repo?{params}"
-    print(f"文件: {fname}")
-    print(f"URL: {dl_url}")
-
-    # 只读 Content-Length 验证可下载
+for i, url in enumerate(urls_to_test, 1):
+    print(f"--- 格式{i} ---")
+    print(f"  URL: {url}")
+    # HEAD 请求测试
     try:
     try:
-        req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"})
-        req.method = "HEAD"
-        with urllib.request.urlopen(req, timeout=30) as resp:
-            size = resp.headers.get("Content-Length", "unknown")
-            content_type = resp.headers.get("Content-Type", "unknown")
-            print(f"  Content-Length: {size}")
-            print(f"  Content-Type: {content_type}")
-            print(f"  状态: {resp.status}")
+        req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Test"})
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            size = resp.headers.get("Content-Length", "?")
+            ctype = resp.headers.get("Content-Type", "?")
+            print(f"  HEAD: {resp.status} | size={size} | type={ctype}")
+            if resp.status == 200:
+                print(f"  >>> 成功! <<<")
+                continue
+    except urllib.error.HTTPError as e:
+        print(f"  HEAD: {e.code}")
     except Exception as e:
     except Exception as e:
-        print(f"  HEAD 请求失败: {e}")
-        # 尝试 GET 前 1MB
-        try:
-            req = urllib.request.Request(dl_url, headers={"User-Agent": "Test"})
-            with urllib.request.urlopen(req, timeout=30) as resp:
-                data = resp.read(1024 * 1024)
-                print(f"  GET 成功, 前 1MB 读取 {len(data)} bytes")
-                # 检查是否是 zip
-                if data[:4] == b'PK\x03\x04':
-                    print(f"  确认是 ZIP 格式!")
-                else:
-                    print(f"  前 4 字节: {data[:4]}")
-        except Exception as e2:
-            print(f"  GET 也失败: {e2}")
-    break
+        print(f"  HEAD: {e}")
 
 
-print("\n=== 完成 ===")
+    # GET 请求测试(只读前 1024 字节)
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": "Test"})
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            data = resp.read(1024)
+            ctype = resp.headers.get("Content-Type", "?")
+            is_zip = data[:4] == b'PK\x03\x04'
+            print(f"  GET: {resp.status} | {len(data)} bytes | type={ctype} | is_zip={is_zip}")
+            if is_zip:
+                print(f"  >>> 成功! 是 ZIP 文件! <<<")
+    except urllib.error.HTTPError as e:
+        print(f"  GET: {e.code}")
+    except Exception as e:
+        print(f"  GET: {e}")
+    print()