瀏覽代碼

修复测试文件

lxylxy123321 6 小時之前
父節點
當前提交
9b53d25b4f
共有 2 個文件被更改,包括 82 次插入91 次删除
  1. 66 51
      backend/scripts/test_ms_api.py
  2. 16 40
      result.txt

+ 66 - 51
backend/scripts/test_ms_api.py

@@ -1,66 +1,81 @@
 #!/usr/bin/env python3
 #!/usr/bin/env python3
-"""测试 MsDataset.load() 能否正确下载图片数据集。"""
-import sys
+"""测试 ModelScope 数据集 API,查找数据文件区的 CDN 链接。"""
 import json
 import json
+import urllib.request
+import urllib.parse
+import sys
 
 
+api_base = "https://www.modelscope.cn"
 dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
 dataset_id = sys.argv[1] if len(sys.argv) > 1 else "tany0699/carBrands50"
-namespace, ds_name = dataset_id.split("/", 1) if "/" in dataset_id else ("", dataset_id)
 
 
-print(f"测试数据集: {dataset_id}")
-print(f"namespace: {namespace}, name: {ds_name}\n")
+print(f"测试数据集: {dataset_id}\n")
 
 
-print("=== 用 MsDataset.load() 下载 ===")
+# Test 1: 完整查看数据集 info API 返回
+print("=== Test1: 数据集 info API 完整响应 ===")
 try:
 try:
-    from modelscope.msdatasets import MsDataset
+    url = f"{api_base}/api/v1/datasets/{dataset_id}"
+    print(f"请求: {url}")
+    req = urllib.request.Request(url, headers={"User-Agent": "Test"})
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        info = json.loads(resp.read().decode())
+    print(json.dumps(info, indent=2, ensure_ascii=False)[:3000])
+except Exception as e:
+    print(f"失败: {e}")
 
 
-    ds = None
-    for split in ("train", "validation", "test"):
-        try:
-            if namespace:
-                ds = MsDataset.load(ds_name, namespace=namespace, split=split)
-            else:
-                ds = MsDataset.load(dataset_id, split=split)
-            if ds:
-                print(f"加载 split='{split}' 成功, 共 {len(ds) if hasattr(ds, '__len__') else '?'} 条")
-                break
-        except Exception as e:
-            print(f"split='{split}' 失败: {e}")
+# Test 2: 尝试 HubApi(跳过 msdatasets 的 import 问题)
+print("\n=== Test2: HubApi 直接调用 ===")
+try:
+    from modelscope.hub.api import HubApi
+    api = HubApi()
 
 
-    if not ds:
-        try:
-            if namespace:
-                ds = MsDataset.load(ds_name, namespace=namespace)
-            else:
-                ds = MsDataset.load(dataset_id)
-            print(f"不带 split 加载成功, 类型: {type(ds)}")
-        except Exception as e:
-            print(f"不带 split 也失败: {e}")
-            sys.exit(1)
+    # 获取数据集文件列表
+    print("尝试 get_dataset_files...")
+    try:
+        namespace, ds_name = dataset_id.split("/", 1)
+        files = api.get_dataset_files(ds_name, namespace=namespace, recursive=True)
+        print(f"get_dataset_files 返回 {len(files)} 个文件:")
+        for f in files:
+            print(f"  {f}")
+    except Exception as e:
+        print(f"get_dataset_files 失败: {e}")
 
 
-    if not hasattr(ds, "__iter__"):
-        print(f"数据集不可迭代, 类型: {type(ds)}")
-        sys.exit(1)
+    # 尝试获取文件下载 URL
+    print("\n尝试 get_dataset_file_url...")
+    try:
+        namespace, ds_name = dataset_id.split("/", 1)
+        url = api.get_dataset_file_url("train.csv", ds_name, namespace, revision="master")
+        print(f"train.csv 下载 URL: {url}")
+    except Exception as e:
+        print(f"get_dataset_file_url 失败: {e}")
 
 
-    # 查看前 3 条数据
-    print("\n=== 前 3 条数据 ===")
-    count = 0
-    for row in ds:
-        if count >= 3:
-            break
-        print(f"\n--- Record {count} ---")
-        for k, v in row.items():
-            vtype = type(v).__name__
-            if vtype == "Image":
-                print(f"  {k}: PIL.Image (size={v.size}, mode={v.mode})")
-            elif isinstance(v, str) and len(v) > 100:
-                print(f"  {k}: str (len={len(v)}) '{v[:100]}...'")
-            else:
-                print(f"  {k}: {vtype} = {v}")
-        count += 1
+except ImportError as e:
+    print(f"import 失败: {e}")
+except Exception as e:
+    print(f"失败: {e}")
 
 
-    print(f"\n=== 完成 ===")
+# Test 3: 查看 carBrands50.json 配置文件(可能包含数据文件 URL)
+print("\n=== Test3: carBrands50.json 配置文件 ===")
+try:
+    namespace, ds_name = dataset_id.split("/", 1)
+    url = (f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo"
+           f"?Revision=master&FilePath=carBrands50.json&View=false")
+    print(f"请求: {url}")
+    req = urllib.request.Request(url, headers={"User-Agent": "Test"})
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        config = resp.read().decode()
+    print(config[:2000])
+except Exception as e:
+    print(f"失败: {e}")
 
 
+# Test 4: 查看 dataset_infos.json
+print("\n=== Test4: dataset_infos.json ===")
+try:
+    url = (f"{api_base}/api/v1/datasets/{namespace}/{ds_name}/repo"
+           f"?Revision=master&FilePath=dataset_infos.json&View=false")
+    print(f"请求: {url}")
+    req = urllib.request.Request(url, headers={"User-Agent": "Test"})
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        config = resp.read().decode()
+    print(config[:2000])
 except Exception as e:
 except Exception as e:
     print(f"失败: {e}")
     print(f"失败: {e}")
-    import traceback
-    traceback.print_exc()

+ 16 - 40
result.txt

@@ -1,41 +1,17 @@
-import time
-import hmac
-import hashlib
-import requests
-import uuid
+lq@lq:~/Fine-tuning$ cp backend/scripts/test_ms_api.py backend/data/ && sudo docker exec -it finetune-backend python3 /root/Fine-tuning/backend/data/test_ms_api.py tany0699/carBrands50
+测试数据集: tany0699/carBrands50
+namespace: tany0699, name: carBrands50
 
 
-# 1. 配置(直接填你的)
-app_id = "hmDeOtXZVbeo2AZ-x58yPssZLg4Tcb1W"
-app_secret = "pj9UirhGUFPsFnCizCz-Qo1dOGi3kxRIrDKKmJZu2aRCPgtTogTubDRW1weM4KNL"
-url = "http://192.168.92.61:8003/api/v1/open/auth/token"  # 注意带 /open/
-
-# 2. 生成参数
-timestamp = str(int(time.time()))  # 秒级时间戳
-nonce = uuid.uuid4().hex  # 随机字符串
-message = app_id + timestamp + nonce
-
-# 3. 计算 HMAC-SHA256 签名
-signature = hmac.new(
-    key=app_secret.encode("utf-8"),
-    msg=message.encode("utf-8"),
-    digestmod=hashlib.sha256
-).hexdigest()
-
-# 4. 构造请求头
-headers = {
-    "Content-Type": "application/json",
-    "X-Api-Key": app_id,
-    "X-Timestamp": timestamp,
-    "X-Nonce": nonce,
-    "X-Signature": signature
-}
-
-# 5. 发送请求(body 为空 {})
-body = {}
-resp = requests.post(url, json=body, headers=headers)
-
-# 6. 打印结果
-print("=== 请求信息 ===")
-print("headers:", headers)
-print("status_code:", resp.status_code)
-print("response:", resp.text)
+=== 用 MsDataset.load() 下载 ===
+失败: cannot import name 'get_metadata_patterns' from 'datasets.data_files' (/usr/local/lib/python3.10/site-packages/datasets/data_files.py)
+Traceback (most recent call last):
+  File "/root/Fine-tuning/backend/data/test_ms_api.py", line 14, in <module>
+    from modelscope.msdatasets import MsDataset
+  File "/usr/local/lib/python3.10/site-packages/modelscope/msdatasets/__init__.py", line 2, in <module>
+    from modelscope.msdatasets.ms_dataset import MsDataset
+  File "/usr/local/lib/python3.10/site-packages/modelscope/msdatasets/ms_dataset.py", line 24, in <module>
+    from modelscope.msdatasets.utils.hf_datasets_util import load_dataset_with_ctx
+  File "/usr/local/lib/python3.10/site-packages/modelscope/msdatasets/utils/hf_datasets_util.py", line 19, in <module>
+    from datasets.data_files import (
+ImportError: cannot import name 'get_metadata_patterns' from 'datasets.data_files' (/usr/local/lib/python3.10/site-packages/datasets/data_files.py)
+lq@lq:~/Fine-tuning$