lxylxy123321 пре 7 часа
родитељ
комит
6852f7a763
2 измењених фајлова са 29 додато и 21 уклоњено
  1. 1 7
      backend/app/services/dataset_service.py
  2. 28 14
      result.txt

+ 1 - 7
backend/app/services/dataset_service.py

@@ -534,13 +534,7 @@ def _download_modelscope_dataset_cli(dataset_id: str, ds_dir: Path) -> tuple[Pat
         logger.error(f"ModelScope CLI download failed (code={proc.returncode}): {proc.stderr[:500]}")
         raise RuntimeError(f"ModelScope download failed: {proc.stderr[:500]}")
 
-    # CLI 下载完 git 仓库文件后,尝试下载数据文件区的压缩包并解压(图片数据集用)
-    # 即使失败也不影响原来的下载流程
-    try:
-        _download_modelscope_data_files(dataset_id, ds_dir)
-        _extract_archives(ds_dir)
-    except Exception as e:
-        logger.warning(f"数据文件下载/解压失败(不影响主流程): {e}")
+    # CLI 下载完 git 仓库文件后(数据文件区的图片需要通过 MsDataset.load 获取,CLI 只能下载元数据)
 
     # 扫描下载目录中的所有文件
     all_files = [p for p in ds_dir.rglob("*") if p.is_file()]

+ 28 - 14
result.txt

@@ -2,21 +2,35 @@ lq@lq:~/Fine-tuning$ cp backend/scripts/test_ms_api.py backend/data/ && sudo doc
 数据集: tany0699/carBrands50
 
 === 用 MsDataset.load() 下载 ===
-2026-05-28 07:23:48,514 - modelscope - INFO - No subset_name specified, defaulting to the default
-2026-05-28 07:23:50,442 - modelscope - WARNING - Reusing dataset dataset_builder (/root/.cache/modelscope/hub/datasets/tany0699/carBrands50/master/data_files)
-2026-05-28 07:23:50,443 - modelscope - INFO - Generating dataset dataset_builder (/root/.cache/modelscope/hub/datasets/tany0699/carBrands50/master/data_files)
-2026-05-28 07:23:50,443 - modelscope - INFO - Loading meta-data file ...
-4398it [00:00, 39702.70it/s]
-100% split='train' 成功, 共 4397 条
+2026-05-28 07:26:26,648 - modelscope - INFO - No subset_name specified, defaulting to the default
+2026-05-28 07:26:27,880 - modelscope - WARNING - Reusing dataset dataset_builder (/root/.cache/modelscope/hub/datasets/tany0699/carBrands50/master/data_files)
+2026-05-28 07:26:27,880 - modelscope - INFO - Generating dataset dataset_builder (/root/.cache/modelscope/hub/datasets/tany0699/carBrands50/master/data_files)
+2026-05-28 07:26:27,880 - modelscope - INFO - Reusing cached meta-data file: /root/.cache/modelscope/hub/datasets/tany0699/carBrands50/master/data_files/a6ade4dacefa0beffa564bf1f50f7ffd
+split='train' 成功, 共 4397 条
 
-=== 前 2 条数据 ===
+=== 处理前 5 条数据(复制图片) ===
+Record 0: image:FILE -> copied Bentley_025.jpg (6811 bytes) as 000000.jpg
+Record 1: image:FILE -> copied Bentley_004.jpg (12710 bytes) as 000001.jpg
+Record 2: image:FILE -> copied Bentley_038.jpg (6993 bytes) as 000002.jpg
+Record 3: image:FILE -> copied Bentley_011.jpg (8629 bytes) as 000003.jpg
+Record 4: image:FILE -> copied Bentley_006.jpg (7090 bytes) as 000004.jpg
 
---- Record 0 ---
-  image:FILE: str (len=180)
-  category: int = 4
+=== 写入 JSONL ===
+写入 5 条记录到 /tmp/ms_test_download/data.jsonl
 
---- Record 1 ---
-  image:FILE: str (len=180)
-  category: int = 4
+=== JSONL 内容 ===
+0: {"image:FILE": "images/000000.jpg", "category": 4}
+1: {"image:FILE": "images/000001.jpg", "category": 4}
+2: {"image:FILE": "images/000002.jpg", "category": 4}
+3: {"image:FILE": "images/000003.jpg", "category": 4}
+4: {"image:FILE": "images/000004.jpg", "category": 4}
 
-=== 测试通过! ===
+=== images 目录 ===
+  000000.jpg (6811 bytes)
+  000001.jpg (12710 bytes)
+  000002.jpg (6993 bytes)
+  000003.jpg (8629 bytes)
+  000004.jpg (7090 bytes)
+
+=== 测试完成! ===
+数据集目录: /tmp/ms_test_download