Просмотр исходного кода

修复数据集无法正常显示

lxylxy123321 1 неделя назад
Родитель
Сommit
732a30502f
1 измененных файлов с 9 добавлено и 2 удалено
  1. 9 2
      backend/app/services/dataset_service.py

+ 9 - 2
backend/app/services/dataset_service.py

@@ -112,12 +112,19 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
 
     local_path = snapshot_download(dataset_id, cache_dir=str(settings.processed_dir))
 
-    # 扫描所有文件,识别训练数据文件
+    # ModelScope 的 snapshot_download 把实际数据存到 cache_dir/downloads/<hash> 里
+    # 而 local_path 指向的目录只有元数据文件,需要额外扫描 downloads 目录
     all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
+    downloads_dir = settings.processed_dir / "downloads"
+    if downloads_dir.exists():
+        for p in downloads_dir.rglob("*"):
+            if p.is_file() and str(p.parent) != str(ds_dir):
+                all_files.append(p)
+
+    # 识别训练数据文件
     data_files = [f for f in all_files if _is_training_data_file(f)]
 
     if not data_files:
-        # 回退:列出所有 JSON/JSONL 文件方便调试
         fallback = [f for f in all_files if f.suffix in (".json", ".jsonl")]
         logger.warning(f"No training data files found in {dataset_id}. "
                        f"Available JSON files: {[f.name for f in fallback]}")