lxylxy123321 пре 1 недеља
родитељ
комит
c9814eafff
1 измењених фајлова са 23 додато и 8 уклоњено
  1. 23 8
      backend/app/services/dataset_service.py

+ 23 - 8
backend/app/services/dataset_service.py

@@ -111,14 +111,29 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
     ds_dir.mkdir(parents=True, exist_ok=True)
 
     local_path = snapshot_download(dataset_id, cache_dir=str(settings.processed_dir))
-
-    # ModelScope 的 snapshot_download 把实际数据存到 cache_dir/downloads/<hash> 里
-    # 而 local_path 指向的目录只有元数据文件,需要额外扫描 downloads 目录
-    all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
-    downloads_dir = settings.processed_dir / "downloads"
-    if downloads_dir.exists():
-        for p in downloads_dir.rglob("*"):
-            if p.is_file() and str(p.parent) != str(ds_dir):
+    logger.info(f"ModelScope snapshot_download returned local_path: {local_path}")
+
+    # 收集所有文件:先扫描 local_path 本身
+    all_files = []
+    if Path(local_path).exists():
+        all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
+        logger.info(f"Found {len(all_files)} files in local_path")
+
+    # ModelScope 可能把实际文件存到 cache_dir/downloads/<hash> 或 cache_dir/hub 里
+    # 额外扫描几个可能的目录
+    for subdir_name in ("downloads", "hub", f"models/{dataset_id}"):
+        extra_dir = settings.processed_dir / subdir_name
+        if extra_dir.exists():
+            for p in extra_dir.rglob("*"):
+                if p.is_file() and p not in all_files:
+                    all_files.append(p)
+            logger.info(f"Found additional files in {subdir_name}: {[p.name for p in all_files if str(p.parent).startswith(str(extra_dir))]}")
+
+    # 如果 local_path 里什么也没找到,尝试扫描整个 processed_dir
+    if not all_files:
+        logger.warning(f"No files found in local_path, scanning entire processed_dir")
+        for p in settings.processed_dir.rglob("*"):
+            if p.is_file() and p not in all_files:
                 all_files.append(p)
 
     # 识别训练数据文件