|
|
@@ -111,14 +111,29 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
|
|
|
ds_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
local_path = snapshot_download(dataset_id, cache_dir=str(settings.processed_dir))
|
|
|
-
|
|
|
- # ModelScope 的 snapshot_download 把实际数据存到 cache_dir/downloads/<hash> 里
|
|
|
- # 而 local_path 指向的目录只有元数据文件,需要额外扫描 downloads 目录
|
|
|
- all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
|
|
|
- downloads_dir = settings.processed_dir / "downloads"
|
|
|
- if downloads_dir.exists():
|
|
|
- for p in downloads_dir.rglob("*"):
|
|
|
- if p.is_file() and str(p.parent) != str(ds_dir):
|
|
|
+ logger.info(f"ModelScope snapshot_download returned local_path: {local_path}")
|
|
|
+
|
|
|
+ # 收集所有文件:先扫描 local_path 本身
|
|
|
+ all_files = []
|
|
|
+ if Path(local_path).exists():
|
|
|
+ all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
|
|
|
+ logger.info(f"Found {len(all_files)} files in local_path")
|
|
|
+
|
|
|
+ # ModelScope 可能把实际文件存到 cache_dir/downloads/<hash> 或 cache_dir/hub 里
|
|
|
+ # 额外扫描几个可能的目录
|
|
|
+ for subdir_name in ("downloads", "hub", f"models/{dataset_id}"):
|
|
|
+ extra_dir = settings.processed_dir / subdir_name
|
|
|
+ if extra_dir.exists():
|
|
|
+ for p in extra_dir.rglob("*"):
|
|
|
+ if p.is_file() and p not in all_files:
|
|
|
+ all_files.append(p)
|
|
|
+ logger.info(f"Found additional files in {subdir_name}: {[p.name for p in all_files if str(p.parent).startswith(str(extra_dir))]}")
|
|
|
+
|
|
|
+ # 如果 local_path 里什么也没找到,尝试扫描整个 processed_dir
|
|
|
+ if not all_files:
|
|
|
+ logger.warning(f"No files found in local_path, scanning entire processed_dir")
|
|
|
+ for p in settings.processed_dir.rglob("*"):
|
|
|
+ if p.is_file() and p not in all_files:
|
|
|
all_files.append(p)
|
|
|
|
|
|
# 识别训练数据文件
|