|
|
@@ -112,12 +112,19 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
|
|
|
|
|
|
local_path = snapshot_download(dataset_id, cache_dir=str(settings.processed_dir))
|
|
|
|
|
|
- # 扫描所有文件,识别训练数据文件
|
|
|
+ # ModelScope 的 snapshot_download 把实际数据存到 cache_dir/downloads/<hash> 里
|
|
|
+ # 而 local_path 指向的目录只有元数据文件,需要额外扫描 downloads 目录
|
|
|
all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
|
|
|
+ downloads_dir = settings.processed_dir / "downloads"
|
|
|
+ if downloads_dir.exists():
|
|
|
+ for p in downloads_dir.rglob("*"):
|
|
|
+ if p.is_file() and str(p.parent) != str(ds_dir):
|
|
|
+ all_files.append(p)
|
|
|
+
|
|
|
+ # 识别训练数据文件
|
|
|
data_files = [f for f in all_files if _is_training_data_file(f)]
|
|
|
|
|
|
if not data_files:
|
|
|
- # 回退:列出所有 JSON/JSONL 文件方便调试
|
|
|
fallback = [f for f in all_files if f.suffix in (".json", ".jsonl")]
|
|
|
logger.warning(f"No training data files found in {dataset_id}. "
|
|
|
f"Available JSON files: {[f.name for f in fallback]}")
|