|
@@ -1,7 +1,7 @@
|
|
|
import asyncio
|
|
import asyncio
|
|
|
import json
|
|
import json
|
|
|
import uuid
|
|
import uuid
|
|
|
-from datetime import datetime
|
|
|
|
|
|
|
+from datetime import datetime, timezone
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
from typing import Any
|
|
from typing import Any
|
|
|
|
|
|
|
@@ -22,9 +22,6 @@ META_FILENAMES = frozenset({
|
|
|
"special_tokens_map.json", "tokenizer_config.json",
|
|
"special_tokens_map.json", "tokenizer_config.json",
|
|
|
"added_tokens.json", "vocab.json", "merges.txt",
|
|
"added_tokens.json", "vocab.json", "merges.txt",
|
|
|
"config.json", "preprocessor_config.json",
|
|
"config.json", "preprocessor_config.json",
|
|
|
- # HF/ModelScope dataset metadata
|
|
|
|
|
- "dataset_info.json", "dataset_infos.json", "dataset.json",
|
|
|
|
|
- "state.json", "dataset_dict.json",
|
|
|
|
|
})
|
|
})
|
|
|
|
|
|
|
|
# File size threshold: files smaller than this (bytes) are likely metadata
|
|
# File size threshold: files smaller than this (bytes) are likely metadata
|
|
@@ -69,25 +66,7 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
|
|
|
"""从 HuggingFace 或 ModelScope 下载数据集。"""
|
|
"""从 HuggingFace 或 ModelScope 下载数据集。"""
|
|
|
try:
|
|
try:
|
|
|
if req.use_modelscope:
|
|
if req.use_modelscope:
|
|
|
- # ModelScope 数据集是 HF 镜像,直接用 datasets 库加载
|
|
|
|
|
- from datasets import load_dataset
|
|
|
|
|
-
|
|
|
|
|
- ds_dir = settings.processed_dir / f"ms_{req.dataset_id.replace('/', '_')}"
|
|
|
|
|
- ds_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
- ds = load_dataset(req.dataset_id)
|
|
|
|
|
- if "train" in ds:
|
|
|
|
|
- split = ds["train"]
|
|
|
|
|
- else:
|
|
|
|
|
- split = ds[list(ds.keys())[0]]
|
|
|
|
|
- output_path = ds_dir / "data.jsonl"
|
|
|
|
|
- record_count = 0
|
|
|
|
|
- with open(output_path, "w", encoding="utf-8") as f:
|
|
|
|
|
- for item in split:
|
|
|
|
|
- f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
|
|
|
- record_count += 1
|
|
|
|
|
- if record_count == 0:
|
|
|
|
|
- raise RuntimeError("Dataset loaded but returned 0 records")
|
|
|
|
|
- jsonl_path = output_path
|
|
|
|
|
|
|
+ ds_dir, jsonl_path, record_count = await asyncio.to_thread(_download_modelscope_dataset, req.dataset_id)
|
|
|
else:
|
|
else:
|
|
|
from datasets import load_dataset
|
|
from datasets import load_dataset
|
|
|
|
|
|
|
@@ -104,8 +83,6 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
|
|
|
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
|
jsonl_path = output_path
|
|
jsonl_path = output_path
|
|
|
record_count = len(split) if hasattr(split, "__len__") else 0
|
|
record_count = len(split) if hasattr(split, "__len__") else 0
|
|
|
- if record_count == 0:
|
|
|
|
|
- raise RuntimeError("HF dataset loaded but returned 0 records")
|
|
|
|
|
|
|
|
|
|
record = DatasetRecord(
|
|
record = DatasetRecord(
|
|
|
id=str(uuid.uuid4()),
|
|
id=str(uuid.uuid4()),
|
|
@@ -113,7 +90,7 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
|
|
|
format="jsonl",
|
|
format="jsonl",
|
|
|
record_count=record_count,
|
|
record_count=record_count,
|
|
|
file_path=str(jsonl_path),
|
|
file_path=str(jsonl_path),
|
|
|
- created_at=datetime.utcnow(),
|
|
|
|
|
|
|
+ created_at=datetime.now(timezone.utc),
|
|
|
)
|
|
)
|
|
|
async with async_session() as session:
|
|
async with async_session() as session:
|
|
|
session.add(record)
|
|
session.add(record)
|
|
@@ -126,55 +103,82 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
|
|
|
return DatasetDownloadResponse(dataset_id=req.dataset_id, status="failed", error=str(e))
|
|
return DatasetDownloadResponse(dataset_id=req.dataset_id, status="failed", error=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
-def _scan_and_convert_to_jsonl(ds_dir: Path) -> tuple[Path, int]:
|
|
|
|
|
- """扫描 CLI 下载的数据集目录,找训练数据文件并转为 JSONL。"""
|
|
|
|
|
- # 找所有可能的数据文件
|
|
|
|
|
- data_files = []
|
|
|
|
|
- for ext in ("*.jsonl", "*.json", "*.csv"):
|
|
|
|
|
- data_files.extend(ds_dir.rglob(ext))
|
|
|
|
|
- # 过滤掉元数据文件
|
|
|
|
|
- data_files = [f for f in data_files if f.name not in META_FILENAMES]
|
|
|
|
|
|
|
+def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
|
|
|
|
|
+ """用 snapshot_download 下载数据集文件,完全绕过 datasets 库,避免版本兼容问题。"""
|
|
|
|
|
+ from modelscope import snapshot_download
|
|
|
|
|
|
|
|
- if not data_files:
|
|
|
|
|
- raise RuntimeError(f"No dataset files found in {ds_dir}")
|
|
|
|
|
|
|
+ ds_dir = settings.processed_dir / f"ms_{dataset_id.replace('/', '_')}"
|
|
|
|
|
+ ds_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+
|
|
|
|
|
+ local_path = snapshot_download(dataset_id, cache_dir=str(settings.processed_dir))
|
|
|
|
|
|
|
|
|
|
+ # ModelScope 的 snapshot_download 把实际数据存到 cache_dir/downloads/<hash> 里
|
|
|
|
|
+ # 而 local_path 指向的目录只有元数据文件,需要额外扫描 downloads 目录
|
|
|
|
|
+ all_files = [p for p in Path(local_path).rglob("*") if p.is_file()]
|
|
|
|
|
+ downloads_dir = settings.processed_dir / "downloads"
|
|
|
|
|
+ if downloads_dir.exists():
|
|
|
|
|
+ for p in downloads_dir.rglob("*"):
|
|
|
|
|
+ if p.is_file() and str(p.parent) != str(ds_dir):
|
|
|
|
|
+ all_files.append(p)
|
|
|
|
|
+
|
|
|
|
|
+ # 识别训练数据文件
|
|
|
|
|
+ data_files = [f for f in all_files if _is_training_data_file(f)]
|
|
|
|
|
+
|
|
|
|
|
+ if not data_files:
|
|
|
|
|
+ fallback = [f for f in all_files if f.suffix in (".json", ".jsonl")]
|
|
|
|
|
+ logger.warning(f"No training data files found in {dataset_id}. "
|
|
|
|
|
+ f"Available JSON files: {[f.name for f in fallback]}")
|
|
|
|
|
+ if fallback:
|
|
|
|
|
+ data_files = fallback
|
|
|
|
|
+ else:
|
|
|
|
|
+ raise ValueError(f"No JSON/JSONL data files found in dataset {dataset_id}")
|
|
|
|
|
+
|
|
|
|
|
+ # 优先取 train / data 开头的文件
|
|
|
|
|
+ target = None
|
|
|
|
|
+ for name in ("train.jsonl", "train.json", "data.jsonl", "data.json"):
|
|
|
|
|
+ for f in data_files:
|
|
|
|
|
+ if f.name == name:
|
|
|
|
|
+ target = f
|
|
|
|
|
+ break
|
|
|
|
|
+ if target:
|
|
|
|
|
+ break
|
|
|
|
|
+ if not target:
|
|
|
|
|
+ # 优先取数据量最大的文件
|
|
|
|
|
+ target = sorted(data_files, key=lambda f: f.stat().st_size, reverse=True)[0]
|
|
|
|
|
+
|
|
|
|
|
+ logger.info(f"Selected data file: {target} (size={target.stat().st_size})")
|
|
|
|
|
+
|
|
|
|
|
+ # 读取并统一转为 JSONL
|
|
|
jsonl_path = ds_dir / "data.jsonl"
|
|
jsonl_path = ds_dir / "data.jsonl"
|
|
|
record_count = 0
|
|
record_count = 0
|
|
|
|
|
+ content = target.read_text(encoding="utf-8")
|
|
|
|
|
+
|
|
|
|
|
+ if target.suffix == ".jsonl" or not target.suffix:
|
|
|
|
|
+ # JSONL 或无后缀文件:尝试逐行解析
|
|
|
|
|
+ records = []
|
|
|
|
|
+ for line in content.splitlines():
|
|
|
|
|
+ line = line.strip()
|
|
|
|
|
+ if not line:
|
|
|
|
|
+ continue
|
|
|
|
|
+ try:
|
|
|
|
|
+ records.append(json.loads(line))
|
|
|
|
|
+ except json.JSONDecodeError:
|
|
|
|
|
+ # 如果逐行解析失败,尝试整体解析(可能是 JSON 数组)
|
|
|
|
|
+ records = json.loads(content)
|
|
|
|
|
+ if not isinstance(records, list):
|
|
|
|
|
+ records = [records]
|
|
|
|
|
+ break
|
|
|
|
|
+ else:
|
|
|
|
|
+ records = json.loads(content)
|
|
|
|
|
+ if not isinstance(records, list):
|
|
|
|
|
+ records = [records]
|
|
|
|
|
+
|
|
|
|
|
+ with open(jsonl_path, "w", encoding="utf-8") as f:
|
|
|
|
|
+ for item in records:
|
|
|
|
|
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
|
|
|
+ record_count += 1
|
|
|
|
|
|
|
|
- with open(jsonl_path, "w", encoding="utf-8") as out:
|
|
|
|
|
- for data_file in data_files:
|
|
|
|
|
- if data_file.suffix == ".jsonl":
|
|
|
|
|
- with open(data_file, "r", encoding="utf-8") as f:
|
|
|
|
|
- for line in f:
|
|
|
|
|
- line = line.strip()
|
|
|
|
|
- if line:
|
|
|
|
|
- out.write(line + "\n")
|
|
|
|
|
- record_count += 1
|
|
|
|
|
- elif data_file.suffix == ".json":
|
|
|
|
|
- try:
|
|
|
|
|
- with open(data_file, "r", encoding="utf-8") as f:
|
|
|
|
|
- data = json.load(f)
|
|
|
|
|
- if isinstance(data, list):
|
|
|
|
|
- for item in data:
|
|
|
|
|
- out.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
|
|
|
- record_count += 1
|
|
|
|
|
- elif isinstance(data, dict):
|
|
|
|
|
- # 跳过 HF/ModelScope dataset metadata(features/splits 结构)
|
|
|
|
|
- if "features" in data or "splits" in data or "dataset_name" in data:
|
|
|
|
|
- continue
|
|
|
|
|
- out.write(json.dumps(data, ensure_ascii=False) + "\n")
|
|
|
|
|
- record_count += 1
|
|
|
|
|
- except Exception:
|
|
|
|
|
- pass
|
|
|
|
|
- elif data_file.suffix == ".csv":
|
|
|
|
|
- import csv
|
|
|
|
|
- with open(data_file, "r", encoding="utf-8") as f:
|
|
|
|
|
- reader = csv.DictReader(f)
|
|
|
|
|
- for row in reader:
|
|
|
|
|
- out.write(json.dumps(dict(row), ensure_ascii=False) + "\n")
|
|
|
|
|
- record_count += 1
|
|
|
|
|
-
|
|
|
|
|
- return jsonl_path, record_count
|
|
|
|
|
|
|
+ return ds_dir, jsonl_path, record_count
|
|
|
|
|
|
|
|
|
|
|
|
|
async def upload_dataset(file: UploadFile) -> dict[str, Any]:
|
|
async def upload_dataset(file: UploadFile) -> dict[str, Any]:
|
|
@@ -200,7 +204,7 @@ async def upload_dataset(file: UploadFile) -> dict[str, Any]:
|
|
|
format=fmt,
|
|
format=fmt,
|
|
|
record_count=record_count,
|
|
record_count=record_count,
|
|
|
file_path=str(file_path),
|
|
file_path=str(file_path),
|
|
|
- created_at=datetime.utcnow(),
|
|
|
|
|
|
|
+ created_at=datetime.now(timezone.utc),
|
|
|
)
|
|
)
|
|
|
async with async_session() as session:
|
|
async with async_session() as session:
|
|
|
session.add(record)
|
|
session.add(record)
|