|
@@ -97,7 +97,7 @@ async def download_dataset(req: DatasetDownloadRequest) -> DatasetDownloadRespon
|
|
|
|
|
|
|
|
logger.info(f"Dataset download task started: {req.dataset_id} (task_id={task_id})")
|
|
logger.info(f"Dataset download task started: {req.dataset_id} (task_id={task_id})")
|
|
|
return DatasetDownloadResponse(
|
|
return DatasetDownloadResponse(
|
|
|
- dataset_id=req.dataset_id, status="pending", path=task_id
|
|
|
|
|
|
|
+ dataset_id=req.dataset_id, status="pending", task_id=task_id, path=task_id
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@@ -301,9 +301,12 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
|
|
|
try:
|
|
try:
|
|
|
records.append(json.loads(line))
|
|
records.append(json.loads(line))
|
|
|
except json.JSONDecodeError:
|
|
except json.JSONDecodeError:
|
|
|
- records = json.loads(content)
|
|
|
|
|
- if not isinstance(records, list):
|
|
|
|
|
- records = [records]
|
|
|
|
|
|
|
+ # 单行解析失败,尝试整体解析
|
|
|
|
|
+ try:
|
|
|
|
|
+ data = json.loads(content)
|
|
|
|
|
+ records = data if isinstance(data, list) else [data]
|
|
|
|
|
+ except json.JSONDecodeError:
|
|
|
|
|
+ records = []
|
|
|
break
|
|
break
|
|
|
elif target.suffix == ".json":
|
|
elif target.suffix == ".json":
|
|
|
# JSON 文件:先尝试 JSON 数组,失败再逐行解析(可能是 JSONL 格式)
|
|
# JSON 文件:先尝试 JSON 数组,失败再逐行解析(可能是 JSONL 格式)
|
|
@@ -321,6 +324,8 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
|
|
|
records.append(json.loads(line))
|
|
records.append(json.loads(line))
|
|
|
except json.JSONDecodeError:
|
|
except json.JSONDecodeError:
|
|
|
continue
|
|
continue
|
|
|
|
|
+ else:
|
|
|
|
|
+ records = []
|
|
|
|
|
|
|
|
with open(jsonl_path, "w", encoding="utf-8") as f:
|
|
with open(jsonl_path, "w", encoding="utf-8") as f:
|
|
|
for item in records:
|
|
for item in records:
|