|
@@ -152,8 +152,22 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
|
|
|
jsonl_path = ds_dir / "data.jsonl"
|
|
jsonl_path = ds_dir / "data.jsonl"
|
|
|
record_count = 0
|
|
record_count = 0
|
|
|
content = target.read_text(encoding="utf-8")
|
|
content = target.read_text(encoding="utf-8")
|
|
|
- if target.suffix == ".jsonl":
|
|
|
|
|
- records = [json.loads(line.strip()) for line in content.splitlines() if line.strip()]
|
|
|
|
|
|
|
+
|
|
|
|
|
+ if target.suffix == ".jsonl" or not target.suffix:
|
|
|
|
|
+ # JSONL 或无后缀文件:尝试逐行解析
|
|
|
|
|
+ records = []
|
|
|
|
|
+ for line in content.splitlines():
|
|
|
|
|
+ line = line.strip()
|
|
|
|
|
+ if not line:
|
|
|
|
|
+ continue
|
|
|
|
|
+ try:
|
|
|
|
|
+ records.append(json.loads(line))
|
|
|
|
|
+ except json.JSONDecodeError:
|
|
|
|
|
+ # 如果逐行解析失败,尝试整体解析(可能是 JSON 数组)
|
|
|
|
|
+ records = json.loads(content)
|
|
|
|
|
+ if not isinstance(records, list):
|
|
|
|
|
+ records = [records]
|
|
|
|
|
+ break
|
|
|
else:
|
|
else:
|
|
|
records = json.loads(content)
|
|
records = json.loads(content)
|
|
|
if not isinstance(records, list):
|
|
if not isinstance(records, list):
|