|
|
@@ -308,14 +308,31 @@ def _convert_to_jsonl(file_path: Path) -> Path:
|
|
|
return jsonl_path
|
|
|
|
|
|
try:
|
|
|
- # 尝试作为 JSON 数组解析
|
|
|
+ # 尝试作为 JSON 解析
|
|
|
data = _json.loads(content)
|
|
|
if isinstance(data, list):
|
|
|
+ # JSON 数组
|
|
|
+ items = data
|
|
|
+ elif isinstance(data, dict):
|
|
|
+ # JSON 对象:查找嵌套的数组字段
|
|
|
+ items = None
|
|
|
+ for key in ("data", "items", "results", "records", "annotations", "samples"):
|
|
|
+ if key in data and isinstance(data[key], list):
|
|
|
+ items = data[key]
|
|
|
+ break
|
|
|
+ if items is None:
|
|
|
+ # 单个对象,包装为数组
|
|
|
+ items = [data]
|
|
|
+ else:
|
|
|
+ items = None
|
|
|
+
|
|
|
+ if items is not None:
|
|
|
with open(jsonl_path, "w", encoding="utf-8") as out:
|
|
|
- for item in data:
|
|
|
+ for item in items:
|
|
|
out.write(_json.dumps(item, ensure_ascii=False) + "\n")
|
|
|
- # 删除原始 JSON 文件
|
|
|
- file_path.unlink()
|
|
|
+ # 只有当原始文件与新文件不同时才删除(避免删除刚写入的文件)
|
|
|
+ if jsonl_path != file_path and file_path.exists():
|
|
|
+ file_path.unlink()
|
|
|
return jsonl_path
|
|
|
except _json.JSONDecodeError:
|
|
|
pass
|
|
|
@@ -334,7 +351,8 @@ def _convert_to_jsonl(file_path: Path) -> Path:
|
|
|
|
|
|
with open(jsonl_path, "w", encoding="utf-8") as out:
|
|
|
out.write("\n".join(valid_lines) + ("\n" if valid_lines else ""))
|
|
|
- file_path.unlink()
|
|
|
+ if jsonl_path != file_path and file_path.exists():
|
|
|
+ file_path.unlink()
|
|
|
return jsonl_path
|
|
|
|
|
|
|