|
|
@@ -170,7 +170,13 @@ async def preview_dataset(dataset_id: str, rows: int = 10) -> dict[str, Any]:
|
|
|
|
|
|
return {
|
|
|
"total_records": record.record_count,
|
|
|
- "preview_rows": [{"row_index": i, "data": row} for i, row in enumerate(preview_data)],
|
|
|
+ "preview_rows": [
|
|
|
+ {
|
|
|
+ "row_index": i,
|
|
|
+ "data": {k: _format_value(v) for k, v in row.items()},
|
|
|
+ }
|
|
|
+ for i, row in enumerate(preview_data)
|
|
|
+ ],
|
|
|
"columns": columns,
|
|
|
}
|
|
|
|
|
|
@@ -281,6 +287,28 @@ def _count_records(file_path: Path, fmt: str) -> int:
|
|
|
return 0
|
|
|
|
|
|
|
|
|
+def _format_value(value) -> str:
|
|
|
+ """将复杂值格式化为可读字符串,特别处理 ShareGPT 格式的 conversations 数组。"""
|
|
|
+ if isinstance(value, list) and len(value) > 0 and isinstance(value[0], dict):
|
|
|
+ # 检测 ShareGPT 格式:[{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}]
|
|
|
+ first = value[0]
|
|
|
+ if "from" in first and "value" in first:
|
|
|
+ parts = []
|
|
|
+ for turn in value:
|
|
|
+ role = turn.get("from", "unknown")
|
|
|
+ text = str(turn.get("value", ""))
|
|
|
+ # 截断过长文本
|
|
|
+ if len(text) > 200:
|
|
|
+ text = text[:200] + "..."
|
|
|
+ parts.append(f"[{role}] {text}")
|
|
|
+ return "\n---\n".join(parts)
|
|
|
+ # 其他对象数组:显示为 JSON
|
|
|
+ return json.dumps(value, ensure_ascii=False, indent=2)
|
|
|
+ if isinstance(value, (dict, list)):
|
|
|
+ return json.dumps(value, ensure_ascii=False, indent=2)
|
|
|
+ return str(value)
|
|
|
+
|
|
|
+
|
|
|
def _read_records(file_path: Path, fmt: str, n: int) -> list[dict]:
|
|
|
if fmt == "jsonl":
|
|
|
records = []
|