lxylxy123321 1 неделя назад
Родитель
Сommit
30438fc60a
2 измененных файлов с 49 добавлено и 6 удалено
  1. 29 1
      backend/app/services/dataset_service.py
  2. 20 5
      frontend/src/pages/Datasets.tsx

+ 29 - 1
backend/app/services/dataset_service.py

@@ -170,7 +170,13 @@ async def preview_dataset(dataset_id: str, rows: int = 10) -> dict[str, Any]:
 
     return {
         "total_records": record.record_count,
-        "preview_rows": [{"row_index": i, "data": row} for i, row in enumerate(preview_data)],
+        "preview_rows": [
+            {
+                "row_index": i,
+                "data": {k: _format_value(v) for k, v in row.items()},
+            }
+            for i, row in enumerate(preview_data)
+        ],
         "columns": columns,
     }
 
@@ -281,6 +287,28 @@ def _count_records(file_path: Path, fmt: str) -> int:
     return 0
 
 
+def _format_value(value) -> str:
+    """将复杂值格式化为可读字符串,特别处理 ShareGPT 格式的 conversations 数组。"""
+    if isinstance(value, list) and len(value) > 0 and isinstance(value[0], dict):
+        # 检测 ShareGPT 格式:[{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}]
+        first = value[0]
+        if "from" in first and "value" in first:
+            parts = []
+            for turn in value:
+                role = turn.get("from", "unknown")
+                text = str(turn.get("value", ""))
+                # 截断过长文本
+                if len(text) > 200:
+                    text = text[:200] + "..."
+                parts.append(f"[{role}] {text}")
+            return "\n---\n".join(parts)
+        # 其他对象数组:显示为 JSON
+        return json.dumps(value, ensure_ascii=False, indent=2)
+    if isinstance(value, (dict, list)):
+        return json.dumps(value, ensure_ascii=False, indent=2)
+    return str(value)
+
+
 def _read_records(file_path: Path, fmt: str, n: int) -> list[dict]:
     if fmt == "jsonl":
         records = []

+ 20 - 5
frontend/src/pages/Datasets.tsx

@@ -174,11 +174,26 @@ export function Datasets() {
             <tbody>
               {previewData.rows.slice(0, 10).map((row, i) => (
                 <tr key={i} style={{ borderBottom: '1px solid #eee' }}>
-                  {previewData.columns.map(col => (
-                    <td key={col} style={{ padding: '6px 8px', maxWidth: 200, overflow: 'hidden', textOverflow: 'ellipsis', whiteSpace: 'nowrap' }}>
-                      {String(row.data[col] ?? '')}
-                    </td>
-                  ))}
+                  {previewData.columns.map(col => {
+                    const cellVal = String(row.data[col] ?? '')
+                    const isMultiline = cellVal.includes('\n') || cellVal.length > 100
+                    return (
+                      <td
+                        key={col}
+                        style={{
+                          padding: '6px 8px',
+                          maxWidth: isMultiline ? 500 : 200,
+                          overflow: isMultiline ? 'auto' : 'hidden',
+                          textOverflow: isMultiline ? undefined : 'ellipsis',
+                          whiteSpace: isMultiline ? 'pre-wrap' : 'nowrap',
+                          fontFamily: isMultiline ? 'monospace' : undefined,
+                          fontSize: isMultiline ? 12 : 13,
+                        }}
+                      >
+                        {cellVal}
+                      </td>
+                    )
+                  })}
                 </tr>
               ))}
             </tbody>