Просмотр исходного кода

Remove raw prompt/completion columns after tokenization to prevent tensor conversion error

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
lxylxy123321 1 неделя назад
Родитель
Сommit
e24d669952
1 измененных файлов с 6 добавлено и 4 удалено
  1. 6 4
      backend/app/engines/text_engine.py

+ 6 - 4
backend/app/engines/text_engine.py

@@ -272,9 +272,6 @@ class TextEngine(BaseEngine):
         hf_dataset = HFDataset.from_list(data)
 
         def tokenize_fn(batch):
-            # batched=True: each value in batch is a list of samples.
-            # Some individual values may themselves be lists/dicts (e.g. from
-            # Alpaca template producing list values) — coerce each to string.
             def _to_str(v):
                 if isinstance(v, (list, dict)):
                     return json.dumps(v, ensure_ascii=False)
@@ -296,7 +293,12 @@ class TextEngine(BaseEngine):
             tokenized["labels"] = list(tokenized["input_ids"])
             return tokenized
 
-        return hf_dataset.map(tokenize_fn, batched=True)
+        tokenized_dataset = hf_dataset.map(
+            tokenize_fn,
+            batched=True,
+            remove_columns=["prompt", "completion"],
+        )
+        return tokenized_dataset
 
 
 class _ProgressCallback: