| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- """数据预处理器:将不同格式的数据集转换为训练所需格式。"""
- import json
- from pathlib import Path
- from typing import Any
- def apply_alpaca_template(item: dict) -> dict:
- """Alpaca 模板: instruction + input -> output。"""
- instruction = item.get("instruction", "")
- input_text = item.get("input", "")
- output = item.get("output", "")
- # 确保所有值为字符串
- instruction = str(instruction) if instruction is not None else ""
- input_text = str(input_text) if input_text is not None else ""
- output = str(output) if output is not None else ""
- prompt = f"{instruction}\n\n{input_text}" if input_text else instruction
- return {"prompt": prompt, "completion": output}
- def apply_sharegpt_template(item: dict) -> dict:
- """ShareGPT 模板: conversations list -> formatted prompt + completion。"""
- conversations = item.get("conversations", [])
- if len(conversations) < 2:
- return {"prompt": "", "completion": ""}
- prompt_parts = []
- completion = ""
- for i, turn in enumerate(conversations):
- role = turn.get("from", turn.get("role", "human"))
- content = turn.get("value", turn.get("content", ""))
- if i == 0:
- prompt_parts.append(content)
- elif i == 1:
- completion = content
- break
- else:
- prompt_parts.append(f"{role}: {content}")
- prompt = "\n".join(prompt_parts)
- return {"prompt": prompt, "completion": completion}
- def apply_raw_template(item: dict) -> dict:
- """Raw 模板: 直接读取 prompt/text 和 completion/output 字段。"""
- prompt = item.get("prompt", item.get("text", item.get("input", "")))
- completion = item.get("completion", item.get("output", item.get("target", "")))
- return {"prompt": str(prompt), "completion": str(completion)}
- def apply_dpo_template(item: dict) -> dict:
- """DPO 模板: prompt + chosen + rejected。"""
- return {
- "prompt": item.get("prompt", item.get("input", "")),
- "chosen": item.get("chosen", item.get("positive", "")),
- "rejected": item.get("rejected", item.get("negative", "")),
- }
- def apply_kto_template(item: dict) -> dict:
- """KTO 模板: prompt + completion + label。"""
- return {
- "prompt": item.get("prompt", item.get("input", "")),
- "completion": item.get("completion", item.get("output", "")),
- "label": item.get("label", True),
- }
- def apply_orpo_template(item: dict) -> dict:
- """ORPO 模板: prompt + chosen + rejected (类似 DPO)。"""
- return {
- "prompt": item.get("prompt", item.get("input", "")),
- "chosen": item.get("chosen", item.get("positive", "")),
- "rejected": item.get("rejected", item.get("negative", "")),
- }
- def apply_rm_template(item: dict) -> dict:
- """Reward Modeling 模板: prompt + chosen + rejected。"""
- return {
- "prompt": item.get("prompt", item.get("input", "")),
- "chosen": item.get("chosen", item.get("positive", "")),
- "rejected": item.get("rejected", item.get("negative", "")),
- }
- TEMPLATE_MAP = {
- "sft": {
- "alpaca": apply_alpaca_template,
- "sharegpt": apply_sharegpt_template,
- "raw": apply_raw_template,
- },
- "dpo": {
- "alpaca": apply_dpo_template,
- "sharegpt": apply_dpo_template,
- "raw": apply_dpo_template,
- },
- "kto": {
- "raw": apply_kto_template,
- },
- "orpo": {
- "alpaca": apply_orpo_template,
- "raw": apply_orpo_template,
- },
- "rm": {
- "raw": apply_rm_template,
- },
- "ppo": {
- "raw": apply_raw_template,
- },
- }
- def preprocess_file(
- input_path: str,
- output_path: str,
- task_type: str = "sft",
- template: str = "alpaca",
- ) -> list[dict[str, Any]]:
- """读取文件并应用模板,返回处理后的数据列表。"""
- input_p = Path(input_path)
- ext = input_p.suffix.lower()
- # 读取原始数据
- if ext == ".jsonl":
- with open(input_path, "r", encoding="utf-8") as f:
- raw_data = [json.loads(line) for line in f if line.strip()]
- elif ext == ".json":
- with open(input_path, "r", encoding="utf-8") as f:
- data = json.load(f)
- raw_data = data if isinstance(data, list) else [data]
- elif ext == ".csv":
- import csv
- with open(input_path, "r", encoding="utf-8") as f:
- reader = csv.DictReader(f)
- raw_data = [dict(row) for row in reader]
- elif ext == ".parquet":
- import pandas as pd
- df = pd.read_parquet(input_path)
- raw_data = df.to_dict(orient="records")
- else:
- raise ValueError(f"Unsupported format: {ext}")
- # 获取模板函数
- templates = TEMPLATE_MAP.get(task_type, TEMPLATE_MAP["sft"])
- apply_fn = templates.get(template, templates.get("raw", apply_raw_template))
- # 应用模板
- processed = []
- for item in raw_data:
- try:
- result = apply_fn(item)
- if result.get("prompt"):
- processed.append(result)
- except Exception:
- continue
- # 写入处理后的数据
- output_p = Path(output_path)
- output_p.parent.mkdir(parents=True, exist_ok=True)
- with open(output_path, "w", encoding="utf-8") as f:
- for item in processed:
- f.write(json.dumps(item, ensure_ascii=False) + "\n")
- return processed
|