Przeglądaj źródła

英伟达环境初始化

lxylxy123321 1 tydzień temu
rodzic
commit
b303606d38

+ 0 - 12
backend/.dockerignore

@@ -1,12 +0,0 @@
-__pycache__/
-*.pyc
-*.pyo
-*.egg-info/
-dist/
-build/
-*.egg
-.venv/
-venv/
-.env
-data/
-*.db

+ 8 - 3
backend/.env

@@ -10,10 +10,15 @@ BACKEND_LOG_LEVEL=INFO
 BACKEND_CORS_ORIGINS=http://192.168.91.253:5173
 
 # 数据库
-DATABASE_URL=sqlite+aiosqlite:///root/Fine-tuning/backend/data/finetuning.db
+DATABASE_URL=postgresql+asyncpg://finetune:finetune123@localhost:5432/finetuning
 
-# 数据路径(Linux 服务器路径)
-DATA_DIR=/root/Fine-tuning/backend/data
+# 数据路径(Ubuntu 24.04 / 3090 服务器路径)
+DATA_DIR=/home/ubuntu/Fine-tuning/backend/data
+
+# GPU 配置 — RTX 3090 24GB
+CUDA_VISIBLE_DEVICES=0
+MAX_MEMORY_PER_GPU=0
+USE_UNSLOTH=false
 
 # 训练默认参数
 DEFAULT_PEFT_METHOD=lora

+ 0 - 35
backend/.env.docker

@@ -1,35 +0,0 @@
-# Docker 环境配置
-BACKEND_HOST=0.0.0.0
-BACKEND_PORT=8010
-BACKEND_ENV=production
-BACKEND_LOG_LEVEL=INFO
-BACKEND_CORS_ORIGINS=http://localhost:3000
-
-# PostgreSQL 数据库
-DATABASE_URL=postgresql+asyncpg://finetune:finetune123@postgres:5432/finetuning
-
-# 容器内数据目录
-DATA_DIR=/root/Fine-tuning/backend/data
-
-DEFAULT_PEFT_METHOD=lora
-DEFAULT_EPOCHS=3
-DEFAULT_BATCH_SIZE=4
-DEFAULT_GRADIENT_ACCUMULATION=4
-DEFAULT_LR=2e-4
-DEFAULT_MAX_SEQ_LENGTH=2048
-DEFAULT_WARMUP_RATIO=0.05
-DEFAULT_SAVE_STRATEGY=epoch
-DEFAULT_EVAL_STRATEGY=epoch
-DEFAULT_EVAL_STEPS=100
-
-LORA_R=16
-LORA_ALPHA=32
-LORA_DROPOUT=0.05
-LORA_TARGET_MODULES=all-linear
-
-QLORA_BITS=4
-QLORA_TYPE=nf4
-QLORA_DOUBLE_QUANT=true
-
-MAX_UPLOAD_SIZE_MB=500
-ALLOWED_DATASET_FORMATS=jsonl,csv,parquet,json

+ 8 - 3
backend/.env.example

@@ -10,10 +10,15 @@ BACKEND_LOG_LEVEL=INFO
 BACKEND_CORS_ORIGINS=http://192.168.91.253:5173
 
 # 数据库
-DATABASE_URL=sqlite+aiosqlite:///root/Fine-tuning/backend/data/finetuning.db
+DATABASE_URL=postgresql+asyncpg://finetune:finetune123@localhost:5432/finetuning
 
-# 数据路径(Linux 服务器路径)
-DATA_DIR=/root/Fine-tuning/backend/data
+# 数据路径(Ubuntu 24.04 / 3090 服务器路径)
+DATA_DIR=/home/ubuntu/Fine-tuning/backend/data
+
+# GPU 配置
+CUDA_VISIBLE_DEVICES=0
+MAX_MEMORY_PER_GPU=0
+USE_UNSLOTH=false
 
 # 训练默认参数
 DEFAULT_PEFT_METHOD=lora

+ 3 - 7
backend/Dockerfile

@@ -1,5 +1,5 @@
-# 主节点(151)后端 — 轻量级 Python 镜像,不含 GPU 依赖
-# 仅负责 API/DB/WebSocket/SSH 调度,实际训练/推理在 253 算力节点执行
+# 主节点后端 — 轻量级 Python 镜像,不含 GPU 依赖
+# 仅负责 API/DB/WebSocket 调度
 FROM docker.m.daocloud.io/library/python:3.10-slim
 
 WORKDIR /app
@@ -8,7 +8,7 @@ WORKDIR /app
 RUN sed -i 's|deb.debian.org|mirrors.aliyun.com|g' /etc/apt/sources.list.d/debian.sources && \
     sed -i 's|security.debian.org|mirrors.aliyun.com|g' /etc/apt/sources.list.d/debian.sources
 
-RUN apt-get update && apt-get install -y git openssh-client sshpass rsync && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
@@ -21,7 +21,3 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
     CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8010/health')" || exit 1
 
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8010"]
-
-COPY entrypoint.sh /entrypoint.sh
-RUN chmod +x /entrypoint.sh
-ENTRYPOINT ["/entrypoint.sh"]

+ 1 - 0
backend/app/api/training.py

@@ -46,6 +46,7 @@ async def cancel_training_job(job_id: str):
 @router.get("/jobs/{job_id}/logs")
 async def stream_training_logs(job_id: str):
     """通过 SSE 流式推送训练日志。"""
+    import os
     from fastapi.responses import StreamingResponse
 
     async def log_stream():

+ 1 - 19
backend/app/config.py

@@ -48,7 +48,7 @@ class Settings(BaseSettings):
         )
 
     # --- 数据路径 ---
-    data_dir: Path = Path("/root/Fine-tuning/backend/data")
+    data_dir: Path = Path("/home/ubuntu/Fine-tuning/backend/data")
 
     # --- HuggingFace / ModelScope ---
     hf_token: str = ""
@@ -98,19 +98,6 @@ class Settings(BaseSettings):
     max_upload_size_mb: int = 500
     allowed_dataset_formats: str = "jsonl,csv,parquet,json"
 
-    # --- 分布式计算节点 ---
-    compute_node_host: str = ""  # 算力节点 IP,为空则本地执行
-    compute_node_ssh_port: int = 22
-    compute_node_ssh_user: str = "root"
-    compute_node_ssh_password: str = ""  # SSH 密码(与密钥二选一)
-    compute_node_ssh_key: str = ""  # SSH 私钥路径
-    compute_node_docker_container: str = "finetune-trainer"  # 算力节点上的训练容器名
-    compute_node_python: str = "/opt/conda/bin/python"
-    compute_node_workdir: str = "/root/Fine-tuning/backend"
-    compute_node_remote_data_dir: str = "/root/Fine-tuning/backend/data"
-    compute_node_remote_env: str = "production"
-    compute_node_ssh_timeout: int = 300  # SSH 命令超时(秒)
-
     @field_validator("backend_cors_origins", mode="before")
     @classmethod
     def parse_cors_origins(cls, v):
@@ -134,11 +121,6 @@ class Settings(BaseSettings):
     def processed_dir(self) -> Path:
         return self.data_dir / "processed"
 
-    @property
-    def use_remote_compute(self) -> bool:
-        """是否启用远程算力节点。"""
-        return bool(self.compute_node_host)
-
     def ensure_dirs(self) -> None:
         self.data_dir.mkdir(parents=True, exist_ok=True)
         for d in [self.models_dir, self.adapters_dir, self.uploads_dir, self.processed_dir]:

+ 16 - 141
backend/app/core/job_queue.py

@@ -188,45 +188,22 @@ class JobQueue:
 
             await engine.preprocess_dataset(dataset_path, processed_path, task_type=task_type, template=template)
 
-            # 判断是否远程执行
-            if settings.use_remote_compute:
-                # 远程训练模式 — 数据集路径已由上面的代码查好
-                if not dataset_path:
-                    dataset_path = self._find_dataset_path(dataset_id)
-                if not dataset_path:
-                    raise FileNotFoundError(f"Dataset not found: {dataset_id}")
-
-                self.update_job(job_id, status=JobStatus.TRAINING)
-                await self._notify_callbacks()
-
-                from app.core.remote_executor import run_training_remote, is_process_running
-                pid = run_training_remote(job_id, model_id, model_type, dataset_path, config)
-
-                if not pid:
-                    raise RuntimeError("Failed to launch remote training")
-
-                # 轮询共享日志文件解析进度
-                await self._poll_remote_progress(job_id, pid)
-
-                logger.info(f"Remote training launched for job {job_id}")
-            else:
-                # 本地训练模式
-                await engine.load_model(model_id, quantization="4bit" if peft_method == "qlora" else None)
-                peft_config = engine.get_peft_config(peft_method, config)
-
-                self.update_job(job_id, status=JobStatus.TRAINING)
-                await self._notify_callbacks()
-
-                adapter_path = await engine.train(
-                    job_id=job_id,
-                    dataset_path=processed_path,
-                    peft_config=peft_config,
-                    training_args=config,
-                )
-
-                self.update_job(job_id, status=JobStatus.COMPLETED, adapter_path=adapter_path)
-                await self._notify_callbacks()
-                logger.info(f"Job {job_id} completed successfully")
+            await engine.load_model(model_id, quantization="4bit" if peft_method == "qlora" else None)
+            peft_config = engine.get_peft_config(peft_method, config)
+
+            self.update_job(job_id, status=JobStatus.TRAINING)
+            await self._notify_callbacks()
+
+            adapter_path = await engine.train(
+                job_id=job_id,
+                dataset_path=processed_path,
+                peft_config=peft_config,
+                training_args=config,
+            )
+
+            self.update_job(job_id, status=JobStatus.COMPLETED, adapter_path=adapter_path)
+            await self._notify_callbacks()
+            logger.info(f"Job {job_id} completed successfully")
 
         except asyncio.CancelledError:
             self.update_job(job_id, status=JobStatus.CANCELLED)
@@ -280,108 +257,6 @@ class JobQueue:
             from app.engines.text_engine import text_engine
             return text_engine
 
-    async def _poll_remote_progress(self, job_id: str, pid: str):
-        """通过 SSH 读取远程日志文件,解析训练进度(非阻塞)。"""
-        from app.config import get_settings
-        from app.core.websocket import send_progress, send_epoch_done, send_completed, send_error
-        from app.core.remote_executor import ssh_exec, is_process_running
-
-        settings = get_settings()
-        remote_log = f"{settings.compute_node_remote_data_dir}/logs/{job_id}.jsonl"
-        last_bytes = 0
-        poll_interval = 5
-        max_polls = 8640
-
-        for _ in range(max_polls):
-            if self.is_cancelled(job_id):
-                _s = get_settings()
-                await asyncio.to_thread(ssh_exec, f"docker exec {_s.compute_node_docker_container} bash -c 'kill {pid} 2>/dev/null'", timeout=10)
-                self.update_job(job_id, status=JobStatus.CANCELLED)
-                await self._notify_callbacks()
-                await send_error(job_id, "Training cancelled")
-                return
-
-            # 检查进程是否还在运行(非阻塞)
-            process_alive = await asyncio.to_thread(is_process_running, pid)
-
-            # 通过 SSH 远程读取日志文件(非阻塞)
-            cat_cmd = f"docker exec {settings.compute_node_docker_container} bash -c 'wc -c < {remote_log} 2>/dev/null || echo 0'"
-            code, size_out, _ = await asyncio.to_thread(ssh_exec, cat_cmd, timeout=30)
-            try:
-                file_size = int(size_out.strip()) if code == 0 and size_out.strip() else 0
-            except ValueError:
-                file_size = 0
-
-            if file_size > last_bytes:
-                read_cmd = f"docker exec {settings.compute_node_docker_container} bash -c 'tail -c +{last_bytes + 1} {remote_log} 2>/dev/null'"
-                code, log_content, _ = await asyncio.to_thread(ssh_exec, read_cmd, timeout=30)
-
-                if code == 0 and log_content.strip():
-                    for line in log_content.strip().split("\n"):
-                        line = line.strip()
-                        if not line:
-                            continue
-                        try:
-                            entry = json.loads(line)
-                        except json.JSONDecodeError:
-                            continue
-
-                        entry_type = entry.get("type")
-                        if entry_type == "progress":
-                            self.update_job(job_id,
-                                            current_step=entry.get("step", 0),
-                                            total_steps=entry.get("total_steps", 0),
-                                            loss=entry.get("loss"),
-                                            progress=round(entry.get("step", 0) / max(entry.get("total_steps", 1), 1) * 100, 1))
-                            await self._notify_callbacks()
-                            await send_progress(job_id, **{k: v for k, v in entry.items() if k != "type"})
-
-                        elif entry_type == "epoch_begin":
-                            self.update_job(job_id, current_epoch=entry.get("epoch", 0))
-                            await self._notify_callbacks()
-
-                        elif entry_type == "epoch_done":
-                            await self._notify_callbacks()
-                            await send_epoch_done(job_id, **{k: v for k, v in entry.items() if k not in ("type", "ts")})
-
-                        elif entry_type == "completed":
-                            adapter_path = entry.get("adapter_path", str(settings.adapters_dir / job_id))
-                            self.update_job(job_id,
-                                            status=JobStatus.COMPLETED,
-                                            adapter_path=adapter_path,
-                                            progress=100.0)
-                            await self._notify_callbacks()
-                            await send_completed(job_id, **{k: v for k, v in entry.items() if k not in ("type", "ts")})
-                            return
-
-                        elif entry_type == "error":
-                            self.update_job(job_id,
-                                            status=JobStatus.FAILED,
-                                            error_message=entry.get("message", "Unknown error"))
-                            await self._notify_callbacks()
-                            await send_error(job_id, entry.get("message", "Unknown error"))
-                            return
-
-                    last_bytes = file_size
-
-            # 进程已退出但日志里没有 completed/error
-            if not process_alive:
-                await asyncio.sleep(2)
-                if not await asyncio.to_thread(is_process_running, pid):
-                    self.update_job(job_id,
-                                    status=JobStatus.FAILED,
-                                    error_message=f"Remote process exited unexpectedly (pid={pid})")
-                    await self._notify_callbacks()
-                    await send_error(job_id, f"Remote process exited unexpectedly (pid={pid})")
-                    return
-
-            await asyncio.sleep(poll_interval)
-
-        # 超时
-        self.update_job(job_id, status=JobStatus.FAILED, error_message="Remote training timed out")
-        await self._notify_callbacks()
-        await send_error(job_id, "Remote training timed out")
-
     @property
     def jobs(self) -> dict[str, TrainingJob]:
         return dict(self._jobs)

+ 0 - 214
backend/app/core/remote_executor.py

@@ -1,214 +0,0 @@
-"""SSH 远程执行模块 — 在算力节点上运行 GPU 任务。"""
-import json
-import os
-import subprocess
-from typing import Any
-
-from app.config import get_settings
-from app.core.logging import logger
-
-settings = get_settings()
-
-
-def _get_ssh_prefix() -> list[str]:
-    """构建 ssh/scp 命令前缀,支持密钥或密码登录。"""
-    prefix = ["-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=10"]
-    return prefix
-
-
-def scp_to_remote(local_path: str, remote_path: str) -> tuple[int, str, str]:
-    """通过 SCP 把本地文件复制到远端主机,返回 (exit_code, stdout, stderr)。"""
-    target = f"{settings.compute_node_ssh_user}@{settings.compute_node_host}"
-    scp_args = ["scp", *_get_ssh_prefix(), "-P", str(settings.compute_node_ssh_port)]
-    if settings.compute_node_ssh_key:
-        scp_args += ["-i", settings.compute_node_ssh_key]
-    elif settings.compute_node_ssh_password:
-        scp_args = ["sshpass", "-p", settings.compute_node_ssh_password] + scp_args
-    scp_args += [local_path, f"{target}:{remote_path}"]
-
-    try:
-        proc = subprocess.run(scp_args, capture_output=True, text=True, timeout=30)
-        clean_stderr = "\n".join(line for line in proc.stderr.split("\n")
-                                  if not line.startswith("Warning:"))
-        return proc.returncode, proc.stdout, clean_stderr
-    except Exception as e:
-        logger.error(f"SCP failed: {e}")
-        return -1, "", str(e)
-
-
-def scp_to_remote_dir(local_path: str, remote_path: str) -> tuple[int, str, str]:
-    """通过 SCP 把本地目录递归复制到远端主机。"""
-    target = f"{settings.compute_node_ssh_user}@{settings.compute_node_host}"
-    scp_args = ["scp", "-r", *_get_ssh_prefix(), "-P", str(settings.compute_node_ssh_port)]
-    if settings.compute_node_ssh_key:
-        scp_args += ["-i", settings.compute_node_ssh_key]
-    elif settings.compute_node_ssh_password:
-        scp_args = ["sshpass", "-p", settings.compute_node_ssh_password] + scp_args
-    scp_args += [local_path, f"{target}:{remote_path}"]
-
-    try:
-        proc = subprocess.run(scp_args, capture_output=True, text=True, timeout=120)
-        clean_stderr = "\n".join(line for line in proc.stderr.split("\n")
-                                  if not line.startswith("Warning:"))
-        return proc.returncode, proc.stdout, clean_stderr
-    except Exception as e:
-        logger.error(f"SCP dir failed: {e}")
-        return -1, "", str(e)
-
-
-def ssh_exec(cmd: str, timeout: int | None = None) -> tuple[int, str, str]:
-    """通过 SSH 在算力节点执行命令,返回 (exit_code, stdout, stderr)。"""
-    if not settings.use_remote_compute:
-        raise RuntimeError("未配置算力节点(compute_node_host 为空)")
-
-    target = f"{settings.compute_node_ssh_user}@{settings.compute_node_host}"
-    ssh_args = [
-        "ssh", *_get_ssh_prefix(),
-        "-p", str(settings.compute_node_ssh_port),
-        target,
-        cmd,
-    ]
-
-    # sshpass 需要包裹 ssh 命令,而不是作为 ssh 的参数
-    if settings.compute_node_ssh_key:
-        ssh_args = ["ssh", "-i", settings.compute_node_ssh_key] + ssh_args[1:]
-    elif settings.compute_node_ssh_password:
-        ssh_args = ["sshpass", "-p", settings.compute_node_ssh_password] + ssh_args
-
-    timeout = timeout or settings.compute_node_ssh_timeout
-    try:
-        proc = subprocess.run(
-            ssh_args,
-            capture_output=True,
-            text=True,
-            timeout=timeout,
-        )
-        # 过滤 known_hosts 警告,这些不算真正的错误
-        clean_stderr = "\n".join(line for line in proc.stderr.split("\n")
-                                  if not line.startswith("Warning:"))
-        return proc.returncode, proc.stdout, clean_stderr
-    except subprocess.TimeoutExpired:
-        logger.error(f"SSH command timeout after {timeout}s: {cmd[:100]}")
-        return -1, "", f"Command timed out after {timeout}s"
-    except Exception as e:
-        logger.error(f"SSH exec failed: {e}")
-        return -1, "", str(e)
-
-
-def run_training_remote(
-    job_id: str,
-    model_id: str,
-    model_type: str,
-    dataset_path: str,
-    config: dict[str, Any],
-) -> str | None:
-    """在算力节点启动训练任务(通过 docker exec,后台执行)。
-
-    通过 SCP 把配置文件传到远端宿主机,再在容器内启动训练。
-    dataset_path 由主节点预先查好,直接传给远程脚本。
-    """
-    import tempfile
-
-    # 在 151 宿主机创建临时配置文件
-    config_tmp = tempfile.mktemp(suffix=".json", prefix=f"config_{job_id}_")
-    with open(config_tmp, "w", encoding="utf-8") as f:
-        json.dump(config, f, ensure_ascii=False)
-
-    # SCP 到远端宿主机(使用 data_dir,这个目录已通过 bind mount 共享给容器)
-    remote_config_path = f"{settings.compute_node_remote_data_dir}/config_{job_id}.json"
-    ret_code, _, _ = scp_to_remote(config_tmp, f"{remote_config_path}")
-    os.unlink(config_tmp)  # 删除本地临时文件
-
-    if ret_code != 0:
-        logger.error(f"SCP config file failed: ret_code={ret_code}")
-        return None
-
-    # 把数据集路径也传到远程(SCP 到 data/uploads/ 目录)
-    remote_dataset_name = os.path.basename(dataset_path)
-    remote_dataset_path = f"{settings.compute_node_remote_data_dir}/datasets/{remote_dataset_name}"
-
-    if os.path.isdir(dataset_path):
-        # 目录:用 scp -r
-        ret_code, _, _ = scp_to_remote_dir(dataset_path, remote_dataset_path)
-    else:
-        # 文件:普通 scp
-        ret_code, _, _ = scp_to_remote(dataset_path, remote_dataset_path)
-
-    if ret_code != 0:
-        logger.error(f"SCP dataset failed: ret_code={ret_code}")
-        return None
-
-    # 在容器内启动训练
-    remote_cmd = (
-        f"docker exec -w {settings.compute_node_workdir} "
-        f"{settings.compute_node_docker_container} "
-        f"bash -c '"
-        f"nohup {settings.compute_node_python} -m app.engines.remote_train "
-        f"{job_id} {model_id} {model_type} {remote_dataset_path} {remote_config_path} "
-        f"</dev/null >/tmp/train_{job_id}.log 2>&1 & echo $!'"
-    )
-
-    code, stdout, stderr = ssh_exec(remote_cmd, timeout=30)
-
-    if code != 0:
-        logger.error(f"Remote training launch failed: {stderr}")
-        return None
-
-    pid = stdout.strip()
-    logger.info(f"Remote training launched in container: job={job_id}, container_pid={pid}")
-    return pid
-
-
-def is_process_running(pid: str) -> bool:
-    """检查远程训练进程是否还在运行。
-
-    通过 docker exec 进入容器检查 PID 是否存在。
-    """
-    cmd = f"docker exec {settings.compute_node_docker_container} bash -c 'kill -0 {pid} 2>/dev/null && echo running || echo stopped'"
-    code, stdout, stderr = ssh_exec(cmd, timeout=30)
-    return code == 0 and "running" in stdout
-
-
-def run_inference_remote(
-    model_id: str,
-    adapter_id: str,
-    prompt: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    repetition_penalty: float,
-    do_sample: bool,
-) -> dict[str, Any] | None:
-    """在算力节点执行推理。"""
-    safe_prompt = prompt.replace('"', '\\"').replace("'", "\\'").replace("\n", "\\n")
-
-    remote_cmd = (
-        f"docker exec {settings.compute_node_docker_container} "
-        f"{settings.compute_node_python} -c \""
-        "import asyncio, json; "
-        "from app.config import get_settings; "
-        "settings = get_settings(); "
-        "from app.services.inference_service import run_inference_single; "
-        f"result = asyncio.run(run_inference_single("
-        f"'{model_id}', '{adapter_id}', '{safe_prompt}', "
-        f"{max_new_tokens}, {temperature}, {top_p}, {repetition_penalty}, {str(do_sample).lower()}"
-        ")); "
-        "print(json.dumps(result, ensure_ascii=False))\" 2>&1"
-    )
-
-    code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
-
-    if code != 0:
-        logger.error(f"Remote inference failed: {stderr}")
-        return {"error": stderr.strip() or "Remote inference failed"}
-
-    # 提取最后一行 JSON
-    for line in reversed(stdout.strip().split("\n")):
-        line = line.strip()
-        if line.startswith("{"):
-            try:
-                return json.loads(line)
-            except json.JSONDecodeError:
-                continue
-
-    return {"error": f"Invalid JSON response: {stdout[:500]}"}

+ 0 - 5
backend/app/engines/__main__.py

@@ -1,5 +0,0 @@
-"""远程训练入口:python -m app.engines.remote_train <args>"""
-from app.engines.remote_train import main
-
-if __name__ == "__main__":
-    main()

+ 0 - 162
backend/app/engines/remote_train.py

@@ -1,162 +0,0 @@
-"""远程训练入口脚本 — 在算力节点上执行。"""
-import asyncio
-import json
-import os
-import sys
-import signal
-import time
-import traceback
-from datetime import datetime, timezone
-from pathlib import Path
-
-# 禁用 FlashAttention
-os.environ["PYTORCH_NO_FLASH"] = "1"
-os.environ["FLASH_ATTENTION_ENABLED"] = "0"
-
-_progress_log_file = None
-
-
-def _init_log_file(data_dir: Path, job_id: str):
-    """初始化进度日志文件(通过 SSHFS 共享给主节点读取)。"""
-    global _progress_log_file
-    log_dir = data_dir / "logs"
-    log_dir.mkdir(parents=True, exist_ok=True)
-    _progress_log_file = log_dir / f"{job_id}.jsonl"
-    _write_log(type="start", job_id=job_id)
-
-
-def _write_log(**kwargs):
-    """追加一行 JSON 到共享日志文件。"""
-    if _progress_log_file:
-        entry = {"ts": datetime.now(timezone.utc).isoformat(), **kwargs}
-        with open(_progress_log_file, "a", encoding="utf-8") as f:
-            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
-            f.flush()
-
-
-class FileProgressCallback:
-    """HuggingFace Trainer 回调 — 写进度到共享日志文件。"""
-
-    def __init__(self, job_id: str):
-        self.job_id = job_id
-
-    def on_log(self, args, state, control, logs=None, **kwargs):
-        if logs and "loss" in logs:
-            _write_log(type="progress", epoch=int(state.epoch or 0),
-                       step=state.global_step, total_steps=state.max_steps or 0,
-                       loss=round(logs["loss"], 4),
-                       learning_rate=round(logs.get("learning_rate", 0), 8))
-
-    def on_epoch_begin(self, args, state, control, **kwargs):
-        _write_log(type="epoch_begin", epoch=int(state.epoch or 0))
-
-    def on_epoch_end(self, args, state, control, metrics=None, **kwargs):
-        _write_log(type="epoch_done", epoch=int(state.epoch or 0),
-                   eval_loss=metrics.get("eval_loss") if metrics and hasattr(metrics, "get") else None,
-                   eval_accuracy=metrics.get("eval_accuracy") if metrics and hasattr(metrics, "get") else None)
-
-    def on_train_end(self, args, state, control, **kwargs):
-        _write_log(type="completed", total_time_seconds=getattr(state, "train_runtime", 0),
-                   adapter_path=args.output_dir)
-
-    def on_train_begin(self, args, state, control, **kwargs):
-        _write_log(type="status", status="training")
-
-    def on_save(self, args, state, control, **kwargs):
-        _write_log(type="save", step=state.global_step)
-
-    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
-        if metrics:
-            _write_log(type="evaluate", epoch=int(state.epoch or 0),
-                       eval_loss=metrics.get("eval_loss"),
-                       eval_accuracy=metrics.get("eval_accuracy"))
-
-
-async def run_training(job_id: str, model_id: str, model_type: str, dataset_path: str, config: dict):
-    """执行单个训练任务(远程调用入口)。"""
-    from app.config import get_settings
-    from app.core.logging import logger
-
-    settings = get_settings()
-    _init_log_file(settings.data_dir, job_id)
-
-    try:
-        # dataset_path 由主节点直接传入
-        if not dataset_path or not Path(dataset_path).exists():
-            raise FileNotFoundError(f"Dataset not found: {dataset_path}")
-
-        _write_log(type="status", status="preprocessing")
-
-        # 预处理
-        processed_path = str(settings.processed_dir / f"{job_id}_processed.jsonl")
-        task_type = config.get("task_type", "sft")
-        template = config.get("dataset_template", "alpaca")
-
-        # 选择引擎
-        if model_type == "vision":
-            from app.engines.vision_engine import vision_engine
-            engine = vision_engine
-        elif model_type == "multimodal":
-            from app.engines.multimodal_engine import multimodal_engine
-            engine = multimodal_engine
-        else:
-            from app.engines.text_engine import text_engine
-            engine = text_engine
-
-        peft_method = config.get("peft_method", "lora")
-
-        await engine.preprocess_dataset(dataset_path, processed_path, task_type=task_type, template=template)
-
-        _write_log(type="status", status="loading_model")
-
-        # 加载模型
-        await engine.load_model(model_id, quantization="4bit" if peft_method == "qlora" else None)
-
-        # 构建 PEFT 配置
-        peft_config = engine.get_peft_config(peft_method, config)
-
-        _write_log(type="status", status="training")
-
-        # 训练 — 传入文件日志回调替代 WebSocket 回调
-        start_time = time.time()
-        file_cb = FileProgressCallback(job_id)
-
-        adapter_path = await engine.train(
-            job_id=job_id,
-            dataset_path=processed_path,
-            peft_config=peft_config,
-            training_args=config,
-            callbacks=[file_cb],
-        )
-
-        elapsed = round(time.time() - start_time, 2)
-        _write_log(type="completed", adapter_path=str(adapter_path), total_time=elapsed)
-
-        logger.info(f"Remote training completed: {job_id} -> {adapter_path} ({elapsed}s)")
-        return adapter_path
-
-    except Exception as e:
-        _write_log(type="error", message=str(e), traceback=traceback.format_exc())
-        logger.error(f"Remote training failed: {job_id} - {e}")
-        raise
-
-
-def main():
-    """命令行入口:python -m app.engines.remote_train <job_id> <model_id> <model_type> <dataset_path> <config_file>"""
-    if len(sys.argv) < 6:
-        print("Usage: python -m app.engines.remote_train <job_id> <model_id> <model_type> <dataset_path> <config_file>")
-        sys.exit(1)
-
-    job_id = sys.argv[1]
-    model_id = sys.argv[2]
-    model_type = sys.argv[3]
-    dataset_id = sys.argv[4]
-    config_path = sys.argv[5]
-    with open(config_path, encoding="utf-8") as f:
-        config = json.load(f)
-
-    asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
-
-
-if __name__ == "__main__":
-    main()

+ 1 - 47
backend/app/services/inference_service.py

@@ -1,4 +1,4 @@
-"""推理服务 — 支持本地执行和 SSH 远程执行两种模式。"""
+"""推理服务 — 本地执行。"""
 import json
 from pathlib import Path
 from typing import Any
@@ -24,28 +24,6 @@ async def generate(
     if not base_model_id:
         return {"error": "无法找到基础模型信息,请确保训练任务已完成"}
 
-    if settings.use_remote_compute:
-        # 远程执行模式
-        from app.core.remote_executor import run_inference_remote
-
-        adapter_dir = Path(adapter_path)
-        adapter_id = adapter_dir.name
-
-        result = run_inference_remote(
-            model_id=base_model_id,
-            adapter_id=adapter_id,
-            prompt=prompt,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            do_sample=do_sample,
-        )
-        if result:
-            return result
-        return {"error": "Remote inference failed"}
-
-    # 本地执行模式
     return _generate_local(
         adapter_path=adapter_path,
         base_model_id=base_model_id,
@@ -145,27 +123,3 @@ async def get_available_adapters() -> list[dict[str, Any]]:
                 "peft_type": cfg.get("peft_type", "unknown"),
             })
     return result
-
-
-async def run_inference_single(
-    model_id: str,
-    adapter_id: str,
-    prompt: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    repetition_penalty: float,
-    do_sample: bool,
-) -> dict[str, Any]:
-    """供远程 SSH 调用的单条推理入口。"""
-    adapter_path = str(settings.adapters_dir / adapter_id)
-    return _generate_local(
-        adapter_path=adapter_path,
-        base_model_id=model_id,
-        prompt=prompt,
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
-        do_sample=do_sample,
-    )

+ 0 - 1
backend/app/services/model_service.py

@@ -114,7 +114,6 @@ async def download_model(model_id: str, use_modelscope: bool = False) -> dict[st
         if "Connection" in error_msg or "timeout" in error_msg.lower() or "network" in error_msg.lower():
             error_msg += "\n提示: 可能是 HuggingFace 网络问题。尝试使用 ModelScope 下载。"
         return {"model_id": model_id, "status": "failed", "error": error_msg}
-        return {"model_id": model_id, "status": "failed", "error": error_msg}
 
 
 async def list_cached_models() -> list[dict[str, Any]]:

+ 1 - 164
backend/app/services/model_test_service.py

@@ -9,174 +9,11 @@ settings = get_settings()
 
 async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temperature: float = 0.8, top_p: float = 0.95) -> dict[str, Any]:
     """加载已缓存模型并生成测试响应。"""
-    if settings.use_remote_compute:
-        return await _test_model_remote(model_id, prompt, max_new_tokens, temperature, top_p)
     return await _test_model_local(model_id, prompt, max_new_tokens, temperature, top_p)
 
 
-async def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
-    """在算力节点容器内执行模型测试(通过 SSH + docker exec)。
-
-    方案:将 Python 脚本写入容器临时文件执行,避免 stdin 管道缓冲区限制。
-    """
-    import base64
-    import json
-    from app.core.remote_executor import ssh_exec
-
-    container = settings.compute_node_docker_container
-    python = settings.compute_node_python
-    workdir = settings.compute_node_workdir
-
-    # 将 prompt 进行 base64 编码,避免引号/特殊字符问题
-    prompt_b64 = base64.b64encode(prompt.encode("utf-8")).decode()
-    do_sample = str(temperature > 0).lower()
-
-    # 独立脚本:零 app/db 依赖,参数全部通过环境变量传入
-    # 开头通过 OS 级别重定向 fd 1 到 /dev/null,抑制 C 层调试输出
-    # 最后恢复 fd 1 以打印 JSON
-    script = rf"""\
-import os, sys, json, warnings, base64
-# 保存原始 fd 1(docker exec 的 stdout pipe),然后重定向到 /dev/null
-_orig_fd1 = os.dup(1)
-_devnull = os.open(os.devnull, os.O_WRONLY)
-os.dup2(_devnull, 1)
-os.close(_devnull)
-warnings.filterwarnings('ignore')
-os.environ['PYTHONWARNINGS'] = 'ignore'
-os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
-os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
-from pathlib import Path
-import torch
-from transformers import logging as tf_logging
-tf_logging.set_verbosity_error()
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
-
-def find_model_path(model_id):
-    for base in [
-        '/root/Fine-tuning/backend/data/models',
-        '/root/.cache/huggingface/hub',
-        '/root/.cache/modelscope/hub',
-        '/root/models',
-    ]:
-        bp = Path(base)
-        if not bp.is_dir():
-            continue
-        flat_name = model_id.replace("/", "_")
-        if (bp / flat_name / "config.json").exists():
-            return str(bp / flat_name)
-        if (bp / model_id / "config.json").exists():
-            return str(bp / model_id)
-        try:
-            for child in bp.rglob("config.json"):
-                if child.parent.is_dir():
-                    return str(child.parent)
-        except Exception:
-            pass
-    return None
-
-model_id = os.environ.get('MODEL_ID', '')
-prompt = base64.b64decode(os.environ.get('PROMPT_B64', '')).decode('utf-8')
-max_new_tokens = int(os.environ.get('MAX_TOKENS', '128'))
-temperature = float(os.environ.get('TEMPERATURE', '0.8'))
-top_p = float(os.environ.get('TOP_P', '0.95'))
-do_sample = os.environ.get('DO_SAMPLE', 'true').lower() == 'true'
-
-model_path = find_model_path(model_id)
-if model_path is None:
-    sys.stderr.write(json.dumps({{'error': f'Model not found: {{model_id}}'}}) + '\\n')
-    exit(1)
-
-t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-t.pad_token = t.pad_token or t.eos_token
-
-has_accelerate = False
-try:
-    import accelerate
-    has_accelerate = True
-except ImportError:
-    pass
-
-m = None
-load_errors = []
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-
-for cls, kw in [(AutoModelForCausalLM, {{'trust_remote_code': True}}), (AutoModel, {{'trust_remote_code': True}})]:
-    for dtype_val, dtype_name in [(torch.float16, 'float16'), (torch.float32, 'float32')]:
-        try:
-            if has_accelerate:
-                m = cls.from_pretrained(model_path, dtype=dtype_val, device_map='auto', **kw)
-            else:
-                m = cls.from_pretrained(model_path, dtype=dtype_val, device_map=None, **kw)
-                m = m.to(device)
-            break
-        except Exception as e:
-            load_errors.append(f'{{cls.__name__}} {{dtype_name}}: {{str(e)[:200]}}')
-    if m is not None:
-        break
-
-if m is None:
-    sys.stderr.write(json.dumps({{'error': 'Unable to load model', 'details': load_errors}}) + '\\n')
-    exit(1)
-
-# 恢复 fd 1 到原始 stdout(docker exec 的 pipe)
-os.dup2(_orig_fd1, 1)
-os.close(_orig_fd1)
-
-m.eval()
-device = next(m.parameters()).device
-inp = t(prompt, return_tensors='pt').to(device)
-out = m.generate(**inp, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=do_sample, pad_token_id=t.eos_token_id)
-gen = t.decode(out[0][inp['input_ids'].shape[1]:], skip_special_tokens=True)
-print(json.dumps({{'generated_text': gen}}))
-"""
-
-    script_b64 = base64.b64encode(script.encode()).decode()
-
-    # 通过 docker exec -i 将解码后的脚本内容传入容器内的 cat,写入临时文件后执行
-    script_path = f"/tmp/test_model_{model_id.replace('/', '_')}.py"
-    remote_cmd = (
-        f"echo '{script_b64}' | base64 -d | "
-        f"docker exec -i -w {workdir} "
-        f"-e MODEL_ID={model_id} "
-        f"-e PROMPT_B64={prompt_b64} "
-        f"-e MAX_TOKENS={max_new_tokens} "
-        f"-e TEMPERATURE={temperature} "
-        f"-e TOP_P={top_p} "
-        f"-e DO_SAMPLE={do_sample} "
-        f"{container} bash -c 'cat > {script_path} && {python} {script_path}'"
-    )
-
-    code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
-
-    # 清理容器内临时文件
-    ssh_exec(f"docker exec {container} rm -f {script_path}", timeout=5)
-
-    logger.info(f"Remote test result: code={code}, stdout_len={len(stdout)}, stderr_len={len(stderr)}")
-    if stdout:
-        logger.info(f"stdout (first 500): {stdout[:500]}")
-    if stderr:
-        logger.info(f"stderr (first 500): {stderr[:500]}")
-
-    if code != 0:
-        logger.error(f"Remote model test failed: {stderr}")
-        return {"error": stderr.strip() or "Remote test failed"}
-
-    for line in reversed(stdout.strip().split("\n")):
-        line = line.strip()
-        if line.startswith("{"):
-            try:
-                result = json.loads(line)
-                result["model_id"] = model_id
-                result["prompt"] = prompt
-                return result
-            except json.JSONDecodeError:
-                continue
-
-    return {"error": f"Invalid response: {stdout[:500]}"}
-
-
 async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
-    """本地执行模型测试(仅用于开发环境)。"""
+    """本地执行模型测试。"""
     import torch
     from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 

+ 0 - 26
backend/entrypoint.sh

@@ -1,26 +0,0 @@
-#!/bin/bash
-# 容器启动时自动将 backend 代码同步到 253 训练节点
-
-REMOTE_USER="${COMPUTE_NODE_SSH_USER:-root}"
-REMOTE_HOST="${COMPUTE_NODE_HOST}"
-REMOTE_PASS="${COMPUTE_NODE_SSH_PASSWORD}"
-REMOTE_DIR="/root/Fine-tuning/backend"
-
-if [ -n "$REMOTE_HOST" ]; then
-  echo "=> Syncing backend code to compute node ${REMOTE_HOST} ..."
-  if [ -n "$REMOTE_PASS" ]; then
-    sshpass -p "$REMOTE_PASS" rsync -avz --delete \
-      -e "ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5" \
-      /app/ ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}/
-  else
-    rsync -avz --delete \
-      -e "ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5" \
-      /app/ ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}/
-  fi
-  echo "=> Sync done."
-else
-  echo "=> No compute node configured, skipping code sync."
-fi
-
-# 启动主进程
-exec "$@"

+ 10 - 5
backend/requirements.txt

@@ -1,5 +1,4 @@
-# --- 151 主节点 API 服务依赖(轻量) ---
-# 实际训练依赖(torch/transformers/peft/trl)在 253 沐曦容器中
+# --- 3090 单机部署依赖(前后端合一,含 GPU 训练栈) ---
 
 fastapi>=0.115.0
 uvicorn[standard]>=0.30.0
@@ -8,7 +7,6 @@ pydantic-settings>=2.0
 python-dotenv>=1.0
 sqlalchemy[asyncio]>=2.0
 aiosqlite>=0.20.0
-asyncpg>=0.29.0
 alembic>=1.13.0
 python-multipart>=0.0.9
 websockets>=12.0
@@ -20,6 +18,13 @@ pyarrow>=17.0.0
 addict>=2.4.0
 modelscope>=1.15.0,<1.18.0
 oss2>=2.18.0
-datasets
+datasets>=3.0.0
 huggingface_hub
-aiohttp>=3.9.0,<3.11.0
+aiohttp>=3.9.0,<3.11.0
+# GPU 训练依赖
+torch>=2.4.0
+transformers>=4.45.0
+peft>=0.13.0
+trl>=0.12.0
+accelerate>=1.0.0
+bitsandbytes>=0.44.0

+ 0 - 24
deploy.sh

@@ -1,24 +0,0 @@
-#!/bin/bash
-# 一键部署:拉最新代码 → 构建后端 → 同步到253 → 重启
-
-set -e
-
-PROJECT_DIR="/root/Fine-tuning"
-REMOTE_USER="root"
-REMOTE_HOST="192.168.91.253"
-REMOTE_PASS="ictrek"
-
-cd ${PROJECT_DIR}
-
-echo "=== Step 1: Git pull ==="
-git pull
-
-echo "=== Step 2: Build backend ==="
-docker compose up -d --build backend
-
-echo "=== Step 3: Sync backend to 253 ==="
-sshpass -p "${REMOTE_PASS}" rsync -avz --delete \
-  -e "ssh -o StrictHostKeyChecking=no -p 22" \
-  ${PROJECT_DIR}/backend/ ${REMOTE_USER}@${REMOTE_HOST}:/root/Fine-tuning/backend/
-
-echo "=== Deploy done ==="

+ 0 - 21
deploy_remote.sh

@@ -1,21 +0,0 @@
-#!/bin/bash
-# 将 151 上的 backend 代码同步到 253 训练节点
-
-REMOTE_USER="root"
-REMOTE_HOST="192.168.91.253"
-REMOTE_PASS="ictrek"
-REMOTE_DIR="/root/Fine-tuning"
-LOCAL_BACKEND="./backend"
-
-echo "=> Syncing backend to ${REMOTE_HOST}:${REMOTE_DIR}/backend ..."
-
-sshpass -p "$REMOTE_PASS" rsync -avz --delete \
-  -e "ssh -o StrictHostKeyChecking=no -p 22" \
-  ${LOCAL_BACKEND}/ ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}/backend/
-
-if [ $? -eq 0 ]; then
-  echo "=> Sync done."
-else
-  echo "=> Sync failed!"
-  exit 1
-fi

+ 15 - 32
docker-compose.yml

@@ -13,8 +13,7 @@ services:
       - pgdata:/var/lib/postgresql/data
     ports:
       - "5432:5432"
-    networks:
-      - finetune-net
+    network_mode: host
 
   backend:
     build:
@@ -25,51 +24,35 @@ services:
     ports:
       - "8010:8010"
     volumes:
-      # 持久化数据和模型
-      - ./backend/data:/root/Fine-tuning/backend/data
+      - ./backend/data:/home/ubuntu/Fine-tuning/backend/data
     env_file:
-      - ./backend/.env.docker
+      - ./backend/.env
     environment:
       - BACKEND_HOST=0.0.0.0
       - BACKEND_PORT=8010
-      - DATABASE_URL=postgresql+asyncpg://finetune:finetune123@postgres:5432/finetuning
-      # --- 分布式计算节点(可选) ---
-      # 设置为 253 的 IP 即启用远程算力模式
-      - COMPUTE_NODE_HOST=192.168.91.253
-      - COMPUTE_NODE_SSH_PORT=22
-      - COMPUTE_NODE_SSH_USER=root
-      - COMPUTE_NODE_SSH_PASSWORD=ictrek
-      # - COMPUTE_NODE_SSH_KEY=/root/.ssh/id_rsa  # 优先用密钥,密码为备选
-      - COMPUTE_NODE_PYTHON=/opt/conda/bin/python
-      - COMPUTE_NODE_DOCKER_CONTAINER=finetune-trainer
-      - COMPUTE_NODE_WORKDIR=/root/Fine-tuning/backend
-      - COMPUTE_NODE_REMOTE_DATA_DIR=/root/Fine-tuning/backend/data
-      - COMPUTE_NODE_REMOTE_ENV=production
-      - COMPUTE_NODE_SSH_TIMEOUT=300
+      - DATABASE_URL=postgresql+asyncpg://finetune:finetune123@localhost:5432/finetuning
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    network_mode: host
     depends_on:
       - postgres
-    networks:
-      - finetune-net
 
   frontend:
     build:
       context: ./frontend
       dockerfile: Dockerfile
       args:
-        VITE_API_BASE_URL: /api/v1
-        VITE_WS_BASE_URL: /ws
+        VITE_API_BASE_URL: http://localhost:8010/api/v1
+        VITE_WS_BASE_URL: ws://localhost:8010/ws
     container_name: finetune-frontend
     restart: unless-stopped
     ports:
-      - "3000:80"
-    depends_on:
-      - backend
-    networks:
-      - finetune-net
-
-networks:
-  finetune-net:
-    driver: bridge
+      - "5173:80"
 
 volumes:
   pgdata:

+ 45 - 28
result.txt

@@ -1,28 +1,45 @@
-(base) [root@localhost ~]# docker exec -w /root/Fine-tuning/backend finetune-trainer /opt/conda/bin/python -m app.engines.remote_train "test-manual-001" "Qwen/Qwen3.5-0.8B" "text" "/root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/distill_r1_sft.json" "/root/Fine-tuning/backend/data/config_aa342346-a39e-4644-9a34-f3a9d3b961f8.json"
-2026-05-20 14:28:57 | ERROR    | peft-platform | Remote training failed: test-manual-001 - Extra data: line 2 column 1 (char 71)
-Traceback (most recent call last):
-  File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
-    return _run_code(code, main_globals, None,
-  File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
-    exec(code, run_globals)
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 162, in <module>
-    main()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 158, in main
-    asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
-  File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
-    return loop.run_until_complete(main)
-  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
-    return future.result()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 108, in run_training
-    await engine.preprocess_dataset(dataset_path, processed_path, task_type=task_type, template=template)
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 119, in preprocess_dataset
-    processed = preprocess_file(dataset_path, output_path, task_type, template)
-  File "/root/Fine-tuning/backend/app/preprocessors/__init__.py", line 130, in preprocess_file
-    data = json.load(f)
-  File "/opt/conda/lib/python3.10/json/__init__.py", line 293, in load
-    return loads(fp.read(),
-  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
-    return _default_decoder.decode(s)
-  File "/opt/conda/lib/python3.10/json/decoder.py", line 340, in decode
-    raise JSONDecodeError("Extra data", s, end)
-json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 71)
+ubuntu@ubuntu:/home$ cd ubuntu/
+ubuntu@ubuntu:~$ cat /etc/os-release
+PRETTY_NAME="Ubuntu 24.04.3 LTS"
+NAME="Ubuntu"
+VERSION_ID="24.04"
+VERSION="24.04.3 LTS (Noble Numbat)"
+VERSION_CODENAME=noble
+ID=ubuntu
+ID_LIKE=debian
+HOME_URL="https://www.ubuntu.com/"
+SUPPORT_URL="https://help.ubuntu.com/"
+BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
+PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
+UBUNTU_CODENAME=noble
+LOGO=ubuntu-logo
+ubuntu@ubuntu:~$ df -h
+Filesystem                         Size  Used Avail Use% Mounted on
+tmpfs                              3.1G  1.4M  3.1G   1% /run
+/dev/mapper/ubuntu--vg-ubuntu--lv  298G   49G  250G  17% /
+tmpfs                               12G     0   12G   0% /dev/shm
+tmpfs                              5.0M     0  5.0M   0% /run/lock
+/dev/sda2                          2.0G  101M  1.7G   6% /boot
+tmpfs                              1.6G   20K  1.6G   1% /run/user/1000
+ubuntu@ubuntu:~$ nvidia-smi
+Wed May 20 06:43:18 2026       
++-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 580.159.03             Driver Version: 580.159.03     CUDA Version: 13.0     |
++-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA GeForce RTX 3090        Off |   00000000:00:0A.0 Off |                  N/A |
+| 30%   27C    P8             16W /  350W |       1MiB /  24576MiB |      0%      Default |
+|                                         |                        |                  N/A |
++-----------------------------------------+------------------------+----------------------+
+
++-----------------------------------------------------------------------------------------+
+| Processes:                                                                              |
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+|        ID   ID                                                               Usage      |
+|=========================================================================================|
+|  No running processes found                                                             |
++-----------------------------------------------------------------------------------------+
+ubuntu@ubuntu:~$