Переглянути джерело

修改内存不足问题,更改平台名称

lxylxy123321 5 днів тому
батько
коміт
25e2de0635

Різницю між файлами не показано, бо вона завелика
+ 0 - 0
.claude-terminal


+ 3 - 2
CLAUDE.md

@@ -26,7 +26,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 在 253 服务器(192.168.91.253)上重建 `finetune-trainer` 容器的命令:
 
 ```bash
-docker stop finetune-trainer && docker rename finetune-trainer finetune-trainer-old && docker run -d --name finetune-trainer --privileged --network host --shm-size 64m -e MACA_MPS_MODE=1 -v /root/Fine-tuning/backend:/root/Fine-tuning/backend 5334348e7a9b tail -f /dev/null
+docker stop finetune-trainer && docker rename finetune-trainer finetune-trainer-old && docker run -d --name finetune-trainer --privileged --network host --shm-size 4g -e MACA_MPS_MODE=1 -v /root/Fine-tuning/backend:/root/Fine-tuning/backend 5334348e7a9b tail -f /dev/null
 ```
 
 ### 容器配置说明
@@ -34,7 +34,7 @@ docker stop finetune-trainer && docker rename finetune-trainer finetune-trainer-
 - **基础镜像**: `5334348e7a9b`(沐曦官方镜像的 image ID)
 - **特权模式**: `--privileged` 允许容器访问沐曦 GPU 设备
 - **网络模式**: `--network host` 使用宿主机网络
-- **共享内存**: `--shm-size 64m`
+- **共享内存**: `--shm-size 4g`(沐曦驱动 ring buffer 需要足够共享内存)
 - **MACA_MPS_MODE**: `1` 启用沐曦 MPS 模式
 - **代码目录**: 挂载 `/root/Fine-tuning/backend`(由 151 rsync 同步)
 - **Python 路径**: `/opt/conda/bin/python`(conda 环境)
@@ -45,4 +45,5 @@ docker stop finetune-trainer && docker rename finetune-trainer finetune-trainer-
 
 ```bash
 docker exec -it finetune-trainer /opt/conda/bin/pip install peft trl accelerate bitsandbytes datasets
+docker exec -it finetune-trainer /opt/conda/bin/pip install --no-deps --upgrade transformers huggingface-hub
 ```

+ 27 - 28
backend/app/core/job_queue.py

@@ -275,35 +275,34 @@ class JobQueue:
         return None
 
     async def _cleanup_remote_processes(self):
-        """通过 SSH 清理容器内所有残留的 python 进程(包括僵尸进程),释放 GPU ring buffer。"""
+        """通过 SSH 清理容器内所有残留的 python 进程(包括僵尸进程),释放 GPU ring buffer。
+
+        所有操作合并为一条 SSH 命令,避免多次连接导致超时。
+        """
         from app.config import get_settings
         from app.core.remote_executor import ssh_exec
 
         settings = get_settings()
         container = settings.compute_node_docker_container
 
-        # 先检查容器是否存活,避免在容器异常时卡住 SSH
-        check_cmd = f"docker inspect -f '{{{{.State.Running}}}}' {container} 2>/dev/null"
-        code, stdout, _ = await asyncio.to_thread(ssh_exec, check_cmd, timeout=10)
-        if code != 0 or "true" not in stdout.strip().lower():
-            logger.warning(f"Container {container} is not running, skipping cleanup")
-            return
-
-        # 查找所有 python 进程(包括僵尸)
-        cmd = f"docker exec {container} bash -c 'ps aux | grep \"[p]ython\" | grep -v grep | awk \"{{print \\$2}}\"'"
-        code, stdout, _ = await asyncio.to_thread(ssh_exec, cmd, timeout=30)
-        if code == 0 and stdout.strip():
-            pids = stdout.strip().split("\n")
-            for pid in pids:
-                pid = pid.strip()
-                if not pid:
-                    continue
-                # 强制 kill(僵尸进程需要父进程 reaper 清理,kill -9 后 PID 1 会自动 reap)
-                kill_cmd = f"docker exec {container} bash -c 'kill -9 {pid} 2>/dev/null; wait {pid} 2>/dev/null'"
-                await asyncio.to_thread(ssh_exec, kill_cmd, timeout=10)
-            logger.info(f"Cleaned up {len(pids)} remote python processes in container {container}")
+        # 一条命令完成:检查容器 → 查找 python 进程 → 逐个 kill → 输出清理结果
+        cmd = (
+            f"docker inspect -f '{{{{.State.Running}}}}' {container} 2>/dev/null || echo false; "
+            f"if [ \"$(docker inspect -f '{{{{.State.Running}}}}' {container} 2>/dev/null)\" = 'true' ]; then "
+            f"pids=$(docker exec {container} bash -c 'ps aux 2>/dev/null | grep \"[p]ython\" | grep -v grep | awk \"{{{{print \\$2}}}}\"'); "
+            f"if [ -n \"$pids\" ]; then "
+            f"echo \"$pids\" | while read pid; do "
+            f"docker exec {container} bash -c 'kill -9 $pid 2>/dev/null; wait $pid 2>/dev/null'; "
+            f"done; "
+            f"echo \"cleaned $(echo \"$pids\" | wc -l) processes\"; "
+            f"else echo 'no python processes'; fi; "
+            f"else echo 'container not running'; fi"
+        )
+        code, stdout, stderr = await asyncio.to_thread(ssh_exec, cmd, timeout=60)
+        if code != 0:
+            logger.warning(f"Remote cleanup failed: code={code}, stderr={stderr}")
         else:
-            logger.info(f"No python processes found in container {container}, no cleanup needed")
+            logger.info(f"Remote cleanup result: {stdout.strip()}")
 
     async def _lookup_dataset_db(self, dataset_id: str) -> str | None:
         """从数据库查找数据集路径。"""
@@ -363,8 +362,8 @@ class JobQueue:
                 )
                 logger.info(f"Killed remote process {pid} via docker exec")
                 return
-            except Exception:
-                pass
+            except Exception as e:
+                logger.warning(f"Failed to kill process {pid} via docker exec: {e}")
 
             # 方式2: nsenter 从宿主机直接进入进程 namespace 发信号
             try:
@@ -375,8 +374,8 @@ class JobQueue:
                 )
                 logger.info(f"Killed remote process {pid} via nsenter")
                 return
-            except Exception:
-                pass
+            except Exception as e:
+                logger.warning(f"Failed to kill process {pid} via nsenter: {e}")
 
             # 方式3: 终极方案 — 重启整个容器(释放所有 GPU 资源)
             try:
@@ -386,8 +385,8 @@ class JobQueue:
                     timeout=30,
                 )
                 logger.warning(f"Force restarted container {container} to release GPU resources")
-            except Exception:
-                pass
+            except Exception as e:
+                logger.error(f"Failed to restart container {container}: {e}")
 
         async def _mark_failed(error_msg: str):
             """统一标记失败:先 kill 远程进程,再更新状态。"""

+ 10 - 3
backend/app/core/remote_executor.py

@@ -12,7 +12,7 @@ settings = get_settings()
 
 def _get_ssh_prefix() -> list[str]:
     """构建 ssh/scp 命令前缀,支持密钥或密码登录。"""
-    prefix = ["-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=10"]
+    prefix = ["-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=30"]
     return prefix
 
 
@@ -160,6 +160,7 @@ def run_training_remote(
         f"docker exec "
         f"-e MACA_MPS_MODE=1 "
         f"-e METAX_VISIBLE_DEVICES=2,3 "
+        f"-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True "
         f"-w {settings.compute_node_workdir} "
         f"{settings.compute_node_docker_container} "
         f"bash -c '"
@@ -183,9 +184,15 @@ def run_training_remote(
 def is_process_running(pid: str) -> bool:
     """检查远程训练进程是否还在运行。
 
-    通过 docker exec 进入容器检查 PID 是否存在。
+    通过 docker exec 进入容器检查 PID 是否存在且不是僵尸进程
     """
-    cmd = f"docker exec {settings.compute_node_docker_container} bash -c 'kill -0 {pid} 2>/dev/null && echo running || echo stopped'"
+    cmd = (
+        f"docker exec {settings.compute_node_docker_container} bash -c "
+        f"'state=$(cat /proc/{pid}/stat 2>/dev/null | awk \"{{{{print \\$3}}}}\"); "
+        f"if [ \"$state\" = \"Z\" ]; then echo zombie; "
+        f"elif kill -0 {pid} 2>/dev/null; then echo running; "
+        f"else echo stopped; fi'"
+    )
     code, stdout, stderr = ssh_exec(cmd, timeout=30)
     if code != 0:
         # SSH/docker exec 本身失败(容器可能挂了),视为进程不存活

+ 2 - 0
backend/app/engines/remote_train.py

@@ -14,6 +14,8 @@ from pathlib import Path
 # 禁用 FlashAttention
 os.environ["PYTORCH_NO_FLASH"] = "1"
 os.environ["FLASH_ATTENTION_ENABLED"] = "0"
+# 解决 PyTorch 显存碎片化问题(避免 reserved unallocated 占用大量显存)
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # 禁用 torch.compile,避免 fork 大量 inductor worker 进程
 os.environ["PT2_COMPILE"] = "0"
 os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"

+ 2 - 0
backend/app/engines/text_engine.py

@@ -8,6 +8,8 @@ os.environ["TORCH_FLASH_ATTN"] = "0"
 # 禁用 torch.compile,避免每个任务 fork 几十个 inductor worker
 os.environ["PT2_COMPILE"] = "0"
 os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
+# 解决 PyTorch 显存碎片化问题(避免 reserved unallocated 占用大量显存)
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 # 限制训练只用 GPU 2 和 3(GPU 0/1 被 VLLM 占用)
 # 沐曦 MPS 模式下 CUDA_VISIBLE_DEVICES 可能干扰设备映射,
 # 只设 METAX_VISIBLE_DEVICES,device_map 里用物理 GPU 号手动指定。

+ 1 - 1
backend/main.py

@@ -39,7 +39,7 @@ async def lifespan(app: FastAPI):
 
 def create_app() -> FastAPI:
     app = FastAPI(
-        title="PEFT Fine-Tuning Platform",
+        title="四川路桥模型微调平台",
         version="0.1.0",
         lifespan=lifespan,
     )

+ 1 - 1
backend/pyproject.toml

@@ -1,7 +1,7 @@
 [project]
 name = "peft-platform-backend"
 version = "0.1.0"
-description = "PEFT Fine-Tuning Platform Backend"
+description = "四川路桥模型微调平台"
 requires-python = ">=3.10"
 dependencies = [
     "fastapi>=0.115.0",

+ 1 - 1
deploy.sh

@@ -14,7 +14,7 @@ echo "=== Step 1: Git pull ==="
 git pull
 
 echo "=== Step 2: Build backend ==="
-docker compose up -d --build backend
+docker compose up -d --build
 
 echo "=== Step 3: Sync backend to 253 ==="
 sshpass -p "${REMOTE_PASS}" rsync -avz --delete \

+ 1 - 1
frontend/.env

@@ -1,4 +1,4 @@
 VITE_API_BASE_URL=/api/v1
 VITE_WS_BASE_URL=/ws
-VITE_APP_TITLE=PEFT Fine-Tuning Platform
+VITE_APP_TITLE=四川路桥模型微调平台
 VITE_MAX_UPLOAD_SIZE_MB=500

+ 1 - 1
frontend/.env.example

@@ -1,4 +1,4 @@
 VITE_API_BASE_URL=/api/v1
 VITE_WS_BASE_URL=/ws
-VITE_APP_TITLE=PEFT Fine-Tuning Platform
+VITE_APP_TITLE=四川路桥模型微调平台
 VITE_MAX_UPLOAD_SIZE_MB=500

+ 1 - 1
frontend/index.html

@@ -3,7 +3,7 @@
   <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-    <title>PEFT Fine-Tuning Platform</title>
+    <title>四川路桥模型微调平台</title>
   </head>
   <body>
     <div id="root"></div>

+ 1 - 1
frontend/src/components/layout/Layout.tsx

@@ -31,7 +31,7 @@ export function Layout({ children }: { children: React.ReactNode }) {
         {/* Logo / Brand */}
         <div style={{ padding: '24px 20px', borderBottom: '1px solid rgba(0,0,0,0.06)' }}>
           <h2 style={{ fontSize: 18, fontWeight: 800, margin: 0, letterSpacing: '-0.5px', color: '#134e4a' }}>
-            PEFT Platform
+            四川路桥模型微调平台
           </h2>
           <p style={{ fontSize: 11, color: '#94a3b8', margin: '4px 0 0' }}>
             Fine-Tuning Studio

+ 2 - 2
frontend/src/pages/AuthLogin.tsx

@@ -62,10 +62,10 @@ export function Login() {
             <Sparkles size={24} color="#fff" />
           </div>
           <h1 style={{ fontSize: 22, fontWeight: 800, margin: '0 0 6px', color: '#134e4a', letterSpacing: '-0.5px' }}>
-            PEFT Fine-Tuning Platform
+            四川路桥模型微调平台
           </h1>
           <p style={{ color: '#94a3b8', fontSize: 13, margin: 0 }}>
-            高效、易用的参数高效微调平台
+            高效、易用的模型微调平台
           </p>
         </div>
 

+ 1 - 1
frontend/src/pages/Dashboard.tsx

@@ -73,7 +73,7 @@ export function Dashboard() {
           仪表盘
         </h1>
         <p style={{ color: '#64748b', fontSize: 14, margin: '6px 0 0' }}>
-          PEFT Fine-Tuning Platform v0.1.0 — 欢迎回来
+          四川路桥模型微调平台 v0.1.0 — 欢迎回来
         </p>
       </div>
 

Різницю між файлами не показано, бо вона завелика
+ 158 - 16
result.txt


Деякі файли не було показано, через те що забагато файлів було змінено