Sfoglia il codice sorgente

修复文件传输问题

lxylxy123321 1 settimana fa
parent
commit
70ff992343
2 ha cambiato i file con 43 aggiunte e 45 eliminazioni
  1. 22 13
      backend/app/core/remote_executor.py
  2. 21 32
      result.txt

+ 22 - 13
backend/app/core/remote_executor.py

@@ -84,23 +84,32 @@ def run_training_remote(
 ) -> str | None:
     """在算力节点启动训练任务(通过 docker exec,后台执行)。
 
-    通过 docker exec -i 将配置传入容器内,避免宿主机/容器路径混淆
+    通过 SCP 把配置文件传到远端宿主机,再在容器内启动训练
     """
-    import base64
+    import tempfile
 
-    config_json = json.dumps(config, ensure_ascii=False)
-    config_b64 = base64.b64encode(config_json.encode()).decode()
-    config_file = f"/tmp/config_{job_id}.json"
+    # 在 151 宿主机创建临时配置文件
+    config_tmp = tempfile.mktemp(suffix=".json", prefix=f"config_{job_id}_")
+    with open(config_tmp, "w", encoding="utf-8") as f:
+        json.dump(config, f, ensure_ascii=False)
 
-    # 通过 docker exec -i 把配置传入容器内,在容器里写入临时文件并启动训练
+    # SCP 到远端宿主机(使用 data_dir,这个目录已通过 bind mount 共享给容器)
+    remote_config_path = f"{settings.compute_node_remote_data_dir}/config_{job_id}.json"
+    ret_code, _, _ = scp_to_remote(config_tmp, f"{remote_config_path}")
+    os.unlink(config_tmp)  # 删除本地临时文件
+
+    if ret_code != 0:
+        logger.error(f"SCP config file failed: ret_code={ret_code}")
+        return None
+
+    # 在容器内启动训练(不再依赖 stdin pipe)
     remote_cmd = (
-        f"echo '{config_b64}' | base64 -d | "
-        f"docker exec -i {settings.compute_node_docker_container} bash -c '"
-        f"cat > {config_file} && "
-        f"cd {settings.compute_node_workdir} && "
+        f"docker exec -w {settings.compute_node_workdir} "
+        f"{settings.compute_node_docker_container} "
+        f"bash -c '"
         f"nohup {settings.compute_node_python} -m app.engines.remote_train "
-        f"{job_id} {model_id} {model_type} {dataset_id} {config_file} "
-        f">/tmp/train_{job_id}.log 2>&1 & echo $!'"
+        f"{job_id} {model_id} {model_type} {dataset_id} {remote_config_path} "
+        f"</dev/null >/tmp/train_{job_id}.log 2>&1 & echo $!'"
     )
 
     code, stdout, stderr = ssh_exec(remote_cmd, timeout=30)
@@ -120,7 +129,7 @@ def is_process_running(pid: str) -> bool:
     通过 docker exec 进入容器检查 PID 是否存在。
     """
     cmd = f"docker exec {settings.compute_node_docker_container} bash -c 'kill -0 {pid} 2>/dev/null && echo running || echo stopped'"
-    code, stdout, stderr = ssh_exec(cmd, timeout=10)
+    code, stdout, stderr = ssh_exec(cmd, timeout=30)
     return code == 0 and "running" in stdout
 
 

+ 21 - 32
result.txt

@@ -1,32 +1,21 @@
-finetune-backend  | 2026-05-20 03:20:25 | ERROR    | peft-platform | Dataset download failed: Extra data: line 2 column 1 (char 71)
-finetune-backend  | INFO:     172.20.0.4:57044 - "POST /api/v1/datasets/download HTTP/1.0" 400 Bad Request
-finetune-backend  | INFO:     127.0.0.1:33414 - "GET /health HTTP/1.1" 200 OK
-finetune-backend  | INFO:     172.20.0.4:48556 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-finetune-backend  | 2026-05-20 03:22:27 | ERROR    | peft-platform | SSH command timeout after 5s: docker exec finetune-trainer rm -f /tmp/test_model_Qwen_Qwen3.5-0.8B.py
-finetune-backend  | 2026-05-20 03:22:27 | INFO     | peft-platform | Remote test result: code=0, stdout_len=1701, stderr_len=2440
-finetune-backend  | 2026-05-20 03:22:27 | INFO     | peft-platform | stdout (first 500): 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | 1,7,16,128,128,64,1,1,None
-finetune-backend  | {"generated_te
-finetune-backend  | 2026-05-20 03:22:27 | INFO     | peft-platform | stderr (first 500): Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-finetune-backend  | Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-finetune-backend  | torch.compile is not available in Python 3.10, using identity decorator instead
-finetune-backend  | 
-finetune-backend  | Loading weights:   0%|          | 0/320 [00:00<?, ?it/s]
-finetune-backend  | Loading weights:   0%|          | 1/320 [00:02<11:11,  2.11s
-finetune-backend  | INFO:     172.20.0.4:48564 - "POST /api/v1/models/test HTTP/1.0" 200 OK
+(base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_7bcbc0bb-72c7-408f-a4c6-c38fb05b8382.log
+Traceback (most recent call last):
+  File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 179, in <module>
+    main()
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 173, in main
+    config = json.load(f)
+  File "/opt/conda/lib/python3.10/json/__init__.py", line 293, in load
+    return loads(fp.read(),
+  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
+    return _default_decoder.decode(s)
+  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
+    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
+  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
+    raise JSONDecodeError("Expecting value", s, err.value) from None
+json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
+(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/python -c "import sys; sys.path.insert(0, '/root/Fine-tuning/backend'); from app.engines.remote_train import run_training; print('ok')"
+ok