Parcourir la source

修改远程传输

lxylxy123321 il y a 1 semaine
Parent
commit
570357acf6
2 fichiers modifiés avec 28 ajouts et 33 suppressions
  1. 14 5
      backend/app/core/remote_executor.py
  2. 14 28
      result.txt

+ 14 - 5
backend/app/core/remote_executor.py

@@ -116,28 +116,37 @@ def run_training_remote(
 
 
     # SCP 到远端宿主机(使用 data_dir,这个目录已通过 bind mount 共享给容器)
     # SCP 到远端宿主机(使用 data_dir,这个目录已通过 bind mount 共享给容器)
     remote_config_path = f"{settings.compute_node_remote_data_dir}/config_{job_id}.json"
     remote_config_path = f"{settings.compute_node_remote_data_dir}/config_{job_id}.json"
-    ret_code, _, _ = scp_to_remote(config_tmp, f"{remote_config_path}")
+    ret_code, stdout, stderr = scp_to_remote(config_tmp, f"{remote_config_path}")
     os.unlink(config_tmp)  # 删除本地临时文件
     os.unlink(config_tmp)  # 删除本地临时文件
 
 
     if ret_code != 0:
     if ret_code != 0:
-        logger.error(f"SCP config file failed: ret_code={ret_code}")
+        logger.error(f"SCP config file failed: ret_code={ret_code}, stderr={stderr}")
         return None
         return None
 
 
     # 把数据集路径也传到远程(SCP 到 data/uploads/ 目录)
     # 把数据集路径也传到远程(SCP 到 data/uploads/ 目录)
     remote_dataset_name = os.path.basename(dataset_path)
     remote_dataset_name = os.path.basename(dataset_path)
     remote_dataset_path = f"{settings.compute_node_remote_data_dir}/datasets/{remote_dataset_name}"
     remote_dataset_path = f"{settings.compute_node_remote_data_dir}/datasets/{remote_dataset_name}"
 
 
+    # 确保远程父目录存在
+    remote_dataset_dir = os.path.dirname(remote_dataset_path)
+    _, _, mkdir_stderr = ssh_exec(f"mkdir -p {remote_dataset_dir}")
+    logger.info(f"Created remote dataset directory: {remote_dataset_dir}")
+
     if os.path.isdir(dataset_path):
     if os.path.isdir(dataset_path):
         # 目录:用 scp -r
         # 目录:用 scp -r
-        ret_code, _, _ = scp_to_remote_dir(dataset_path, remote_dataset_path)
+        logger.info(f"Uploading dataset directory: {dataset_path} -> {remote_dataset_path}")
+        ret_code, _, stderr = scp_to_remote_dir(dataset_path, remote_dataset_path)
     else:
     else:
         # 文件:普通 scp
         # 文件:普通 scp
-        ret_code, _, _ = scp_to_remote(dataset_path, remote_dataset_path)
+        logger.info(f"Uploading dataset file: {dataset_path} -> {remote_dataset_path}")
+        ret_code, _, stderr = scp_to_remote(dataset_path, remote_dataset_path)
 
 
     if ret_code != 0:
     if ret_code != 0:
-        logger.error(f"SCP dataset failed: ret_code={ret_code}")
+        logger.error(f"SCP dataset failed: ret_code={ret_code}, stderr={stderr}")
         return None
         return None
 
 
+    logger.info(f"Dataset uploaded successfully: {remote_dataset_path}")
+
     # 在容器内启动训练
     # 在容器内启动训练
     remote_cmd = (
     remote_cmd = (
         f"docker exec -w {settings.compute_node_workdir} "
         f"docker exec -w {settings.compute_node_workdir} "

+ 14 - 28
result.txt

@@ -1,28 +1,14 @@
-(base) [root@localhost ~]# docker exec -w /root/Fine-tuning/backend finetune-trainer /opt/conda/bin/python -m app.engines.remote_train "test-manual-001" "Qwen/Qwen3.5-0.8B" "text" "/root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/distill_r1_sft.json" "/root/Fine-tuning/backend/data/config_aa342346-a39e-4644-9a34-f3a9d3b961f8.json"
-2026-05-20 14:28:57 | ERROR    | peft-platform | Remote training failed: test-manual-001 - Extra data: line 2 column 1 (char 71)
-Traceback (most recent call last):
-  File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
-    return _run_code(code, main_globals, None,
-  File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
-    exec(code, run_globals)
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 162, in <module>
-    main()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 158, in main
-    asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
-  File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
-    return loop.run_until_complete(main)
-  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
-    return future.result()
-  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 108, in run_training
-    await engine.preprocess_dataset(dataset_path, processed_path, task_type=task_type, template=template)
-  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 119, in preprocess_dataset
-    processed = preprocess_file(dataset_path, output_path, task_type, template)
-  File "/root/Fine-tuning/backend/app/preprocessors/__init__.py", line 130, in preprocess_file
-    data = json.load(f)
-  File "/opt/conda/lib/python3.10/json/__init__.py", line 293, in load
-    return loads(fp.read(),
-  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
-    return _default_decoder.decode(s)
-  File "/opt/conda/lib/python3.10/json/decoder.py", line 340, in decode
-    raise JSONDecodeError("Extra data", s, end)
-json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 71)
+lq@lq:~/Fine-tuning$ sudo docker exec finetune-postgres psql -U finetune -d finetuning -c "SELECT id, name, file_path FROM datasets;"
+[sudo] password for lq: 
+                  id                  |       name        |                                file_path                                 
+--------------------------------------+-------------------+--------------------------------------------------------------------------
+ 3d5f8808-e71a-449d-94e9-c61c4881b2cf | yanalong/yanalong | /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl
+(1 row)
+
+lq@lq:~/Fine-tuning$ docker exec finetune-backend ls -la /root/Fine-tuning/backend/data/uploads/
+permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock: Get "http://%2Fvar%2Frun%2Fdocker.sock/v1.49/containers/finetune-backend/json": dial unix /var/run/docker.sock: connect: permission denied
+lq@lq:~/Fine-tuning$ sudo docker exec finetune-backend ls -la /root/Fine-tuning/backend/data/uploads/
+total 8
+drwxr-xr-x 2 root root 4096 May 20 06:35 .
+drwxrwxr-x 6 1000 1000 4096 May 21 01:27 ..
+lq@lq:~/Fine-tuning$