Przeglądaj źródła

不同步data目录

lxylxy123321 3 dni temu
rodzic
commit
f4030231ad
2 zmienionych plików z 105 dodań i 18 usunięć
  1. 2 0
      backend/entrypoint.sh
  2. 103 18
      result.txt

+ 2 - 0
backend/entrypoint.sh

@@ -11,10 +11,12 @@ if [ -n "$REMOTE_HOST" ]; then
   # 使用 --no-o --no-g 避免 chown 失败,去掉 --delete 避免无权删除远端文件
   if [ -n "$REMOTE_PASS" ]; then
     sshpass -p "$REMOTE_PASS" rsync -avz --no-o --no-g \
+      --exclude 'data' \
       -e "ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5" \
       /app/ ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}/
   else
     rsync -avz --no-o --no-g \
+      --exclude 'data' \
       -e "ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5" \
       /app/ ${REMOTE_USER}@${REMOTE_HOST}:${REMOTE_DIR}/
   fi

+ 103 - 18
result.txt

@@ -1,18 +1,103 @@
-(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/pip install bitsandbytes
-Looking in indexes: http://mirrors.aliyun.com/pypi/simple
-Collecting bitsandbytes
-  Using cached http://mirrors.aliyun.com/pypi/packages/19/57/3443d6f183436fbdaf5000aac332c4d5ddb056665d459244a5608e98ae92/bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl (60.7 MB)
-Requirement already satisfied: torch<3,>=2.3 in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (2.8.0+metax3.5.3.9)
-Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (1.26.4)
-Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (26.2)
-Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (3.29.0)
-Requirement already satisfied: typing-extensions>=4.10.0 in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (4.15.0)
-Requirement already satisfied: sympy>=1.13.3 in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (1.14.0)
-Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (3.4.2)
-Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (3.1.6)
-Requirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (2025.5.1)
-Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy>=1.13.3->torch<3,>=2.3->bitsandbytes) (1.3.0)
-Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch<3,>=2.3->bitsandbytes) (3.0.2)
-Installing collected packages: bitsandbytes
-Successfully installed bitsandbytes-0.49.2
-WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
+lq@lq:~/Fine-tuning$ sudo docker logs -f finetune-backend
+=> Syncing backend code to compute node 192.168.91.253 ...
+Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
+sending incremental file list
+app/engines/
+app/engines/__pycache__/
+
+sent 4,159 bytes  received 34 bytes  226.65 bytes/sec
+total size is 410,566  speedup is 97.92
+=> Sync done.
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+2026-05-25 02:43:35 | INFO     | peft-platform | JobQueue started with 2 workers
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+INFO:     172.20.0.4:55314 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:52682 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:55330 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42010 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42026 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42740 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:33348 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:34080 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:33364 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:45028 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:45030 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47366 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:47382 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:53934 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:34352 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:53940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57974 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:57976 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:42202 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:54038 - "GET /api/v1/training/jobs HTTP/1.0" 401 Unauthorized
+INFO:     172.20.0.4:54052 - "POST /api/v1/auth/refresh HTTP/1.0" 200 OK
+INFO:     172.20.0.4:54060 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:41064 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:40778 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40782 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40794 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:49102 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:49120 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:49108 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:49136 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:49148 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:49166 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:49158 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     127.0.0.1:50558 - "GET /health HTTP/1.1" 200 OK
+2026-05-25 02:46:13 | INFO     | peft-platform | Job c3938524-d1e2-4bd3-b73e-07d98e2020f4 enqueued
+2026-05-25 02:46:13 | INFO     | peft-platform | Training job created: c3938524-d1e2-4bd3-b73e-07d98e2020f4
+INFO:     172.20.0.4:49174 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 02:46:13 | INFO     | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
+INFO:     172.20.0.4:49190 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:49208 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:49194 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42438 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42440 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35346 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35360 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 02:46:32 | INFO     | peft-platform | Remote cleanup result: true
+cleaned 15 processes
+2026-05-25 02:47:25 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
+2026-05-25 02:47:25 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-25 02:47:42 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-25 02:48:18 | INFO     | peft-platform | Remote training launched in container: job=c3938524-d1e2-4bd3-b73e-07d98e2020f4, container_pid=175470
+INFO:     127.0.0.1:44134 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:43880 - "GET /health HTTP/1.1" 200 OK
+INFO:     127.0.0.1:58442 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:59902 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:41198 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:41214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:40696 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:56650 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:56642 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:39566 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:39568 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:39554 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:39572 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:39582 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:39584 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35208 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:35214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:43170 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:43186 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:59424 - "GET /health HTTP/1.1" 200 OK
+INFO:     172.20.0.4:37098 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:37106 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:60088 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:36258 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:36270 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     127.0.0.1:38910 - "GET /health HTTP/1.1" 200 OK
+2026-05-25 02:49:11 | ERROR    | peft-platform | Remote job c3938524-d1e2-4bd3-b73e-07d98e2020f4 failed: AdaLoRA does not work when `total_step` is None, supply a value > 0.
+INFO:     172.20.0.4:45988 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:46000 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-25 02:49:21 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 175470 2>/dev/null; pkill -9 -P 175470 2>/dev/null'
+2026-05-25 02:49:21 | INFO     | peft-platform | Killed remote process 175470 via docker exec
+2026-05-25 02:49:21 | INFO     | peft-platform | Remote training launched for job c3938524-d1e2-4bd3-b73e-07d98e2020f4
+INFO:     172.20.0.4:42648 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+INFO:     172.20.0.4:42656 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+INFO:     127.0.0.1:37380 - "GET /health HTTP/1.1" 200 OK