|
@@ -1,18 +1,103 @@
|
|
|
-(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/pip install bitsandbytes
|
|
|
|
|
-Looking in indexes: http://mirrors.aliyun.com/pypi/simple
|
|
|
|
|
-Collecting bitsandbytes
|
|
|
|
|
- Using cached http://mirrors.aliyun.com/pypi/packages/19/57/3443d6f183436fbdaf5000aac332c4d5ddb056665d459244a5608e98ae92/bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl (60.7 MB)
|
|
|
|
|
-Requirement already satisfied: torch<3,>=2.3 in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (2.8.0+metax3.5.3.9)
|
|
|
|
|
-Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (1.26.4)
|
|
|
|
|
-Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (26.2)
|
|
|
|
|
-Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (3.29.0)
|
|
|
|
|
-Requirement already satisfied: typing-extensions>=4.10.0 in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (4.15.0)
|
|
|
|
|
-Requirement already satisfied: sympy>=1.13.3 in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (1.14.0)
|
|
|
|
|
-Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (3.4.2)
|
|
|
|
|
-Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (3.1.6)
|
|
|
|
|
-Requirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from torch<3,>=2.3->bitsandbytes) (2025.5.1)
|
|
|
|
|
-Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy>=1.13.3->torch<3,>=2.3->bitsandbytes) (1.3.0)
|
|
|
|
|
-Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch<3,>=2.3->bitsandbytes) (3.0.2)
|
|
|
|
|
-Installing collected packages: bitsandbytes
|
|
|
|
|
-Successfully installed bitsandbytes-0.49.2
|
|
|
|
|
-WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
|
|
|
|
|
|
|
+lq@lq:~/Fine-tuning$ sudo docker logs -f finetune-backend
|
|
|
|
|
+=> Syncing backend code to compute node 192.168.91.253 ...
|
|
|
|
|
+Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
|
|
|
|
|
+sending incremental file list
|
|
|
|
|
+app/engines/
|
|
|
|
|
+app/engines/__pycache__/
|
|
|
|
|
+
|
|
|
|
|
+sent 4,159 bytes received 34 bytes 226.65 bytes/sec
|
|
|
|
|
+total size is 410,566 speedup is 97.92
|
|
|
|
|
+=> Sync done.
|
|
|
|
|
+INFO: Started server process [1]
|
|
|
|
|
+INFO: Waiting for application startup.
|
|
|
|
|
+2026-05-25 02:43:35 | INFO | peft-platform | JobQueue started with 2 workers
|
|
|
|
|
+INFO: Application startup complete.
|
|
|
|
|
+INFO: Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
|
|
|
|
|
+INFO: 172.20.0.4:55314 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:52682 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:55330 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42010 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42026 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42740 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:33348 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:34080 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:33364 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:45028 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:45030 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:47366 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:47382 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:53934 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:34352 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:53940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:57974 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:57976 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:42202 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:54038 - "GET /api/v1/training/jobs HTTP/1.0" 401 Unauthorized
|
|
|
|
|
+INFO: 172.20.0.4:54052 - "POST /api/v1/auth/refresh HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:54060 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:41064 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:40778 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:40782 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:40794 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49102 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49120 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49108 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49136 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49148 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49166 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49158 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:50558 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-25 02:46:13 | INFO | peft-platform | Job c3938524-d1e2-4bd3-b73e-07d98e2020f4 enqueued
|
|
|
|
|
+2026-05-25 02:46:13 | INFO | peft-platform | Training job created: c3938524-d1e2-4bd3-b73e-07d98e2020f4
|
|
|
|
|
+INFO: 172.20.0.4:49174 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-25 02:46:13 | INFO | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
|
|
|
|
|
+INFO: 172.20.0.4:49190 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49208 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49194 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42438 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42440 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:35346 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:35360 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-25 02:46:32 | INFO | peft-platform | Remote cleanup result: true
|
|
|
|
|
+cleaned 15 processes
|
|
|
|
|
+2026-05-25 02:47:25 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
|
|
|
|
|
+2026-05-25 02:47:25 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+2026-05-25 02:47:42 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+2026-05-25 02:48:18 | INFO | peft-platform | Remote training launched in container: job=c3938524-d1e2-4bd3-b73e-07d98e2020f4, container_pid=175470
|
|
|
|
|
+INFO: 127.0.0.1:44134 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:43880 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:58442 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:59902 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:41198 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:41214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:40696 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:56650 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:56642 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:39566 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:39568 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:39554 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:39572 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:39582 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:39584 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:35208 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:35214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:43170 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:43186 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:59424 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:37098 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:37106 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:60072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:60088 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:36258 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:36270 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:38910 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-25 02:49:11 | ERROR | peft-platform | Remote job c3938524-d1e2-4bd3-b73e-07d98e2020f4 failed: AdaLoRA does not work when `total_step` is None, supply a value > 0.
|
|
|
|
|
+INFO: 172.20.0.4:45988 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:46000 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-25 02:49:21 | ERROR | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 175470 2>/dev/null; pkill -9 -P 175470 2>/dev/null'
|
|
|
|
|
+2026-05-25 02:49:21 | INFO | peft-platform | Killed remote process 175470 via docker exec
|
|
|
|
|
+2026-05-25 02:49:21 | INFO | peft-platform | Remote training launched for job c3938524-d1e2-4bd3-b73e-07d98e2020f4
|
|
|
|
|
+INFO: 172.20.0.4:42648 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42656 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:37380 - "GET /health HTTP/1.1" 200 OK
|