|
@@ -1,31 +1,65 @@
|
|
|
-(base) [root@localhost ~]# docker exec finetune-trainer id
|
|
|
|
|
-uid=0(root) gid=0(root) groups=0(root)
|
|
|
|
|
-(base) [root@localhost ~]# docker exec finetune-trainer ls -la /root/Fine-tuning/backend/data/processed/
|
|
|
|
|
-total 160
|
|
|
|
|
-drwxrwxrwx. 1 root root 4096 May 21 10:14 .
|
|
|
|
|
-drwxrwxrwx. 1 1000 1000 4096 May 21 10:10 ..
|
|
|
|
|
-drwxrwxrwx. 1 root root 4096 May 20 00:50 ._____temp
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 21 09:47 0058e329-ea10-442c-b73f-7f4c64965478_processed.jsonl
|
|
|
|
|
--rw-r--r--. 1 root root 8287 May 21 10:03 20cbc0c5-c946-4166-9e21-a5a96e49a0da_processed.jsonl
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 21 09:27 3296b92a-8bd0-4c26-8c9d-d17778abfa14_processed.jsonl
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 20 11:31 35949b14-2872-47b4-9fdc-b87bda7279ef_processed.jsonl
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 21 09:23 7aa39fbf-b396-422d-82c9-73044ee4397e_processed.jsonl
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 20 11:48 7bcbc0bb-72c7-408f-a4c6-c38fb05b8382_processed.jsonl
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 20 13:53 92a0a9cd-46aa-48bc-b7ad-bd5a18270c51_processed.jsonl
|
|
|
|
|
--rw-r--r--. 1 root root 8287 May 21 10:10 9b7a9050-d1ec-405b-a8f0-6df562437794_processed.jsonl
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 20 13:12 a52d395e-d3c8-40d2-9be3-1839f597dc7f_processed.jsonl
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 20 14:27 aa342346-a39e-4644-9a34-f3a9d3b961f8_processed.jsonl
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 20 12:03 cce886de-4dd5-460a-b0ac-2404731cd9f8_processed.jsonl
|
|
|
|
|
--rwxrwxrwx. 1 root root 8287 May 20 11:38 d0412da9-b6d0-4ecf-8ae9-35600353bf3e_processed.jsonl
|
|
|
|
|
-drwxrwxrwx. 1 root root 4096 May 20 11:31 ms_yanalong_yanalong
|
|
|
|
|
--rw-r--r--. 1 1000 1000 0 May 21 10:14 test_write
|
|
|
|
|
-(base) [root@localhost ~]# docker exec finetune-trainer touch /root/Fine-tuning/backend/data/processed/test_container && echo "容器内成功" || echo "容器内失败"
|
|
|
|
|
-容器内成功
|
|
|
|
|
-(base) [root@localhost ~]# docker inspect finetune-trainer | grep -A5 '"Mounts"'
|
|
|
|
|
- "Mounts": [
|
|
|
|
|
- {
|
|
|
|
|
- "Type": "bind",
|
|
|
|
|
- "Source": "/root/Fine-tuning/backend",
|
|
|
|
|
- "Destination": "/root/Fine-tuning/backend",
|
|
|
|
|
- "Mode": "",
|
|
|
|
|
-(base) [root@localhost ~]#
|
|
|
|
|
|
|
+lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
|
|
|
|
|
+[sudo] password for lq:
|
|
|
|
|
+2026-05-21T02:34:43.351649510Z => Syncing backend code to compute node 192.168.91.253 ...
|
|
|
|
|
+2026-05-21T02:34:43.398837861Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
|
|
|
|
|
+2026-05-21T02:35:01.023370182Z sending incremental file list
|
|
|
|
|
+2026-05-21T02:35:01.050523499Z app/engines/
|
|
|
|
|
+2026-05-21T02:35:01.050592451Z app/preprocessors/
|
|
|
|
|
+2026-05-21T02:35:01.091659254Z
|
|
|
|
|
+2026-05-21T02:35:01.091745788Z sent 2,328 bytes received 31 bytes 127.51 bytes/sec
|
|
|
|
|
+2026-05-21T02:35:01.091758710Z total size is 203,735 speedup is 86.36
|
|
|
|
|
+2026-05-21T02:35:01.093507150Z => Sync done.
|
|
|
|
|
+2026-05-21T02:35:02.344222594Z INFO: Started server process [1]
|
|
|
|
|
+2026-05-21T02:35:02.344297685Z INFO: Waiting for application startup.
|
|
|
|
|
+2026-05-21T02:35:02.434311439Z 2026-05-21 02:35:02 | INFO | peft-platform | JobQueue started with 2 workers
|
|
|
|
|
+2026-05-21T02:35:02.434367300Z INFO: Application startup complete.
|
|
|
|
|
+2026-05-21T02:35:02.435502488Z INFO: Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
|
|
|
|
|
+2026-05-21T02:35:04.147983780Z INFO: 127.0.0.1:51418 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:35:08.814099882Z INFO: 172.20.0.4:40850 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:35:08.839124444Z INFO: 172.20.0.4:40860 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:35:08.923924366Z INFO: 172.20.0.4:40872 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:35:10.473798949Z INFO: 172.20.0.4:40876 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:35:10.575801902Z INFO: 172.20.0.4:40892 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:35:10.589381990Z INFO: 172.20.0.4:40902 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:35:13.164806549Z 2026-05-21 02:35:13 | INFO | peft-platform | Job 7fa42ee0-c310-4aaf-83eb-634790f9904d enqueued
|
|
|
|
|
+2026-05-21T02:35:13.164893550Z 2026-05-21 02:35:13 | INFO | peft-platform | Training job created: 7fa42ee0-c310-4aaf-83eb-634790f9904d
|
|
|
|
|
+2026-05-21T02:35:13.165016465Z INFO: 172.20.0.4:40910 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:35:13.209751718Z 2026-05-21 02:35:13 | INFO | peft-platform | Preprocessed 60 samples for sft/alpaca
|
|
|
|
|
+2026-05-21T02:36:06.256137111Z 2026-05-21 02:36:06 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
|
|
|
|
|
+2026-05-21T02:36:06.256252195Z 2026-05-21 02:36:06 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+2026-05-21T02:36:23.951981838Z 2026-05-21 02:36:23 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+2026-05-21T02:36:41.679737054Z 2026-05-21 02:36:41 | INFO | peft-platform | Remote training launched in container: job=7fa42ee0-c310-4aaf-83eb-634790f9904d, container_pid=37
|
|
|
|
|
+2026-05-21T02:36:41.690100936Z [DEBUG] output_path=/root/Fine-tuning/backend/data/processed/7fa42ee0-c310-4aaf-83eb-634790f9904d_processed.jsonl
|
|
|
|
|
+2026-05-21T02:36:41.690253415Z [DEBUG] parent=/root/Fine-tuning/backend/data/processed, exists=True, writable=True
|
|
|
|
|
+2026-05-21T02:36:41.690265015Z [DEBUG] parent mode=0o40777
|
|
|
|
|
+2026-05-21T02:36:41.690272383Z [DEBUG] uid=0, gid=0
|
|
|
|
|
+2026-05-21T02:36:41.690279578Z INFO: 127.0.0.1:59794 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:36:41.751113159Z INFO: 127.0.0.1:34422 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:36:41.757340251Z INFO: 172.20.0.4:40928 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:41.853292345Z INFO: 172.20.0.4:40924 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:41.853552504Z INFO: 172.20.0.4:40940 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:41.912087522Z INFO: 172.20.0.4:54602 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:41.913050949Z INFO: 172.20.0.4:54618 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:41.918341110Z INFO: 172.20.0.4:44944 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.063986222Z INFO: 172.20.0.4:49000 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.223399081Z INFO: 172.20.0.4:49016 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.233469047Z INFO: 172.20.0.4:49026 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.331951157Z INFO: 172.20.0.4:49038 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.332695096Z INFO: 172.20.0.4:49042 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.389840552Z INFO: 172.20.0.4:49050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.390568905Z INFO: 172.20.0.4:49056 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.391465567Z INFO: 172.20.0.4:49068 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.392360966Z INFO: 172.20.0.4:49072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:42.492577438Z INFO: 172.20.0.4:49086 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:36:54.529704158Z INFO: 127.0.0.1:57198 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:37:17.977243955Z INFO: 172.20.0.4:43980 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:37:20.444805620Z INFO: 172.20.0.4:43994 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:37:24.715196859Z INFO: 127.0.0.1:34044 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:37:26.193334161Z INFO: 172.20.0.4:55586 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:37:31.187154610Z INFO: 172.20.0.4:55588 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:37:34.946677567Z 2026-05-21 02:37:34 | ERROR | peft-platform | Remote job 7fa42ee0-c310-4aaf-83eb-634790f9904d failed: local variable 'Path' referenced before assignment
|
|
|
|
|
+2026-05-21T02:37:34.968928307Z 2026-05-21 02:37:34 | INFO | peft-platform | Remote training launched for job 7fa42ee0-c310-4aaf-83eb-634790f9904d
|
|
|
|
|
+2026-05-21T02:37:36.185236838Z INFO: 172.20.0.4:40238 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:37:41.172814618Z INFO: 172.20.0.4:40254 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:37:54.917414999Z INFO: 127.0.0.1:43476 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:38:25.110170590Z INFO: 127.0.0.1:56426 - "GET /health HTTP/1.1" 200 OK
|