|
@@ -1,65 +1,62 @@
|
|
|
lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
|
|
lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
|
|
|
-[sudo] password for lq:
|
|
|
|
|
-2026-05-21T02:34:43.351649510Z => Syncing backend code to compute node 192.168.91.253 ...
|
|
|
|
|
-2026-05-21T02:34:43.398837861Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
|
|
|
|
|
-2026-05-21T02:35:01.023370182Z sending incremental file list
|
|
|
|
|
-2026-05-21T02:35:01.050523499Z app/engines/
|
|
|
|
|
-2026-05-21T02:35:01.050592451Z app/preprocessors/
|
|
|
|
|
-2026-05-21T02:35:01.091659254Z
|
|
|
|
|
-2026-05-21T02:35:01.091745788Z sent 2,328 bytes received 31 bytes 127.51 bytes/sec
|
|
|
|
|
-2026-05-21T02:35:01.091758710Z total size is 203,735 speedup is 86.36
|
|
|
|
|
-2026-05-21T02:35:01.093507150Z => Sync done.
|
|
|
|
|
-2026-05-21T02:35:02.344222594Z INFO: Started server process [1]
|
|
|
|
|
-2026-05-21T02:35:02.344297685Z INFO: Waiting for application startup.
|
|
|
|
|
-2026-05-21T02:35:02.434311439Z 2026-05-21 02:35:02 | INFO | peft-platform | JobQueue started with 2 workers
|
|
|
|
|
-2026-05-21T02:35:02.434367300Z INFO: Application startup complete.
|
|
|
|
|
-2026-05-21T02:35:02.435502488Z INFO: Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
|
|
|
|
|
-2026-05-21T02:35:04.147983780Z INFO: 127.0.0.1:51418 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-2026-05-21T02:35:08.814099882Z INFO: 172.20.0.4:40850 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:35:08.839124444Z INFO: 172.20.0.4:40860 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:35:08.923924366Z INFO: 172.20.0.4:40872 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:35:10.473798949Z INFO: 172.20.0.4:40876 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:35:10.575801902Z INFO: 172.20.0.4:40892 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:35:10.589381990Z INFO: 172.20.0.4:40902 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:35:13.164806549Z 2026-05-21 02:35:13 | INFO | peft-platform | Job 7fa42ee0-c310-4aaf-83eb-634790f9904d enqueued
|
|
|
|
|
-2026-05-21T02:35:13.164893550Z 2026-05-21 02:35:13 | INFO | peft-platform | Training job created: 7fa42ee0-c310-4aaf-83eb-634790f9904d
|
|
|
|
|
-2026-05-21T02:35:13.165016465Z INFO: 172.20.0.4:40910 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:35:13.209751718Z 2026-05-21 02:35:13 | INFO | peft-platform | Preprocessed 60 samples for sft/alpaca
|
|
|
|
|
-2026-05-21T02:36:06.256137111Z 2026-05-21 02:36:06 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
|
|
|
|
|
-2026-05-21T02:36:06.256252195Z 2026-05-21 02:36:06 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
-2026-05-21T02:36:23.951981838Z 2026-05-21 02:36:23 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
-2026-05-21T02:36:41.679737054Z 2026-05-21 02:36:41 | INFO | peft-platform | Remote training launched in container: job=7fa42ee0-c310-4aaf-83eb-634790f9904d, container_pid=37
|
|
|
|
|
-2026-05-21T02:36:41.690100936Z [DEBUG] output_path=/root/Fine-tuning/backend/data/processed/7fa42ee0-c310-4aaf-83eb-634790f9904d_processed.jsonl
|
|
|
|
|
-2026-05-21T02:36:41.690253415Z [DEBUG] parent=/root/Fine-tuning/backend/data/processed, exists=True, writable=True
|
|
|
|
|
-2026-05-21T02:36:41.690265015Z [DEBUG] parent mode=0o40777
|
|
|
|
|
-2026-05-21T02:36:41.690272383Z [DEBUG] uid=0, gid=0
|
|
|
|
|
-2026-05-21T02:36:41.690279578Z INFO: 127.0.0.1:59794 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-2026-05-21T02:36:41.751113159Z INFO: 127.0.0.1:34422 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-2026-05-21T02:36:41.757340251Z INFO: 172.20.0.4:40928 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:41.853292345Z INFO: 172.20.0.4:40924 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:41.853552504Z INFO: 172.20.0.4:40940 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:41.912087522Z INFO: 172.20.0.4:54602 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:41.913050949Z INFO: 172.20.0.4:54618 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:41.918341110Z INFO: 172.20.0.4:44944 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.063986222Z INFO: 172.20.0.4:49000 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.223399081Z INFO: 172.20.0.4:49016 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.233469047Z INFO: 172.20.0.4:49026 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.331951157Z INFO: 172.20.0.4:49038 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.332695096Z INFO: 172.20.0.4:49042 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.389840552Z INFO: 172.20.0.4:49050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.390568905Z INFO: 172.20.0.4:49056 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.391465567Z INFO: 172.20.0.4:49068 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.392360966Z INFO: 172.20.0.4:49072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:42.492577438Z INFO: 172.20.0.4:49086 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:36:54.529704158Z INFO: 127.0.0.1:57198 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-2026-05-21T02:37:17.977243955Z INFO: 172.20.0.4:43980 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:37:20.444805620Z INFO: 172.20.0.4:43994 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:37:24.715196859Z INFO: 127.0.0.1:34044 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-2026-05-21T02:37:26.193334161Z INFO: 172.20.0.4:55586 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:37:31.187154610Z INFO: 172.20.0.4:55588 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:37:34.946677567Z 2026-05-21 02:37:34 | ERROR | peft-platform | Remote job 7fa42ee0-c310-4aaf-83eb-634790f9904d failed: local variable 'Path' referenced before assignment
|
|
|
|
|
-2026-05-21T02:37:34.968928307Z 2026-05-21 02:37:34 | INFO | peft-platform | Remote training launched for job 7fa42ee0-c310-4aaf-83eb-634790f9904d
|
|
|
|
|
-2026-05-21T02:37:36.185236838Z INFO: 172.20.0.4:40238 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:37:41.172814618Z INFO: 172.20.0.4:40254 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-21T02:37:54.917414999Z INFO: 127.0.0.1:43476 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-2026-05-21T02:38:25.110170590Z INFO: 127.0.0.1:56426 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
|
|
+2026-05-21T02:40:08.673136969Z => Syncing backend code to compute node 192.168.91.253 ...
|
|
|
|
|
+2026-05-21T02:40:08.717899573Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
|
|
|
|
|
+2026-05-21T02:40:26.357052143Z sending incremental file list
|
|
|
|
|
+2026-05-21T02:40:26.381542018Z app/engines/
|
|
|
|
|
+2026-05-21T02:40:26.381590199Z app/engines/__pycache__/
|
|
|
|
|
+2026-05-21T02:40:26.422772225Z
|
|
|
|
|
+2026-05-21T02:40:26.422838503Z sent 2,327 bytes received 31 bytes 127.46 bytes/sec
|
|
|
|
|
+2026-05-21T02:40:26.422848995Z total size is 204,130 speedup is 86.57
|
|
|
|
|
+2026-05-21T02:40:26.424904186Z => Sync done.
|
|
|
|
|
+2026-05-21T02:40:27.669950491Z INFO: Started server process [1]
|
|
|
|
|
+2026-05-21T02:40:27.670035430Z INFO: Waiting for application startup.
|
|
|
|
|
+2026-05-21T02:40:27.770134907Z 2026-05-21 02:40:27 | INFO | peft-platform | JobQueue started with 2 workers
|
|
|
|
|
+2026-05-21T02:40:27.770213838Z INFO: Application startup complete.
|
|
|
|
|
+2026-05-21T02:40:27.770578225Z INFO: Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
|
|
|
|
|
+2026-05-21T02:40:29.509509792Z INFO: 127.0.0.1:48930 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:40:32.217187935Z INFO: 172.20.0.4:50040 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:40:32.224100080Z INFO: 172.20.0.4:50050 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:40:32.230253988Z INFO: 172.20.0.4:50054 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:40:33.673475291Z INFO: 172.20.0.4:50058 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:40:33.683717171Z INFO: 172.20.0.4:50072 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:40:33.684756184Z INFO: 172.20.0.4:50078 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:40:35.724653433Z INFO: 172.20.0.4:35344 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:40:38.676563982Z INFO: 172.20.0.4:35356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:40:39.586231597Z 2026-05-21 02:40:39 | INFO | peft-platform | Job b6fa4a38-56e7-4d0c-b173-88b12899eb42 enqueued
|
|
|
|
|
+2026-05-21T02:40:39.586321192Z 2026-05-21 02:40:39 | INFO | peft-platform | Training job created: b6fa4a38-56e7-4d0c-b173-88b12899eb42
|
|
|
|
|
+2026-05-21T02:40:39.586331550Z INFO: 172.20.0.4:35366 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:40:39.625239455Z 2026-05-21 02:40:39 | INFO | peft-platform | Preprocessed 60 samples for sft/alpaca
|
|
|
|
|
+2026-05-21T02:41:32.509647929Z 2026-05-21 02:41:32 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
|
|
|
|
|
+2026-05-21T02:41:32.509820571Z 2026-05-21 02:41:32 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+2026-05-21T02:41:50.177510125Z 2026-05-21 02:41:50 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+2026-05-21T02:42:07.927323963Z 2026-05-21 02:42:07 | INFO | peft-platform | Remote training launched in container: job=b6fa4a38-56e7-4d0c-b173-88b12899eb42, container_pid=64
|
|
|
|
|
+2026-05-21T02:42:07.977298510Z [DEBUG] output_path=/root/Fine-tuning/backend/data/processed/b6fa4a38-56e7-4d0c-b173-88b12899eb42_processed.jsonl
|
|
|
|
|
+2026-05-21T02:42:07.977375388Z [DEBUG] parent=/root/Fine-tuning/backend/data/processed, exists=True, writable=True
|
|
|
|
|
+2026-05-21T02:42:07.977386730Z [DEBUG] parent mode=0o40777
|
|
|
|
|
+2026-05-21T02:42:07.977395595Z [DEBUG] uid=0, gid=0
|
|
|
|
|
+2026-05-21T02:42:07.977404155Z INFO: 127.0.0.1:36332 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:42:07.985156303Z INFO: 127.0.0.1:38402 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.131460852Z INFO: 172.20.0.4:35378 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.133037399Z INFO: 172.20.0.4:35386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.133448205Z INFO: 172.20.0.4:35392 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.145805667Z INFO: 172.20.0.4:47482 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.146808367Z INFO: 172.20.0.4:56662 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.152471235Z INFO: 172.20.0.4:56674 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.317500767Z INFO: 172.20.0.4:59356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.318077808Z INFO: 172.20.0.4:59372 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.319005101Z INFO: 172.20.0.4:59386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.481764957Z INFO: 172.20.0.4:59388 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.482439440Z INFO: 172.20.0.4:59420 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.483310902Z INFO: 172.20.0.4:59404 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.626551262Z INFO: 172.20.0.4:59422 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.641395518Z INFO: 172.20.0.4:59424 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:08.649519187Z INFO: 172.20.0.4:59440 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:09.044991986Z INFO: 172.20.0.4:59446 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:19.939428924Z INFO: 127.0.0.1:52178 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:42:42.114448308Z INFO: 172.20.0.4:51834 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:42:50.137975669Z INFO: 127.0.0.1:33576 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-21T02:43:01.031805306Z 2026-05-21 02:43:01 | ERROR | peft-platform | Remote job b6fa4a38-56e7-4d0c-b173-88b12899eb42 failed: No module named 'sqlalchemy'
|
|
|
|
|
+2026-05-21T02:43:01.040583882Z 2026-05-21 02:43:01 | INFO | peft-platform | Remote training launched for job b6fa4a38-56e7-4d0c-b173-88b12899eb42
|
|
|
|
|
+2026-05-21T02:43:08.194343547Z INFO: 172.20.0.4:58674 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:43:08.653925330Z INFO: 172.20.0.4:58688 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-21T02:43:20.361871810Z INFO: 127.0.0.1:50708 - "GET /health HTTP/1.1" 200 OK
|