result.txt 6.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
  2. 2026-05-21T02:40:08.673136969Z => Syncing backend code to compute node 192.168.91.253 ...
  3. 2026-05-21T02:40:08.717899573Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
  4. 2026-05-21T02:40:26.357052143Z sending incremental file list
  5. 2026-05-21T02:40:26.381542018Z app/engines/
  6. 2026-05-21T02:40:26.381590199Z app/engines/__pycache__/
  7. 2026-05-21T02:40:26.422772225Z
  8. 2026-05-21T02:40:26.422838503Z sent 2,327 bytes received 31 bytes 127.46 bytes/sec
  9. 2026-05-21T02:40:26.422848995Z total size is 204,130 speedup is 86.57
  10. 2026-05-21T02:40:26.424904186Z => Sync done.
  11. 2026-05-21T02:40:27.669950491Z INFO: Started server process [1]
  12. 2026-05-21T02:40:27.670035430Z INFO: Waiting for application startup.
  13. 2026-05-21T02:40:27.770134907Z 2026-05-21 02:40:27 | INFO | peft-platform | JobQueue started with 2 workers
  14. 2026-05-21T02:40:27.770213838Z INFO: Application startup complete.
  15. 2026-05-21T02:40:27.770578225Z INFO: Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
  16. 2026-05-21T02:40:29.509509792Z INFO: 127.0.0.1:48930 - "GET /health HTTP/1.1" 200 OK
  17. 2026-05-21T02:40:32.217187935Z INFO: 172.20.0.4:50040 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  18. 2026-05-21T02:40:32.224100080Z INFO: 172.20.0.4:50050 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  19. 2026-05-21T02:40:32.230253988Z INFO: 172.20.0.4:50054 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  20. 2026-05-21T02:40:33.673475291Z INFO: 172.20.0.4:50058 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  21. 2026-05-21T02:40:33.683717171Z INFO: 172.20.0.4:50072 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  22. 2026-05-21T02:40:33.684756184Z INFO: 172.20.0.4:50078 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  23. 2026-05-21T02:40:35.724653433Z INFO: 172.20.0.4:35344 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  24. 2026-05-21T02:40:38.676563982Z INFO: 172.20.0.4:35356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  25. 2026-05-21T02:40:39.586231597Z 2026-05-21 02:40:39 | INFO | peft-platform | Job b6fa4a38-56e7-4d0c-b173-88b12899eb42 enqueued
  26. 2026-05-21T02:40:39.586321192Z 2026-05-21 02:40:39 | INFO | peft-platform | Training job created: b6fa4a38-56e7-4d0c-b173-88b12899eb42
  27. 2026-05-21T02:40:39.586331550Z INFO: 172.20.0.4:35366 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
  28. 2026-05-21T02:40:39.625239455Z 2026-05-21 02:40:39 | INFO | peft-platform | Preprocessed 60 samples for sft/alpaca
  29. 2026-05-21T02:41:32.509647929Z 2026-05-21 02:41:32 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
  30. 2026-05-21T02:41:32.509820571Z 2026-05-21 02:41:32 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
  31. 2026-05-21T02:41:50.177510125Z 2026-05-21 02:41:50 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
  32. 2026-05-21T02:42:07.927323963Z 2026-05-21 02:42:07 | INFO | peft-platform | Remote training launched in container: job=b6fa4a38-56e7-4d0c-b173-88b12899eb42, container_pid=64
  33. 2026-05-21T02:42:07.977298510Z [DEBUG] output_path=/root/Fine-tuning/backend/data/processed/b6fa4a38-56e7-4d0c-b173-88b12899eb42_processed.jsonl
  34. 2026-05-21T02:42:07.977375388Z [DEBUG] parent=/root/Fine-tuning/backend/data/processed, exists=True, writable=True
  35. 2026-05-21T02:42:07.977386730Z [DEBUG] parent mode=0o40777
  36. 2026-05-21T02:42:07.977395595Z [DEBUG] uid=0, gid=0
  37. 2026-05-21T02:42:07.977404155Z INFO: 127.0.0.1:36332 - "GET /health HTTP/1.1" 200 OK
  38. 2026-05-21T02:42:07.985156303Z INFO: 127.0.0.1:38402 - "GET /health HTTP/1.1" 200 OK
  39. 2026-05-21T02:42:08.131460852Z INFO: 172.20.0.4:35378 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  40. 2026-05-21T02:42:08.133037399Z INFO: 172.20.0.4:35386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  41. 2026-05-21T02:42:08.133448205Z INFO: 172.20.0.4:35392 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  42. 2026-05-21T02:42:08.145805667Z INFO: 172.20.0.4:47482 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  43. 2026-05-21T02:42:08.146808367Z INFO: 172.20.0.4:56662 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  44. 2026-05-21T02:42:08.152471235Z INFO: 172.20.0.4:56674 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  45. 2026-05-21T02:42:08.317500767Z INFO: 172.20.0.4:59356 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  46. 2026-05-21T02:42:08.318077808Z INFO: 172.20.0.4:59372 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  47. 2026-05-21T02:42:08.319005101Z INFO: 172.20.0.4:59386 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  48. 2026-05-21T02:42:08.481764957Z INFO: 172.20.0.4:59388 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  49. 2026-05-21T02:42:08.482439440Z INFO: 172.20.0.4:59420 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  50. 2026-05-21T02:42:08.483310902Z INFO: 172.20.0.4:59404 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  51. 2026-05-21T02:42:08.626551262Z INFO: 172.20.0.4:59422 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  52. 2026-05-21T02:42:08.641395518Z INFO: 172.20.0.4:59424 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  53. 2026-05-21T02:42:08.649519187Z INFO: 172.20.0.4:59440 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  54. 2026-05-21T02:42:09.044991986Z INFO: 172.20.0.4:59446 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  55. 2026-05-21T02:42:19.939428924Z INFO: 127.0.0.1:52178 - "GET /health HTTP/1.1" 200 OK
  56. 2026-05-21T02:42:42.114448308Z INFO: 172.20.0.4:51834 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  57. 2026-05-21T02:42:50.137975669Z INFO: 127.0.0.1:33576 - "GET /health HTTP/1.1" 200 OK
  58. 2026-05-21T02:43:01.031805306Z 2026-05-21 02:43:01 | ERROR | peft-platform | Remote job b6fa4a38-56e7-4d0c-b173-88b12899eb42 failed: No module named 'sqlalchemy'
  59. 2026-05-21T02:43:01.040583882Z 2026-05-21 02:43:01 | INFO | peft-platform | Remote training launched for job b6fa4a38-56e7-4d0c-b173-88b12899eb42
  60. 2026-05-21T02:43:08.194343547Z INFO: 172.20.0.4:58674 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  61. 2026-05-21T02:43:08.653925330Z INFO: 172.20.0.4:58688 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  62. 2026-05-21T02:43:20.361871810Z INFO: 127.0.0.1:50708 - "GET /health HTTP/1.1" 200 OK