result.txt 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
  2. [sudo] password for lq:
  3. Sorry, try again.
  4. [sudo] password for lq:
  5. 2026-05-21T05:34:12.856748710Z => Syncing backend code to compute node 192.168.91.253 ...
  6. 2026-05-21T05:34:12.904890703Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
  7. 2026-05-21T05:34:30.582860442Z sending incremental file list
  8. 2026-05-21T05:34:30.614393978Z app/engines/
  9. 2026-05-21T05:34:30.655962590Z
  10. 2026-05-21T05:34:30.656054587Z sent 2,425 bytes received 29 bytes 132.65 bytes/sec
  11. 2026-05-21T05:34:30.656067153Z total size is 215,247 speedup is 87.71
  12. 2026-05-21T05:34:30.658073223Z => Sync done.
  13. 2026-05-21T05:34:31.901691988Z INFO: Started server process [1]
  14. 2026-05-21T05:34:31.901771435Z INFO: Waiting for application startup.
  15. 2026-05-21T05:34:31.999831081Z 2026-05-21 05:34:31 | INFO | peft-platform | JobQueue started with 2 workers
  16. 2026-05-21T05:34:31.999992582Z INFO: Application startup complete.
  17. 2026-05-21T05:34:32.000819280Z INFO: Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
  18. 2026-05-21T05:34:33.658164981Z INFO: 127.0.0.1:58612 - "GET /health HTTP/1.1" 200 OK
  19. 2026-05-21T05:34:40.128876661Z INFO: 172.20.0.4:57504 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  20. 2026-05-21T05:34:40.981739480Z INFO: 172.20.0.4:57510 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  21. 2026-05-21T05:34:41.037853040Z INFO: 172.20.0.4:57518 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  22. 2026-05-21T05:34:41.042812013Z INFO: 172.20.0.4:57516 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  23. 2026-05-21T05:34:44.446833463Z 2026-05-21 05:34:44 | INFO | peft-platform | Job 83732d01-5022-4d67-9c44-acdf47f89092 enqueued
  24. 2026-05-21T05:34:44.446919521Z 2026-05-21 05:34:44 | INFO | peft-platform | Training job created: 83732d01-5022-4d67-9c44-acdf47f89092
  25. 2026-05-21T05:34:44.447206930Z INFO: 172.20.0.4:38752 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
  26. 2026-05-21T05:34:44.571734837Z INFO: 172.20.0.4:38758 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  27. 2026-05-21T05:34:44.581743700Z 2026-05-21 05:34:44 | INFO | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
  28. 2026-05-21T05:35:37.883942489Z 2026-05-21 05:35:37 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
  29. 2026-05-21T05:35:37.884065139Z 2026-05-21 05:35:37 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
  30. 2026-05-21T05:35:55.509694297Z 2026-05-21 05:35:55 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
  31. 2026-05-21T05:36:30.923810649Z 2026-05-21 05:36:30 | INFO | peft-platform | Remote training launched in container: job=83732d01-5022-4d67-9c44-acdf47f89092, container_pid=2506
  32. 2026-05-21T05:36:30.928971588Z INFO: 127.0.0.1:56772 - "GET /health HTTP/1.1" 200 OK
  33. 2026-05-21T05:36:30.933549372Z INFO: 172.20.0.4:38766 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  34. 2026-05-21T05:36:30.933947243Z INFO: 172.20.0.4:38762 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  35. 2026-05-21T05:36:30.935698059Z INFO: 127.0.0.1:47622 - "GET /health HTTP/1.1" 200 OK
  36. 2026-05-21T05:36:30.936923403Z INFO: 127.0.0.1:36574 - "GET /health HTTP/1.1" 200 OK
  37. 2026-05-21T05:36:30.944840546Z INFO: 172.20.0.4:53070 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  38. 2026-05-21T05:36:30.945740117Z INFO: 172.20.0.4:53074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  39. 2026-05-21T05:36:30.946639063Z INFO: 172.20.0.4:34814 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  40. 2026-05-21T05:36:30.948415195Z INFO: 172.20.0.4:34830 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  41. 2026-05-21T05:36:31.294922087Z INFO: 172.20.0.4:40276 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  42. 2026-05-21T05:36:31.316642706Z INFO: 172.20.0.4:40282 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  43. 2026-05-21T05:36:31.341533316Z INFO: 172.20.0.4:40288 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  44. 2026-05-21T05:36:32.082545706Z INFO: 172.20.0.4:40296 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  45. 2026-05-21T05:36:32.083579114Z INFO: 172.20.0.4:40300 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  46. 2026-05-21T05:36:32.094750861Z INFO: 172.20.0.4:40308 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  47. 2026-05-21T05:36:32.101512544Z INFO: 172.20.0.4:40324 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  48. 2026-05-21T05:36:32.112351440Z INFO: 172.20.0.4:40336 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  49. 2026-05-21T05:36:42.269063163Z INFO: 172.20.0.4:58890 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  50. 2026-05-21T05:36:57.587271550Z INFO: 172.20.0.4:41132 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  51. 2026-05-21T05:37:01.147104111Z INFO: 127.0.0.1:50416 - "GET /health HTTP/1.1" 200 OK
  52. 2026-05-21T05:37:01.264907014Z INFO: 172.20.0.4:41134 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  53. 2026-05-21T05:37:06.265588600Z INFO: 172.20.0.4:36266 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  54. 2026-05-21T05:37:11.264471858Z INFO: 172.20.0.4:36276 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  55. 2026-05-21T05:37:16.268764019Z INFO: 172.20.0.4:60828 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  56. 2026-05-21T05:37:21.263744160Z INFO: 172.20.0.4:60832 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  57. 2026-05-21T05:37:26.277948988Z INFO: 172.20.0.4:42252 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  58. 2026-05-21T05:37:31.273684817Z INFO: 172.20.0.4:42256 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  59. 2026-05-21T05:37:31.346532880Z INFO: 127.0.0.1:37738 - "GET /health HTTP/1.1" 200 OK
  60. 2026-05-21T05:37:36.279624659Z INFO: 172.20.0.4:50096 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  61. 2026-05-21T05:37:41.270703742Z INFO: 172.20.0.4:50100 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  62. 2026-05-21T05:37:46.311247473Z INFO: 172.20.0.4:38902 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  63. 2026-05-21T05:37:51.278088336Z INFO: 172.20.0.4:38910 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  64. 2026-05-21T05:37:56.280272066Z INFO: 172.20.0.4:60532 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  65. 2026-05-21T05:38:00.370724449Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] === Training job started: 83732d01-5022-4d67-9c44-acdf47f89092 ===
  66. 2026-05-21T05:38:00.370793541Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
  67. 2026-05-21T05:38:00.370808018Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
  68. 2026-05-21T05:38:00.370817515Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "lora", "epochs": 3, "batch_size": 4, "gradient_accumulation": 4, "learning
  69. 2026-05-21T05:38:00.370844181Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
  70. 2026-05-21T05:38:00.370964860Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] Step 1: Preprocessing dataset...
  71. 2026-05-21T05:38:00.371068870Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] task_type=sft, template=alpaca
  72. 2026-05-21T05:38:00.371128216Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] output_path=/root/Fine-tuning/backend/data/processed/83732d01-5022-4d67-9c44-acdf47f89092_processed.jsonl
  73. 2026-05-21T05:38:00.371142431Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] Selecting engine for model_type=text...
  74. 2026-05-21T05:38:00.371157641Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] Engine loaded: TextEngine
  75. 2026-05-21T05:38:00.371224686Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] PEFT method: lora
  76. 2026-05-21T05:38:00.371289339Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] Running preprocess_dataset...
  77. 2026-05-21T05:38:00.371302088Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] Preprocessing done, output: /root/Fine-tuning/backend/data/processed/83732d01-5022-4d67-9c44-acdf47f89092_processed.jsonl
  78. 2026-05-21T05:38:00.371329600Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
  79. 2026-05-21T05:38:00.371356109Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] [remote_train] Quantization: None
  80. 2026-05-21T05:38:00.371563114Z 2026-05-21 05:38:00 | WARNING | peft-platform | [253:83732d01] [transformers] `torch_dtype` is deprecated! Use `dtype` instead!
  81. 2026-05-21T05:38:00.371662704Z 2026-05-21 05:38:00 | ERROR | peft-platform | [253:83732d01] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
  82. 2026-05-21T05:38:00.371737137Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
  83. 2026-05-21T05:38:00.371772645Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] torch.compile is not available in Python 3.10, using identity decorator instead
  84. 2026-05-21T05:38:00.371889023Z 2026-05-21 05:38:00 | WARNING | peft-platform | [253:83732d01] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  85. 2026-05-21T05:38:00.372089241Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] warnings.warn(_BETA_TRANSFORMS_WARNING)
  86. 2026-05-21T05:38:00.372109324Z 2026-05-21 05:38:00 | WARNING | peft-platform | [253:83732d01] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
  87. 2026-05-21T05:38:00.372121857Z 2026-05-21 05:38:00 | INFO | peft-platform | [253:83732d01] warnings.warn(_BETA_TRANSFORMS_WARNING)
  88. 2026-05-21T05:38:00.372137426Z 2026-05-21 05:38:00 | ERROR | peft-platform | [253:83732d01] [13:37:00.956][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
  89. 2026-05-21T05:38:00.372289777Z 2026-05-21 05:38:00 | ERROR | peft-platform | [253:83732d01] [13:37:11.196][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
  90. 2026-05-21T05:38:00.372303934Z 2026-05-21 05:38:00 | ERROR | peft-platform | [253:83732d01] [13:37:21.436][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
  91. 2026-05-21T05:38:00.372371497Z 2026-05-21 05:38:00 | ERROR | peft-platform | [253:83732d01] [13:37:31.676][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
  92. 2026-05-21T05:38:00.372474247Z 2026-05-21 05:38:00 | ERROR | peft-platform | [253:83732d01] [13:37:41.916][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
  93. 2026-05-21T05:38:00.372500910Z 2026-05-21 05:38:00 | ERROR | peft-platform | [253:83732d01] [13:37:52.156][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
  94. 2026-05-21T05:38:01.548682891Z INFO: 127.0.0.1:52318 - "GET /health HTTP/1.1" 200 OK
  95. 2026-05-21T05:38:31.744481381Z INFO: 127.0.0.1:55348 - "GET /health HTTP/1.1" 200 OK
  96. 2026-05-21T05:38:42.293320538Z INFO: 172.20.0.4:40146 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  97. 2026-05-21T05:38:47.635600218Z INFO: 172.20.0.4:43030 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  98. 2026-05-21T05:38:49.472820117Z INFO: 172.20.0.4:43050 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
  99. 2026-05-21T05:38:49.474803507Z INFO: 172.20.0.4:43042 - "GET /api/v1/models/ HTTP/1.0" 200 OK
  100. 2026-05-21T05:38:49.475933409Z INFO: 172.20.0.4:43058 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
  101. 2026-05-21T05:38:50.415585317Z 2026-05-21 05:38:50 | INFO | peft-platform | Job 83732d01-5022-4d67-9c44-acdf47f89092 cancelled
  102. 2026-05-21T05:38:50.420419117Z 2026-05-21 05:38:50 | INFO | peft-platform | Job cancelled: 83732d01-5022-4d67-9c44-acdf47f89092
  103. 2026-05-21T05:38:50.420771552Z INFO: 172.20.0.4:43064 - "POST /api/v1/training/jobs/83732d01-5022-4d67-9c44-acdf47f89092/cancel HTTP/1.0" 200 OK
  104. 2026-05-21T05:38:50.444305921Z INFO: 172.20.0.4:43074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK