|
@@ -2,102 +2,184 @@ lq@lq:~/Fine-tuning$ sudo docker logs -f finetune-backend
|
|
|
=> Syncing backend code to compute node 192.168.91.253 ...
|
|
=> Syncing backend code to compute node 192.168.91.253 ...
|
|
|
Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
|
|
Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
|
|
|
sending incremental file list
|
|
sending incremental file list
|
|
|
-app/engines/
|
|
|
|
|
-app/engines/__pycache__/
|
|
|
|
|
|
|
+./
|
|
|
|
|
+.dockerignore
|
|
|
|
|
+.env.docker
|
|
|
|
|
+.env.example
|
|
|
|
|
+.python-version
|
|
|
|
|
+Dockerfile
|
|
|
|
|
+entrypoint.sh
|
|
|
|
|
+main.py
|
|
|
|
|
+pyproject.toml
|
|
|
|
|
+requirements.txt
|
|
|
|
|
+app/__init__.py
|
|
|
|
|
+app/config.py
|
|
|
|
|
+app/__pycache__/__init__.cpython-310.pyc
|
|
|
|
|
+app/__pycache__/config.cpython-310.pyc
|
|
|
|
|
+app/api/__init__.py
|
|
|
|
|
+app/api/auth.py
|
|
|
|
|
+app/api/datasets.py
|
|
|
|
|
+app/api/deployment.py
|
|
|
|
|
+app/api/evaluation.py
|
|
|
|
|
+app/api/inference.py
|
|
|
|
|
+app/api/models.py
|
|
|
|
|
+app/api/sample_center.py
|
|
|
|
|
+app/api/training.py
|
|
|
|
|
+app/api/__pycache__/__init__.cpython-310.pyc
|
|
|
|
|
+app/api/__pycache__/auth.cpython-310.pyc
|
|
|
|
|
+app/api/__pycache__/datasets.cpython-310.pyc
|
|
|
|
|
+app/api/__pycache__/deployment.cpython-310.pyc
|
|
|
|
|
+app/api/__pycache__/evaluation.cpython-310.pyc
|
|
|
|
|
+app/api/__pycache__/inference.cpython-310.pyc
|
|
|
|
|
+app/api/__pycache__/models.cpython-310.pyc
|
|
|
|
|
+app/api/__pycache__/sample_center.cpython-310.pyc
|
|
|
|
|
+app/api/__pycache__/training.cpython-310.pyc
|
|
|
|
|
+app/core/__init__.py
|
|
|
|
|
+app/core/auth.py
|
|
|
|
|
+app/core/background_tasks.py
|
|
|
|
|
+app/core/db.py
|
|
|
|
|
+app/core/job_queue.py
|
|
|
|
|
+app/core/logging.py
|
|
|
|
|
+app/core/remote_deploy.py
|
|
|
|
|
+app/core/remote_eval.py
|
|
|
|
|
+app/core/remote_executor.py
|
|
|
|
|
+app/core/security.py
|
|
|
|
|
+app/core/sso_client.py
|
|
|
|
|
+app/core/websocket.py
|
|
|
|
|
+app/core/__pycache__/__init__.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/auth.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/background_tasks.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/db.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/job_queue.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/logging.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/remote_deploy.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/remote_eval.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/remote_executor.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/security.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/sso_client.cpython-310.pyc
|
|
|
|
|
+app/core/__pycache__/websocket.cpython-310.pyc
|
|
|
|
|
+app/engines/__init__.py
|
|
|
|
|
+app/engines/__main__.py
|
|
|
|
|
+app/engines/base.py
|
|
|
|
|
+app/engines/multimodal_engine.py
|
|
|
|
|
+app/engines/remote_train.py
|
|
|
|
|
+app/engines/text_engine.py
|
|
|
|
|
+app/engines/vision_engine.py
|
|
|
|
|
+app/engines/__pycache__/__init__.cpython-310.pyc
|
|
|
|
|
+app/engines/__pycache__/base.cpython-310.pyc
|
|
|
|
|
+app/engines/__pycache__/remote_train.cpython-310.pyc
|
|
|
|
|
+app/engines/__pycache__/text_engine.cpython-310.pyc
|
|
|
|
|
+app/peft/__init__.py
|
|
|
|
|
+app/peft/__pycache__/__init__.cpython-310.pyc
|
|
|
|
|
+app/preprocessors/__init__.py
|
|
|
|
|
+app/preprocessors/__pycache__/__init__.cpython-310.pyc
|
|
|
|
|
+app/schemas/__init__.py
|
|
|
|
|
+app/schemas/background_task.py
|
|
|
|
|
+app/schemas/common.py
|
|
|
|
|
+app/schemas/dataset.py
|
|
|
|
|
+app/schemas/deployment.py
|
|
|
|
|
+app/schemas/evaluation.py
|
|
|
|
|
+app/schemas/model.py
|
|
|
|
|
+app/schemas/model_test.py
|
|
|
|
|
+app/schemas/sample_center.py
|
|
|
|
|
+app/schemas/training.py
|
|
|
|
|
+app/schemas/__pycache__/__init__.cpython-310.pyc
|
|
|
|
|
+app/schemas/__pycache__/background_task.cpython-310.pyc
|
|
|
|
|
+app/schemas/__pycache__/common.cpython-310.pyc
|
|
|
|
|
+app/schemas/__pycache__/dataset.cpython-310.pyc
|
|
|
|
|
+app/schemas/__pycache__/deployment.cpython-310.pyc
|
|
|
|
|
+app/schemas/__pycache__/evaluation.cpython-310.pyc
|
|
|
|
|
+app/schemas/__pycache__/model.cpython-310.pyc
|
|
|
|
|
+app/schemas/__pycache__/model_test.cpython-310.pyc
|
|
|
|
|
+app/schemas/__pycache__/sample_center.cpython-310.pyc
|
|
|
|
|
+app/schemas/__pycache__/training.cpython-310.pyc
|
|
|
|
|
+app/services/dataset_service.py
|
|
|
|
|
+app/services/deploy_service.py
|
|
|
|
|
+app/services/eval_service.py
|
|
|
|
|
+app/services/inference_service.py
|
|
|
|
|
+app/services/model_service.py
|
|
|
|
|
+app/services/model_test_service.py
|
|
|
|
|
+app/services/sample_center_service.py
|
|
|
|
|
+app/services/training_service.py
|
|
|
|
|
+app/services/__pycache__/dataset_service.cpython-310.pyc
|
|
|
|
|
+app/services/__pycache__/deploy_service.cpython-310.pyc
|
|
|
|
|
+app/services/__pycache__/eval_service.cpython-310.pyc
|
|
|
|
|
+app/services/__pycache__/inference_service.cpython-310.pyc
|
|
|
|
|
+app/services/__pycache__/model_service.cpython-310.pyc
|
|
|
|
|
+app/services/__pycache__/model_test_service.cpython-310.pyc
|
|
|
|
|
+app/services/__pycache__/sample_center_service.cpython-310.pyc
|
|
|
|
|
+app/services/__pycache__/training_service.cpython-310.pyc
|
|
|
|
|
|
|
|
-sent 4,159 bytes received 34 bytes 226.65 bytes/sec
|
|
|
|
|
-total size is 410,566 speedup is 97.92
|
|
|
|
|
|
|
+sent 8,251 bytes received 5,885 bytes 764.11 bytes/sec
|
|
|
|
|
+total size is 410,728 speedup is 29.06
|
|
|
=> Sync done.
|
|
=> Sync done.
|
|
|
INFO: Started server process [1]
|
|
INFO: Started server process [1]
|
|
|
INFO: Waiting for application startup.
|
|
INFO: Waiting for application startup.
|
|
|
-2026-05-25 02:43:35 | INFO | peft-platform | JobQueue started with 2 workers
|
|
|
|
|
|
|
+2026-05-25 02:57:27 | INFO | peft-platform | JobQueue started with 2 workers
|
|
|
INFO: Application startup complete.
|
|
INFO: Application startup complete.
|
|
|
INFO: Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
|
|
INFO: Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
|
|
|
-INFO: 172.20.0.4:55314 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:52682 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:55330 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42010 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42026 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42732 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42740 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:33348 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:34080 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:33364 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:45028 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:45030 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:47366 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:47382 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:53934 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:34352 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:53940 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:57974 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:57976 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:42202 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:54038 - "GET /api/v1/training/jobs HTTP/1.0" 401 Unauthorized
|
|
|
|
|
-INFO: 172.20.0.4:54052 - "POST /api/v1/auth/refresh HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:54060 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:41064 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:40778 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:40782 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:40794 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:49102 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:49120 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:49108 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:49136 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:49148 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:49166 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:49158 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:50558 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-2026-05-25 02:46:13 | INFO | peft-platform | Job c3938524-d1e2-4bd3-b73e-07d98e2020f4 enqueued
|
|
|
|
|
-2026-05-25 02:46:13 | INFO | peft-platform | Training job created: c3938524-d1e2-4bd3-b73e-07d98e2020f4
|
|
|
|
|
-INFO: 172.20.0.4:49174 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-25 02:46:13 | INFO | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
|
|
|
|
|
-INFO: 172.20.0.4:49190 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:49208 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:49194 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42438 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42440 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:35346 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:35360 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-25 02:46:32 | INFO | peft-platform | Remote cleanup result: true
|
|
|
|
|
-cleaned 15 processes
|
|
|
|
|
-2026-05-25 02:47:25 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
|
|
|
|
|
-2026-05-25 02:47:25 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
-2026-05-25 02:47:42 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
-2026-05-25 02:48:18 | INFO | peft-platform | Remote training launched in container: job=c3938524-d1e2-4bd3-b73e-07d98e2020f4, container_pid=175470
|
|
|
|
|
-INFO: 127.0.0.1:44134 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:43880 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:58442 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:59902 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:41198 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:41214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:40696 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:56650 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:56642 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39566 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39568 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39554 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39572 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39582 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39584 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:35208 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:35214 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:43170 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:43186 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:59424 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:37098 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:37106 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:60072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:60088 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:36258 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:36270 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:38910 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-2026-05-25 02:49:11 | ERROR | peft-platform | Remote job c3938524-d1e2-4bd3-b73e-07d98e2020f4 failed: AdaLoRA does not work when `total_step` is None, supply a value > 0.
|
|
|
|
|
-INFO: 172.20.0.4:45988 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:46000 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-25 02:49:21 | ERROR | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 175470 2>/dev/null; pkill -9 -P 175470 2>/dev/null'
|
|
|
|
|
-2026-05-25 02:49:21 | INFO | peft-platform | Killed remote process 175470 via docker exec
|
|
|
|
|
-2026-05-25 02:49:21 | INFO | peft-platform | Remote training launched for job c3938524-d1e2-4bd3-b73e-07d98e2020f4
|
|
|
|
|
-INFO: 172.20.0.4:42648 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42656 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:37380 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
|
|
+INFO: 127.0.0.1:39240 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:43618 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:43626 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:43628 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:55410 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:55418 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:55412 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:55428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42002 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-25 02:57:48 | INFO | peft-platform | Job 598a6794-9fad-4c85-8bc8-a5f31a2af184 enqueued
|
|
|
|
|
+2026-05-25 02:57:48 | INFO | peft-platform | Training job created: 598a6794-9fad-4c85-8bc8-a5f31a2af184
|
|
|
|
|
+INFO: 172.20.0.4:42018 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-25 02:57:48 | INFO | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
|
|
|
|
|
+INFO: 172.20.0.4:42024 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42028 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42044 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:50476 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:47416 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:50490 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:48496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-25 02:58:07 | INFO | peft-platform | Remote cleanup result: true
|
|
|
|
|
+cleaned 16 processes
|
|
|
|
|
+2026-05-25 02:59:00 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
|
|
|
|
|
+2026-05-25 02:59:00 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+2026-05-25 02:59:17 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
|
|
+2026-05-25 02:59:53 | INFO | peft-platform | Remote training launched in container: job=598a6794-9fad-4c85-8bc8-a5f31a2af184, container_pid=176135
|
|
|
|
|
+INFO: 127.0.0.1:55276 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:44184 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:37066 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:35906 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:48512 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:35908 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:51080 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:34534 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:51084 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42942 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42952 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:42956 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:43060 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:43072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:44468 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:44482 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:51686 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:58792 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:58806 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49734 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49736 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:37696 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-25 03:00:46 | ERROR | peft-platform | Remote job 598a6794-9fad-4c85-8bc8-a5f31a2af184 failed: AdaLoRA does not work when `total_step` is None, supply a value > 0.
|
|
|
|
|
+INFO: 172.20.0.4:37706 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:59420 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+2026-05-25 03:00:56 | ERROR | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 176135 2>/dev/null; pkill -9 -P 176135 2>/dev/null'
|
|
|
|
|
+2026-05-25 03:00:56 | INFO | peft-platform | Killed remote process 176135 via docker exec
|
|
|
|
|
+INFO: 172.20.0.4:59336 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+2026-05-25 03:00:56 | INFO | peft-platform | Remote training launched for job 598a6794-9fad-4c85-8bc8-a5f31a2af184
|
|
|
|
|
+INFO: 172.20.0.4:59340 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:33706 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:33722 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:34462 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:34474 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 127.0.0.1:54736 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:52398 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:52412 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
+INFO: 172.20.0.4:49268 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|