|
|
@@ -1,185 +1,65 @@
|
|
|
-lq@lq:~/Fine-tuning$ sudo docker logs -f finetune-backend
|
|
|
-=> Syncing backend code to compute node 192.168.91.253 ...
|
|
|
-Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
|
|
|
-sending incremental file list
|
|
|
-./
|
|
|
-.dockerignore
|
|
|
-.env.docker
|
|
|
-.env.example
|
|
|
-.python-version
|
|
|
-Dockerfile
|
|
|
-entrypoint.sh
|
|
|
-main.py
|
|
|
-pyproject.toml
|
|
|
-requirements.txt
|
|
|
-app/__init__.py
|
|
|
-app/config.py
|
|
|
-app/__pycache__/__init__.cpython-310.pyc
|
|
|
-app/__pycache__/config.cpython-310.pyc
|
|
|
-app/api/__init__.py
|
|
|
-app/api/auth.py
|
|
|
-app/api/datasets.py
|
|
|
-app/api/deployment.py
|
|
|
-app/api/evaluation.py
|
|
|
-app/api/inference.py
|
|
|
-app/api/models.py
|
|
|
-app/api/sample_center.py
|
|
|
-app/api/training.py
|
|
|
-app/api/__pycache__/__init__.cpython-310.pyc
|
|
|
-app/api/__pycache__/auth.cpython-310.pyc
|
|
|
-app/api/__pycache__/datasets.cpython-310.pyc
|
|
|
-app/api/__pycache__/deployment.cpython-310.pyc
|
|
|
-app/api/__pycache__/evaluation.cpython-310.pyc
|
|
|
-app/api/__pycache__/inference.cpython-310.pyc
|
|
|
-app/api/__pycache__/models.cpython-310.pyc
|
|
|
-app/api/__pycache__/sample_center.cpython-310.pyc
|
|
|
-app/api/__pycache__/training.cpython-310.pyc
|
|
|
-app/core/__init__.py
|
|
|
-app/core/auth.py
|
|
|
-app/core/background_tasks.py
|
|
|
-app/core/db.py
|
|
|
-app/core/job_queue.py
|
|
|
-app/core/logging.py
|
|
|
-app/core/remote_deploy.py
|
|
|
-app/core/remote_eval.py
|
|
|
-app/core/remote_executor.py
|
|
|
-app/core/security.py
|
|
|
-app/core/sso_client.py
|
|
|
-app/core/websocket.py
|
|
|
-app/core/__pycache__/__init__.cpython-310.pyc
|
|
|
-app/core/__pycache__/auth.cpython-310.pyc
|
|
|
-app/core/__pycache__/background_tasks.cpython-310.pyc
|
|
|
-app/core/__pycache__/db.cpython-310.pyc
|
|
|
-app/core/__pycache__/job_queue.cpython-310.pyc
|
|
|
-app/core/__pycache__/logging.cpython-310.pyc
|
|
|
-app/core/__pycache__/remote_deploy.cpython-310.pyc
|
|
|
-app/core/__pycache__/remote_eval.cpython-310.pyc
|
|
|
-app/core/__pycache__/remote_executor.cpython-310.pyc
|
|
|
-app/core/__pycache__/security.cpython-310.pyc
|
|
|
-app/core/__pycache__/sso_client.cpython-310.pyc
|
|
|
-app/core/__pycache__/websocket.cpython-310.pyc
|
|
|
-app/engines/__init__.py
|
|
|
-app/engines/__main__.py
|
|
|
-app/engines/base.py
|
|
|
-app/engines/multimodal_engine.py
|
|
|
-app/engines/remote_train.py
|
|
|
-app/engines/text_engine.py
|
|
|
-app/engines/vision_engine.py
|
|
|
-app/engines/__pycache__/__init__.cpython-310.pyc
|
|
|
-app/engines/__pycache__/base.cpython-310.pyc
|
|
|
-app/engines/__pycache__/remote_train.cpython-310.pyc
|
|
|
-app/engines/__pycache__/text_engine.cpython-310.pyc
|
|
|
-app/peft/__init__.py
|
|
|
-app/peft/__pycache__/__init__.cpython-310.pyc
|
|
|
-app/preprocessors/__init__.py
|
|
|
-app/preprocessors/__pycache__/__init__.cpython-310.pyc
|
|
|
-app/schemas/__init__.py
|
|
|
-app/schemas/background_task.py
|
|
|
-app/schemas/common.py
|
|
|
-app/schemas/dataset.py
|
|
|
-app/schemas/deployment.py
|
|
|
-app/schemas/evaluation.py
|
|
|
-app/schemas/model.py
|
|
|
-app/schemas/model_test.py
|
|
|
-app/schemas/sample_center.py
|
|
|
-app/schemas/training.py
|
|
|
-app/schemas/__pycache__/__init__.cpython-310.pyc
|
|
|
-app/schemas/__pycache__/background_task.cpython-310.pyc
|
|
|
-app/schemas/__pycache__/common.cpython-310.pyc
|
|
|
-app/schemas/__pycache__/dataset.cpython-310.pyc
|
|
|
-app/schemas/__pycache__/deployment.cpython-310.pyc
|
|
|
-app/schemas/__pycache__/evaluation.cpython-310.pyc
|
|
|
-app/schemas/__pycache__/model.cpython-310.pyc
|
|
|
-app/schemas/__pycache__/model_test.cpython-310.pyc
|
|
|
-app/schemas/__pycache__/sample_center.cpython-310.pyc
|
|
|
-app/schemas/__pycache__/training.cpython-310.pyc
|
|
|
-app/services/dataset_service.py
|
|
|
-app/services/deploy_service.py
|
|
|
-app/services/eval_service.py
|
|
|
-app/services/inference_service.py
|
|
|
-app/services/model_service.py
|
|
|
-app/services/model_test_service.py
|
|
|
-app/services/sample_center_service.py
|
|
|
-app/services/training_service.py
|
|
|
-app/services/__pycache__/dataset_service.cpython-310.pyc
|
|
|
-app/services/__pycache__/deploy_service.cpython-310.pyc
|
|
|
-app/services/__pycache__/eval_service.cpython-310.pyc
|
|
|
-app/services/__pycache__/inference_service.cpython-310.pyc
|
|
|
-app/services/__pycache__/model_service.cpython-310.pyc
|
|
|
-app/services/__pycache__/model_test_service.cpython-310.pyc
|
|
|
-app/services/__pycache__/sample_center_service.cpython-310.pyc
|
|
|
-app/services/__pycache__/training_service.cpython-310.pyc
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer find /root/Fine-tuning/backend -name '*.pyc' -delete && docker exec finetune-trainer find /root/Fine-tuning/backend -name '__pycache__' -type d -delete
|
|
|
+(base) [root@localhost ~]#
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer tail -200 /tmp/train_1e334a57-26f5-4e7e-a961-0a02330fa708.log
|
|
|
+[remote_train] === Training job started: 1e334a57-26f5-4e7e-a961-0a02330fa708 ===
|
|
|
+[remote_train] model_id=Qwen/Qwen1.5-0.5B, model_type=text
|
|
|
+[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
+[remote_train] config={"model_id": "Qwen/Qwen1.5-0.5B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "adalora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "lear
|
|
|
+[remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
+[remote_train] Step 1: Preprocessing dataset...
|
|
|
+[remote_train] task_type=sft, template=auto
|
|
|
+[remote_train] output_path=/root/Fine-tuning/backend/data/processed/1e334a57-26f5-4e7e-a961-0a02330fa708_processed.jsonl
|
|
|
+[remote_train] Selecting engine for model_type=text...
|
|
|
+[remote_train] Engine loaded: TextEngine
|
|
|
+[remote_train] PEFT method: adalora
|
|
|
+[remote_train] Running preprocess_dataset...
|
|
|
+[remote_train] Preprocessing done, output: /root/Fine-tuning/backend/data/processed/1e334a57-26f5-4e7e-a961-0a02330fa708_processed.jsonl
|
|
|
+[remote_train] Step 2: Loading model: Qwen/Qwen1.5-0.5B...
|
|
|
+[remote_train] Quantization: None
|
|
|
+Loading weights: 100%|██████████| 291/291 [00:04<00:00, 59.39it/s]
|
|
|
+[remote_train] Model loaded successfully
|
|
|
+[remote_train] Step 3: Building PEFT config...
|
|
|
+[remote_train] ERROR: AdaLoRA does not work when `total_step` is None, supply a value > 0.
|
|
|
+[remote_train] Traceback (most recent call last):
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 162, in run_training
|
|
|
+ peft_config = engine.get_peft_config(peft_method, config)
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 149, in get_peft_config
|
|
|
+ return builder(params)
|
|
|
+ File "/root/Fine-tuning/backend/app/peft/__init__.py", line 43, in build_adalora_config
|
|
|
+ return AdaLoraConfig(
|
|
|
+ File "<string>", line 51, in __init__
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/peft/tuners/adalora/config.py", line 102, in __post_init__
|
|
|
+ raise ValueError("AdaLoRA does not work when `total_step` is None, supply a value > 0.")
|
|
|
+ValueError: AdaLoRA does not work when `total_step` is None, supply a value > 0.
|
|
|
|
|
|
-sent 8,251 bytes received 5,885 bytes 764.11 bytes/sec
|
|
|
-total size is 410,728 speedup is 29.06
|
|
|
-=> Sync done.
|
|
|
-INFO: Started server process [1]
|
|
|
-INFO: Waiting for application startup.
|
|
|
-2026-05-25 02:57:27 | INFO | peft-platform | JobQueue started with 2 workers
|
|
|
-INFO: Application startup complete.
|
|
|
-INFO: Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
|
|
|
-INFO: 127.0.0.1:39240 - "GET /health HTTP/1.1" 200 OK
|
|
|
-INFO: 172.20.0.4:43618 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:43626 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:43628 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:55410 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:55418 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:55412 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:55428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:42002 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-2026-05-25 02:57:48 | INFO | peft-platform | Job 598a6794-9fad-4c85-8bc8-a5f31a2af184 enqueued
|
|
|
-2026-05-25 02:57:48 | INFO | peft-platform | Training job created: 598a6794-9fad-4c85-8bc8-a5f31a2af184
|
|
|
-INFO: 172.20.0.4:42018 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-2026-05-25 02:57:48 | INFO | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
|
|
|
-INFO: 172.20.0.4:42024 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:42028 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:42044 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:42050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:50476 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 127.0.0.1:47416 - "GET /health HTTP/1.1" 200 OK
|
|
|
-INFO: 172.20.0.4:50490 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:48496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-2026-05-25 02:58:07 | INFO | peft-platform | Remote cleanup result: true
|
|
|
-cleaned 16 processes
|
|
|
-2026-05-25 02:59:00 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
|
|
|
-2026-05-25 02:59:00 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
-2026-05-25 02:59:17 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
-2026-05-25 02:59:53 | INFO | peft-platform | Remote training launched in container: job=598a6794-9fad-4c85-8bc8-a5f31a2af184, container_pid=176135
|
|
|
-INFO: 127.0.0.1:55276 - "GET /health HTTP/1.1" 200 OK
|
|
|
-INFO: 127.0.0.1:44184 - "GET /health HTTP/1.1" 200 OK
|
|
|
-INFO: 127.0.0.1:37066 - "GET /health HTTP/1.1" 200 OK
|
|
|
-INFO: 172.20.0.4:35906 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:48512 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:35908 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:51080 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:34534 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:51084 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:42942 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:42952 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:42956 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:43060 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:43072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:44468 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:44482 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 127.0.0.1:51686 - "GET /health HTTP/1.1" 200 OK
|
|
|
-INFO: 172.20.0.4:58792 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:58806 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:49734 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:49736 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:37696 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-2026-05-25 03:00:46 | ERROR | peft-platform | Remote job 598a6794-9fad-4c85-8bc8-a5f31a2af184 failed: AdaLoRA does not work when `total_step` is None, supply a value > 0.
|
|
|
-INFO: 172.20.0.4:37706 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 127.0.0.1:59420 - "GET /health HTTP/1.1" 200 OK
|
|
|
-2026-05-25 03:00:56 | ERROR | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 176135 2>/dev/null; pkill -9 -P 176135 2>/dev/null'
|
|
|
-2026-05-25 03:00:56 | INFO | peft-platform | Killed remote process 176135 via docker exec
|
|
|
-INFO: 172.20.0.4:59336 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-2026-05-25 03:00:56 | INFO | peft-platform | Remote training launched for job 598a6794-9fad-4c85-8bc8-a5f31a2af184
|
|
|
-INFO: 172.20.0.4:59340 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:33706 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:33722 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:34462 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:34474 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 127.0.0.1:54736 - "GET /health HTTP/1.1" 200 OK
|
|
|
-INFO: 172.20.0.4:52398 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:52412 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
-INFO: 172.20.0.4:49268 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
+[remote_train] === Training job failed: 1e334a57-26f5-4e7e-a961-0a02330fa708 ===
|
|
|
+Traceback (most recent call last):
|
|
|
+ File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
|
|
|
+ return _run_code(code, main_globals, None,
|
|
|
+ File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
|
|
|
+ exec(code, run_globals)
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 213, in <module>
|
|
|
+ main()
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 209, in main
|
|
|
+ asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
|
|
|
+ File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
|
|
|
+ return loop.run_until_complete(main)
|
|
|
+ File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
|
|
|
+ return future.result()
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 162, in run_training
|
|
|
+ peft_config = engine.get_peft_config(peft_method, config)
|
|
|
+ File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 149, in get_peft_config
|
|
|
+ return builder(params)
|
|
|
+ File "/root/Fine-tuning/backend/app/peft/__init__.py", line 43, in build_adalora_config
|
|
|
+ return AdaLoraConfig(
|
|
|
+ File "<string>", line 51, in __init__
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/peft/tuners/adalora/config.py", line 102, in __post_init__
|
|
|
+ raise ValueError("AdaLoRA does not work when `total_step` is None, supply a value > 0.")
|
|
|
+ValueError: AdaLoRA does not work when `total_step` is None, supply a value > 0.
|
|
|
+(base) [root@localhost ~]#
|
|
|
+(base) [root@localhost ~]# grep -n 'total_step\|init_r.*target_r' /root/Fine-tuning/backend/app/engines/text_engine.py
|
|
|
+190: # 计算总步数(AdaLoRA 需要在 get_peft_model 之前设置 total_step)
|
|
|
+194: # AdaLoRA 要求 total_step > 0(通过属性名判断而非 isinstance,避免导入路径问题)
|
|
|
+195: if hasattr(peft_config, "init_r") and hasattr(peft_config, "target_r"):
|
|
|
+196: peft_config.total_step = max_steps
|
|
|
+396: total_steps=state.max_steps or 0,
|