Przeglądaj źródła

adalora初始化total_step赋默认值

lxylxy123321 3 dni temu
rodzic
commit
a3e748d3c9
2 zmienionych plików z 69 dodań i 184 usunięć
  1. 5 0
      backend/app/peft/__init__.py
  2. 64 184
      result.txt

+ 5 - 0
backend/app/peft/__init__.py

@@ -40,10 +40,15 @@ def build_adalora_config(params: dict[str, Any]):
     """返回实际的 peft.AdaLoraConfig 对象。"""
     from peft import AdaLoraConfig, TaskType
 
+    # total_step 必须由外部传入,AdaLoraConfig 的 __post_init__ 会校验 > 0
+    # 如果没有传入,给一个较大的默认值(10000),train() 中会重新覆盖
+    total_step = params.get("total_step", 10000)
+
     return AdaLoraConfig(
         init_r=params.get("adalora_init_r", 8),
         target_r=params.get("adalora_target_r", 16),
         beta1=params.get("adalora_beta1", 0.85),
         beta2=params.get("adalora_beta2", 0.85),
         task_type=TaskType.CAUSAL_LM,
+        total_step=total_step,
     )

+ 64 - 184
result.txt

@@ -1,185 +1,65 @@
-lq@lq:~/Fine-tuning$ sudo docker logs -f finetune-backend
-=> Syncing backend code to compute node 192.168.91.253 ...
-Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
-sending incremental file list
-./
-.dockerignore
-.env.docker
-.env.example
-.python-version
-Dockerfile
-entrypoint.sh
-main.py
-pyproject.toml
-requirements.txt
-app/__init__.py
-app/config.py
-app/__pycache__/__init__.cpython-310.pyc
-app/__pycache__/config.cpython-310.pyc
-app/api/__init__.py
-app/api/auth.py
-app/api/datasets.py
-app/api/deployment.py
-app/api/evaluation.py
-app/api/inference.py
-app/api/models.py
-app/api/sample_center.py
-app/api/training.py
-app/api/__pycache__/__init__.cpython-310.pyc
-app/api/__pycache__/auth.cpython-310.pyc
-app/api/__pycache__/datasets.cpython-310.pyc
-app/api/__pycache__/deployment.cpython-310.pyc
-app/api/__pycache__/evaluation.cpython-310.pyc
-app/api/__pycache__/inference.cpython-310.pyc
-app/api/__pycache__/models.cpython-310.pyc
-app/api/__pycache__/sample_center.cpython-310.pyc
-app/api/__pycache__/training.cpython-310.pyc
-app/core/__init__.py
-app/core/auth.py
-app/core/background_tasks.py
-app/core/db.py
-app/core/job_queue.py
-app/core/logging.py
-app/core/remote_deploy.py
-app/core/remote_eval.py
-app/core/remote_executor.py
-app/core/security.py
-app/core/sso_client.py
-app/core/websocket.py
-app/core/__pycache__/__init__.cpython-310.pyc
-app/core/__pycache__/auth.cpython-310.pyc
-app/core/__pycache__/background_tasks.cpython-310.pyc
-app/core/__pycache__/db.cpython-310.pyc
-app/core/__pycache__/job_queue.cpython-310.pyc
-app/core/__pycache__/logging.cpython-310.pyc
-app/core/__pycache__/remote_deploy.cpython-310.pyc
-app/core/__pycache__/remote_eval.cpython-310.pyc
-app/core/__pycache__/remote_executor.cpython-310.pyc
-app/core/__pycache__/security.cpython-310.pyc
-app/core/__pycache__/sso_client.cpython-310.pyc
-app/core/__pycache__/websocket.cpython-310.pyc
-app/engines/__init__.py
-app/engines/__main__.py
-app/engines/base.py
-app/engines/multimodal_engine.py
-app/engines/remote_train.py
-app/engines/text_engine.py
-app/engines/vision_engine.py
-app/engines/__pycache__/__init__.cpython-310.pyc
-app/engines/__pycache__/base.cpython-310.pyc
-app/engines/__pycache__/remote_train.cpython-310.pyc
-app/engines/__pycache__/text_engine.cpython-310.pyc
-app/peft/__init__.py
-app/peft/__pycache__/__init__.cpython-310.pyc
-app/preprocessors/__init__.py
-app/preprocessors/__pycache__/__init__.cpython-310.pyc
-app/schemas/__init__.py
-app/schemas/background_task.py
-app/schemas/common.py
-app/schemas/dataset.py
-app/schemas/deployment.py
-app/schemas/evaluation.py
-app/schemas/model.py
-app/schemas/model_test.py
-app/schemas/sample_center.py
-app/schemas/training.py
-app/schemas/__pycache__/__init__.cpython-310.pyc
-app/schemas/__pycache__/background_task.cpython-310.pyc
-app/schemas/__pycache__/common.cpython-310.pyc
-app/schemas/__pycache__/dataset.cpython-310.pyc
-app/schemas/__pycache__/deployment.cpython-310.pyc
-app/schemas/__pycache__/evaluation.cpython-310.pyc
-app/schemas/__pycache__/model.cpython-310.pyc
-app/schemas/__pycache__/model_test.cpython-310.pyc
-app/schemas/__pycache__/sample_center.cpython-310.pyc
-app/schemas/__pycache__/training.cpython-310.pyc
-app/services/dataset_service.py
-app/services/deploy_service.py
-app/services/eval_service.py
-app/services/inference_service.py
-app/services/model_service.py
-app/services/model_test_service.py
-app/services/sample_center_service.py
-app/services/training_service.py
-app/services/__pycache__/dataset_service.cpython-310.pyc
-app/services/__pycache__/deploy_service.cpython-310.pyc
-app/services/__pycache__/eval_service.cpython-310.pyc
-app/services/__pycache__/inference_service.cpython-310.pyc
-app/services/__pycache__/model_service.cpython-310.pyc
-app/services/__pycache__/model_test_service.cpython-310.pyc
-app/services/__pycache__/sample_center_service.cpython-310.pyc
-app/services/__pycache__/training_service.cpython-310.pyc
+(base) [root@localhost ~]# docker exec finetune-trainer find /root/Fine-tuning/backend -name '*.pyc' -delete && docker exec finetune-trainer find /root/Fine-tuning/backend -name '__pycache__' -type d -delete
+(base) [root@localhost ~]# 
+(base) [root@localhost ~]# docker exec finetune-trainer tail -200 /tmp/train_1e334a57-26f5-4e7e-a961-0a02330fa708.log
+[remote_train] === Training job started: 1e334a57-26f5-4e7e-a961-0a02330fa708 ===
+[remote_train] model_id=Qwen/Qwen1.5-0.5B, model_type=text
+[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
+[remote_train] config={"model_id": "Qwen/Qwen1.5-0.5B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "adalora", "epochs": 3, "batch_size": 16, "gradient_accumulation": 4, "lear
+[remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
+[remote_train] Step 1: Preprocessing dataset...
+[remote_train]   task_type=sft, template=auto
+[remote_train]   output_path=/root/Fine-tuning/backend/data/processed/1e334a57-26f5-4e7e-a961-0a02330fa708_processed.jsonl
+[remote_train]   Selecting engine for model_type=text...
+[remote_train]   Engine loaded: TextEngine
+[remote_train]   PEFT method: adalora
+[remote_train]   Running preprocess_dataset...
+[remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/1e334a57-26f5-4e7e-a961-0a02330fa708_processed.jsonl
+[remote_train] Step 2: Loading model: Qwen/Qwen1.5-0.5B...
+[remote_train]   Quantization: None
+Loading weights: 100%|██████████| 291/291 [00:04<00:00, 59.39it/s] 
+[remote_train]   Model loaded successfully
+[remote_train] Step 3: Building PEFT config...
+[remote_train] ERROR: AdaLoRA does not work when `total_step` is None, supply a value > 0.
+[remote_train] Traceback (most recent call last):
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 162, in run_training
+    peft_config = engine.get_peft_config(peft_method, config)
+  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 149, in get_peft_config
+    return builder(params)
+  File "/root/Fine-tuning/backend/app/peft/__init__.py", line 43, in build_adalora_config
+    return AdaLoraConfig(
+  File "<string>", line 51, in __init__
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/adalora/config.py", line 102, in __post_init__
+    raise ValueError("AdaLoRA does not work when `total_step` is None, supply a value > 0.")
+ValueError: AdaLoRA does not work when `total_step` is None, supply a value > 0.
 
-sent 8,251 bytes  received 5,885 bytes  764.11 bytes/sec
-total size is 410,728  speedup is 29.06
-=> Sync done.
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-2026-05-25 02:57:27 | INFO     | peft-platform | JobQueue started with 2 workers
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
-INFO:     127.0.0.1:39240 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:43618 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:43626 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:43628 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55410 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55418 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55412 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:55428 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42002 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 02:57:48 | INFO     | peft-platform | Job 598a6794-9fad-4c85-8bc8-a5f31a2af184 enqueued
-2026-05-25 02:57:48 | INFO     | peft-platform | Training job created: 598a6794-9fad-4c85-8bc8-a5f31a2af184
-INFO:     172.20.0.4:42018 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 02:57:48 | INFO     | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
-INFO:     172.20.0.4:42024 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42028 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42044 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50476 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:47416 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:50490 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48496 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 02:58:07 | INFO     | peft-platform | Remote cleanup result: true
-cleaned 16 processes
-2026-05-25 02:59:00 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
-2026-05-25 02:59:00 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-25 02:59:17 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
-2026-05-25 02:59:53 | INFO     | peft-platform | Remote training launched in container: job=598a6794-9fad-4c85-8bc8-a5f31a2af184, container_pid=176135
-INFO:     127.0.0.1:55276 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:44184 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:37066 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:35906 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:48512 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35908 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51080 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:34534 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51084 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42942 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42952 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42956 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:43060 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:43072 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:44468 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:44482 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:51686 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:58792 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:58806 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:49734 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:49736 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:37696 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 03:00:46 | ERROR    | peft-platform | Remote job 598a6794-9fad-4c85-8bc8-a5f31a2af184 failed: AdaLoRA does not work when `total_step` is None, supply a value > 0.
-INFO:     172.20.0.4:37706 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:59420 - "GET /health HTTP/1.1" 200 OK
-2026-05-25 03:00:56 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 176135 2>/dev/null; pkill -9 -P 176135 2>/dev/null'
-2026-05-25 03:00:56 | INFO     | peft-platform | Killed remote process 176135 via docker exec
-INFO:     172.20.0.4:59336 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-25 03:00:56 | INFO     | peft-platform | Remote training launched for job 598a6794-9fad-4c85-8bc8-a5f31a2af184
-INFO:     172.20.0.4:59340 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33706 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:33722 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:34462 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:34474 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:54736 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:52398 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:52412 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:49268 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+[remote_train] === Training job failed: 1e334a57-26f5-4e7e-a961-0a02330fa708 ===
+Traceback (most recent call last):
+  File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 213, in <module>
+    main()
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 209, in main
+    asyncio.run(run_training(job_id, model_id, model_type, dataset_id, config))
+  File "/opt/conda/lib/python3.10/asyncio/runners.py", line 44, in run
+    return loop.run_until_complete(main)
+  File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
+    return future.result()
+  File "/root/Fine-tuning/backend/app/engines/remote_train.py", line 162, in run_training
+    peft_config = engine.get_peft_config(peft_method, config)
+  File "/root/Fine-tuning/backend/app/engines/text_engine.py", line 149, in get_peft_config
+    return builder(params)
+  File "/root/Fine-tuning/backend/app/peft/__init__.py", line 43, in build_adalora_config
+    return AdaLoraConfig(
+  File "<string>", line 51, in __init__
+  File "/opt/conda/lib/python3.10/site-packages/peft/tuners/adalora/config.py", line 102, in __post_init__
+    raise ValueError("AdaLoRA does not work when `total_step` is None, supply a value > 0.")
+ValueError: AdaLoRA does not work when `total_step` is None, supply a value > 0.
+(base) [root@localhost ~]# 
+(base) [root@localhost ~]# grep -n 'total_step\|init_r.*target_r' /root/Fine-tuning/backend/app/engines/text_engine.py
+190:        # 计算总步数(AdaLoRA 需要在 get_peft_model 之前设置 total_step)
+194:        # AdaLoRA 要求 total_step > 0(通过属性名判断而非 isinstance,避免导入路径问题)
+195:        if hasattr(peft_config, "init_r") and hasattr(peft_config, "target_r"):
+196:            peft_config.total_step = max_steps
+396:                    total_steps=state.max_steps or 0,