Ver Fonte

修复engine问题

lxylxy123321 há 1 semana atrás
pai
commit
bd3e4f3267

+ 1 - 0
backend/app/engines/multimodal_engine.py

@@ -187,6 +187,7 @@ class _ProgressCallback:
     def on_save(self, args, state, control, **kwargs): pass
     def on_save(self, args, state, control, **kwargs): pass
     def on_predict(self, args, state, control, metrics=None, **kwargs): pass
     def on_predict(self, args, state, control, metrics=None, **kwargs): pass
     def on_init_end(self, args, state, control, **kwargs): pass
     def on_init_end(self, args, state, control, **kwargs): pass
+    def on_epoch_begin(self, args, state, control, **kwargs): pass
 
 
 
 
 from app.core.websocket import send_completed, send_epoch_done, send_progress
 from app.core.websocket import send_completed, send_epoch_done, send_progress

+ 3 - 0
backend/app/engines/text_engine.py

@@ -331,6 +331,9 @@ class _ProgressCallback:
     def on_init_end(self, args, state, control, **kwargs):
     def on_init_end(self, args, state, control, **kwargs):
         pass
         pass
 
 
+    def on_epoch_begin(self, args, state, control, **kwargs):
+        pass
+
 
 
 # 全局单例
 # 全局单例
 text_engine = TextEngine()
 text_engine = TextEngine()

+ 1 - 0
backend/app/engines/vision_engine.py

@@ -187,6 +187,7 @@ class _ProgressCallback:
     def on_save(self, args, state, control, **kwargs): pass
     def on_save(self, args, state, control, **kwargs): pass
     def on_predict(self, args, state, control, metrics=None, **kwargs): pass
     def on_predict(self, args, state, control, metrics=None, **kwargs): pass
     def on_init_end(self, args, state, control, **kwargs): pass
     def on_init_end(self, args, state, control, **kwargs): pass
+    def on_epoch_begin(self, args, state, control, **kwargs): pass
 
 
 
 
 from app.core.websocket import send_completed, send_epoch_done, send_progress
 from app.core.websocket import send_completed, send_epoch_done, send_progress

+ 223 - 12
result.txt

@@ -1,21 +1,232 @@
-INFO:     172.19.0.3:36528 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-15 14:43:01 | INFO     | peft-platform | Job 32178086-d3ff-40b2-8bb5-5b6403ca47a7 enqueued
-2026-05-15 14:43:01 | INFO     | peft-platform | Training job created: 32178086-d3ff-40b2-8bb5-5b6403ca47a7
-INFO:     172.19.0.3:36538 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-15 14:43:01 | INFO     | peft-platform | Preprocessed 60 samples for sft/alpaca
+(base) [root@localhost Fine-tuning]# ls -laR /root/Fine-tuning/data/
+/root/Fine-tuning/data/:
+total 8
+drwxr-xr-x. 6 root root   89 May 14 21:59 .
+drwxr-xr-x. 6 root root 4096 May 15 02:46 ..
+drwxr-xr-x. 4 root root   94 May 15 02:50 adapters
+-rw-r--r--. 1 root root    0 May 14 21:59 finetuning.db
+drwxr-xr-x. 6 root root   65 May 14 23:43 models
+drwxr-xr-x. 8 root root 4096 May 15 02:50 processed
+drwxr-xr-x. 2 root root    6 May 14 21:45 uploads
+
+/root/Fine-tuning/data/adapters:
+total 0
+drwxr-xr-x. 4 root root 94 May 15 02:50 .
+drwxr-xr-x. 6 root root 89 May 14 21:59 ..
+drwxr-xr-x. 2 root root  6 May 15 02:50 0987d6bb-7d38-4124-8139-d02a58e7988a
+drwxr-xr-x. 2 root root  6 May 15 02:43 32178086-d3ff-40b2-8bb5-5b6403ca47a7
+
+/root/Fine-tuning/data/adapters/0987d6bb-7d38-4124-8139-d02a58e7988a:
+total 0
+drwxr-xr-x. 2 root root  6 May 15 02:50 .
+drwxr-xr-x. 4 root root 94 May 15 02:50 ..
+
+/root/Fine-tuning/data/adapters/32178086-d3ff-40b2-8bb5-5b6403ca47a7:
+total 0
+drwxr-xr-x. 2 root root  6 May 15 02:43 .
+drwxr-xr-x. 4 root root 94 May 15 02:50 ..
+
+/root/Fine-tuning/data/models:
+total 0
+drwxr-xr-x. 6 root root 65 May 14 23:43 .
+drwxr-xr-x. 6 root root 89 May 14 21:59 ..
+drwxr-xr-x. 2 root root  6 May 15 02:50 .lock
+drwxr-xr-x. 3 root root 50 May 14 22:08 Qwen
+drwxr-xr-x. 4 root root 34 May 14 23:43 ._____temp
+drwxr-xr-x. 3 root root 22 May 14 23:43 yanalong
+
+/root/Fine-tuning/data/models/.lock:
+total 0
+drwxr-xr-x. 2 root root  6 May 15 02:50 .
+drwxr-xr-x. 6 root root 65 May 14 23:43 ..
+
+/root/Fine-tuning/data/models/Qwen:
+total 4
+drwxr-xr-x. 3 root root   50 May 14 22:08 .
+drwxr-xr-x. 6 root root   65 May 14 23:43 ..
+lrwxrwxrwx. 1 root root   59 May 14 22:08 Qwen3.5-0.8B -> /root/Fine-tuning/backend/data/models/Qwen/Qwen3___5-0___8B
+drwxr-xr-x. 2 root root 4096 May 14 22:08 Qwen3___5-0___8B
+
+/root/Fine-tuning/data/models/Qwen/Qwen3___5-0___8B:
+total 1728540
+drwxr-xr-x. 2 root root       4096 May 14 22:08 .
+drwxr-xr-x. 3 root root         50 May 14 22:08 ..
+-rw-r--r--. 1 root root       7755 May 14 22:02 chat_template.jinja
+-rw-r--r--. 1 root root       2907 May 14 22:02 config.json
+-rw-r--r--. 1 root root         51 May 14 22:02 configuration.json
+-rw-r--r--. 1 root root      11343 May 14 22:02 LICENSE
+-rw-r--r--. 1 root root         44 May 15 02:50 .mdl
+-rw-r--r--. 1 root root    3353259 May 14 22:02 merges.txt
+-rw-r--r--. 1 root root 1746942600 May 14 22:08 model.safetensors-00001-of-00001.safetensors
+-rw-r--r--. 1 root root      50900 May 14 22:02 model.safetensors.index.json
+-rw-------. 1 root root        977 May 14 22:08 .msc
+-rw-r--r--. 1 root root         36 May 15 02:50 .mv
+-rw-r--r--. 1 root root        390 May 14 22:02 preprocessor_config.json
+-rw-r--r--. 1 root root      61705 May 14 22:02 README.md
+-rw-r--r--. 1 root root      16709 May 14 22:02 tokenizer_config.json
+-rw-r--r--. 1 root root   12807982 May 14 22:02 tokenizer.json
+-rw-r--r--. 1 root root        385 May 14 22:02 video_preprocessor_config.json
+-rw-r--r--. 1 root root    6722759 May 14 22:02 vocab.json
+
+/root/Fine-tuning/data/models/._____temp:
+total 0
+drwxr-xr-x. 4 root root 34 May 14 23:43 .
+drwxr-xr-x. 6 root root 65 May 14 23:43 ..
+drwxr-xr-x. 3 root root 26 May 14 22:02 Qwen
+drwxr-xr-x. 3 root root 22 May 14 23:43 yanalong
+
+/root/Fine-tuning/data/models/._____temp/Qwen:
+total 0
+drwxr-xr-x. 3 root root 26 May 14 22:02 .
+drwxr-xr-x. 4 root root 34 May 14 23:43 ..
+drwxr-xr-x. 2 root root  6 May 14 22:08 Qwen3.5-0.8B
+
+/root/Fine-tuning/data/models/._____temp/Qwen/Qwen3.5-0.8B:
+total 0
+drwxr-xr-x. 2 root root  6 May 14 22:08 .
+drwxr-xr-x. 3 root root 26 May 14 22:02 ..
+
+/root/Fine-tuning/data/models/._____temp/yanalong:
+total 0
+drwxr-xr-x. 3 root root 22 May 14 23:43 .
+drwxr-xr-x. 4 root root 34 May 14 23:43 ..
+drwxr-xr-x. 2 root root  6 May 14 23:43 yanalong
+
+/root/Fine-tuning/data/models/._____temp/yanalong/yanalong:
+total 0
+drwxr-xr-x. 2 root root  6 May 14 23:43 .
+drwxr-xr-x. 3 root root 22 May 14 23:43 ..
+
+/root/Fine-tuning/data/models/yanalong:
+total 0
+drwxr-xr-x. 3 root root 22 May 14 23:43 .
+drwxr-xr-x. 6 root root 65 May 14 23:43 ..
+drwxr-xr-x. 2 root root 84 May 14 23:43 yanalong
+
+/root/Fine-tuning/data/models/yanalong/yanalong:
+total 20
+drwxr-xr-x. 2 root root   84 May 14 23:43 .
+drwxr-xr-x. 3 root root   22 May 14 23:43 ..
+-rw-r--r--. 1 root root   36 May 14 23:43 configuration.json
+-rw-r--r--. 1 root root   40 May 14 23:43 .mdl
+-rw-------. 1 root root  165 May 14 23:43 .msc
+-rw-r--r--. 1 root root   36 May 14 23:43 .mv
+-rw-r--r--. 1 root root 1385 May 14 23:43 README.md
+
+/root/Fine-tuning/data/processed:
+total 28
+drwxr-xr-x. 8 root root 4096 May 15 02:50 .
+drwxr-xr-x. 6 root root   89 May 14 21:59 ..
+-rw-r--r--. 1 root root 8287 May 15 02:50 0987d6bb-7d38-4124-8139-d02a58e7988a_processed.jsonl
+-rw-r--r--. 1 root root 8287 May 15 02:43 32178086-d3ff-40b2-8bb5-5b6403ca47a7_processed.jsonl
+drwxr-xr-x. 2 root root  155 May 15 00:34 downloads
+drwxr-xr-x. 2 root root    6 May 15 02:49 .lock
+drwxr-xr-x. 2 root root   24 May 15 02:20 ms_yanalong_yanalong
+drwxr-xr-x. 3 root root   22 May 15 00:41 ._____temp
+drwxr-xr-x. 3 root root   22 May 15 00:41 yanalong
+drwxr-xr-x. 3 root root   38 May 15 00:34 yanalong___yanalong
+
+/root/Fine-tuning/data/processed/downloads:
+total 16
+drwxr-xr-x. 2 root root  155 May 15 00:34 .
+drwxr-xr-x. 8 root root 4096 May 15 02:50 ..
+-rw-r--r--. 1 root root 7703 May 15 00:34 07accbecba3067d05a158915b80a160fd071498296059b329fa91f0f0ad966be
+-rw-r--r--. 1 root root  145 May 15 00:34 07accbecba3067d05a158915b80a160fd071498296059b329fa91f0f0ad966be.json
+
+/root/Fine-tuning/data/processed/.lock:
+total 4
+drwxr-xr-x. 2 root root    6 May 15 02:49 .
+drwxr-xr-x. 8 root root 4096 May 15 02:50 ..
+
+/root/Fine-tuning/data/processed/ms_yanalong_yanalong:
+total 12
+drwxr-xr-x. 2 root root   24 May 15 02:20 .
+drwxr-xr-x. 8 root root 4096 May 15 02:50 ..
+-rw-r--r--. 1 root root 7747 May 15 02:49 data.jsonl
+
+/root/Fine-tuning/data/processed/._____temp:
+total 4
+drwxr-xr-x. 3 root root   22 May 15 00:41 .
+drwxr-xr-x. 8 root root 4096 May 15 02:50 ..
+drwxr-xr-x. 3 root root   22 May 15 00:41 yanalong
+
+/root/Fine-tuning/data/processed/._____temp/yanalong:
+total 0
+drwxr-xr-x. 3 root root 22 May 15 00:41 .
+drwxr-xr-x. 3 root root 22 May 15 00:41 ..
+drwxr-xr-x. 2 root root  6 May 15 00:41 yanalong
+
+/root/Fine-tuning/data/processed/._____temp/yanalong/yanalong:
+total 0
+drwxr-xr-x. 2 root root  6 May 15 00:41 .
+drwxr-xr-x. 3 root root 22 May 15 00:41 ..
+
+/root/Fine-tuning/data/processed/yanalong:
+total 4
+drwxr-xr-x. 3 root root   22 May 15 00:41 .
+drwxr-xr-x. 8 root root 4096 May 15 02:50 ..
+drwxr-xr-x. 2 root root   84 May 15 00:41 yanalong
+
+/root/Fine-tuning/data/processed/yanalong/yanalong:
+total 20
+drwxr-xr-x. 2 root root   84 May 15 00:41 .
+drwxr-xr-x. 3 root root   22 May 15 00:41 ..
+-rw-r--r--. 1 root root   36 May 15 00:41 configuration.json
+-rw-r--r--. 1 root root   40 May 15 02:49 .mdl
+-rw-------. 1 root root  165 May 15 00:41 .msc
+-rw-r--r--. 1 root root   36 May 15 02:49 .mv
+-rw-r--r--. 1 root root 1385 May 15 00:41 README.md
+
+/root/Fine-tuning/data/processed/yanalong___yanalong:
+total 4
+drwxr-xr-x. 3 root root   38 May 15 00:34 .
+drwxr-xr-x. 8 root root 4096 May 15 02:50 ..
+drwxr-xr-x. 3 root root   19 May 15 00:34 default-7d50d822a140caa3
+
+/root/Fine-tuning/data/processed/yanalong___yanalong/default-7d50d822a140caa3:
+total 0
+drwxr-xr-x. 3 root root 19 May 15 00:34 .
+drwxr-xr-x. 3 root root 38 May 15 00:34 ..
+drwxr-xr-x. 3 root root 20 May 15 00:35 0.0.0
+
+/root/Fine-tuning/data/processed/yanalong___yanalong/default-7d50d822a140caa3/0.0.0:
+total 0
+drwxr-xr-x. 3 root root 20 May 15 00:35 .
+drwxr-xr-x. 3 root root 19 May 15 00:34 ..
+drwxr-xr-x. 2 root root 59 May 15 00:34 master
+
+/root/Fine-tuning/data/processed/yanalong___yanalong/default-7d50d822a140caa3/0.0.0/master:
+total 12
+drwxr-xr-x. 2 root root   59 May 15 00:34 .
+drwxr-xr-x. 3 root root   20 May 15 00:35 ..
+-rw-r--r--. 1 root root  504 May 15 00:34 dataset_info.json
+-rw-r--r--. 1 root root 7120 May 15 00:34 yanalong-train.arrow
+
+/root/Fine-tuning/data/uploads:
+total 0
+drwxr-xr-x. 2 root root  6 May 14 21:45 .
+drwxr-xr-x. 6 root root 89 May 14 21:59 ..
+(base) [root@lo
+
+
+
+
+
+INFO:     172.19.0.3:36448 - "GET /api/v1/models/ HTTP/1.0" 200 OK
 [transformers] `torch_dtype` is deprecated! Use `dtype` instead!
 [transformers] `torch_dtype` is deprecated! Use `dtype` instead!
-2026-05-15 14:43:12 | WARNING  | fla.utils | Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-2026-05-15 14:43:12 | WARNING  | fla.utils | Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-2026-05-15 14:43:17 | WARNING  | fla.ops.rwkv7.fused_addcmul | torch.compile is not available in Python 3.10, using identity decorator instead
+2026-05-15 14:50:37 | WARNING  | fla.utils | Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+2026-05-15 14:50:37 | WARNING  | fla.utils | Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+2026-05-15 14:50:42 | WARNING  | fla.ops.rwkv7.fused_addcmul | torch.compile is not available in Python 3.10, using identity decorator instead
 /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
 /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
   warnings.warn(_BETA_TRANSFORMS_WARNING)
   warnings.warn(_BETA_TRANSFORMS_WARNING)
 /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
 /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
   warnings.warn(_BETA_TRANSFORMS_WARNING)
   warnings.warn(_BETA_TRANSFORMS_WARNING)
-Loading weights: 100%|██████████| 320/320 [00:00<00:00, 447.82it/s]
-2026-05-15 14:43:18 | INFO     | peft-platform | Loaded model: Qwen/Qwen3.5-0.8B
-Map: 100%|██████████| 60/60 [00:00<00:00, 2251.91 examples/s]
+Loading weights: 100%|██████████| 320/320 [00:00<00:00, 395.55it/s]
+2026-05-15 14:50:44 | INFO     | peft-platform | Loaded model: Qwen/Qwen3.5-0.8B
+Map: 100%|██████████| 60/60 [00:00<00:00, 1780.68 examples/s]
 /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
 /opt/conda/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:1348: UserWarning: Model has `tie_word_embeddings=True` and a tied layer is part of the adapter, but `ensure_weight_tying` is not set to True. This can lead to complications, for example when merging the adapter or converting your model to formats other than safetensors. Check the discussion here: https://github.com/huggingface/peft/issues/2777
   warnings.warn(msg)
   warnings.warn(msg)
 [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
 [transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
-2026-05-15 14:43:32 | ERROR    | peft-platform | Job 32178086-d3ff-40b2-8bb5-5b6403ca47a7 failed: '_ProgressCallback' object has no attribute 'on_init_end'
 trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
 trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
+  0%|          | 0/12 [00:00<?, ?it/s]2026-05-15 14:50:59 | ERROR    | peft-platform | Training failed for job 0987d6bb-7d38-4124-8139-d02a58e7988a: '_ProgressCallback' object has no attribute 'on_epoch_begin'
+2026-05-15 14:50:59 | ERROR    | peft-platform | Job 0987d6bb-7d38-4124-8139-d02a58e7988a failed: '_ProgressCallback' object has no attribute 'on_epoch_begin'