浏览代码

优化训练过程

lxylxy123321 5 天之前
父节点
当前提交
6e9f8e7903
共有 4 个文件被更改,包括 58 次插入17 次删除
  1. 2 0
      backend/app/engines/multimodal_engine.py
  2. 3 1
      backend/app/engines/text_engine.py
  3. 2 0
      backend/app/engines/vision_engine.py
  4. 51 16
      result.txt

+ 2 - 0
backend/app/engines/multimodal_engine.py

@@ -133,6 +133,8 @@ class MultimodalEngine(BaseEngine):
             optim="adamw_torch",
             remove_unused_columns=False,
             report_to="none",
+            dataloader_num_workers=4,
+            dataloader_pin_memory=False,
         )
 
         all_callbacks = callbacks if callbacks else [_ProgressCallback(job_id)]

+ 3 - 1
backend/app/engines/text_engine.py

@@ -208,7 +208,7 @@ class TextEngine(BaseEngine):
             remove_unused_columns=False,
             report_to="none",
             gradient_checkpointing=True,
-            dataloader_num_workers=0,
+            dataloader_num_workers=4,
             dataloader_pin_memory=False,
             **({"deepspeed": deepspeed_config} if deepspeed_config else {}),
         )
@@ -240,6 +240,8 @@ class TextEngine(BaseEngine):
                 logging_steps=10,
                 fp16=True,
                 report_to="none",
+                dataloader_num_workers=4,
+                dataloader_pin_memory=False,
             )
 
             if task_type == "dpo":

+ 2 - 0
backend/app/engines/vision_engine.py

@@ -133,6 +133,8 @@ class VisionEngine(BaseEngine):
             optim="adamw_torch",
             remove_unused_columns=False,
             report_to="none",
+            dataloader_num_workers=4,
+            dataloader_pin_memory=False,
         )
 
         all_callbacks = callbacks if callbacks else [_ProgressCallback(job_id)]

+ 51 - 16
result.txt

@@ -1,16 +1,51 @@
-(base) [root@localhost ~]# docker exec -e MACA_MPS_MODE=1 finetune-trainer /opt/conda/bin/python -c "import torch; print(f'设备数: {torch.cuda.device_count()}'); [print(f'GPU {i}: {torch.cuda.get_device_name(i)}') for i in range(torch.cuda.device_count())]"
-设备数: 4
-GPU 0: MetaX N260
-GPU 1: MetaX N260
-GPU 2: MetaX N260
-GPU 3: MetaX N260
-(base) [root@localhost ~]# docker exec -e MACA_MPS_MODE=1 -e CUDA_VISIBLE_DEVICES=2,3 finetune-trainer /opt/conda/bin/python -c "import torch; print(f'设备数: {torch.cuda.device_count()}'); [print(f'逻辑GPU {i} -> 物理GPU: {torch.cuda.get_device_properties(i).name}') for i in range(torch.cuda.device_count())]"
-设备数: 2
-逻辑GPU 0 -> 物理GPU: MetaX N260
-逻辑GPU 1 -> 物理GPU: MetaX N260
-(base) [root@localhost ~]# docker exec -e MACA_MPS_MODE=1 -e METAX_VISIBLE_DEVICES=2,3 finetune-trainer /opt/conda/bin/python -c "import torch; print(f'设备数: {torch.cuda.device_count()}'); [print(f'GPU {i}: {torch.cuda.get_device_name(i)}') for i in range(torch.cuda.device_count())]"
-设备数: 4
-GPU 0: MetaX N260
-GPU 1: MetaX N260
-GPU 2: MetaX N260
-GPU 3: MetaX N260
+(base) [root@localhost ~]# docker exec finetune-trainer tail -n 50 /tmp/train_a26f4344-2575-48ca-b62a-8eca3c28fb05.log
+    self._run_epoch(
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1737, in _run_epoch
+    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1909, in training_step
+    loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
+  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1981, in compute_loss
+    outputs = model(**inputs)
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
+    return forward_call(*args, **kwargs)
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 195, in forward
+    return self.gather(outputs, self.output_device)
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py", line 218, in gather
+    return gather(outputs, output_device, dim=self.dim)
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py", line 134, in gather
+    res = gather_map(outputs)
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py", line 126, in gather_map
+    return type(out)((k, gather_map([d[k] for d in outputs])) for k in out)
+  File "<string>", line 8, in __init__
+  File "/opt/conda/lib/python3.10/site-packages/transformers/utils/generic.py", line 451, in __post_init__
+    for idx, element in enumerate(iterator):
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py", line 126, in <genexpr>
+    return type(out)((k, gather_map([d[k] for d in outputs])) for k in out)
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py", line 120, in gather_map
+    return Gather.apply(target_device, dim, *outputs)
+  File "/opt/conda/lib/python3.10/site-packages/torch/autograd/function.py", line 576, in apply
+    return super().apply(*args, **kwargs)  # type: ignore[misc]
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py", line 80, in forward
+    return comm.gather(inputs, ctx.dim, ctx.target_device)
+  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/comm.py", line 253, in gather
+    return torch._C._gather(tensors, dim, destination)
+torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 1.49 GiB. GPU 0 has a total capacity of 63.78 GiB of which 500.44 MiB is free. Of the allocated memory 2.26 GiB is allocated by PyTorch, and 46.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
+{'loss': '2.906', 'grad_norm': '1.203', 'learning_rate': '2.799e-06', 'epoch': '0.002334'}
+{'loss': '2.906', 'grad_norm': '1.112', 'learning_rate': '5.91e-06', 'epoch': '0.004669'}
+{'loss': '2.889', 'grad_norm': '1.232', 'learning_rate': '9.02e-06', 'epoch': '0.007003'}
+{'loss': '2.821', 'grad_norm': '1.221', 'learning_rate': '1.213e-05', 'epoch': '0.009338'}
+{'loss': '2.826', 'grad_norm': '1.137', 'learning_rate': '1.524e-05', 'epoch': '0.01167'}
+{'loss': '2.758', 'grad_norm': '1.051', 'learning_rate': '1.835e-05', 'epoch': '0.01401'}
+{'loss': '2.674', 'grad_norm': '1.227', 'learning_rate': '2.146e-05', 'epoch': '0.01634'}
+{'loss': '2.635', 'grad_norm': '1.046', 'learning_rate': '2.457e-05', 'epoch': '0.01868'}
+{'loss': '2.595', 'grad_norm': '1.148', 'learning_rate': '2.768e-05', 'epoch': '0.02101'}
+{'loss': '2.541', 'grad_norm': '1.157', 'learning_rate': '3.079e-05', 'epoch': '0.02334'}
+{'loss': '2.509', 'grad_norm': '1.101', 'learning_rate': '3.39e-05', 'epoch': '0.02568'}
+{'loss': '2.523', 'grad_norm': '1.242', 'learning_rate': '3.701e-05', 'epoch': '0.02801'}
+{'loss': '2.475', 'grad_norm': '1.377', 'learning_rate': '4.012e-05', 'epoch': '0.03035'}
+{'loss': '2.461', 'grad_norm': '1.403', 'learning_rate': '4.323e-05', 'epoch': '0.03268'}
+{'loss': '2.419', 'grad_norm': '1.246', 'learning_rate': '4.635e-05', 'epoch': '0.03502'}
+{'loss': '2.427', 'grad_norm': '1.41', 'learning_rate': '4.946e-05', 'epoch': '0.03735'}
+  1%|▏         | 166/12852 [18:46<23:54:59,  6.79s/it]