lxylxy123321 1 неделя назад
Родитель
Сommit
aa579fb02b
3 измененных файлов с 55 добавлено и 39 удалено
  1. 2 0
      backend/app/engines/remote_train.py
  2. 2 0
      backend/app/engines/text_engine.py
  3. 51 39
      result.txt

+ 2 - 0
backend/app/engines/remote_train.py

@@ -17,6 +17,8 @@ os.environ["FLASH_ATTENTION_ENABLED"] = "0"
 # 禁用 torch.compile,避免 fork 大量 inductor worker 进程
 os.environ["PT2_COMPILE"] = "0"
 os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
+# 限制训练只用 GPU 2 和 3(GPU 0/1 被 VLLM 占用)
+os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
 
 _progress_log_file = None
 

+ 2 - 0
backend/app/engines/text_engine.py

@@ -8,6 +8,8 @@ os.environ["TORCH_FLASH_ATTN"] = "0"
 # 禁用 torch.compile,避免每个任务 fork 几十个 inductor worker
 os.environ["PT2_COMPILE"] = "0"
 os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
+# 限制训练只用 GPU 2 和 3(GPU 0/1 被 VLLM 占用)
+os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
 
 import asyncio
 import json

+ 51 - 39
result.txt

@@ -1,39 +1,51 @@
-(base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_d7309868-1c9c-4cf7-b051-8d189db189c2.log
-[remote_train] === Training job started: d7309868-1c9c-4cf7-b051-8d189db189c2 ===
-[remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
-[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
-[remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "lora", "epochs": 3, "batch_size": 4, "gradient_accumulation": 4, "learning
-[remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
-[remote_train] Step 1: Preprocessing dataset...
-[remote_train]   task_type=sft, template=alpaca
-[remote_train]   output_path=/root/Fine-tuning/backend/data/processed/d7309868-1c9c-4cf7-b051-8d189db189c2_processed.jsonl
-[remote_train]   Selecting engine for model_type=text...
-[remote_train]   Engine loaded: TextEngine
-[remote_train]   PEFT method: lora
-[remote_train]   Running preprocess_dataset...
-[remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/d7309868-1c9c-4cf7-b051-8d189db189c2_processed.jsonl
-[remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
-[remote_train]   Quantization: None
-[transformers] `torch_dtype` is deprecated! Use `dtype` instead!
-Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
-Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
-torch.compile is not available in Python 3.10, using identity decorator instead
-/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-  warnings.warn(_BETA_TRANSFORMS_WARNING)
-/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
-  warnings.warn(_BETA_TRANSFORMS_WARNING)
-[11:39:41.468][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:39:51.708][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:40:01.948][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:40:12.188][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:40:22.428][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:40:32.668][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:40:42.908][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:40:53.148][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:41:03.389][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:41:13.629][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:41:23.868][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:41:34.109][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:41:44.348][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:41:54.588][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
-[11:42:04.829][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
+(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/python -c "import torch; [print(f'GPU {i}: {torch.cuda.get_device_name(i)}, mem={torch.cuda.get_device_properties(i).total_memory/1e9:.2f}GB, alloc={torch.cuda.memory_allocated(i)/1e9:.2f}GB') for i in range(4)]"
+GPU 0: MetaX N260, mem=68.48GB, alloc=0.00GB
+GPU 1: MetaX N260, mem=68.48GB, alloc=0.00GB
+GPU 2: MetaX N260, mem=68.48GB, alloc=0.00GB
+GPU 3: MetaX N260, mem=68.48GB, alloc=0.00GB
+(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/python -c "import torch; print(torch.cuda.memory_allocated())"
+0
+(base) [root@localhost ~]# mx-smi 2>/dev/null || mcli-smi 2>/dev/null || echo "No smi tool found"
+mx-smi  version: 2.2.9
+
+=================== MetaX System Management Interface Log ===================
+Timestamp                                         : Thu May 21 01:30:13 2026
+
+Attached GPUs                                     : 4
++---------------------------------------------------------------------------------+
+| MX-SMI 2.2.9                       Kernel Mode Driver Version: 3.4.4            |
+| MACA Version: 3.3.0.15             BIOS Version: 1.30.0.0                       |
+|------------------+-----------------+---------------------+----------------------|
+| Board       Name | GPU   Persist-M | Bus-id              | GPU-Util      sGPU-M |
+| Pwr:Usage/Cap    | Temp       Perf | Memory-Usage        | GPU-State            |
+|==================+=================+=====================+======================|
+| 0     MetaX N260 | 0           Off | 0000:b5:00.0        | 0%          Disabled |
+| 53W / 225W       | 43C          P9 | 62108/65536 MiB     | Available            |
++------------------+-----------------+---------------------+----------------------+
+| 1     MetaX N260 | 1           Off | 0000:b6:00.0        | 0%          Disabled |
+| 49W / 225W       | 42C          P9 | 60952/65536 MiB     | Available            |
++------------------+-----------------+---------------------+----------------------+
+| 2     MetaX N260 | 2           Off | 0000:b9:00.0        | 0%          Disabled |
+| 53W / 225W       | 44C          P9 | 30691/65536 MiB     | Available            |
++------------------+-----------------+---------------------+----------------------+
+| 3     MetaX N260 | 3           Off | 0000:bd:00.0        | 0%          Disabled |
+| 51W / 225W       | 42C          P9 | 30469/65536 MiB     | Available            |
++------------------+-----------------+---------------------+----------------------+
+
++---------------------------------------------------------------------------------+
+| Process:                                                                        |
+|  GPU                    PID         Process Name                 GPU Memory     |
+|                                                                  Usage(MiB)     |
+|=================================================================================|
+|  0                  1007916         VLLM::Worker_TP              59790          |
+|  0                  1129825         python                       1618           |
+|  1                  1007917         VLLM::Worker_TP              59790          |
+|  1                  1129825         python                       490            |
+|  2                   888820         VLLM::EngineCor              29530          |
+|  2                  1129825         python                       490            |
+|  3                   894310         VLLM::EngineCor              29180          |
+|  3                  1129825         python                       618            |
++---------------------------------------------------------------------------------+
+
+End of Log
+(base) [root@loca