|
|
@@ -1,39 +1,51 @@
|
|
|
-(base) [root@localhost ~]# docker exec finetune-trainer cat /tmp/train_d7309868-1c9c-4cf7-b051-8d189db189c2.log
|
|
|
-[remote_train] === Training job started: d7309868-1c9c-4cf7-b051-8d189db189c2 ===
|
|
|
-[remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
|
|
|
-[remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
-[remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "lora", "epochs": 3, "batch_size": 4, "gradient_accumulation": 4, "learning
|
|
|
-[remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
|
|
|
-[remote_train] Step 1: Preprocessing dataset...
|
|
|
-[remote_train] task_type=sft, template=alpaca
|
|
|
-[remote_train] output_path=/root/Fine-tuning/backend/data/processed/d7309868-1c9c-4cf7-b051-8d189db189c2_processed.jsonl
|
|
|
-[remote_train] Selecting engine for model_type=text...
|
|
|
-[remote_train] Engine loaded: TextEngine
|
|
|
-[remote_train] PEFT method: lora
|
|
|
-[remote_train] Running preprocess_dataset...
|
|
|
-[remote_train] Preprocessing done, output: /root/Fine-tuning/backend/data/processed/d7309868-1c9c-4cf7-b051-8d189db189c2_processed.jsonl
|
|
|
-[remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
|
|
|
-[remote_train] Quantization: None
|
|
|
-[transformers] `torch_dtype` is deprecated! Use `dtype` instead!
|
|
|
-Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
|
|
|
-Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
|
|
|
-torch.compile is not available in Python 3.10, using identity decorator instead
|
|
|
-/opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
|
|
|
- warnings.warn(_BETA_TRANSFORMS_WARNING)
|
|
|
-/opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
|
|
|
- warnings.warn(_BETA_TRANSFORMS_WARNING)
|
|
|
-[11:39:41.468][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:39:51.708][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:40:01.948][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:40:12.188][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:40:22.428][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:40:32.668][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:40:42.908][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:40:53.148][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:41:03.389][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:41:13.629][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:41:23.868][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:41:34.109][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:41:44.348][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:41:54.588][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
-[11:42:04.829][MXKW][E]queues.c :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:9288 type:21. Retrying.
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/python -c "import torch; [print(f'GPU {i}: {torch.cuda.get_device_name(i)}, mem={torch.cuda.get_device_properties(i).total_memory/1e9:.2f}GB, alloc={torch.cuda.memory_allocated(i)/1e9:.2f}GB') for i in range(4)]"
|
|
|
+GPU 0: MetaX N260, mem=68.48GB, alloc=0.00GB
|
|
|
+GPU 1: MetaX N260, mem=68.48GB, alloc=0.00GB
|
|
|
+GPU 2: MetaX N260, mem=68.48GB, alloc=0.00GB
|
|
|
+GPU 3: MetaX N260, mem=68.48GB, alloc=0.00GB
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/python -c "import torch; print(torch.cuda.memory_allocated())"
|
|
|
+0
|
|
|
+(base) [root@localhost ~]# mx-smi 2>/dev/null || mcli-smi 2>/dev/null || echo "No smi tool found"
|
|
|
+mx-smi version: 2.2.9
|
|
|
+
|
|
|
+=================== MetaX System Management Interface Log ===================
|
|
|
+Timestamp : Thu May 21 01:30:13 2026
|
|
|
+
|
|
|
+Attached GPUs : 4
|
|
|
++---------------------------------------------------------------------------------+
|
|
|
+| MX-SMI 2.2.9 Kernel Mode Driver Version: 3.4.4 |
|
|
|
+| MACA Version: 3.3.0.15 BIOS Version: 1.30.0.0 |
|
|
|
+|------------------+-----------------+---------------------+----------------------|
|
|
|
+| Board Name | GPU Persist-M | Bus-id | GPU-Util sGPU-M |
|
|
|
+| Pwr:Usage/Cap | Temp Perf | Memory-Usage | GPU-State |
|
|
|
+|==================+=================+=====================+======================|
|
|
|
+| 0 MetaX N260 | 0 Off | 0000:b5:00.0 | 0% Disabled |
|
|
|
+| 53W / 225W | 43C P9 | 62108/65536 MiB | Available |
|
|
|
++------------------+-----------------+---------------------+----------------------+
|
|
|
+| 1 MetaX N260 | 1 Off | 0000:b6:00.0 | 0% Disabled |
|
|
|
+| 49W / 225W | 42C P9 | 60952/65536 MiB | Available |
|
|
|
++------------------+-----------------+---------------------+----------------------+
|
|
|
+| 2 MetaX N260 | 2 Off | 0000:b9:00.0 | 0% Disabled |
|
|
|
+| 53W / 225W | 44C P9 | 30691/65536 MiB | Available |
|
|
|
++------------------+-----------------+---------------------+----------------------+
|
|
|
+| 3 MetaX N260 | 3 Off | 0000:bd:00.0 | 0% Disabled |
|
|
|
+| 51W / 225W | 42C P9 | 30469/65536 MiB | Available |
|
|
|
++------------------+-----------------+---------------------+----------------------+
|
|
|
+
|
|
|
++---------------------------------------------------------------------------------+
|
|
|
+| Process: |
|
|
|
+| GPU PID Process Name GPU Memory |
|
|
|
+| Usage(MiB) |
|
|
|
+|=================================================================================|
|
|
|
+| 0 1007916 VLLM::Worker_TP 59790 |
|
|
|
+| 0 1129825 python 1618 |
|
|
|
+| 1 1007917 VLLM::Worker_TP 59790 |
|
|
|
+| 1 1129825 python 490 |
|
|
|
+| 2 888820 VLLM::EngineCor 29530 |
|
|
|
+| 2 1129825 python 490 |
|
|
|
+| 3 894310 VLLM::EngineCor 29180 |
|
|
|
+| 3 1129825 python 618 |
|
|
|
++---------------------------------------------------------------------------------+
|
|
|
+
|
|
|
+End of Log
|
|
|
+(base) [root@loca
|