lxylxy123321 пре 6 дана
родитељ
комит
b4e83df4f3
3 измењених фајлова са 108 додато и 51 уклоњено
  1. 2 0
      backend/app/engines/remote_train.py
  2. 2 0
      backend/app/engines/text_engine.py
  3. 104 51
      result.txt

+ 2 - 0
backend/app/engines/remote_train.py

@@ -18,6 +18,8 @@ os.environ["FLASH_ATTENTION_ENABLED"] = "0"
 os.environ["PT2_COMPILE"] = "0"
 os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
 # 限制训练只用 GPU 2 和 3(GPU 0/1 被 VLLM 占用)
+# 沐曦 GPU 优先用 METAX_VISIBLE_DEVICES,同时设 CUDA_VISIBLE_DEVICES 兜底
+os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
 os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
 
 _progress_log_file = None

+ 2 - 0
backend/app/engines/text_engine.py

@@ -9,6 +9,8 @@ os.environ["TORCH_FLASH_ATTN"] = "0"
 os.environ["PT2_COMPILE"] = "0"
 os.environ["TORCHINDUCTOR_MAX_WORKERS"] = "1"
 # 限制训练只用 GPU 2 和 3(GPU 0/1 被 VLLM 占用)
+# 沐曦 GPU 优先用 METAX_VISIBLE_DEVICES,同时设 CUDA_VISIBLE_DEVICES 兜底
+os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
 os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
 
 import asyncio

+ 104 - 51
result.txt

@@ -1,51 +1,104 @@
-(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/python -c "import torch; [print(f'GPU {i}: {torch.cuda.get_device_name(i)}, mem={torch.cuda.get_device_properties(i).total_memory/1e9:.2f}GB, alloc={torch.cuda.memory_allocated(i)/1e9:.2f}GB') for i in range(4)]"
-GPU 0: MetaX N260, mem=68.48GB, alloc=0.00GB
-GPU 1: MetaX N260, mem=68.48GB, alloc=0.00GB
-GPU 2: MetaX N260, mem=68.48GB, alloc=0.00GB
-GPU 3: MetaX N260, mem=68.48GB, alloc=0.00GB
-(base) [root@localhost ~]# docker exec finetune-trainer /opt/conda/bin/python -c "import torch; print(torch.cuda.memory_allocated())"
-0
-(base) [root@localhost ~]# mx-smi 2>/dev/null || mcli-smi 2>/dev/null || echo "No smi tool found"
-mx-smi  version: 2.2.9
-
-=================== MetaX System Management Interface Log ===================
-Timestamp                                         : Thu May 21 01:30:13 2026
-
-Attached GPUs                                     : 4
-+---------------------------------------------------------------------------------+
-| MX-SMI 2.2.9                       Kernel Mode Driver Version: 3.4.4            |
-| MACA Version: 3.3.0.15             BIOS Version: 1.30.0.0                       |
-|------------------+-----------------+---------------------+----------------------|
-| Board       Name | GPU   Persist-M | Bus-id              | GPU-Util      sGPU-M |
-| Pwr:Usage/Cap    | Temp       Perf | Memory-Usage        | GPU-State            |
-|==================+=================+=====================+======================|
-| 0     MetaX N260 | 0           Off | 0000:b5:00.0        | 0%          Disabled |
-| 53W / 225W       | 43C          P9 | 62108/65536 MiB     | Available            |
-+------------------+-----------------+---------------------+----------------------+
-| 1     MetaX N260 | 1           Off | 0000:b6:00.0        | 0%          Disabled |
-| 49W / 225W       | 42C          P9 | 60952/65536 MiB     | Available            |
-+------------------+-----------------+---------------------+----------------------+
-| 2     MetaX N260 | 2           Off | 0000:b9:00.0        | 0%          Disabled |
-| 53W / 225W       | 44C          P9 | 30691/65536 MiB     | Available            |
-+------------------+-----------------+---------------------+----------------------+
-| 3     MetaX N260 | 3           Off | 0000:bd:00.0        | 0%          Disabled |
-| 51W / 225W       | 42C          P9 | 30469/65536 MiB     | Available            |
-+------------------+-----------------+---------------------+----------------------+
-
-+---------------------------------------------------------------------------------+
-| Process:                                                                        |
-|  GPU                    PID         Process Name                 GPU Memory     |
-|                                                                  Usage(MiB)     |
-|=================================================================================|
-|  0                  1007916         VLLM::Worker_TP              59790          |
-|  0                  1129825         python                       1618           |
-|  1                  1007917         VLLM::Worker_TP              59790          |
-|  1                  1129825         python                       490            |
-|  2                   888820         VLLM::EngineCor              29530          |
-|  2                  1129825         python                       490            |
-|  3                   894310         VLLM::EngineCor              29180          |
-|  3                  1129825         python                       618            |
-+---------------------------------------------------------------------------------+
-
-End of Log
-(base) [root@loca
+lq@lq:~/Fine-tuning$ sudo docker logs -f -t finetune-backend
+[sudo] password for lq: 
+Sorry, try again.
+[sudo] password for lq: 
+2026-05-21T05:34:12.856748710Z => Syncing backend code to compute node 192.168.91.253 ...
+2026-05-21T05:34:12.904890703Z Warning: Permanently added '192.168.91.253' (ED25519) to the list of known hosts.
+2026-05-21T05:34:30.582860442Z sending incremental file list
+2026-05-21T05:34:30.614393978Z app/engines/
+2026-05-21T05:34:30.655962590Z 
+2026-05-21T05:34:30.656054587Z sent 2,425 bytes  received 29 bytes  132.65 bytes/sec
+2026-05-21T05:34:30.656067153Z total size is 215,247  speedup is 87.71
+2026-05-21T05:34:30.658073223Z => Sync done.
+2026-05-21T05:34:31.901691988Z INFO:     Started server process [1]
+2026-05-21T05:34:31.901771435Z INFO:     Waiting for application startup.
+2026-05-21T05:34:31.999831081Z 2026-05-21 05:34:31 | INFO     | peft-platform | JobQueue started with 2 workers
+2026-05-21T05:34:31.999992582Z INFO:     Application startup complete.
+2026-05-21T05:34:32.000819280Z INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)
+2026-05-21T05:34:33.658164981Z INFO:     127.0.0.1:58612 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T05:34:40.128876661Z INFO:     172.20.0.4:57504 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:34:40.981739480Z INFO:     172.20.0.4:57510 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-21T05:34:41.037853040Z INFO:     172.20.0.4:57518 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:34:41.042812013Z INFO:     172.20.0.4:57516 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-21T05:34:44.446833463Z 2026-05-21 05:34:44 | INFO     | peft-platform | Job 83732d01-5022-4d67-9c44-acdf47f89092 enqueued
+2026-05-21T05:34:44.446919521Z 2026-05-21 05:34:44 | INFO     | peft-platform | Training job created: 83732d01-5022-4d67-9c44-acdf47f89092
+2026-05-21T05:34:44.447206930Z INFO:     172.20.0.4:38752 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:34:44.571734837Z INFO:     172.20.0.4:38758 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-21T05:34:44.581743700Z 2026-05-21 05:34:44 | INFO     | app.engines.text_engine | Preprocessed 60 samples for sft/alpaca
+2026-05-21T05:35:37.883942489Z 2026-05-21 05:35:37 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
+2026-05-21T05:35:37.884065139Z 2026-05-21 05:35:37 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/processed/ms_yanalong_yanalong/data.jsonl -> /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-21T05:35:55.509694297Z 2026-05-21 05:35:55 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-21T05:36:30.923810649Z 2026-05-21 05:36:30 | INFO     | peft-platform | Remote training launched in container: job=83732d01-5022-4d67-9c44-acdf47f89092, container_pid=2506
+2026-05-21T05:36:30.928971588Z INFO:     127.0.0.1:56772 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T05:36:30.933549372Z INFO:     172.20.0.4:38766 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:30.933947243Z INFO:     172.20.0.4:38762 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-21T05:36:30.935698059Z INFO:     127.0.0.1:47622 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T05:36:30.936923403Z INFO:     127.0.0.1:36574 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T05:36:30.944840546Z INFO:     172.20.0.4:53070 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:30.945740117Z INFO:     172.20.0.4:53074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:30.946639063Z INFO:     172.20.0.4:34814 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:30.948415195Z INFO:     172.20.0.4:34830 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:31.294922087Z INFO:     172.20.0.4:40276 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:31.316642706Z INFO:     172.20.0.4:40282 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:31.341533316Z INFO:     172.20.0.4:40288 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:32.082545706Z INFO:     172.20.0.4:40296 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:32.083579114Z INFO:     172.20.0.4:40300 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:32.094750861Z INFO:     172.20.0.4:40308 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:32.101512544Z INFO:     172.20.0.4:40324 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:32.112351440Z INFO:     172.20.0.4:40336 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:42.269063163Z INFO:     172.20.0.4:58890 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:36:57.587271550Z INFO:     172.20.0.4:41132 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:01.147104111Z INFO:     127.0.0.1:50416 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T05:37:01.264907014Z INFO:     172.20.0.4:41134 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:06.265588600Z INFO:     172.20.0.4:36266 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:11.264471858Z INFO:     172.20.0.4:36276 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:16.268764019Z INFO:     172.20.0.4:60828 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:21.263744160Z INFO:     172.20.0.4:60832 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:26.277948988Z INFO:     172.20.0.4:42252 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:31.273684817Z INFO:     172.20.0.4:42256 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:31.346532880Z INFO:     127.0.0.1:37738 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T05:37:36.279624659Z INFO:     172.20.0.4:50096 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:41.270703742Z INFO:     172.20.0.4:50100 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:46.311247473Z INFO:     172.20.0.4:38902 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:51.278088336Z INFO:     172.20.0.4:38910 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:37:56.280272066Z INFO:     172.20.0.4:60532 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:38:00.370724449Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] === Training job started: 83732d01-5022-4d67-9c44-acdf47f89092 ===
+2026-05-21T05:38:00.370793541Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] model_id=Qwen/Qwen3.5-0.8B, model_type=text
+2026-05-21T05:38:00.370808018Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] dataset_path=/root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-21T05:38:00.370817515Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] config={"model_id": "Qwen/Qwen3.5-0.8B", "model_type": "text", "dataset_id": "3d5f8808-e71a-449d-94e9-c61c4881b2cf", "peft_method": "lora", "epochs": 3, "batch_size": 4, "gradient_accumulation": 4, "learning
+2026-05-21T05:38:00.370844181Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] Dataset file exists: /root/Fine-tuning/backend/data/datasets/data.jsonl
+2026-05-21T05:38:00.370964860Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] Step 1: Preprocessing dataset...
+2026-05-21T05:38:00.371068870Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   task_type=sft, template=alpaca
+2026-05-21T05:38:00.371128216Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   output_path=/root/Fine-tuning/backend/data/processed/83732d01-5022-4d67-9c44-acdf47f89092_processed.jsonl
+2026-05-21T05:38:00.371142431Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Selecting engine for model_type=text...
+2026-05-21T05:38:00.371157641Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Engine loaded: TextEngine
+2026-05-21T05:38:00.371224686Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   PEFT method: lora
+2026-05-21T05:38:00.371289339Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Running preprocess_dataset...
+2026-05-21T05:38:00.371302088Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Preprocessing done, output: /root/Fine-tuning/backend/data/processed/83732d01-5022-4d67-9c44-acdf47f89092_processed.jsonl
+2026-05-21T05:38:00.371329600Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train] Step 2: Loading model: Qwen/Qwen3.5-0.8B...
+2026-05-21T05:38:00.371356109Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] [remote_train]   Quantization: None
+2026-05-21T05:38:00.371563114Z 2026-05-21 05:38:00 | WARNING  | peft-platform | [253:83732d01] [transformers] `torch_dtype` is deprecated! Use `dtype` instead!
+2026-05-21T05:38:00.371662704Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+2026-05-21T05:38:00.371737137Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
+2026-05-21T05:38:00.371772645Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] torch.compile is not available in Python 3.10, using identity decorator instead
+2026-05-21T05:38:00.371889023Z 2026-05-21 05:38:00 | WARNING  | peft-platform | [253:83732d01] /opt/conda/lib/python3.10/site-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-21T05:38:00.372089241Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-21T05:38:00.372109324Z 2026-05-21 05:38:00 | WARNING  | peft-platform | [253:83732d01] /opt/conda/lib/python3.10/site-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().
+2026-05-21T05:38:00.372121857Z 2026-05-21 05:38:00 | INFO     | peft-platform | [253:83732d01] warnings.warn(_BETA_TRANSFORMS_WARNING)
+2026-05-21T05:38:00.372137426Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:00.956][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
+2026-05-21T05:38:00.372289777Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:11.196][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
+2026-05-21T05:38:00.372303934Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:21.436][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
+2026-05-21T05:38:00.372371497Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:31.676][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
+2026-05-21T05:38:00.372474247Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:41.916][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
+2026-05-21T05:38:00.372500910Z 2026-05-21 05:38:00 | ERROR    | peft-platform | [253:83732d01] [13:37:52.156][MXKW][E]queues.c                :826 : [mxkwCreateQueueBlock][Hint]ioctl create queue block timeout, gpu_id:8525 type:21. Retrying.
+2026-05-21T05:38:01.548682891Z INFO:     127.0.0.1:52318 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T05:38:31.744481381Z INFO:     127.0.0.1:55348 - "GET /health HTTP/1.1" 200 OK
+2026-05-21T05:38:42.293320538Z INFO:     172.20.0.4:40146 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:38:47.635600218Z INFO:     172.20.0.4:43030 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:38:49.472820117Z INFO:     172.20.0.4:43050 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
+2026-05-21T05:38:49.474803507Z INFO:     172.20.0.4:43042 - "GET /api/v1/models/ HTTP/1.0" 200 OK
+2026-05-21T05:38:49.475933409Z INFO:     172.20.0.4:43058 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
+2026-05-21T05:38:50.415585317Z 2026-05-21 05:38:50 | INFO     | peft-platform | Job 83732d01-5022-4d67-9c44-acdf47f89092 cancelled
+2026-05-21T05:38:50.420419117Z 2026-05-21 05:38:50 | INFO     | peft-platform | Job cancelled: 83732d01-5022-4d67-9c44-acdf47f89092
+2026-05-21T05:38:50.420771552Z INFO:     172.20.0.4:43064 - "POST /api/v1/training/jobs/83732d01-5022-4d67-9c44-acdf47f89092/cancel HTTP/1.0" 200 OK
+2026-05-21T05:38:50.444305921Z INFO:     172.20.0.4:43074 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK