lxylxy123321 22 часов назад
Родитель
Сommit
feb01ed135
2 измененных файлов с 73 добавлено и 66 удалено
  1. 1 0
      backend/app/engines/text_engine.py
  2. 72 66
      result.txt

+ 1 - 0
backend/app/engines/text_engine.py

@@ -372,6 +372,7 @@ class TextEngine(BaseEngine):
                 report_to="none",
                 dataloader_num_workers=4,
                 dataloader_pin_memory=False,
+                torch_compile=False,
             )
 
             # ppo_epochs: 新版叫 num_ppo_epochs,旧版叫 ppo_epochs

+ 72 - 66
result.txt

@@ -1,66 +1,72 @@
-INFO:     172.20.0.4:39360 - "POST /api/oauth/exchange-code HTTP/1.0" 200 OK
-INFO:     172.20.0.4:39364 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:39378 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:39394 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50946 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50944 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50952 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50958 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 02:30:41 | INFO     | peft-platform | Training job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf: num_gpus=1, batch_size=16
-2026-05-27 02:30:41 | INFO     | peft-platform | Job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf enqueued
-2026-05-27 02:30:41 | INFO     | peft-platform | Training job created: 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf
-INFO:     172.20.0.4:50972 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 02:30:41 | INFO     | app.engines.text_engine | Preprocessed 0 samples for ppo/alpaca
-INFO:     172.20.0.4:50998 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:50984 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51000 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
-INFO:     172.20.0.4:51012 - "WebSocket /ws/training/4fd86f1d-3f2f-48ac-92a4-8e236159d1cf?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5ODUwMjMzLCJpYXQiOjE3Nzk4NDkwMzMsInR5cGUiOiJhY2Nlc3MifQ.WvY2rgy_lvYhdR4UGaXA6x1X5MiMFvWKwqk3JzQdpOY" [accepted]
-2026-05-27 02:30:41 | INFO     | peft-platform | 客户端已连接到训练 WebSocket (job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf)
-INFO:     connection open
-INFO:     172.20.0.4:35710 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:35720 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:43638 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:40052 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:43646 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:59604 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 02:31:07 | INFO     | peft-platform | Remote cleanup result: true
-cleaned 147 processes
-2026-05-27 02:32:00 | INFO     | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
-2026-05-27 02:32:00 | INFO     | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/uploads/ppo_sample.jsonl -> /root/Fine-tuning/backend/data/datasets/ppo_sample.jsonl
-2026-05-27 02:32:18 | INFO     | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/ppo_sample.jsonl
-2026-05-27 02:32:53 | INFO     | peft-platform | Remote training launched in container: job=4fd86f1d-3f2f-48ac-92a4-8e236159d1cf, container_pid=26886
-INFO:     127.0.0.1:57260 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:59094 - "GET /health HTTP/1.1" 200 OK
-INFO:     127.0.0.1:55910 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:37264 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:59616 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:37248 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42048 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:45268 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42172 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42170 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42188 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42186 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42194 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42198 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42202 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42218 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42234 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42252 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42262 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42246 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42270 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42272 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:42284 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     172.20.0.4:44220 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:52000 - "GET /health HTTP/1.1" 200 OK
-2026-05-27 02:33:46 | ERROR    | peft-platform | Remote job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf failed: num_samples should be a positive integer value, but got num_samples=0
-INFO:     172.20.0.4:51606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-INFO:     127.0.0.1:54618 - "GET /health HTTP/1.1" 200 OK
-INFO:     172.20.0.4:47416 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
-2026-05-27 02:33:56 | ERROR    | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 26886 2>/dev/null; pkill -9 -P 26886 2>/dev/null'
-2026-05-27 02:33:56 | INFO     | peft-platform | Killed remote process 26886 via docker exec
-2026-05-27 02:33:56 | INFO     | peft-platform | Remote training launched for job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf
-2026-05-27 02:33:56 | INFO     | peft-platform | 客户端已从训练 WebSocket 断开 (job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf)
-INFO:     connection closed
+(base) [root@localhost ~]# docker exec finetune-trainer bash -c 'cat /proc/$(pgrep -f "remote_train" | head -1)/fd/1 2>/dev/null | tail -20'
+  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
+    lib = get_native_library()
+  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
+    raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
+RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
+[transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
+/root/Fine-tuning/backend/app/engines/text_engine.py:310: TRLExperimentalWarning: You are importing from 'trl.experimental'. APIs here are unstable and may change or be removed without notice. Silence this warning by setting environment variable TRL_EXPERIMENTAL_SILENCE=1.
+  from trl.experimental.ppo import PPOConfig, PPOTrainer
+[transformers] `torch_dtype` is deprecated! Use `dtype` instead!
+trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
+Loading weights: 100%|██████████| 473/473 [00:06<00:00, 72.87it/s] 
+[transformers] Qwen3_5ForSequenceClassification LOAD REPORT from: /root/Fine-tuning/backend/data/models/Qwen_Qwen3.5-0.8B
+Key          | Status  | 
+-------------+---------+-
+score.weight | MISSING | 
+
+Notes:
+- MISSING:	those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
+===training policy===
+  0%|          | 0/1 [00:00<?, ?it/s](base) [root@localhost ~]# docker exec finetune-trainer ps aux | grep python | grep -v defunct | grep -v grep
+root       31506  104  2.0 197646012 10808312 ?  Rsl  11:31  19:50 /opt/conda/bin/python -m app.engines.remote_train 0b822b5e-105d-4d1e-ad51-7217a2d63c29 Qwen/Qwen3.5-0.8B text /root/Fine-tuning/backend/data/datasets/ppo_sample.jsonl /root/Fine-tuning/backend/data/config_0b822b5e-105d-4d1e-ad51-7217a2d63c29.json
+root       31639  1.5  1.6 17399148 8921280 ?    Sl   11:31   0:17 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31725  0.0  0.9 17102096 4842292 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31727  0.0  0.9 17102096 4842164 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31729  0.0  0.9 17102096 4841516 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31731  0.0  0.9 17102096 4841516 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31733  0.0  0.9 17102096 4841516 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31735  0.0  0.9 17102096 4841528 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31737  0.0  0.9 17102096 4841528 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31739  0.0  0.9 17102096 4841528 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31741  0.0  0.9 17102096 4841528 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31743  0.0  0.9 17102096 4841532 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31745  0.0  0.9 17102096 4841532 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31747  0.0  0.9 17102096 4841532 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31749  0.0  0.9 17102096 4841532 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31751  0.0  0.9 17102096 4841532 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31753  0.0  0.9 17102096 4841532 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31755  0.0  0.9 17102096 4841532 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31757  0.0  0.9 17102096 4841536 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31759  0.0  0.9 17102096 4841536 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31761  0.0  0.9 17102096 4841540 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31763  0.0  0.9 17102096 4841540 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31765  0.0  0.9 17102096 4841544 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31767  0.0  0.9 17102096 4841544 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31769  0.0  0.9 17102096 4841544 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31771  0.0  0.9 17102096 4841548 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31773  0.0  0.9 17102096 4841548 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31778  0.0  0.9 17102096 4841548 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31780  0.0  0.9 17102096 4841884 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31782  0.0  0.9 17102096 4841884 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31784  0.0  0.9 17102096 4841888 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31849  0.0  0.9 17102096 4841888 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31851  0.0  0.9 17102096 4841496 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+root       31853  0.0  0.9 17102096 4841496 ?    Sl   11:31   0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
+(base) [root@localhost ~]# docker exec finetune-trainer mx-smi | grep python
+|  3                    31506         python                       4580           |
+(base) [root@localhost ~]# docker exec finetune-trainer tail -f /tmp/train_*.log 2>/dev/null | head -50
+(base) [root@localhost ~]# docker exec finetune-trainer top -b -n 1 | grep python | head -5
+  31506 root      20   0  188.5g  10.3g   4.0g R 100.0   2.0  20:20.44 python
+     29 root      20   0       0      0      0 Z   0.0   0.0   2:36.28 python
+    161 root      20   0       0      0      0 Z   0.0   0.0   0:17.39 python
+    499 root      20   0       0      0      0 Z   0.0   0.0   0:00.22 python
+    501 root      20   0       0      0      0 Z   0.0   0.0   0:00.29 python
+(base) [root@localhost ~]# docker exec finetune-trainer mx-smi | grep -E "MiB|python"
+| 52W / 225W       | 42C          P9 | 60459/65536 MiB     | Available            |
+| 49W / 225W       | 41C          P9 | 60459/65536 MiB     | Available            |
+| 53W / 225W       | 44C          P9 | 29988/65536 MiB     | Available            |
+| 51W / 225W       | 42C          P9 | 5248/65536 MiB      | Available            |
+|                                                                  Usage(MiB)     |
+|  3                    31506         python                       4580           |
+(base) [root@localhost ~]#