|
@@ -1,66 +1,72 @@
|
|
|
-INFO: 172.20.0.4:39360 - "POST /api/oauth/exchange-code HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39364 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39378 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:39394 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:50946 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:50944 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:50952 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:50958 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-27 02:30:41 | INFO | peft-platform | Training job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf: num_gpus=1, batch_size=16
|
|
|
|
|
-2026-05-27 02:30:41 | INFO | peft-platform | Job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf enqueued
|
|
|
|
|
-2026-05-27 02:30:41 | INFO | peft-platform | Training job created: 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf
|
|
|
|
|
-INFO: 172.20.0.4:50972 - "POST /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-27 02:30:41 | INFO | app.engines.text_engine | Preprocessed 0 samples for ppo/alpaca
|
|
|
|
|
-INFO: 172.20.0.4:50998 - "GET /api/v1/models/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:50984 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:51000 - "GET /api/v1/datasets/ HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:51012 - "WebSocket /ws/training/4fd86f1d-3f2f-48ac-92a4-8e236159d1cf?token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZjgyN2IxZC0wM2IxLTQwZGMtOTliMC1jOGRjYTEzNWEwNmUiLCJ1c2VybmFtZSI6InN1cGVyX2FkbWluIiwicm9sZXMiOlsic3VwZXJfYWRtaW4iXSwiZXhwIjoxNzc5ODUwMjMzLCJpYXQiOjE3Nzk4NDkwMzMsInR5cGUiOiJhY2Nlc3MifQ.WvY2rgy_lvYhdR4UGaXA6x1X5MiMFvWKwqk3JzQdpOY" [accepted]
|
|
|
|
|
-2026-05-27 02:30:41 | INFO | peft-platform | 客户端已连接到训练 WebSocket (job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf)
|
|
|
|
|
-INFO: connection open
|
|
|
|
|
-INFO: 172.20.0.4:35710 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:35720 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:43638 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:40052 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:43646 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:59604 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-27 02:31:07 | INFO | peft-platform | Remote cleanup result: true
|
|
|
|
|
-cleaned 147 processes
|
|
|
|
|
-2026-05-27 02:32:00 | INFO | peft-platform | Created remote dataset directory: /root/Fine-tuning/backend/data/datasets
|
|
|
|
|
-2026-05-27 02:32:00 | INFO | peft-platform | Uploading dataset file: /root/Fine-tuning/backend/data/uploads/ppo_sample.jsonl -> /root/Fine-tuning/backend/data/datasets/ppo_sample.jsonl
|
|
|
|
|
-2026-05-27 02:32:18 | INFO | peft-platform | Dataset uploaded successfully: /root/Fine-tuning/backend/data/datasets/ppo_sample.jsonl
|
|
|
|
|
-2026-05-27 02:32:53 | INFO | peft-platform | Remote training launched in container: job=4fd86f1d-3f2f-48ac-92a4-8e236159d1cf, container_pid=26886
|
|
|
|
|
-INFO: 127.0.0.1:57260 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:59094 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:55910 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:37264 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:59616 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:37248 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42048 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:45268 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42050 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42172 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42170 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42188 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42186 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42194 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42198 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42202 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42218 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42234 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42252 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42262 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42246 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42270 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42272 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:42284 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:44220 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:52000 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-2026-05-27 02:33:46 | ERROR | peft-platform | Remote job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf failed: num_samples should be a positive integer value, but got num_samples=0
|
|
|
|
|
-INFO: 172.20.0.4:51606 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-INFO: 127.0.0.1:54618 - "GET /health HTTP/1.1" 200 OK
|
|
|
|
|
-INFO: 172.20.0.4:47416 - "GET /api/v1/training/jobs HTTP/1.0" 200 OK
|
|
|
|
|
-2026-05-27 02:33:56 | ERROR | peft-platform | SSH command timeout after 10s: docker exec finetune-trainer bash -c 'kill -9 26886 2>/dev/null; pkill -9 -P 26886 2>/dev/null'
|
|
|
|
|
-2026-05-27 02:33:56 | INFO | peft-platform | Killed remote process 26886 via docker exec
|
|
|
|
|
-2026-05-27 02:33:56 | INFO | peft-platform | Remote training launched for job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf
|
|
|
|
|
-2026-05-27 02:33:56 | INFO | peft-platform | 客户端已从训练 WebSocket 断开 (job 4fd86f1d-3f2f-48ac-92a4-8e236159d1cf)
|
|
|
|
|
-INFO: connection closed
|
|
|
|
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer bash -c 'cat /proc/$(pgrep -f "remote_train" | head -1)/fd/1 2>/dev/null | tail -20'
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 320, in <module>
|
|
|
|
|
+ lib = get_native_library()
|
|
|
|
|
+ File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
|
|
|
|
|
+ raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
|
|
|
|
|
+RuntimeError: Configured CUDA binary not found at /opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
|
|
|
|
|
+[transformers] warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
|
|
|
|
|
+/root/Fine-tuning/backend/app/engines/text_engine.py:310: TRLExperimentalWarning: You are importing from 'trl.experimental'. APIs here are unstable and may change or be removed without notice. Silence this warning by setting environment variable TRL_EXPERIMENTAL_SILENCE=1.
|
|
|
|
|
+ from trl.experimental.ppo import PPOConfig, PPOTrainer
|
|
|
|
|
+[transformers] `torch_dtype` is deprecated! Use `dtype` instead!
|
|
|
|
|
+trainable params: 5,070,848 || all params: 757,463,872 || trainable%: 0.6695
|
|
|
|
|
+Loading weights: 100%|██████████| 473/473 [00:06<00:00, 72.87it/s]
|
|
|
|
|
+[transformers] Qwen3_5ForSequenceClassification LOAD REPORT from: /root/Fine-tuning/backend/data/models/Qwen_Qwen3.5-0.8B
|
|
|
|
|
+Key | Status |
|
|
|
|
|
+-------------+---------+-
|
|
|
|
|
+score.weight | MISSING |
|
|
|
|
|
+
|
|
|
|
|
+Notes:
|
|
|
|
|
+- MISSING: those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
|
|
|
|
|
+===training policy===
|
|
|
|
|
+ 0%| | 0/1 [00:00<?, ?it/s](base) [root@localhost ~]# docker exec finetune-trainer ps aux | grep python | grep -v defunct | grep -v grep
|
|
|
|
|
+root 31506 104 2.0 197646012 10808312 ? Rsl 11:31 19:50 /opt/conda/bin/python -m app.engines.remote_train 0b822b5e-105d-4d1e-ad51-7217a2d63c29 Qwen/Qwen3.5-0.8B text /root/Fine-tuning/backend/data/datasets/ppo_sample.jsonl /root/Fine-tuning/backend/data/config_0b822b5e-105d-4d1e-ad51-7217a2d63c29.json
|
|
|
|
|
+root 31639 1.5 1.6 17399148 8921280 ? Sl 11:31 0:17 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31725 0.0 0.9 17102096 4842292 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31727 0.0 0.9 17102096 4842164 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31729 0.0 0.9 17102096 4841516 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31731 0.0 0.9 17102096 4841516 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31733 0.0 0.9 17102096 4841516 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31735 0.0 0.9 17102096 4841528 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31737 0.0 0.9 17102096 4841528 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31739 0.0 0.9 17102096 4841528 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31741 0.0 0.9 17102096 4841528 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31743 0.0 0.9 17102096 4841532 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31745 0.0 0.9 17102096 4841532 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31747 0.0 0.9 17102096 4841532 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31749 0.0 0.9 17102096 4841532 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31751 0.0 0.9 17102096 4841532 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31753 0.0 0.9 17102096 4841532 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31755 0.0 0.9 17102096 4841532 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31757 0.0 0.9 17102096 4841536 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31759 0.0 0.9 17102096 4841536 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31761 0.0 0.9 17102096 4841540 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31763 0.0 0.9 17102096 4841540 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31765 0.0 0.9 17102096 4841544 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31767 0.0 0.9 17102096 4841544 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31769 0.0 0.9 17102096 4841544 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31771 0.0 0.9 17102096 4841548 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31773 0.0 0.9 17102096 4841548 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31778 0.0 0.9 17102096 4841548 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31780 0.0 0.9 17102096 4841884 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31782 0.0 0.9 17102096 4841884 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31784 0.0 0.9 17102096 4841888 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31849 0.0 0.9 17102096 4841888 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31851 0.0 0.9 17102096 4841496 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+root 31853 0.0 0.9 17102096 4841496 ? Sl 11:31 0:00 /opt/conda/bin/python /opt/conda/lib/python3.10/site-packages/torch/_inductor/compile_worker/__main__.py --pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler --kind=fork --workers=32 --parent=31506 --read-fd=10 --write-fd=13 --torch-key=kdnYoFpyXJfmeFh07c0N00WVSuau0TZN11yUZqCrSHo=
|
|
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer mx-smi | grep python
|
|
|
|
|
+| 3 31506 python 4580 |
|
|
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer tail -f /tmp/train_*.log 2>/dev/null | head -50
|
|
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer top -b -n 1 | grep python | head -5
|
|
|
|
|
+ 31506 root 20 0 188.5g 10.3g 4.0g R 100.0 2.0 20:20.44 python
|
|
|
|
|
+ 29 root 20 0 0 0 0 Z 0.0 0.0 2:36.28 python
|
|
|
|
|
+ 161 root 20 0 0 0 0 Z 0.0 0.0 0:17.39 python
|
|
|
|
|
+ 499 root 20 0 0 0 0 Z 0.0 0.0 0:00.22 python
|
|
|
|
|
+ 501 root 20 0 0 0 0 Z 0.0 0.0 0:00.29 python
|
|
|
|
|
+(base) [root@localhost ~]# docker exec finetune-trainer mx-smi | grep -E "MiB|python"
|
|
|
|
|
+| 52W / 225W | 42C P9 | 60459/65536 MiB | Available |
|
|
|
|
|
+| 49W / 225W | 41C P9 | 60459/65536 MiB | Available |
|
|
|
|
|
+| 53W / 225W | 44C P9 | 29988/65536 MiB | Available |
|
|
|
|
|
+| 51W / 225W | 42C P9 | 5248/65536 MiB | Available |
|
|
|
|
|
+| Usage(MiB) |
|
|
|
|
|
+| 3 31506 python 4580 |
|
|
|
|
|
+(base) [root@localhost ~]#
|