Sfoglia il codice sorgente

添加253服务器日志

lxylxy123321 1 settimana fa
parent
commit
402cbc1327
1 ha cambiato i file con 28 aggiunte e 3 eliminazioni
  1. 28 3
      backend/app/engines/remote_train.py

+ 28 - 3
backend/app/engines/remote_train.py

@@ -96,19 +96,29 @@ async def run_training(job_id: str, model_id: str, model_type: str, dataset_path
     """执行单个训练任务(远程调用入口)。"""
     _init_log_file(job_id)
 
+    _remote_log(f"=== Training job started: {job_id} ===")
+    _remote_log(f"model_id={model_id}, model_type={model_type}")
+    _remote_log(f"dataset_path={dataset_path}")
+    _remote_log(f"config={json.dumps(config, ensure_ascii=False)[:200]}")
+
     try:
         # dataset_path 由主节点直接传入
         if not dataset_path or not Path(dataset_path).exists():
             raise FileNotFoundError(f"Dataset not found: {dataset_path}")
+        _remote_log(f"Dataset file exists: {dataset_path}")
 
         _write_log(type="status", status="preprocessing")
+        _remote_log("Step 1: Preprocessing dataset...")
 
         # 预处理
         processed_path = str(_PROCESSED_DIR / f"{job_id}_processed.jsonl")
         task_type = config.get("task_type", "sft")
         template = config.get("dataset_template", "alpaca")
+        _remote_log(f"  task_type={task_type}, template={template}")
+        _remote_log(f"  output_path={processed_path}")
 
         # 选择引擎
+        _remote_log(f"  Selecting engine for model_type={model_type}...")
         if model_type == "vision":
             from app.engines.vision_engine import vision_engine
             engine = vision_engine
@@ -118,20 +128,31 @@ async def run_training(job_id: str, model_id: str, model_type: str, dataset_path
         else:
             from app.engines.text_engine import text_engine
             engine = text_engine
+        _remote_log(f"  Engine loaded: {engine.__class__.__name__}")
 
         peft_method = config.get("peft_method", "lora")
+        _remote_log(f"  PEFT method: {peft_method}")
 
+        _remote_log("  Running preprocess_dataset...")
         await engine.preprocess_dataset(dataset_path, processed_path, task_type=task_type, template=template)
+        _remote_log(f"  Preprocessing done, output: {processed_path}")
 
         _write_log(type="status", status="loading_model")
+        _remote_log(f"Step 2: Loading model: {model_id}...")
 
         # 加载模型
-        await engine.load_model(model_id, quantization="4bit" if peft_method == "qlora" else None)
+        quantization_mode = "4bit" if peft_method == "qlora" else None
+        _remote_log(f"  Quantization: {quantization_mode}")
+        await engine.load_model(model_id, quantization=quantization_mode)
+        _remote_log("  Model loaded successfully")
 
         # 构建 PEFT 配置
+        _remote_log("Step 3: Building PEFT config...")
         peft_config = engine.get_peft_config(peft_method, config)
+        _remote_log("  PEFT config built")
 
         _write_log(type="status", status="training")
+        _remote_log("Step 4: Starting training...")
 
         # 训练 — 传入文件日志回调替代 WebSocket 回调
         start_time = time.time()
@@ -148,11 +169,15 @@ async def run_training(job_id: str, model_id: str, model_type: str, dataset_path
         elapsed = round(time.time() - start_time, 2)
         _write_log(type="completed", adapter_path=str(adapter_path), total_time=elapsed)
         _remote_log(f"Remote training completed: {job_id} -> {adapter_path} ({elapsed}s)")
+        _remote_log(f"=== Training job finished: {job_id} ===")
         return adapter_path
 
     except Exception as e:
-        _write_log(type="error", message=str(e), traceback=traceback.format_exc())
-        _remote_log(f"Remote training failed: {job_id} - {e}")
+        tb = traceback.format_exc()
+        _write_log(type="error", message=str(e), traceback=tb)
+        _remote_log(f"ERROR: {e}")
+        _remote_log(tb)
+        _remote_log(f"=== Training job failed: {job_id} ===")
         raise