Kaynağa Gözat

优化前端界面,加入日志

lxylxy123321 1 hafta önce
ebeveyn
işleme
0eff6afdd6

+ 8 - 0
backend/app/core/job_queue.py

@@ -147,8 +147,10 @@ class JobQueue:
         """执行单个任务:预处理 → 训练 → 完成。"""
         job = self._jobs.get(job_id)
         if not job:
+            logger.warning(f"Job {job_id} not found in queue, skipping")
             return
 
+        logger.info(f"Job {job_id}: Starting execution (status=QUEUED)")
         self.update_job(job_id, status=JobStatus.QUEUED)
         await self._notify_callbacks()
 
@@ -186,14 +188,20 @@ class JobQueue:
             task_type = config.get("task_type", "sft")
             template = config.get("dataset_template", "alpaca")
 
+            logger.info(f"Job {job_id}: Preprocessing dataset (task_type={task_type}, template={template})")
             await engine.preprocess_dataset(dataset_path, processed_path, task_type=task_type, template=template)
+            logger.info(f"Job {job_id}: Preprocessing completed, output: {processed_path}")
 
+            logger.info(f"Job {job_id}: Loading model {model_id} (peft={peft_method})")
             await engine.load_model(model_id, quantization="4bit" if peft_method == "qlora" else None)
+            logger.info(f"Job {job_id}: Model loaded, building PEFT config")
             peft_config = engine.get_peft_config(peft_method, config)
+            logger.info(f"Job {job_id}: PEFT config built, starting training...")
 
             self.update_job(job_id, status=JobStatus.TRAINING)
             await self._notify_callbacks()
 
+            logger.info(f"Job {job_id}: Calling engine.train()...")
             adapter_path = await engine.train(
                 job_id=job_id,
                 dataset_path=processed_path,

+ 11 - 1
backend/app/core/logging.py

@@ -1,13 +1,23 @@
 import logging
+import sys
 
 from app.config import get_settings
 
 settings = get_settings()
 
+# 统一日志配置:同时输出到 stderr,确保 docker logs 能捕获
+log_level = getattr(logging, settings.backend_log_level.upper(), logging.INFO)
+
 logging.basicConfig(
-    level=getattr(logging, settings.backend_log_level.upper(), logging.INFO),
+    level=log_level,
     format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
+    stream=sys.stderr,
 )
 
 logger = logging.getLogger("peft-platform")
+
+# 覆盖 uvicorn/gunicorn logger 避免重复或静默
+logging.getLogger("uvicorn").setLevel(log_level)
+logging.getLogger("uvicorn.error").setLevel(log_level)
+logging.getLogger("uvicorn.access").setLevel(log_level)

+ 23 - 3
backend/app/engines/text_engine.py

@@ -40,6 +40,7 @@ class TextEngine(BaseEngine):
 
         # 如果本地没有,从 HF 下载
         if not (Path(local_path) / "config.json").exists():
+            logger.info(f"Model not found locally, downloading from HuggingFace: {model_id}")
             from huggingface_hub import snapshot_download
 
             snapshot_download(
@@ -47,9 +48,10 @@ class TextEngine(BaseEngine):
                 local_dir=local_path,
                 local_dir_use_symlinks=False,
             )
+            logger.info(f"Model download completed: {model_id}")
 
         quantization = kwargs.get("quantization", None)
-        
+
         # 日志:检查 GPU 状态
         logger.info(f"CUDA available: {torch.cuda.is_available()}")
         logger.info(f"CUDA device count: {torch.cuda.device_count()}")
@@ -59,8 +61,14 @@ class TextEngine(BaseEngine):
                 logger.info(f"GPU {i} memory: {torch.cuda.get_device_properties(i).total_memory / (1024**3):.2f} GB")
         else:
             logger.warning("No GPU detected! Training will run on CPU.")
-        
-        max_memory = {i: "4GB" for i in range(torch.cuda.device_count())} if torch.cuda.is_available() else None
+
+        logger.info(f"Loading model from: {local_path} (quantization={quantization})")
+
+        max_memory = None
+        if torch.cuda.is_available():
+            # 不限制 max_memory,让 transformers 自动利用全部显存
+            max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory * 0.9 // (1024**3))}GiB"
+                          for i in range(torch.cuda.device_count())}
         
         load_kwargs: dict[str, Any] = {
             "torch_dtype": torch.float16,
@@ -84,6 +92,7 @@ class TextEngine(BaseEngine):
 
         self._model = AutoModelForCausalLM.from_pretrained(local_path, **load_kwargs)
         logger.info(f"Loaded model: {model_id}")
+        logger.info(f"Model loading complete, returning from load_model()")
 
     def get_peft_config(self, method: str, params: dict[str, Any]) -> Any:
         """根据 PEFT 方法返回对应的配置对象。"""
@@ -142,12 +151,21 @@ class TextEngine(BaseEngine):
         save_strategy = training_args.get("save_strategy", "epoch")
         deepspeed_config = training_args.get("deepspeed", None)
 
+        logger.info(f"Training args: task_type={task_type}, epochs={epochs}, batch_size={batch_size}, "
+                     f"gradient_accumulation={gradient_accumulation}, lr={learning_rate}, "
+                     f"max_seq_length={max_seq_length}, warmup_ratio={warmup_ratio}, "
+                     f"save_strategy={save_strategy}, deepspeed={'enabled' if deepspeed_config else 'disabled'}")
+        logger.info(f"Loading dataset from: {dataset_path}")
+
         dataset = self._tokenize_dataset(dataset_path, max_seq_length)
+        logger.info(f"Dataset tokenized: {len(dataset)} samples")
 
         self._model = get_peft_model(self._model, peft_config)
+        logger.info(f"PEFT model created, trainable parameters:")
         self._model.print_trainable_parameters()
 
         output_dir = str(settings.adapters_dir / job_id)
+        logger.info(f"Adapter output directory: {output_dir}")
         tr_args = TrainingArguments(
             output_dir=output_dir,
             num_train_epochs=epochs,
@@ -235,7 +253,9 @@ class TextEngine(BaseEngine):
                 )
 
         try:
+            logger.info(f"Trainer created, starting trainer.train()...")
             trainer.train()
+            logger.info(f"trainer.train() returned, saving adapter...")
             self._model.save_pretrained(output_dir)
             self._tokenizer.save_pretrained(output_dir)
             logger.info(f"Training completed for job {job_id}")

+ 4 - 1
backend/app/services/model_service.py

@@ -41,6 +41,7 @@ async def resolve_model_path(model_id: str) -> str | None:
 
 async def download_model(model_id: str, use_modelscope: bool = False) -> dict[str, Any]:
     """从 HF 或 ModelScope 下载模型到本地缓存。"""
+    logger.info(f"Starting model download: {model_id} (source={'ModelScope' if use_modelscope else 'HuggingFace'})")
     try:
         if use_modelscope:
             import subprocess
@@ -61,9 +62,11 @@ async def download_model(model_id: str, use_modelscope: bool = False) -> dict[st
         else:
             from huggingface_hub import snapshot_download
 
+            local_path_dir = str(settings.models_dir / model_id.replace("/", "_"))
+            logger.info(f"Downloading from HuggingFace: {model_id} -> {local_path_dir}")
             local_path = snapshot_download(
                 repo_id=model_id,
-                local_dir=str(settings.models_dir / model_id.replace("/", "_")),
+                local_dir=local_path_dir,
                 local_dir_use_symlinks=False,
             )
 

+ 10 - 2
backend/app/services/model_test_service.py

@@ -26,6 +26,8 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
     if not (model_dir / "config.json").exists():
         return {"error": f"Model directory not found: {model_dir}"}
 
+    logger.info(f"Loading model: {model_id} from {model_dir}")
+
     tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
@@ -43,14 +45,19 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
                 **kwargs,
             )
             break
-        except Exception:
+        except Exception as e:
+            logger.warning(f"Failed to load with {loader_cls.__name__}: {e}")
             continue
 
     if model is None:
         return {"error": f"Unable to load model with any available loader. Model type may not be supported yet."}
     model.eval()
 
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # 限制 prompt 长度,避免 OOM
+    max_prompt_len = getattr(settings, "default_max_seq_length", 2048)
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_prompt_len).to(model.device)
+    prompt_tokens = inputs["input_ids"].shape[1]
+    logger.info(f"Prompt tokenized: {prompt_tokens} tokens, generating up to {max_new_tokens} new tokens")
 
     with torch.no_grad():
         outputs = model.generate(
@@ -63,6 +70,7 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
         )
 
     generated_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+    logger.info(f"Generated {outputs.shape[1] - inputs['input_ids'].shape[1]} tokens")
 
     return {
         "model_id": model_id,

+ 1 - 1
frontend/src/api/client.ts

@@ -176,7 +176,7 @@ interface DatasetValidation {
   warnings?: string[]
 }
 
-interface TrainingJob {
+export interface TrainingJob {
   id: string
   model_id: string
   model_type: string

+ 28 - 1
frontend/src/pages/Training.tsx

@@ -260,6 +260,28 @@ export function Training() {
   const handleCreate = () => {
     if (!modelId.trim() || !datasetId.trim()) return
     setSubmitting(true)
+
+    // 乐观更新:立即在列表中添加一个 pending 任务
+    const tempId = 'temp-' + Date.now()
+    const tempJob: TrainingJob = {
+      id: tempId,
+      model_id: modelId,
+      peft_method: peftMethod,
+      status: 'pending',
+      progress: 0,
+      loss: null,
+      created_at: new Date().toISOString(),
+      started_at: null,
+      finished_at: null,
+      error_message: null,
+      adapter_path: null,
+      current_epoch: 0,
+      current_step: 0,
+      total_steps: 0,
+    }
+    setJobs(prev => [tempJob, ...prev])
+    setLoading(false)
+
     api.training.create({
       model_id: modelId,
       model_type: modelType,
@@ -277,10 +299,15 @@ export function Training() {
       .then(() => {
         setModelId('')
         setDatasetId('')
+        // 用真实数据替换占位
+        setJobs(prev => prev.filter(j => j.id !== tempId))
         fetchJobs()
         fetchOptions()
       })
-      .catch(console.error)
+      .catch(() => {
+        // 失败时移除占位任务
+        setJobs(prev => prev.filter(j => j.id !== tempId))
+      })
       .finally(() => setSubmitting(false))
   }