1 hafta önce · 0eff6afdd6
--- a/backend/app/core/job_queue.py
+++ b/backend/app/core/job_queue.py
@@ -147,8 +147,10 @@ class JobQueue:
 
				         """执行单个任务：预处理 → 训练 → 完成。"""
			
 
				         job = self._jobs.get(job_id)
			
 
				         if not job:
			
 
				+            logger.warning(f"Job {job_id} not found in queue, skipping")
			
 
				             return
			
 
				 
			
 
				+        logger.info(f"Job {job_id}: Starting execution (status=QUEUED)")
			
 
				         self.update_job(job_id, status=JobStatus.QUEUED)
			
 
				         await self._notify_callbacks()
			
 
				 
			
@@ -186,14 +188,20 @@ class JobQueue:
 
				             task_type = config.get("task_type", "sft")
			
 
				             template = config.get("dataset_template", "alpaca")
			
 
				 
			
 
				+            logger.info(f"Job {job_id}: Preprocessing dataset (task_type={task_type}, template={template})")
			
 
				             await engine.preprocess_dataset(dataset_path, processed_path, task_type=task_type, template=template)
			
 
				+            logger.info(f"Job {job_id}: Preprocessing completed, output: {processed_path}")
			
 
				 
			
 
				+            logger.info(f"Job {job_id}: Loading model {model_id} (peft={peft_method})")
			
 
				             await engine.load_model(model_id, quantization="4bit" if peft_method == "qlora" else None)
			
 
				+            logger.info(f"Job {job_id}: Model loaded, building PEFT config")
			
 
				             peft_config = engine.get_peft_config(peft_method, config)
			
 
				+            logger.info(f"Job {job_id}: PEFT config built, starting training...")
			
 
				 
			
 
				             self.update_job(job_id, status=JobStatus.TRAINING)
			
 
				             await self._notify_callbacks()
			
 
				 
			
 
				+            logger.info(f"Job {job_id}: Calling engine.train()...")
			
 
				             adapter_path = await engine.train(
			
 
				                 job_id=job_id,
			
 
				                 dataset_path=processed_path,
			
--- a/backend/app/core/logging.py
+++ b/backend/app/core/logging.py
@@ -1,13 +1,23 @@
 
				 import logging
			
 
				+import sys
			
 
				 
			
 
				 from app.config import get_settings
			
 
				 
			
 
				 settings = get_settings()
			
 
				 
			
 
				+# 统一日志配置：同时输出到 stderr，确保 docker logs 能捕获
			
 
				+log_level = getattr(logging, settings.backend_log_level.upper(), logging.INFO)
			
 
				+
			
 
				 logging.basicConfig(
			
 
				-    level=getattr(logging, settings.backend_log_level.upper(), logging.INFO),
			
 
				+    level=log_level,
			
 
				     format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
			
 
				     datefmt="%Y-%m-%d %H:%M:%S",
			
 
				+    stream=sys.stderr,
			
 
				 )
			
 
				 
			
 
				 logger = logging.getLogger("peft-platform")
			
 
				+
			
 
				+# 覆盖 uvicorn/gunicorn logger 避免重复或静默
			
 
				+logging.getLogger("uvicorn").setLevel(log_level)
			
 
				+logging.getLogger("uvicorn.error").setLevel(log_level)
			
 
				+logging.getLogger("uvicorn.access").setLevel(log_level)
			
--- a/backend/app/engines/text_engine.py
+++ b/backend/app/engines/text_engine.py
@@ -40,6 +40,7 @@ class TextEngine(BaseEngine):
 
				 
			
 
				         # 如果本地没有，从 HF 下载
			
 
				         if not (Path(local_path) / "config.json").exists():
			
 
				+            logger.info(f"Model not found locally, downloading from HuggingFace: {model_id}")
			
 
				             from huggingface_hub import snapshot_download
			
 
				 
			
 
				             snapshot_download(
			
@@ -47,9 +48,10 @@ class TextEngine(BaseEngine):
 
				                 local_dir=local_path,
			
 
				                 local_dir_use_symlinks=False,
			
 
				             )
			
 
				+            logger.info(f"Model download completed: {model_id}")
			
 
				 
			
 
				         quantization = kwargs.get("quantization", None)
			
 
				-        
			
 
				+
			
 
				         # 日志：检查 GPU 状态
			
 
				         logger.info(f"CUDA available: {torch.cuda.is_available()}")
			
 
				         logger.info(f"CUDA device count: {torch.cuda.device_count()}")
			
@@ -59,8 +61,14 @@ class TextEngine(BaseEngine):
 
				                 logger.info(f"GPU {i} memory: {torch.cuda.get_device_properties(i).total_memory / (1024**3):.2f} GB")
			
 
				         else:
			
 
				             logger.warning("No GPU detected! Training will run on CPU.")
			
 
				-        
			
 
				-        max_memory = {i: "4GB" for i in range(torch.cuda.device_count())} if torch.cuda.is_available() else None
			
 
				+
			
 
				+        logger.info(f"Loading model from: {local_path} (quantization={quantization})")
			
 
				+
			
 
				+        max_memory = None
			
 
				+        if torch.cuda.is_available():
			
 
				+            # 不限制 max_memory，让 transformers 自动利用全部显存
			
 
				+            max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory * 0.9 // (1024**3))}GiB"
			
 
				+                          for i in range(torch.cuda.device_count())}
			
 
				         
			
 
				         load_kwargs: dict[str, Any] = {
			
 
				             "torch_dtype": torch.float16,
			
@@ -84,6 +92,7 @@ class TextEngine(BaseEngine):
 
				 
			
 
				         self._model = AutoModelForCausalLM.from_pretrained(local_path, **load_kwargs)
			
 
				         logger.info(f"Loaded model: {model_id}")
			
 
				+        logger.info(f"Model loading complete, returning from load_model()")
			
 
				 
			
 
				     def get_peft_config(self, method: str, params: dict[str, Any]) -> Any:
			
 
				         """根据 PEFT 方法返回对应的配置对象。"""
			
@@ -142,12 +151,21 @@ class TextEngine(BaseEngine):
 
				         save_strategy = training_args.get("save_strategy", "epoch")
			
 
				         deepspeed_config = training_args.get("deepspeed", None)
			
 
				 
			
 
				+        logger.info(f"Training args: task_type={task_type}, epochs={epochs}, batch_size={batch_size}, "
			
 
				+                     f"gradient_accumulation={gradient_accumulation}, lr={learning_rate}, "
			
 
				+                     f"max_seq_length={max_seq_length}, warmup_ratio={warmup_ratio}, "
			
 
				+                     f"save_strategy={save_strategy}, deepspeed={'enabled' if deepspeed_config else 'disabled'}")
			
 
				+        logger.info(f"Loading dataset from: {dataset_path}")
			
 
				+
			
 
				         dataset = self._tokenize_dataset(dataset_path, max_seq_length)
			
 
				+        logger.info(f"Dataset tokenized: {len(dataset)} samples")
			
 
				 
			
 
				         self._model = get_peft_model(self._model, peft_config)
			
 
				+        logger.info(f"PEFT model created, trainable parameters:")
			
 
				         self._model.print_trainable_parameters()
			
 
				 
			
 
				         output_dir = str(settings.adapters_dir / job_id)
			
 
				+        logger.info(f"Adapter output directory: {output_dir}")
			
 
				         tr_args = TrainingArguments(
			
 
				             output_dir=output_dir,
			
 
				             num_train_epochs=epochs,
			
@@ -235,7 +253,9 @@ class TextEngine(BaseEngine):
 
				                 )
			
 
				 
			
 
				         try:
			
 
				+            logger.info(f"Trainer created, starting trainer.train()...")
			
 
				             trainer.train()
			
 
				+            logger.info(f"trainer.train() returned, saving adapter...")
			
 
				             self._model.save_pretrained(output_dir)
			
 
				             self._tokenizer.save_pretrained(output_dir)
			
 
				             logger.info(f"Training completed for job {job_id}")
			
--- a/backend/app/services/model_service.py
+++ b/backend/app/services/model_service.py
@@ -41,6 +41,7 @@ async def resolve_model_path(model_id: str) -> str | None:
 
				 
			
 
				 async def download_model(model_id: str, use_modelscope: bool = False) -> dict[str, Any]:
			
 
				     """从 HF 或 ModelScope 下载模型到本地缓存。"""
			
 
				+    logger.info(f"Starting model download: {model_id} (source={'ModelScope' if use_modelscope else 'HuggingFace'})")
			
 
				     try:
			
 
				         if use_modelscope:
			
 
				             import subprocess
			
@@ -61,9 +62,11 @@ async def download_model(model_id: str, use_modelscope: bool = False) -> dict[st
 
				         else:
			
 
				             from huggingface_hub import snapshot_download
			
 
				 
			
 
				+            local_path_dir = str(settings.models_dir / model_id.replace("/", "_"))
			
 
				+            logger.info(f"Downloading from HuggingFace: {model_id} -> {local_path_dir}")
			
 
				             local_path = snapshot_download(
			
 
				                 repo_id=model_id,
			
 
				-                local_dir=str(settings.models_dir / model_id.replace("/", "_")),
			
 
				+                local_dir=local_path_dir,
			
 
				                 local_dir_use_symlinks=False,
			
 
				             )
			
 
				 
			
--- a/backend/app/services/model_test_service.py
+++ b/backend/app/services/model_test_service.py
@@ -26,6 +26,8 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
 
				     if not (model_dir / "config.json").exists():
			
 
				         return {"error": f"Model directory not found: {model_dir}"}
			
 
				 
			
 
				+    logger.info(f"Loading model: {model_id} from {model_dir}")
			
 
				+
			
 
				     tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
			
 
				     if tokenizer.pad_token is None:
			
 
				         tokenizer.pad_token = tokenizer.eos_token
			
@@ -43,14 +45,19 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
 
				                 **kwargs,
			
 
				             )
			
 
				             break
			
 
				-        except Exception:
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"Failed to load with {loader_cls.__name__}: {e}")
			
 
				             continue
			
 
				 
			
 
				     if model is None:
			
 
				         return {"error": f"Unable to load model with any available loader. Model type may not be supported yet."}
			
 
				     model.eval()
			
 
				 
			
 
				-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
			
 
				+    # 限制 prompt 长度，避免 OOM
			
 
				+    max_prompt_len = getattr(settings, "default_max_seq_length", 2048)
			
 
				+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_prompt_len).to(model.device)
			
 
				+    prompt_tokens = inputs["input_ids"].shape[1]
			
 
				+    logger.info(f"Prompt tokenized: {prompt_tokens} tokens, generating up to {max_new_tokens} new tokens")
			
 
				 
			
 
				     with torch.no_grad():
			
 
				         outputs = model.generate(
			
@@ -63,6 +70,7 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
 
				         )
			
 
				 
			
 
				     generated_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
			
 
				+    logger.info(f"Generated {outputs.shape[1] - inputs['input_ids'].shape[1]} tokens")
			
 
				 
			
 
				     return {
			
 
				         "model_id": model_id,
			
--- a/frontend/src/api/client.ts
+++ b/frontend/src/api/client.ts
@@ -176,7 +176,7 @@ interface DatasetValidation {
 
				   warnings?: string[]
			
 
				 }
			
 
				 
			
 
				-interface TrainingJob {
			
 
				+export interface TrainingJob {
			
 
				   id: string
			
 
				   model_id: string
			
 
				   model_type: string
			
--- a/frontend/src/pages/Training.tsx
+++ b/frontend/src/pages/Training.tsx
@@ -260,6 +260,28 @@ export function Training() {
 
				   const handleCreate = () => {
			
 
				     if (!modelId.trim() || !datasetId.trim()) return
			
 
				     setSubmitting(true)
			
 
				+
			
 
				+    // 乐观更新：立即在列表中添加一个 pending 任务
			
 
				+    const tempId = 'temp-' + Date.now()
			
 
				+    const tempJob: TrainingJob = {
			
 
				+      id: tempId,
			
 
				+      model_id: modelId,
			
 
				+      peft_method: peftMethod,
			
 
				+      status: 'pending',
			
 
				+      progress: 0,
			
 
				+      loss: null,
			
 
				+      created_at: new Date().toISOString(),
			
 
				+      started_at: null,
			
 
				+      finished_at: null,
			
 
				+      error_message: null,
			
 
				+      adapter_path: null,
			
 
				+      current_epoch: 0,
			
 
				+      current_step: 0,
			
 
				+      total_steps: 0,
			
 
				+    }
			
 
				+    setJobs(prev => [tempJob, ...prev])
			
 
				+    setLoading(false)
			
 
				+
			
 
				     api.training.create({
			
 
				       model_id: modelId,
			
 
				       model_type: modelType,
			
@@ -277,10 +299,15 @@ export function Training() {
 
				       .then(() => {
			
 
				         setModelId('')
			
 
				         setDatasetId('')
			
 
				+        // 用真实数据替换占位
			
 
				+        setJobs(prev => prev.filter(j => j.id !== tempId))
			
 
				         fetchJobs()
			
 
				         fetchOptions()
			
 
				       })
			
 
				-      .catch(console.error)
			
 
				+      .catch(() => {
			
 
				+        // 失败时移除占位任务
			
 
				+        setJobs(prev => prev.filter(j => j.id !== tempId))
			
 
				+      })
			
 
				       .finally(() => setSubmitting(false))
			
 
				   }