Przeglądaj źródła

优化训练推理过程

lxylxy123321 2 dni temu
rodzic
commit
0e2b2f7370

+ 3 - 0
backend/.env

@@ -15,6 +15,9 @@ DATABASE_URL=sqlite+aiosqlite:///root/Fine-tuning/backend/data/finetuning.db
 # 数据路径(Linux 服务器路径)
 DATA_DIR=/root/Fine-tuning/backend/data
 
+# GPU 配置(推理使用的物理 GPU,逗号分隔,多卡自动 model parallelism)
+# INFERENCE_CUDA_DEVICES=2,3
+
 # 训练默认参数
 DEFAULT_PEFT_METHOD=lora
 DEFAULT_EPOCHS=3

+ 16 - 0
backend/app/api/deployment.py

@@ -129,6 +129,22 @@ async def stop_serving(
     return result
 
 
+@router.post("/{task_id}/restart")
+async def restart_serving(
+    task_id: str,
+    current_user: dict = Depends(get_current_user),
+):
+    """重启已停止的在线服务(不重新导出模型,只启动 worker)。"""
+    user_id = current_user.get("sub")
+    try:
+        result = await deploy_service.restart_serving(task_id, user_id)
+    except RuntimeError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    if "error" in result:
+        raise HTTPException(status_code=400, detail=result["error"])
+    return result
+
+
 @router.get("/{deploy_id}/status", response_model=DeployResponse)
 async def get_deployment_status(deploy_id: str):
     """获取导出/部署任务状态。"""

+ 1 - 0
backend/app/config.py

@@ -58,6 +58,7 @@ class Settings(BaseSettings):
 
     # --- GPU / 硬件 ---
     cuda_visible_devices: str = "3"
+    inference_cuda_devices: str = "2,3"  # 推理使用的物理 GPU,逗号分隔(多卡 model parallelism)
     max_memory_per_gpu: str = "0"
     use_unsloth: bool = False
 

+ 47 - 3
backend/app/core/deploy_server_template.py

@@ -29,7 +29,7 @@ import torch
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
 
 app = FastAPI(title="Model Serving API", version="1.0.0")
 app.add_middleware(
@@ -130,6 +130,8 @@ async def chat_completions(req: ChatRequest):
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     prompt_tokens = inputs["input_ids"].shape[1]
 
+    stopping_criteria = _build_stop_criteria()
+
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
@@ -137,10 +139,14 @@ async def chat_completions(req: ChatRequest):
             temperature=max(req.temperature, 0.01),
             top_p=req.top_p,
             do_sample=req.temperature > 0,
+            repetition_penalty=1.1,
             pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            stopping_criteria=stopping_criteria,
         )
 
     generated = tokenizer.decode(outputs[0][prompt_tokens:], skip_special_tokens=True)
+    generated = _clean_generated(generated)
     completion_tokens = outputs.shape[1] - prompt_tokens
 
     return ChatResponse(
@@ -161,6 +167,8 @@ async def completions(req: CompletionRequest):
     inputs = tokenizer(req.prompt, return_tensors="pt").to(model.device)
     prompt_tokens = inputs["input_ids"].shape[1]
 
+    stopping_criteria = _build_stop_criteria()
+
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
@@ -168,10 +176,14 @@ async def completions(req: CompletionRequest):
             temperature=max(req.temperature, 0.01),
             top_p=req.top_p,
             do_sample=req.temperature > 0,
+            repetition_penalty=1.1,
             pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            stopping_criteria=stopping_criteria,
         )
 
     generated = tokenizer.decode(outputs[0][prompt_tokens:], skip_special_tokens=True)
+    generated = _clean_generated(generated)
     completion_tokens = outputs.shape[1] - prompt_tokens
 
     return CompletionResponse(
@@ -211,6 +223,37 @@ def _build_prompt(messages: list[Message]) -> str:
     return "\n".join(parts)
 
 
+def _build_stop_criteria():
+    """构建 StoppingCriteria,遇到角色切换标记时停止生成,防止复读。"""
+    stop_phrases = ["<|user|>", "<|system|>", "<|assistant|>"]
+    stop_token_ids = []
+    for phrase in stop_phrases:
+        ids = tokenizer.encode(phrase, add_special_tokens=False)
+        stop_token_ids.append(ids)
+
+    class StopOnRoleToken(StoppingCriteria):
+        def __init__(self, stop_sequences):
+            self.stop_sequences = stop_sequences
+
+        def __call__(self, input_ids, scores, **kwargs):
+            gen_seq = input_ids[0].tolist()
+            for stop_ids in self.stop_sequences:
+                if len(gen_seq) >= len(stop_ids):
+                    if gen_seq[-len(stop_ids):] == stop_ids:
+                        return True
+            return False
+
+    return StoppingCriteriaList([StopOnRoleToken(stop_token_ids)])
+
+
+def _clean_generated(generated: str) -> str:
+    """清理可能残留的角色标记。"""
+    for marker in ["<|user|>", "<|system|>", "<|assistant|>"]:
+        if marker in generated:
+            generated = generated[:generated.index(marker)]
+    return generated.strip()
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Model Serving API Server")
     parser.add_argument("--model-path", type=str, default="./model", help="模型目录路径")
@@ -224,9 +267,10 @@ if __name__ == "__main__":
     print(f"Loading model from: {model_path}")
 
     if args.device == "auto":
-        device_map = {"": 0} if torch.cuda.is_available() else "auto"
+        # device_map="auto" 自动将模型层分散到所有可见 GPU
+        device_map = "auto" if torch.cuda.is_available() else "cpu"
     elif args.device == "cuda":
-        device_map = {"": 0}
+        device_map = "auto"  # 自动多卡分散
     else:
         device_map = "cpu"
 

+ 42 - 2
backend/app/core/inference_worker.py

@@ -50,6 +50,35 @@ def _build_prompt_from_messages(messages: list[dict]) -> str:
     return "\n".join(parts)
 
 
+def _build_stop_criteria(tokenizer, model_device):
+    """构建 StoppingCriteria,遇到角色切换标记时停止生成,防止复读。"""
+    from transformers import StoppingCriteria, StoppingCriteriaList
+
+    # 当模型开始生成下一个 role 标记时就应该停止
+    stop_phrases = ["<|user|>", "<|system|>", "<|assistant|>"]
+    # 预编码 stop 短语,用于精确匹配
+    stop_token_ids = []
+    for phrase in stop_phrases:
+        ids = tokenizer.encode(phrase, add_special_tokens=False)
+        stop_token_ids.append(ids)
+
+    class StopOnRoleToken(StoppingCriteria):
+        def __init__(self, stop_sequences, device):
+            self.stop_sequences = stop_sequences
+            self.device = device
+
+        def __call__(self, input_ids, scores, **kwargs):
+            # 检查最近生成的 token 是否匹配任意 stop 序列
+            gen_seq = input_ids[0].tolist()
+            for stop_ids in self.stop_sequences:
+                if len(gen_seq) >= len(stop_ids):
+                    if gen_seq[-len(stop_ids):] == stop_ids:
+                        return True
+            return False
+
+    return StoppingCriteriaList([StopOnRoleToken(stop_token_ids, model_device)])
+
+
 class InferenceWorker:
     def __init__(self, model_path: str):
         import torch
@@ -61,7 +90,8 @@ class InferenceWorker:
             self.tokenizer.pad_token = self.tokenizer.eos_token
 
         print(f"[worker] Loading model from: {model_path}", flush=True)
-        device_map = {"": 0} if torch.cuda.is_available() else "auto"
+        # device_map="auto" 自动将模型层分散到所有可见 GPU(由 CUDA_VISIBLE_DEVICES 控制)
+        device_map = "auto" if torch.cuda.is_available() else "cpu"
         self.model = AutoModelForCausalLM.from_pretrained(
             model_path, torch_dtype=torch.float16, device_map=device_map,
         )
@@ -82,11 +112,14 @@ class InferenceWorker:
         temperature = max(request.get("temperature", 0.7), 0.01)
         top_p = request.get("top_p", 0.9)
         do_sample = request.get("do_sample", temperature > 0)
-        repetition_penalty = request.get("repetition_penalty", 1.0)
+        repetition_penalty = request.get("repetition_penalty", 1.1)
 
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         prompt_tokens = inputs["input_ids"].shape[1]
 
+        # 构建 stop criteria:遇到角色标记就停止,防止复读
+        stopping_criteria = _build_stop_criteria(self.tokenizer, self.model.device)
+
         with self.torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
@@ -96,11 +129,18 @@ class InferenceWorker:
                 do_sample=do_sample,
                 repetition_penalty=repetition_penalty,
                 pad_token_id=self.tokenizer.eos_token_id,
+                eos_token_id=self.tokenizer.eos_token_id,
+                stopping_criteria=stopping_criteria,
             )
 
         generated = self.tokenizer.decode(
             outputs[0][prompt_tokens:], skip_special_tokens=True
         )
+        # 清理可能残留的角色标记(防止 StoppingCriteria 触发前的部分 token)
+        for marker in ["<|user|>", "<|system|>", "<|assistant|>"]:
+            if marker in generated:
+                generated = generated[:generated.index(marker)]
+        generated = generated.strip()
         completion_tokens = outputs.shape[1] - prompt_tokens
 
         return {

+ 1 - 1
backend/app/core/remote_executor.py

@@ -263,7 +263,7 @@ def run_inference_remote(
     remote_cmd = (
         f"docker exec "
         f"-e MACA_MPS_MODE=1 "
-        f"-e CUDA_VISIBLE_DEVICES=3 "
+        f"-e CUDA_VISIBLE_DEVICES={settings.inference_cuda_devices} "
         f"-w {settings.compute_node_workdir} "
         f"{settings.compute_node_docker_container} "
         f"{settings.compute_node_python} -c \""

+ 83 - 20
backend/app/services/deploy_service.py

@@ -135,7 +135,7 @@ async def _execute_export(task_id: str, job_id: str, merge_with_base: bool, expo
 
         # 把 inference_worker.py 和启动脚本复制到输出目录
         if output_path and settings.use_remote_compute:
-            _copy_worker_template_remote(output_path)
+            await _copy_worker_template_remote(output_path)
 
         await _update_deploy_status(task_id, "completed", output_path=output_path)
         return {"output_path": output_path}
@@ -227,7 +227,7 @@ async def _launch_remote_worker(task_id: str, model_path: str, port: int) -> str
         f"docker exec {settings.compute_node_docker_container} "
         f"bash -c 'fuser -k {port}/tcp 2>/dev/null; sleep 1; true'"
     )
-    ssh_exec(kill_cmd, timeout=15)
+    await asyncio.to_thread(ssh_exec, kill_cmd, timeout=15)
 
     # worker 脚本在容器内的路径
     worker_template = f"{settings.compute_node_workdir}/app/core/inference_worker.py"
@@ -237,15 +237,15 @@ async def _launch_remote_worker(task_id: str, model_path: str, port: int) -> str
         f"docker exec {settings.compute_node_docker_container} "
         f"bash -c 'cp {worker_template} {model_path}/inference_worker.py'"
     )
-    code, _, stderr = ssh_exec(copy_cmd, timeout=30)
+    code, _, stderr = await asyncio.to_thread(ssh_exec, copy_cmd, timeout=30)
     if code != 0:
         raise RuntimeError(f"复制 inference_worker.py 失败: {stderr}")
 
-    # 在容器内后台启动 worker
+    # 在容器内后台启动 worker(多卡推理:CUDA_VISIBLE_DEVICES 使用配置项)
     launch_cmd = (
         f"docker exec "
         f"-e MACA_MPS_MODE=1 "
-        f"-e CUDA_VISIBLE_DEVICES=3 "
+        f"-e CUDA_VISIBLE_DEVICES={settings.inference_cuda_devices} "
         f"-w {model_path} "
         f"{settings.compute_node_docker_container} "
         f"bash -c '"
@@ -256,7 +256,7 @@ async def _launch_remote_worker(task_id: str, model_path: str, port: int) -> str
         f" echo $!'"
     )
 
-    code, stdout, stderr = ssh_exec(launch_cmd, timeout=30)
+    code, stdout, stderr = await asyncio.to_thread(ssh_exec, launch_cmd, timeout=30)
     if code != 0:
         raise RuntimeError(f"启动推理 worker 失败: {stderr}")
 
@@ -265,9 +265,8 @@ async def _launch_remote_worker(task_id: str, model_path: str, port: int) -> str
 
     # 等待模型加载(可能需要较长时间),检查 READY 标记
     # 每次轮询只用一次 SSH 连接,同时检查 READY 和进程状态
-    import asyncio as _aio
     for attempt in range(60):  # 最多等 5 分钟(60 * 5s)
-        await _aio.sleep(5)
+        await asyncio.sleep(5)
         check_cmd = (
             f"docker exec {settings.compute_node_docker_container} "
             f"bash -c '"
@@ -277,7 +276,7 @@ async def _launch_remote_worker(task_id: str, model_path: str, port: int) -> str
             f"  echo \"ALIVE\"; "
             f"'"
         )
-        code, stdout, stderr = ssh_exec(check_cmd, timeout=30)
+        code, stdout, stderr = await asyncio.to_thread(ssh_exec, check_cmd, timeout=30)
         if code == 0:
             result = stdout.strip()
             if result.startswith("READY:"):
@@ -289,7 +288,7 @@ async def _launch_remote_worker(task_id: str, model_path: str, port: int) -> str
                     f"docker exec {settings.compute_node_docker_container} "
                     f"bash -c 'tail -20 /tmp/serve_{task_id}.log 2>/dev/null'"
                 )
-                _, log_stdout, _ = ssh_exec(log_cmd, timeout=30)
+                _, log_stdout, _ = await asyncio.to_thread(ssh_exec, log_cmd, timeout=30)
                 raise RuntimeError(f"Worker 进程已退出: {log_stdout}")
             # result == "ALIVE" → 继续等待
 
@@ -332,20 +331,23 @@ async def stop_serving(task_id: str, user_id: str = "") -> dict[str, Any]:
             return {"error": "无权操作此任务"}
 
         pid = record.pid
+        port = record.port
+        output_path = record.output_path
+
         if pid and settings.use_remote_compute:
             # 方式1: kill -9 主进程及其子进程
             kill_cmd = (
                 f"docker exec {settings.compute_node_docker_container} "
                 f"bash -c 'kill -9 {pid} 2>/dev/null; pkill -9 -P {pid} 2>/dev/null; true'"
             )
-            code, _, _ = ssh_exec(kill_cmd, timeout=15)
+            code, _, _ = await asyncio.to_thread(ssh_exec, kill_cmd, timeout=15)
             # 方式2: fuser 兜底清理端口(防止进程 kill 失败仍占着端口)
-            if record.port:
+            if port:
                 fuser_cmd = (
                     f"docker exec {settings.compute_node_docker_container} "
-                    f"bash -c 'fuser -k {record.port}/tcp 2>/dev/null; sleep 1; true'"
+                    f"bash -c 'fuser -k {port}/tcp 2>/dev/null; sleep 1; true'"
                 )
-                ssh_exec(fuser_cmd, timeout=15)
+                await asyncio.to_thread(ssh_exec, fuser_cmd, timeout=15)
             logger.info(f"Stop serving: task={task_id} pid={pid} kill_code={code}")
 
         record.status = "stopped"
@@ -357,6 +359,63 @@ async def stop_serving(task_id: str, user_id: str = "") -> dict[str, Any]:
     return {"task_id": task_id, "status": "stopped"}
 
 
+async def restart_serving(task_id: str, user_id: str = "") -> dict[str, Any]:
+    """重启已停止的在线服务(不重新导出模型,只启动 worker)。"""
+    async with async_session() as session:
+        result = await session.execute(select(DeployTaskModel).where(DeployTaskModel.id == task_id))
+        record = result.scalar_one_or_none()
+        if not record:
+            return {"error": "任务不存在"}
+        if record.deploy_mode != "serve":
+            return {"error": "该任务不是在线服务"}
+        if record.status != "stopped":
+            return {"error": f"只能重启已停止的服务(当前状态: {record.status})"}
+        if user_id and record.user_id and record.user_id != user_id:
+            return {"error": "无权操作此任务"}
+        if not record.output_path:
+            return {"error": "模型文件路径丢失,无法重启,请重新部署"}
+
+        output_path = record.output_path
+
+    # 分配新端口
+    port = await _allocate_port()
+
+    # 更新状态为 pending,标记正在重启
+    await _update_deploy_status(task_id, "pending", port=port)
+
+    background_task_manager.register_task(task_id, "deployment", {"mode": "restart"})
+    await background_task_manager.run(
+        task_id, "deployment", _execute_restart(task_id, output_path, port)
+    )
+
+    logger.info(f"Restart serving: task={task_id} output_path={output_path} port={port}")
+    return {"task_id": task_id, "status": "pending", "deploy_mode": "serve", "port": port}
+
+
+async def _execute_restart(task_id: str, output_path: str, port: int) -> dict:
+    """后台执行重启:只启动 worker,不重新导出。"""
+    try:
+        if settings.use_remote_compute:
+            pid = await _launch_remote_worker(task_id, output_path, port)
+        else:
+            pid = await _launch_local_worker(task_id, output_path, port)
+
+        endpoint_url = f"/api/v1/deployment/proxy/{task_id}/v1"
+        await _update_deploy_status(
+            task_id, "running",
+            output_path=output_path,
+            endpoint_url=endpoint_url,
+            port=port,
+            pid=pid,
+        )
+        return {"endpoint_url": endpoint_url, "port": port, "pid": pid}
+
+    except Exception as e:
+        logger.error(f"Restart failed for task {task_id}: {e}")
+        await _update_deploy_status(task_id, "failed", error=str(e))
+        return {"error": str(e)}
+
+
 async def list_deployed_services(user_id: str = "") -> list[dict[str, Any]]:
     """列出 serve 模式的部署任务(按用户过滤)。"""
     async with async_session() as session:
@@ -373,7 +432,7 @@ async def list_deployed_services(user_id: str = "") -> list[dict[str, Any]]:
         # 对 running 状态,检查远程进程是否还活着
         if status == "running" and r.pid and settings.use_remote_compute:
             from app.core.remote_executor import is_process_running
-            if not is_process_running(r.pid):
+            if not await asyncio.to_thread(is_process_running, r.pid):
                 status = "stopped"
                 await _update_deploy_status(r.id, "stopped", error="进程已退出")
                 # 释放端口和 PID,确保下次分配时可用
@@ -452,7 +511,7 @@ async def _run_remote_export(task_id: str, job_id: str, merge_with_base: bool, e
         "print(json.dumps(result, ensure_ascii=False))\" 2>&1"
     )
 
-    code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
+    code, stdout, stderr = await asyncio.to_thread(ssh_exec, remote_cmd, timeout=600)
 
     if code != 0:
         raise RuntimeError(f"Remote export failed: {stderr}")
@@ -515,14 +574,14 @@ async def _run_local_export(task_id: str, job_id: str, merge_with_base: bool) ->
     return {"output_path": str(output_path)}
 
 
-def _copy_worker_template_remote(output_path: str):
+async def _copy_worker_template_remote(output_path: str):
     """把 inference_worker.py 和启动脚本复制到远程模型目录。"""
     worker_template = f"{settings.compute_node_workdir}/app/core/inference_worker.py"
     copy_cmd = (
         f"docker exec {settings.compute_node_docker_container} "
         f"bash -c 'cp {worker_template} {output_path}/inference_worker.py'"
     )
-    code, _, stderr = ssh_exec(copy_cmd, timeout=30)
+    code, _, stderr = await asyncio.to_thread(ssh_exec, copy_cmd, timeout=30)
     if code != 0:
         logger.warning(f"复制 inference_worker.py 到 {output_path} 失败: {stderr}")
 
@@ -530,7 +589,7 @@ def _copy_worker_template_remote(output_path: str):
     start_script = (
         f"#!/bin/bash\n"
         f"cd {output_path}\n"
-        f"CUDA_VISIBLE_DEVICES=3 MACA_MPS_MODE=1 "
+        f"CUDA_VISIBLE_DEVICES={settings.inference_cuda_devices} MACA_MPS_MODE=1 "
         f"{settings.compute_node_python} inference_worker.py "
         f"--model-path . --port 8100\n"
     )
@@ -539,7 +598,7 @@ def _copy_worker_template_remote(output_path: str):
         f"bash -c 'cat > {output_path}/start.sh << \"EOF\"\n{start_script}EOF\n"
         f"chmod +x {output_path}/start.sh'"
     )
-    code, _, _ = ssh_exec(script_cmd, timeout=15)
+    code, _, _ = await asyncio.to_thread(ssh_exec, script_cmd, timeout=15)
     if code != 0:
         logger.warning(f"生成 start.sh 失败")
 
@@ -574,6 +633,10 @@ async def _update_deploy_status(
                 record.pid = pid
             if status in ("completed", "failed", "stopped"):
                 record.finished_at = datetime.utcnow()
+            if status == "pending":
+                # 重启时清除完成时间和错误信息
+                record.finished_at = None
+                record.error = None
             await session.commit()
 
     background_task_manager.update_task(

+ 2 - 1
backend/app/services/eval_service.py

@@ -1,3 +1,4 @@
+import asyncio
 import json
 import uuid
 from datetime import datetime, timezone
@@ -120,7 +121,7 @@ async def _run_remote_evaluation(eval_id: str, job_id: str) -> dict[str, Any]:
         "print(json.dumps(result, ensure_ascii=False))\" 2>&1"
     )
 
-    code, stdout, stderr = ssh_exec(remote_cmd, timeout=300)
+    code, stdout, stderr = await asyncio.to_thread(ssh_exec, remote_cmd, timeout=300)
 
     if code != 0:
         raise RuntimeError(f"Remote evaluation failed: {stderr}")

+ 32 - 7
backend/app/services/inference_service.py

@@ -1,4 +1,5 @@
 """推理服务 — 支持本地执行和 SSH 远程执行两种模式。"""
+import asyncio
 import json
 from pathlib import Path
 from typing import Any
@@ -25,13 +26,14 @@ async def generate(
         return {"error": "无法找到基础模型信息,请确保训练任务已完成"}
 
     if settings.use_remote_compute:
-        # 远程执行模式
+        # 远程执行模式(用 to_thread 避免阻塞事件循环)
         from app.core.remote_executor import run_inference_remote
 
         adapter_dir = Path(adapter_path)
         adapter_id = adapter_dir.name
 
-        result = run_inference_remote(
+        result = await asyncio.to_thread(
+            run_inference_remote,
             model_id=base_model_id,
             adapter_id=adapter_id,
             prompt=prompt,
@@ -45,8 +47,9 @@ async def generate(
             return result
         return {"error": "Remote inference failed"}
 
-    # 本地执行模式
-    return _generate_local(
+    # 本地执行模式(用 to_thread 避免 GPU 操作阻塞事件循环)
+    return await asyncio.to_thread(
+        _generate_local,
         adapter_path=adapter_path,
         base_model_id=base_model_id,
         prompt=prompt,
@@ -72,16 +75,17 @@ def _generate_local(
     try:
         import os
         import torch
-        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
         from peft import PeftModel
 
         tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
 
-        # CUDA_VISIBLE_DEVICES=3 已将物理 GPU 3 映射为逻辑 GPU 0
+        # CUDA_VISIBLE_DEVICES 由调用方(deploy_service)设置为多卡
+        # device_map="auto" 自动将模型层分散到所有可见 GPU
         import torch
-        device_map = {"": 0}
+        device_map = "auto" if torch.cuda.is_available() else "cpu"
         torch.cuda.set_device(0)
 
         base_model = AutoModelForCausalLM.from_pretrained(
@@ -94,6 +98,20 @@ def _generate_local(
 
         inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
+        # 构建 stop criteria:遇到角色标记就停止,防止复读
+        stop_phrases = ["<|user|>", "<|system|>", "<|assistant|>"]
+        stop_token_ids = [tokenizer.encode(p, add_special_tokens=False) for p in stop_phrases]
+
+        class StopOnRoleToken(StoppingCriteria):
+            def __call__(self, input_ids, scores, **kwargs):
+                gen_seq = input_ids[0].tolist()
+                for s_ids in stop_token_ids:
+                    if len(gen_seq) >= len(s_ids) and gen_seq[-len(s_ids):] == s_ids:
+                        return True
+                return False
+
+        stopping_criteria = StoppingCriteriaList([StopOnRoleToken()])
+
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
@@ -103,10 +121,17 @@ def _generate_local(
                 repetition_penalty=repetition_penalty,
                 do_sample=do_sample,
                 pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                stopping_criteria=stopping_criteria,
             )
 
         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
         generated_only = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+        # 清理可能残留的角色标记
+        for marker in ["<|user|>", "<|system|>", "<|assistant|>"]:
+            if marker in generated_only:
+                generated_only = generated_only[:generated_only.index(marker)]
+        generated_only = generated_only.strip()
 
         return {
             "prompt": prompt,

+ 15 - 7
backend/app/services/model_test_service.py

@@ -1,5 +1,6 @@
 from pathlib import Path
 from typing import Any
+import asyncio
 
 from app.config import get_settings
 from app.core.logging import logger
@@ -146,10 +147,10 @@ print(json.dumps({{'generated_text': gen}}))
         f"{container} bash -c 'cat > {script_path} && {python} {script_path}'"
     )
 
-    code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
+    code, stdout, stderr = await asyncio.to_thread(ssh_exec, remote_cmd, timeout=600)
 
     # 清理容器内临时文件
-    ssh_exec(f"docker exec {container} rm -f {script_path}", timeout=5)
+    await asyncio.to_thread(ssh_exec, f"docker exec {container} rm -f {script_path}", timeout=5)
 
     logger.info(f"Remote test result: code={code}, stdout_len={len(stdout)}, stderr_len={len(stderr)}")
     if stdout:
@@ -177,9 +178,6 @@ print(json.dumps({{'generated_text': gen}}))
 
 async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
     """本地执行模型测试(仅用于开发环境)。"""
-    import torch
-    from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
-
     from app.services.model_service import resolve_model_path
     model_path = await resolve_model_path(model_id)
     if not model_path:
@@ -189,6 +187,17 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
     if not (model_dir / "config.json").exists():
         return {"error": f"Model directory not found: {model_dir}"}
 
+    # GPU 操作在线程池中执行,避免阻塞事件循环
+    return await asyncio.to_thread(
+        _run_local_inference, model_dir, prompt, max_new_tokens, temperature, top_p
+    )
+
+
+def _run_local_inference(model_dir: Path, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict:
+    """同步执行本地模型加载和推理。"""
+    import torch
+    from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+
     tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
@@ -210,7 +219,7 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
             continue
 
     if model is None:
-        return {"error": f"Unable to load model with any available loader. Model type may not be supported yet."}
+        return {"error": "Unable to load model with any available loader. Model type may not be supported yet."}
     model.eval()
 
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
@@ -228,7 +237,6 @@ async def _test_model_local(model_id: str, prompt: str, max_new_tokens: int, tem
     generated_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
 
     return {
-        "model_id": model_id,
         "prompt": prompt,
         "generated_text": generated_text,
     }

+ 2 - 0
frontend/src/api/client.ts

@@ -173,6 +173,8 @@ const api = {
       }).then(r => r.json()) as Promise<DeployResponse>,
     stop: (taskId: string) =>
       apiFetch(`/api/v1/deployment/${taskId}/stop`, { method: 'POST' }).then(r => r.json()),
+    restart: (taskId: string) =>
+      apiFetch(`/api/v1/deployment/${taskId}/restart`, { method: 'POST' }).then(r => r.json()),
     services: () =>
       apiFetch('/api/v1/deployment/services').then(r => r.json()) as Promise<DeployedServiceInfo[]>,
     status: (id: string) =>

+ 31 - 2
frontend/src/pages/Deployment.tsx

@@ -134,6 +134,12 @@ export function Deployment() {
       .catch(() => {})
   }
 
+  const handleRestart = (taskId: string) => {
+    api.deployment.restart(taskId)
+      .then(() => loadServices())
+      .catch(() => {})
+  }
+
   const tabStyle = (active: boolean): React.CSSProperties => ({
     padding: '8px 20px',
     borderRadius: 8,
@@ -462,7 +468,7 @@ export function Deployment() {
         ) : (
           <div style={{ display: 'flex', flexDirection: 'column', gap: 12 }}>
             {services.map(svc => (
-              <ServiceCard key={svc.task_id} service={svc} onStop={() => handleStop(svc.task_id)} />
+              <ServiceCard key={svc.task_id} service={svc} onStop={() => handleStop(svc.task_id)} onRestart={() => handleRestart(svc.task_id)} />
             ))}
           </div>
         )}
@@ -517,9 +523,11 @@ function TaskStatus({ result }: { result: DeployResponse }) {
   )
 }
 
-function ServiceCard({ service, onStop }: { service: DeployedServiceInfo; onStop: () => void }) {
+function ServiceCard({ service, onStop, onRestart }: { service: DeployedServiceInfo; onStop: () => void; onRestart: () => void }) {
   const [showUsage, setShowUsage] = useState(false)
   const isRunning = service.status === 'running'
+  const isStopped = service.status === 'stopped'
+  const isPending = service.status === 'pending'
   // endpoint_url 是相对路径(如 /api/v1/deployment/proxy/{task_id}/v1),拼接完整 URL
   const relativeUrl = service.endpoint_url || service.base_url || ''
   const baseUrl = relativeUrl ? `${window.location.origin}${relativeUrl}` : ''
@@ -580,6 +588,27 @@ function ServiceCard({ service, onStop }: { service: DeployedServiceInfo; onStop
               </button>
             </>
           )}
+          {isStopped && (
+            <button
+              onClick={onRestart}
+              style={{
+                padding: '6px 12px', borderRadius: 6,
+                border: '1px solid #86efac', background: '#fff', color: '#16a34a',
+                cursor: 'pointer', fontSize: 12, fontWeight: 500,
+              }}
+            >
+              重启
+            </button>
+          )}
+          {isPending && (
+            <span style={{
+              padding: '6px 12px', borderRadius: 6,
+              border: '1px solid #fde68a', background: '#fffbeb', color: '#d97706',
+              fontSize: 12, fontWeight: 500,
+            }}>
+              启动中...
+            </span>
+          )}
         </div>
       </div>
 

+ 1 - 1
test.py

@@ -1,7 +1,7 @@
 from openai import OpenAI
 
 client = OpenAI(
-    base_url="http://192.168.92.151:3000/api/v1/deployment/proxy/08b07765-f5cd-4421-ad2c-b8eb22e60399/v1",
+    base_url="http://192.168.92.151:3000/api/v1/deployment/proxy/abececf1-74e0-4374-8f0b-504b0f08730d/v1",
     api_key="sk-1wTkTvsfu0IiyZFhNAx8HMgtIf2TxLGP-DyrcNKYlIc"  # 替换为你的 API Key
 )