"""远程评估入口 — 在算力节点的训练容器里执行。""" import json import os from pathlib import Path # 禁用 FlashAttention,启用 MPS os.environ["PYTORCH_NO_FLASH"] = "1" os.environ["MACA_MPS_MODE"] = "1" os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" _DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data")) _ADAPTERS_DIR = _DATA_DIR / "adapters" async def run_remote_eval(job_id: str) -> dict: """在远程容器里加载 adapter,计算 perplexity。""" adapter_path = _ADAPTERS_DIR / job_id if not adapter_path.exists(): return {"error": f"Adapter not found: {adapter_path}"} import torch from transformers import AutoModelForCausalLM, AutoTokenizer # 加载 adapter(CUDA_VISIBLE_DEVICES=2,3 已将物理 GPU 2,3 映射为逻辑 GPU 0,1) device_map = {"": 0} model = AutoModelForCausalLM.from_pretrained( adapter_path, torch_dtype=torch.float16, device_map=device_map ) tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True) model.eval() sample_texts = [ "The quick brown fox jumps over the lazy dog.", "Hello, how are you doing today?", ] losses = [] with torch.no_grad(): for text in sample_texts: inputs = tokenizer(text, return_tensors="pt").to(model.device) outputs = model(**inputs, labels=inputs["input_ids"]) losses.append(outputs.loss.item()) avg_loss = sum(losses) / len(losses) if losses else 0 perplexity = torch.exp(torch.tensor(avg_loss)).item() if avg_loss > 0 else 0 return { "metrics": { "eval_loss": round(avg_loss, 4), "perplexity": round(perplexity, 2), "num_samples": len(sample_texts), } }