| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
- """远程评估入口 — 在算力节点的训练容器里执行。"""
- import json
- import os
- from pathlib import Path
- # 禁用 FlashAttention,启用 MPS
- os.environ["PYTORCH_NO_FLASH"] = "1"
- os.environ["MACA_MPS_MODE"] = "1"
- os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
- _DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data"))
- _ADAPTERS_DIR = _DATA_DIR / "adapters"
- async def run_remote_eval(job_id: str) -> dict:
- """在远程容器里加载 adapter,计算 perplexity。"""
- adapter_path = _ADAPTERS_DIR / job_id
- if not adapter_path.exists():
- return {"error": f"Adapter not found: {adapter_path}"}
- import torch
- from transformers import AutoModelForCausalLM, AutoTokenizer
- # 加载 adapter(CUDA_VISIBLE_DEVICES=2,3 已将物理 GPU 2,3 映射为逻辑 GPU 0,1)
- device_map = {"": 0}
- model = AutoModelForCausalLM.from_pretrained(
- adapter_path, torch_dtype=torch.float16, device_map=device_map
- )
- tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
- model.eval()
- sample_texts = [
- "The quick brown fox jumps over the lazy dog.",
- "Hello, how are you doing today?",
- ]
- losses = []
- with torch.no_grad():
- for text in sample_texts:
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
- outputs = model(**inputs, labels=inputs["input_ids"])
- losses.append(outputs.loss.item())
- avg_loss = sum(losses) / len(losses) if losses else 0
- perplexity = torch.exp(torch.tensor(avg_loss)).item() if avg_loss > 0 else 0
- return {
- "metrics": {
- "eval_loss": round(avg_loss, 4),
- "perplexity": round(perplexity, 2),
- "num_samples": len(sample_texts),
- }
- }
|