| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- """远程评估入口 — 在算力节点的训练容器里执行。"""
- import json
- import os
- from pathlib import Path
- # 禁用 FlashAttention,启用 MPS
- os.environ["PYTORCH_NO_FLASH"] = "1"
- os.environ["MACA_MPS_MODE"] = "1"
- os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
- _DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data"))
- _ADAPTERS_DIR = _DATA_DIR / "adapters"
- async def run_remote_eval(job_id: str) -> dict:
- """在远程容器里加载 adapter,计算 perplexity。"""
- adapter_path = _ADAPTERS_DIR / job_id
- if not adapter_path.exists():
- return {"error": f"Adapter not found: {adapter_path}"}
- import torch
- from transformers import AutoModelForCausalLM, AutoTokenizer
- # 加载 adapter(沐曦 MPS 模式下固定用第一张物理 GPU)
- visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "0")
- first_gpu = int(visible_devices.split(",")[0])
- device_map = {"": first_gpu}
- model = AutoModelForCausalLM.from_pretrained(
- adapter_path, torch_dtype=torch.float16, device_map=device_map
- )
- tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
- model.eval()
- sample_texts = [
- "The quick brown fox jumps over the lazy dog.",
- "Hello, how are you doing today?",
- ]
- losses = []
- with torch.no_grad():
- for text in sample_texts:
- inputs = tokenizer(text, return_tensors="pt").to(model.device)
- outputs = model(**inputs, labels=inputs["input_ids"])
- losses.append(outputs.loss.item())
- avg_loss = sum(losses) / len(losses) if losses else 0
- perplexity = torch.exp(torch.tensor(avg_loss)).item() if avg_loss > 0 else 0
- return {
- "metrics": {
- "eval_loss": round(avg_loss, 4),
- "perplexity": round(perplexity, 2),
- "num_samples": len(sample_texts),
- }
- }
|