remote_eval.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. """远程评估入口 — 在算力节点的训练容器里执行。"""
  2. import json
  3. import os
  4. from pathlib import Path
  5. # 禁用 FlashAttention,启用 MPS
  6. os.environ["PYTORCH_NO_FLASH"] = "1"
  7. os.environ["MACA_MPS_MODE"] = "1"
  8. os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
  9. _DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data"))
  10. _ADAPTERS_DIR = _DATA_DIR / "adapters"
  11. async def run_remote_eval(job_id: str) -> dict:
  12. """在远程容器里加载 adapter,计算 perplexity。"""
  13. adapter_path = _ADAPTERS_DIR / job_id
  14. if not adapter_path.exists():
  15. return {"error": f"Adapter not found: {adapter_path}"}
  16. import torch
  17. from transformers import AutoModelForCausalLM, AutoTokenizer
  18. # 加载 adapter(沐曦 MPS 模式下固定用第一张物理 GPU)
  19. visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "0")
  20. first_gpu = int(visible_devices.split(",")[0])
  21. device_map = {"": first_gpu}
  22. model = AutoModelForCausalLM.from_pretrained(
  23. adapter_path, torch_dtype=torch.float16, device_map=device_map
  24. )
  25. tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
  26. model.eval()
  27. sample_texts = [
  28. "The quick brown fox jumps over the lazy dog.",
  29. "Hello, how are you doing today?",
  30. ]
  31. losses = []
  32. with torch.no_grad():
  33. for text in sample_texts:
  34. inputs = tokenizer(text, return_tensors="pt").to(model.device)
  35. outputs = model(**inputs, labels=inputs["input_ids"])
  36. losses.append(outputs.loss.item())
  37. avg_loss = sum(losses) / len(losses) if losses else 0
  38. perplexity = torch.exp(torch.tensor(avg_loss)).item() if avg_loss > 0 else 0
  39. return {
  40. "metrics": {
  41. "eval_loss": round(avg_loss, 4),
  42. "perplexity": round(perplexity, 2),
  43. "num_samples": len(sample_texts),
  44. }
  45. }