remote_eval.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. """远程评估入口 — 在算力节点的训练容器里执行。"""
  2. import json
  3. import os
  4. from pathlib import Path
  5. # 禁用 FlashAttention,启用 MPS
  6. os.environ["PYTORCH_NO_FLASH"] = "1"
  7. os.environ["MACA_MPS_MODE"] = "1"
  8. os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
  9. _DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data"))
  10. _ADAPTERS_DIR = _DATA_DIR / "adapters"
  11. async def run_remote_eval(job_id: str) -> dict:
  12. """在远程容器里加载 adapter,计算 perplexity。"""
  13. adapter_path = _ADAPTERS_DIR / job_id
  14. if not adapter_path.exists():
  15. return {"error": f"Adapter not found: {adapter_path}"}
  16. import torch
  17. from transformers import AutoModelForCausalLM, AutoTokenizer
  18. # 加载 adapter(CUDA_VISIBLE_DEVICES=2,3 已将物理 GPU 2,3 映射为逻辑 GPU 0,1)
  19. device_map = {"": 0}
  20. model = AutoModelForCausalLM.from_pretrained(
  21. adapter_path, torch_dtype=torch.float16, device_map=device_map
  22. )
  23. tokenizer = AutoTokenizer.from_pretrained(adapter_path, trust_remote_code=True)
  24. model.eval()
  25. sample_texts = [
  26. "The quick brown fox jumps over the lazy dog.",
  27. "Hello, how are you doing today?",
  28. ]
  29. losses = []
  30. with torch.no_grad():
  31. for text in sample_texts:
  32. inputs = tokenizer(text, return_tensors="pt").to(model.device)
  33. outputs = model(**inputs, labels=inputs["input_ids"])
  34. losses.append(outputs.loss.item())
  35. avg_loss = sum(losses) / len(losses) if losses else 0
  36. perplexity = torch.exp(torch.tensor(avg_loss)).item() if avg_loss > 0 else 0
  37. return {
  38. "metrics": {
  39. "eval_loss": round(avg_loss, 4),
  40. "perplexity": round(perplexity, 2),
  41. "num_samples": len(sample_texts),
  42. }
  43. }