remote_deploy.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. """远程部署/导出入口 — 在算力节点的训练容器里执行。"""
  2. import json
  3. import os
  4. import shutil
  5. from pathlib import Path
  6. os.environ["PYTORCH_NO_FLASH"] = "1"
  7. os.environ["MACA_MPS_MODE"] = "1"
  8. os.environ["CUDA_VISIBLE_DEVICES"] = "3"
  9. _DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data"))
  10. _ADAPTERS_DIR = _DATA_DIR / "adapters"
  11. async def run_remote_export(job_id: str, merge_with_base: bool = False, export_format: str = "safetensors") -> dict:
  12. """在远程容器里合并 adapter 或导出 GGUF。"""
  13. adapter_path = _ADAPTERS_DIR / job_id
  14. if not adapter_path.exists():
  15. return {"error": f"Adapter not found: {adapter_path}"}
  16. output_path = _ADAPTERS_DIR / f"{job_id}_merged"
  17. try:
  18. import torch
  19. from transformers import AutoModelForCausalLM, AutoTokenizer
  20. if merge_with_base:
  21. from peft import PeftModel
  22. base_model_id = _get_base_model_id(job_id)
  23. if base_model_id:
  24. device_map = {"": 0}
  25. base_model = AutoModelForCausalLM.from_pretrained(
  26. base_model_id, torch_dtype=torch.float16, device_map=device_map
  27. )
  28. peft_model = PeftModel.from_pretrained(base_model, adapter_path)
  29. merged = peft_model.merge_and_unload()
  30. merged.save_pretrained(output_path)
  31. tokenizer = AutoTokenizer.from_pretrained(adapter_path)
  32. tokenizer.save_pretrained(output_path)
  33. else:
  34. from peft import PeftModel
  35. merged = PeftModel.from_pretrained(
  36. AutoModelForCausalLM.from_pretrained(
  37. str(adapter_path), torch_dtype=torch.float16, device_map={"": 0}
  38. ),
  39. adapter_path,
  40. )
  41. merged = merged.merge_and_unload()
  42. merged.save_pretrained(output_path)
  43. tokenizer = AutoTokenizer.from_pretrained(adapter_path)
  44. tokenizer.save_pretrained(output_path)
  45. else:
  46. shutil.copytree(adapter_path, output_path)
  47. result = {"output_path": str(output_path)}
  48. if export_format == "gguf":
  49. gguf_path = output_path.with_suffix(".gguf")
  50. _export_to_gguf(output_path, gguf_path)
  51. result["gguf_path"] = str(gguf_path)
  52. return result
  53. except Exception as e:
  54. import traceback
  55. return {"error": str(e), "traceback": traceback.format_exc()}
  56. def _get_base_model_id(job_id: str):
  57. config_path = _ADAPTERS_DIR / job_id / "adapter_config.json"
  58. if config_path.exists():
  59. with open(config_path) as f:
  60. return json.load(f).get("base_model_name_or_path")
  61. return None
  62. def _export_to_gguf(model_path: Path, output_path: Path):
  63. try:
  64. import subprocess
  65. result = subprocess.run(
  66. ["python", "-m", "llama_cpp.convert_hf_to_gguf", str(model_path), "--outfile", str(output_path)],
  67. capture_output=True, text=True, timeout=600,
  68. )
  69. if result.returncode != 0:
  70. raise RuntimeError(result.stderr)
  71. except Exception as e:
  72. raise RuntimeError(f"GGUF export not available: {e}")