remote_deploy.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. """远程部署/导出入口 — 在算力节点的训练容器里执行。"""
  2. import json
  3. import os
  4. import shutil
  5. from pathlib import Path
  6. os.environ["PYTORCH_NO_FLASH"] = "1"
  7. os.environ["MACA_MPS_MODE"] = "1"
  8. os.environ["METAX_VISIBLE_DEVICES"] = "2,3"
  9. _DATA_DIR = Path(os.environ.get("COMPUTE_NODE_REMOTE_DATA_DIR", "/root/Fine-tuning/backend/data"))
  10. _ADAPTERS_DIR = _DATA_DIR / "adapters"
  11. async def run_remote_export(job_id: str, merge_with_base: bool = False, export_format: str = "safetensors") -> dict:
  12. """在远程容器里合并 adapter 或导出 GGUF。"""
  13. adapter_path = _ADAPTERS_DIR / job_id
  14. if not adapter_path.exists():
  15. return {"error": f"Adapter not found: {adapter_path}"}
  16. output_path = _ADAPTERS_DIR / f"{job_id}_merged"
  17. try:
  18. import torch
  19. from transformers import AutoModelForCausalLM, AutoTokenizer
  20. if merge_with_base:
  21. from peft import PeftModel
  22. base_model_id = _get_base_model_id(job_id)
  23. if base_model_id:
  24. visible_devices = os.environ.get("METAX_VISIBLE_DEVICES", "0")
  25. first_gpu = int(visible_devices.split(",")[0])
  26. device_map = {"": first_gpu}
  27. base_model = AutoModelForCausalLM.from_pretrained(
  28. base_model_id, torch_dtype=torch.float16, device_map=device_map
  29. )
  30. peft_model = PeftModel.from_pretrained(base_model, adapter_path)
  31. merged = peft_model.merge_and_unload()
  32. merged.save_pretrained(output_path)
  33. tokenizer = AutoTokenizer.from_pretrained(adapter_path)
  34. tokenizer.save_pretrained(output_path)
  35. else:
  36. from peft import PeftModel
  37. merged = PeftModel.from_pretrained(
  38. AutoModelForCausalLM.from_pretrained(
  39. str(adapter_path), torch_dtype=torch.float16, device_map={"": 0}
  40. ),
  41. adapter_path,
  42. )
  43. merged = merged.merge_and_unload()
  44. merged.save_pretrained(output_path)
  45. tokenizer = AutoTokenizer.from_pretrained(adapter_path)
  46. tokenizer.save_pretrained(output_path)
  47. else:
  48. shutil.copytree(adapter_path, output_path)
  49. result = {"output_path": str(output_path)}
  50. if export_format == "gguf":
  51. gguf_path = output_path.with_suffix(".gguf")
  52. _export_to_gguf(output_path, gguf_path)
  53. result["gguf_path"] = str(gguf_path)
  54. return result
  55. except Exception as e:
  56. import traceback
  57. return {"error": str(e), "traceback": traceback.format_exc()}
  58. def _get_base_model_id(job_id: str):
  59. config_path = _ADAPTERS_DIR / job_id / "adapter_config.json"
  60. if config_path.exists():
  61. with open(config_path) as f:
  62. return json.load(f).get("base_model_name_or_path")
  63. return None
  64. def _export_to_gguf(model_path: Path, output_path: Path):
  65. try:
  66. import subprocess
  67. result = subprocess.run(
  68. ["python", "-m", "llama_cpp.convert_hf_to_gguf", str(model_path), "--outfile", str(output_path)],
  69. capture_output=True, text=True, timeout=600,
  70. )
  71. if result.returncode != 0:
  72. raise RuntimeError(result.stderr)
  73. except Exception as e:
  74. raise RuntimeError(f"GGUF export not available: {e}")