user_data.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. # flake8: noqa: W605
  2. import jinja2
  3. from typing import Dict, Optional, Any, List
  4. import yaml
  5. from gpustack_runtime.detector import ManufacturerEnum
  6. # default_user_data_template is assuming the NVIDIA drivers and container toolkit
  7. # are pre-installed on the base image
  8. default_user_data_template_jinja = """#cloud-config
  9. write_files:
  10. - path: /var/lib/gpustack/config.yaml
  11. permissions: '0600'
  12. content: |
  13. server_url: "{{ server_url }}"
  14. token: "{{ token }}"
  15. worker_name: "{{ worker_name }}"
  16. {%- for k, v in secret_configs.items() %}
  17. {%- if v is not none %}
  18. {{ k }}: {{ v }}
  19. {%- endif %}
  20. {%- endfor %}
  21. - path: /opt/gpustack-run-worker.sh
  22. permissions: '0755'
  23. content: |-
  24. #!/bin/bash
  25. set -e
  26. echo "$(date): trying to bring up gpustack worker container..." >> /var/log/post-reboot.log
  27. docker run -d --name gpustack-worker \\
  28. -e "GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=gpustack-worker" \\
  29. --restart=unless-stopped \\
  30. --privileged \\
  31. --network=host \\
  32. -v /var/lib/gpustack:/var/lib/gpustack \\
  33. -v /var/run/docker.sock:/var/run/docker.sock \\
  34. {{ image_name }} \\
  35. --config-file=/var/lib/gpustack/config.yaml
  36. echo "$(date): gpustack worker container started" >> /var/log/post-reboot.log
  37. """
  38. post_boot_service = """[Unit]
  39. Description=bootstrap gpustack worker container
  40. After=network.target docker.service
  41. Wants=network.target docker.service
  42. [Service]
  43. Type=oneshot
  44. RemainAfterExit=no
  45. ExecStart=/bin/bash -c "/opt/gpustack-run-worker.sh && systemctl disable post-reboot.service"
  46. StandardOutput=journal
  47. [Install]
  48. WantedBy=default.target
  49. """
  50. dkms_service = """[Unit]
  51. Description=DKMS Autoinstall
  52. After=network.target
  53. [Service]
  54. Type=oneshot
  55. ExecStart=/usr/sbin/dkms autoinstall
  56. RemainAfterExit=true
  57. [Install]
  58. WantedBy=multi-user.target
  59. """
  60. debian_driver_map = {"debian": "nvidia-open", "ubuntu": "nvidia-driver-570"}
  61. class UserDataTemplate:
  62. """
  63. Template for user data. Only supports Debian/Ubuntu and nvidia drivers for now.
  64. """
  65. server_url: str
  66. token: str
  67. image_name: str
  68. distribution: Optional[str]
  69. install_driver: Optional[ManufacturerEnum]
  70. setup_driver: Optional[ManufacturerEnum]
  71. _data: Optional[Dict[str, Any]]
  72. secret_configs: Dict[str, Any]
  73. # worker related data
  74. worker_name: str
  75. def __init__(
  76. self,
  77. server_url: str,
  78. token: str,
  79. image_name: str,
  80. worker_name: str,
  81. secret_configs: Dict[str, Any] = {},
  82. ):
  83. self.server_url = server_url
  84. self.token = token
  85. self.image_name = image_name
  86. self.install_driver = None
  87. self.setup_driver = None
  88. self.worker_name = worker_name
  89. self.distribution = None
  90. self.secret_configs = secret_configs
  91. template = jinja2.Environment().from_string(default_user_data_template_jinja)
  92. self._data = yaml.safe_load(
  93. template.render(
  94. server_url=self.server_url,
  95. token=self.token,
  96. image_name=self.image_name,
  97. secret_configs=self.secret_configs,
  98. worker_name=self.worker_name,
  99. )
  100. )
  101. self.distribution = "ubuntu"
  102. self._data.setdefault('packages', [])
  103. self._data.setdefault('runcmd', [])
  104. self._data.setdefault('write_files', [])
  105. def insert_runcmd(self, *commands: str):
  106. command_list: List[str] = self._data.setdefault('runcmd', [])
  107. for idx, command in enumerate(commands):
  108. command_list.insert(idx, command)
  109. def _process_install_driver(self) -> bool:
  110. """
  111. process_install_driver handles the installation of the GPU drivers.
  112. Returns True if a reboot is required after installation.
  113. """
  114. self._data['package_update'] = True
  115. self._data['package_upgrade'] = True
  116. packages: List[str] = self._data.get('packages')
  117. write_files: List[Dict[str, Any]] = self._data.get('write_files')
  118. # only supports nvidia and debian/ubuntu for now
  119. if self.install_driver != ManufacturerEnum.NVIDIA or self.distribution not in [
  120. "debian",
  121. "ubuntu",
  122. ]:
  123. return False
  124. driver_name = debian_driver_map.get(self.distribution, "nvidia-driver-570")
  125. nvidia_toolkit_version = "1.17.8-1"
  126. packages.extend(
  127. [
  128. "build-essential",
  129. "dkms",
  130. "linux-headers-generic",
  131. "curl",
  132. ]
  133. )
  134. write_files.append(
  135. {
  136. "path": "/etc/systemd/system/dkms-autoinstall.service",
  137. "content": dkms_service,
  138. }
  139. )
  140. self.insert_runcmd(
  141. 'echo "blacklist nouveau" >> /etc/modprobe.d/blacklist.conf',
  142. 'echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist.conf',
  143. 'update-initramfs -u',
  144. r"distribution=$(. /etc/os-release; echo $ID$(echo $VERSION_ID | sed 's/\.//g'))",
  145. 'wget "https://developer.download.nvidia.com/compute/cuda/repos/$distribution/$(uname -m)/cuda-keyring_1.1-1_all.deb" -O /tmp/cuda-keyring_1.1-1_all.deb',
  146. 'dpkg -i /tmp/cuda-keyring_1.1-1_all.deb',
  147. 'rm /tmp/cuda-keyring_1.1-1_all.deb',
  148. 'curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg',
  149. "curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list",
  150. "apt-get update",
  151. f"""DEBIAN_FRONTEND=noninteractive \
  152. apt-get install -y \
  153. {driver_name} \
  154. cuda-toolkit-12-8 \
  155. nvidia-container-toolkit={nvidia_toolkit_version} \
  156. nvidia-container-toolkit-base={nvidia_toolkit_version} \
  157. libnvidia-container-tools={nvidia_toolkit_version} \
  158. libnvidia-container1={nvidia_toolkit_version}""",
  159. "echo 'export PATH=/usr/local/cuda/bin:$PATH' | tee /etc/profile.d/cuda.sh",
  160. "echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' | tee /etc/profile.d/cuda_ld_library_path.sh",
  161. "systemctl enable dkms-autoinstall.service",
  162. )
  163. return True
  164. def _process_setup_driver(self) -> None:
  165. """
  166. process_setup_driver handles the setup of the GPU drivers.
  167. """
  168. # only supports nvidia and debian/ubuntu for now
  169. if self.setup_driver != ManufacturerEnum.NVIDIA or self.distribution not in [
  170. "debian",
  171. "ubuntu",
  172. ]:
  173. return
  174. runcmd: List[str] = self._data.get('runcmd')
  175. runcmd.extend(
  176. [
  177. "nvidia-ctk runtime configure --runtime=docker --set-as-default",
  178. "systemctl restart docker",
  179. ]
  180. )
  181. def format(self) -> str:
  182. # hand packages
  183. packages: List[str] = self._data.get('packages', [])
  184. runcmds: List[str] = self._data.get('runcmd', [])
  185. write_files: List[Dict[str, Any]] = self._data.get('write_files', [])
  186. should_restart = False
  187. if self.distribution in ["debian", "ubuntu"]:
  188. packages.append("docker.io")
  189. # handle driver installation
  190. if self._process_install_driver():
  191. should_restart = True
  192. # handle driver setup
  193. self._process_setup_driver()
  194. # handle start on first boot
  195. if not should_restart:
  196. runcmds.append("/opt/gpustack-run-worker.sh")
  197. else:
  198. write_files.append(
  199. {
  200. "content": post_boot_service,
  201. "path": "/etc/systemd/system/post-reboot.service",
  202. }
  203. )
  204. runcmds.append("systemctl enable post-reboot.service")
  205. self._data["power_state"] = {
  206. "mode": "reboot",
  207. "timeout": 30,
  208. "message": "Rebooting after initial setup.",
  209. }
  210. return "#cloud-config\n" + yaml.dump(self._data, default_style='')