| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230 |
- # flake8: noqa: W605
- import jinja2
- from typing import Dict, Optional, Any, List
- import yaml
- from gpustack_runtime.detector import ManufacturerEnum
- # default_user_data_template is assuming the NVIDIA drivers and container toolkit
- # are pre-installed on the base image
- default_user_data_template_jinja = """#cloud-config
- write_files:
- - path: /var/lib/gpustack/config.yaml
- permissions: '0600'
- content: |
- server_url: "{{ server_url }}"
- token: "{{ token }}"
- worker_name: "{{ worker_name }}"
- {%- for k, v in secret_configs.items() %}
- {%- if v is not none %}
- {{ k }}: {{ v }}
- {%- endif %}
- {%- endfor %}
- - path: /opt/gpustack-run-worker.sh
- permissions: '0755'
- content: |-
- #!/bin/bash
- set -e
- echo "$(date): trying to bring up gpustack worker container..." >> /var/log/post-reboot.log
- docker run -d --name gpustack-worker \\
- -e "GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=gpustack-worker" \\
- --restart=unless-stopped \\
- --privileged \\
- --network=host \\
- -v /var/lib/gpustack:/var/lib/gpustack \\
- -v /var/run/docker.sock:/var/run/docker.sock \\
- {{ image_name }} \\
- --config-file=/var/lib/gpustack/config.yaml
- echo "$(date): gpustack worker container started" >> /var/log/post-reboot.log
- """
- post_boot_service = """[Unit]
- Description=bootstrap gpustack worker container
- After=network.target docker.service
- Wants=network.target docker.service
- [Service]
- Type=oneshot
- RemainAfterExit=no
- ExecStart=/bin/bash -c "/opt/gpustack-run-worker.sh && systemctl disable post-reboot.service"
- StandardOutput=journal
- [Install]
- WantedBy=default.target
- """
- dkms_service = """[Unit]
- Description=DKMS Autoinstall
- After=network.target
- [Service]
- Type=oneshot
- ExecStart=/usr/sbin/dkms autoinstall
- RemainAfterExit=true
- [Install]
- WantedBy=multi-user.target
- """
- debian_driver_map = {"debian": "nvidia-open", "ubuntu": "nvidia-driver-570"}
- class UserDataTemplate:
- """
- Template for user data. Only supports Debian/Ubuntu and nvidia drivers for now.
- """
- server_url: str
- token: str
- image_name: str
- distribution: Optional[str]
- install_driver: Optional[ManufacturerEnum]
- setup_driver: Optional[ManufacturerEnum]
- _data: Optional[Dict[str, Any]]
- secret_configs: Dict[str, Any]
- # worker related data
- worker_name: str
- def __init__(
- self,
- server_url: str,
- token: str,
- image_name: str,
- worker_name: str,
- secret_configs: Dict[str, Any] = {},
- ):
- self.server_url = server_url
- self.token = token
- self.image_name = image_name
- self.install_driver = None
- self.setup_driver = None
- self.worker_name = worker_name
- self.distribution = None
- self.secret_configs = secret_configs
- template = jinja2.Environment().from_string(default_user_data_template_jinja)
- self._data = yaml.safe_load(
- template.render(
- server_url=self.server_url,
- token=self.token,
- image_name=self.image_name,
- secret_configs=self.secret_configs,
- worker_name=self.worker_name,
- )
- )
- self.distribution = "ubuntu"
- self._data.setdefault('packages', [])
- self._data.setdefault('runcmd', [])
- self._data.setdefault('write_files', [])
- def insert_runcmd(self, *commands: str):
- command_list: List[str] = self._data.setdefault('runcmd', [])
- for idx, command in enumerate(commands):
- command_list.insert(idx, command)
- def _process_install_driver(self) -> bool:
- """
- process_install_driver handles the installation of the GPU drivers.
- Returns True if a reboot is required after installation.
- """
- self._data['package_update'] = True
- self._data['package_upgrade'] = True
- packages: List[str] = self._data.get('packages')
- write_files: List[Dict[str, Any]] = self._data.get('write_files')
- # only supports nvidia and debian/ubuntu for now
- if self.install_driver != ManufacturerEnum.NVIDIA or self.distribution not in [
- "debian",
- "ubuntu",
- ]:
- return False
- driver_name = debian_driver_map.get(self.distribution, "nvidia-driver-570")
- nvidia_toolkit_version = "1.17.8-1"
- packages.extend(
- [
- "build-essential",
- "dkms",
- "linux-headers-generic",
- "curl",
- ]
- )
- write_files.append(
- {
- "path": "/etc/systemd/system/dkms-autoinstall.service",
- "content": dkms_service,
- }
- )
- self.insert_runcmd(
- 'echo "blacklist nouveau" >> /etc/modprobe.d/blacklist.conf',
- 'echo "options nouveau modeset=0" >> /etc/modprobe.d/blacklist.conf',
- 'update-initramfs -u',
- r"distribution=$(. /etc/os-release; echo $ID$(echo $VERSION_ID | sed 's/\.//g'))",
- 'wget "https://developer.download.nvidia.com/compute/cuda/repos/$distribution/$(uname -m)/cuda-keyring_1.1-1_all.deb" -O /tmp/cuda-keyring_1.1-1_all.deb',
- 'dpkg -i /tmp/cuda-keyring_1.1-1_all.deb',
- 'rm /tmp/cuda-keyring_1.1-1_all.deb',
- 'curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg',
- "curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list",
- "apt-get update",
- f"""DEBIAN_FRONTEND=noninteractive \
- apt-get install -y \
- {driver_name} \
- cuda-toolkit-12-8 \
- nvidia-container-toolkit={nvidia_toolkit_version} \
- nvidia-container-toolkit-base={nvidia_toolkit_version} \
- libnvidia-container-tools={nvidia_toolkit_version} \
- libnvidia-container1={nvidia_toolkit_version}""",
- "echo 'export PATH=/usr/local/cuda/bin:$PATH' | tee /etc/profile.d/cuda.sh",
- "echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' | tee /etc/profile.d/cuda_ld_library_path.sh",
- "systemctl enable dkms-autoinstall.service",
- )
- return True
- def _process_setup_driver(self) -> None:
- """
- process_setup_driver handles the setup of the GPU drivers.
- """
- # only supports nvidia and debian/ubuntu for now
- if self.setup_driver != ManufacturerEnum.NVIDIA or self.distribution not in [
- "debian",
- "ubuntu",
- ]:
- return
- runcmd: List[str] = self._data.get('runcmd')
- runcmd.extend(
- [
- "nvidia-ctk runtime configure --runtime=docker --set-as-default",
- "systemctl restart docker",
- ]
- )
- def format(self) -> str:
- # hand packages
- packages: List[str] = self._data.get('packages', [])
- runcmds: List[str] = self._data.get('runcmd', [])
- write_files: List[Dict[str, Any]] = self._data.get('write_files', [])
- should_restart = False
- if self.distribution in ["debian", "ubuntu"]:
- packages.append("docker.io")
- # handle driver installation
- if self._process_install_driver():
- should_restart = True
- # handle driver setup
- self._process_setup_driver()
- # handle start on first boot
- if not should_restart:
- runcmds.append("/opt/gpustack-run-worker.sh")
- else:
- write_files.append(
- {
- "content": post_boot_service,
- "path": "/etc/systemd/system/post-reboot.service",
- }
- )
- runcmds.append("systemctl enable post-reboot.service")
- self._data["power_state"] = {
- "mode": "reboot",
- "timeout": 30,
- "message": "Rebooting after initial setup.",
- }
- return "#cloud-config\n" + yaml.dump(self._data, default_style='')
|