| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- #!/bin/bash
- function readinessCheck() {
- # $1=name
- # $2=port
- # $3=timeout
- timeout="${3:-0}"
- while true; do
- if [ "$timeout" -gt 0 ]; then
- timeout=$((timeout - 1))
- if [ "$timeout" -eq 0 ]; then
- echo "[ERROR] Timeout while waiting for $1 to be ready."
- exit 1
- fi
- fi
- echo "Checking the readiness of $1..."
- if nc -z 127.0.0.1 "$2"; then
- break
- fi
- sleep 1
- done
- }
- function createDir() {
- mkdir -p "$1"
- chmod 755 "$1"
- }
- function waitForConfig() {
- local name="$1"
- local path="$2"
- local silent_mode="${3:-false}"
- local count=0
- local sleep_time=1
- while true; do
- if [ -f "$path" ]; then
- break
- fi
- ((count++))
- if [ $count -le 10 ]; then
- if [ "$silent_mode" != "true" ]; then
- echo "Waiting for $name configuration from GPUStack..."
- fi
- sleep_time=1
- else
- if [ "$silent_mode" != "true" ]; then
- echo "$name configuration is still missing. This component may be disabled."
- fi
- sleep_time=$((sleep_time * 2))
- [ $sleep_time -gt 30 ] && sleep_time=30
- fi
- sleep $sleep_time
- done
- }
- function handleServiceExit() {
- local service_name="$1"
- local exit_code_service="$2"
- local exit_code_signal="$3"
- local exit_code_file="/run/s6-linux-init-container-results/exitcode"
- local exit_code_container=0
- [[ -f "${exit_code_file}" ]] && exit_code_container=$(<"${exit_code_file}")
- echo "[INFO] Service '${service_name}' exited (code: ${exit_code_service}, signal: ${exit_code_signal})"
- # Case 1: Exit by signal
- if [[ "${exit_code_service}" -eq 256 ]]; then
- # If SIGTERM, stop the container
- if [[ "${exit_code_signal}" -eq 15 ]]; then
- echo 0 > "${exit_code_file}"
- fi
- # Write translated signal code if the container exit code isn't already set
- if [[ "${exit_code_container}" -eq 0 ]]; then
- echo $((128 + exit_code_signal)) > "${exit_code_file}"
- fi
- echo "[INFO] Service '${service_name}' exited by signal, shutting down container..."
- exec /run/s6/basedir/bin/halt
- # Case 2: non-zero exit → fatal → shutdown container
- elif [[ "${exit_code_service}" -ne 0 ]]; then
- # Update container exit code if not already set
- if [[ "${exit_code_container}" -eq 0 ]]; then
- echo "${exit_code_service}" > "${exit_code_file}"
- fi
- echo "[INFO] Service '${service_name}' exited with non-zero code, shutting down container..."
- exec /run/s6/basedir/bin/halt
- fi
- # Case 3: zero exit → exit normally
- echo "[INFO] Service '${service_name}' exited normally."
- exec /run/s6/basedir/bin/halt
- }
- function handleOptionalServiceExit() {
- # For optional services, allow s6 to restart instead of halting the container
- #
- # s6 finish script exit codes:
- # - exit 0: finish script succeeded, s6 will restart the service (default policy for longrun)
- # - exit 125: tell s6 not to restart the service
- # - other non-zero: finish script failed
- #
- # Service exit codes:
- # - EXIT_CODE 256: process was killed by a signal (signal number in EXIT_SIGNAL)
- # - EXIT_CODE 0: process exited normally
- # - other non-zero: process crashed or errored
- local service_name="$1"
- local exit_code_service="$2"
- local exit_code_signal="$3"
- # Case 1: Exit by SIGTERM (signal 15) - container is shutting down
- if [[ "${exit_code_service}" -eq 256 ]] && [[ "${exit_code_signal}" -eq 15 ]]; then
- echo "[INFO] Service '${service_name}' received SIGTERM (container shutting down)" >&2
- echo "[INFO] Service '${service_name}' will not be restarted." >&2
- exit 125 # Tell s6 not to restart during shutdown
- # Case 2: Exit by other signals (SIGKILL, SIGHUP, etc.)
- elif [[ "${exit_code_service}" -eq 256 ]]; then
- echo "[WARN] Service '${service_name}' exited by signal ${exit_code_signal} (exit code: ${exit_code_service})" >&2
- echo "[INFO] Service '${service_name}' will be restarted by s6." >&2
- exit 0 # Allow s6 to restart the service
- # Case 3: Non-zero exit code (crash, error)
- elif [[ "${exit_code_service}" -ne 0 ]]; then
- echo "[ERROR] Service '${service_name}' exited with non-zero code: ${exit_code_service}" >&2
- echo "[INFO] Service '${service_name}' will be restarted by s6." >&2
- exit 0 # Allow s6 to restart the service
- # Case 4: Normal exit (code 0, no signal)
- else
- echo "[INFO] Service '${service_name}' exited normally (code: ${exit_code_service}, signal: ${exit_code_signal})" >&2
- echo "[INFO] Service '${service_name}' will not be restarted." >&2
- exit 125 # Tell s6 not to restart
- fi
- }
- export GPUSTACK_RUN_DIR="${GPUSTACK_RUN_DIR:-/run/gpustack}"
- createDir "$GPUSTACK_RUN_DIR"
- # shellcheck disable=SC2034
- export GPUSTACK_GATEWAY_CONFIG="${GPUSTACK_RUN_DIR}/gateway/.env"
- # shellcheck disable=SC2034
- export GPUSTACK_POSTGRES_CONFIG="${GPUSTACK_RUN_DIR}/postgresql/.env"
- # shellcheck disable=SC2034
- export GPUSTACK_OBSERVABILITY_CONFIG="${GPUSTACK_RUN_DIR}/observability/.env"
|