base.sh 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. #!/bin/bash
  2. function readinessCheck() {
  3. # $1=name
  4. # $2=port
  5. # $3=timeout
  6. timeout="${3:-0}"
  7. while true; do
  8. if [ "$timeout" -gt 0 ]; then
  9. timeout=$((timeout - 1))
  10. if [ "$timeout" -eq 0 ]; then
  11. echo "[ERROR] Timeout while waiting for $1 to be ready."
  12. exit 1
  13. fi
  14. fi
  15. echo "Checking the readiness of $1..."
  16. if nc -z 127.0.0.1 "$2"; then
  17. break
  18. fi
  19. sleep 1
  20. done
  21. }
  22. function createDir() {
  23. mkdir -p "$1"
  24. chmod 755 "$1"
  25. }
  26. function waitForConfig() {
  27. local name="$1"
  28. local path="$2"
  29. local silent_mode="${3:-false}"
  30. local count=0
  31. local sleep_time=1
  32. while true; do
  33. if [ -f "$path" ]; then
  34. break
  35. fi
  36. ((count++))
  37. if [ $count -le 10 ]; then
  38. if [ "$silent_mode" != "true" ]; then
  39. echo "Waiting for $name configuration from GPUStack..."
  40. fi
  41. sleep_time=1
  42. else
  43. if [ "$silent_mode" != "true" ]; then
  44. echo "$name configuration is still missing. This component may be disabled."
  45. fi
  46. sleep_time=$((sleep_time * 2))
  47. [ $sleep_time -gt 30 ] && sleep_time=30
  48. fi
  49. sleep $sleep_time
  50. done
  51. }
  52. function handleServiceExit() {
  53. local service_name="$1"
  54. local exit_code_service="$2"
  55. local exit_code_signal="$3"
  56. local exit_code_file="/run/s6-linux-init-container-results/exitcode"
  57. local exit_code_container=0
  58. [[ -f "${exit_code_file}" ]] && exit_code_container=$(<"${exit_code_file}")
  59. echo "[INFO] Service '${service_name}' exited (code: ${exit_code_service}, signal: ${exit_code_signal})"
  60. # Case 1: Exit by signal
  61. if [[ "${exit_code_service}" -eq 256 ]]; then
  62. # If SIGTERM, stop the container
  63. if [[ "${exit_code_signal}" -eq 15 ]]; then
  64. echo 0 > "${exit_code_file}"
  65. fi
  66. # Write translated signal code if the container exit code isn't already set
  67. if [[ "${exit_code_container}" -eq 0 ]]; then
  68. echo $((128 + exit_code_signal)) > "${exit_code_file}"
  69. fi
  70. echo "[INFO] Service '${service_name}' exited by signal, shutting down container..."
  71. exec /run/s6/basedir/bin/halt
  72. # Case 2: non-zero exit → fatal → shutdown container
  73. elif [[ "${exit_code_service}" -ne 0 ]]; then
  74. # Update container exit code if not already set
  75. if [[ "${exit_code_container}" -eq 0 ]]; then
  76. echo "${exit_code_service}" > "${exit_code_file}"
  77. fi
  78. echo "[INFO] Service '${service_name}' exited with non-zero code, shutting down container..."
  79. exec /run/s6/basedir/bin/halt
  80. fi
  81. # Case 3: zero exit → exit normally
  82. echo "[INFO] Service '${service_name}' exited normally."
  83. exec /run/s6/basedir/bin/halt
  84. }
  85. function handleOptionalServiceExit() {
  86. # For optional services, allow s6 to restart instead of halting the container
  87. #
  88. # s6 finish script exit codes:
  89. # - exit 0: finish script succeeded, s6 will restart the service (default policy for longrun)
  90. # - exit 125: tell s6 not to restart the service
  91. # - other non-zero: finish script failed
  92. #
  93. # Service exit codes:
  94. # - EXIT_CODE 256: process was killed by a signal (signal number in EXIT_SIGNAL)
  95. # - EXIT_CODE 0: process exited normally
  96. # - other non-zero: process crashed or errored
  97. local service_name="$1"
  98. local exit_code_service="$2"
  99. local exit_code_signal="$3"
  100. # Case 1: Exit by SIGTERM (signal 15) - container is shutting down
  101. if [[ "${exit_code_service}" -eq 256 ]] && [[ "${exit_code_signal}" -eq 15 ]]; then
  102. echo "[INFO] Service '${service_name}' received SIGTERM (container shutting down)" >&2
  103. echo "[INFO] Service '${service_name}' will not be restarted." >&2
  104. exit 125 # Tell s6 not to restart during shutdown
  105. # Case 2: Exit by other signals (SIGKILL, SIGHUP, etc.)
  106. elif [[ "${exit_code_service}" -eq 256 ]]; then
  107. echo "[WARN] Service '${service_name}' exited by signal ${exit_code_signal} (exit code: ${exit_code_service})" >&2
  108. echo "[INFO] Service '${service_name}' will be restarted by s6." >&2
  109. exit 0 # Allow s6 to restart the service
  110. # Case 3: Non-zero exit code (crash, error)
  111. elif [[ "${exit_code_service}" -ne 0 ]]; then
  112. echo "[ERROR] Service '${service_name}' exited with non-zero code: ${exit_code_service}" >&2
  113. echo "[INFO] Service '${service_name}' will be restarted by s6." >&2
  114. exit 0 # Allow s6 to restart the service
  115. # Case 4: Normal exit (code 0, no signal)
  116. else
  117. echo "[INFO] Service '${service_name}' exited normally (code: ${exit_code_service}, signal: ${exit_code_signal})" >&2
  118. echo "[INFO] Service '${service_name}' will not be restarted." >&2
  119. exit 125 # Tell s6 not to restart
  120. fi
  121. }
  122. export GPUSTACK_RUN_DIR="${GPUSTACK_RUN_DIR:-/run/gpustack}"
  123. createDir "$GPUSTACK_RUN_DIR"
  124. # shellcheck disable=SC2034
  125. export GPUSTACK_GATEWAY_CONFIG="${GPUSTACK_RUN_DIR}/gateway/.env"
  126. # shellcheck disable=SC2034
  127. export GPUSTACK_POSTGRES_CONFIG="${GPUSTACK_RUN_DIR}/postgresql/.env"
  128. # shellcheck disable=SC2034
  129. export GPUSTACK_OBSERVABILITY_CONFIG="${GPUSTACK_RUN_DIR}/observability/.env"