Dockerfile 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749
  1. # Package logic:
  2. # 1. base target:
  3. # - Install tools.
  4. # - Upgrade GCC if needed.
  5. # - Install C buildkit.
  6. # - Upgrade Python if needed.
  7. # - Install Python buildkit.
  8. # - Install Platform toolkit.
  9. # - Install S6-overlay.
  10. # 2. gpustack target.
  11. # - Install PostgreSQL.
  12. # - Install Higress standalone components.
  13. # - Install gpustack package from the mounted source code.
  14. # - Setup entrypoint to gpustack command.
  15. # Argument usage:
  16. # - PYTHON_VERSION: Version of Python to use.
  17. # - GPUSTACK_BASE_IMAGE: Base image for the gpustack stage.
  18. # - GPUSTACK_RUNTIME_ROCM_VERSION: Version of ROCm detection library for gpustack-runtime, update this if project dependencies has changed.
  19. # - GPUSTACK_RUNTIME_DOCKER_MIRRORED_NAME_FILTER_LABELS: Semicolon-separated list of labels to filter mirrored images when deploying mirrored deployment.
  20. # - HIGRESS_VERSION: Version of Higress to use.
  21. # - HIGRESS_APISERVER_VERSION: Version of Higress API server to use.
  22. ARG PYTHON_VERSION=3.11
  23. ARG GPUSTACK_BASE_IMAGE=base
  24. ARG GPUSTACK_RUNTIME_ROCM_VERSION=7.0.2
  25. ARG GPUSTACK_RUNTIME_DOCKER_MIRRORED_NAME_FILTER_LABELS
  26. ARG HIGRESS_VERSION=2.1.9
  27. ARG HIGRESS_APISERVER_VERSION=0.0.26
  28. ARG PROMETHEUS_VERSION=3.5.1
  29. ARG GRAFANA_VERSION=12.2.4
  30. # Stage Base
  31. #
  32. # Example build command:
  33. # docker build --tag=gpustack/gpustack:base --file=pack/Dockerfile --target=base --progress=plain .
  34. #
  35. FROM gpustack/mirrored-higress-api-server:${HIGRESS_APISERVER_VERSION} AS apiserver
  36. FROM gpustack/mirrored-higress-higress:${HIGRESS_VERSION} AS controller
  37. FROM gpustack/mirrored-higress-pilot:${HIGRESS_VERSION} AS pilot
  38. FROM gpustack/mirrored-higress-gateway:${HIGRESS_VERSION} AS gateway
  39. FROM ubuntu:24.04@sha256:d1e2e92c075e5ca139d51a140fff46f84315c0fdce203eab2807c7e495eff4f9 AS base
  40. SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
  41. ARG TARGETPLATFORM
  42. ARG TARGETOS
  43. ARG TARGETARCH
  44. ## Install Tools
  45. ENV DEBIAN_FRONTEND=noninteractive \
  46. LANG='en_US.UTF-8' \
  47. LANGUAGE='en_US:en' \
  48. LC_ALL='en_US.UTF-8'
  49. RUN <<EOF
  50. # Tools
  51. # Refresh
  52. apt-get update -y && apt-get install -y --no-install-recommends \
  53. software-properties-common apt-transport-https \
  54. ca-certificates gnupg2 lsb-release gnupg-agent \
  55. && apt-get update -y \
  56. && add-apt-repository -y ppa:ubuntu-toolchain-r/test \
  57. && apt-get update -y
  58. # Install
  59. apt-get install -y --no-install-recommends \
  60. ca-certificates build-essential binutils bash openssl \
  61. curl wget aria2 \
  62. git git-lfs \
  63. unzip xz-utils \
  64. tzdata locales \
  65. iproute2 iputils-ping ifstat net-tools dnsutils pciutils ipmitool \
  66. procps sysstat htop \
  67. vim jq bc tree \
  68. logrotate cron netcat-openbsd
  69. # Update locale
  70. localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
  71. # Update timezone
  72. rm -f /etc/localtime \
  73. && ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
  74. && echo "Asia/Shanghai" > /etc/timezone \
  75. && dpkg-reconfigure --frontend noninteractive tzdata
  76. # Cleanup
  77. rm -rf /var/tmp/* \
  78. && rm -rf /tmp/* \
  79. && rm -rf /var/cache/apt
  80. EOF
  81. ## Upgrade GCC if needed
  82. RUN <<EOF
  83. # GCC
  84. # Upgrade GCC if the Ubuntu version is lower than 21.04.
  85. source /etc/os-release
  86. if (( $(echo "${VERSION_ID} >= 21.04" | bc -l) )); then
  87. echo "Skipping GCC upgrade for ${VERSION_ID}..."
  88. exit 0
  89. fi
  90. # Install
  91. apt-get install -y --no-install-recommends \
  92. gcc-11 g++-11 gfortran-11 gfortran
  93. # Update alternatives
  94. if [[ -f /etc/alternatives/gcov-dump ]]; then update-alternatives --remove-all gcov-dump; fi; update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 10
  95. if [[ -f /etc/alternatives/lto-dump ]]; then update-alternatives --remove-all lto-dump; fi; update-alternatives --install /usr/bin/lto-dump lto-dump /usr/bin/lto-dump-11 10
  96. if [[ -f /etc/alternatives/gcov ]]; then update-alternatives --remove-all gcov; fi; update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 10
  97. if [[ -f /etc/alternatives/gcc ]]; then update-alternatives --remove-all gcc; fi; update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10
  98. if [[ -f /etc/alternatives/gcc-nm ]]; then update-alternatives --remove-all gcc-nm; fi; update-alternatives --install /usr/bin/gcc-nm gcc-nm /usr/bin/gcc-nm-11 10
  99. if [[ -f /etc/alternatives/cpp ]]; then update-alternatives --remove-all cpp; fi; update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 10
  100. if [[ -f /etc/alternatives/g++ ]]; then update-alternatives --remove-all g++; fi; update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10
  101. if [[ -f /etc/alternatives/gcc-ar ]]; then update-alternatives --remove-all gcc-ar; fi; update-alternatives --install /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 10
  102. if [[ -f /etc/alternatives/gcov-tool ]]; then update-alternatives --remove-all gcov-tool; fi; update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 10
  103. if [[ -f /etc/alternatives/gcc-ranlib ]]; then update-alternatives --remove-all gcc-ranlib; fi; update-alternatives --install /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 10
  104. if [[ -f /etc/alternatives/gfortran ]]; then update-alternatives --remove-all gfortran; fi; update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 10
  105. # Cleanup
  106. rm -rf /var/tmp/* \
  107. && rm -rf /tmp/* \
  108. && rm -rf /var/cache/apt
  109. EOF
  110. ## Install C buildkit
  111. RUN <<EOF
  112. # C buildkit
  113. # Install
  114. apt-get install -y --no-install-recommends \
  115. make ninja-build pkg-config ccache
  116. curl --retry 3 --retry-connrefused -fL "https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-$(uname -m).tar.gz" | tar -zx -C /usr --strip-components 1
  117. # Install dependencies
  118. apt-get install -y --no-install-recommends \
  119. openssl libssl-dev \
  120. zlib1g zlib1g-dev libbz2-dev libffi-dev \
  121. lzma lzma-dev uuid-dev liblzma-dev \
  122. ffmpeg libjpeg-dev libpng-dev libtiff-dev libwebp-dev \
  123. libsqlite3-dev \
  124. libxml2 libxslt1-dev \
  125. libnuma1 libnuma-dev \
  126. libgpgme-dev libassuan-dev libbtrfs-dev libdevmapper-dev \
  127. libjemalloc-dev
  128. # Cleanup
  129. rm -rf /var/tmp/* \
  130. && rm -rf /tmp/* \
  131. && rm -rf /var/cache/apt
  132. EOF
  133. ## Upgrade Python if needed
  134. ARG PYTHON_VERSION
  135. ENV PYTHON_VERSION=${PYTHON_VERSION}
  136. RUN <<EOF
  137. # Python
  138. if (( $(echo "$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2) == ${PYTHON_VERSION}" | bc -l) )); then
  139. echo "Skipping Python upgrade for ${PYTHON_VERSION}..."
  140. if [[ -z "$(ldconfig -v 2>/dev/null | grep libpython${PYTHON_VERSION})" ]]; then
  141. PYTHON_LIB_PREFIX=$(python3 -c "import sys; print(sys.base_prefix);")
  142. echo "${PYTHON_LIB_PREFIX}/lib" >> /etc/ld.so.conf.d/python3.conf
  143. echo "${PYTHON_LIB_PREFIX}/lib64" >> /etc/ld.so.conf.d/python3.conf
  144. ldconfig -v
  145. fi
  146. exit 0
  147. fi
  148. # Add deadsnakes PPA for Python versions
  149. for i in 1 2 3; do
  150. add-apt-repository -y ppa:deadsnakes/ppa && break || { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }
  151. done
  152. apt-get update -y
  153. # Install
  154. apt-get install -y --no-install-recommends \
  155. python${PYTHON_VERSION} \
  156. python${PYTHON_VERSION}-dev \
  157. python${PYTHON_VERSION}-venv \
  158. python${PYTHON_VERSION}-lib2to3 \
  159. python${PYTHON_VERSION}-gdbm \
  160. python${PYTHON_VERSION}-tk
  161. if (( $(echo "${PYTHON_VERSION} <= 3.11" | bc -l) )); then
  162. apt-get install -y --no-install-recommends \
  163. python${PYTHON_VERSION}-distutils
  164. fi
  165. # Update alternatives
  166. if [[ -f /etc/alternatives/python3 ]]; then update-alternatives --remove-all python3; fi; update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1
  167. if [[ -f /etc/alternatives/python ]]; then update-alternatives --remove-all python; fi; update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1
  168. curl -sS "https://bootstrap.pypa.io/get-pip.py" | python${PYTHON_VERSION}
  169. if [[ -f /etc/alternatives/2to3 ]]; then update-alternatives --remove-all 2to3; fi; update-alternatives --install /usr/bin/2to3 2to3 /usr/bin/2to3${PYTHON_VERSION} 1 || true
  170. if [[ -f /etc/alternatives/pydoc3 ]]; then update-alternatives --remove-all pydoc3; fi; update-alternatives --install /usr/bin/pydoc3 pydoc3 /usr/bin/pydoc${PYTHON_VERSION} 1 || true
  171. if [[ -f /etc/alternatives/idle3 ]]; then update-alternatives --remove-all idle3; fi; update-alternatives --install /usr/bin/idle3 idle3 /usr/bin/idle${PYTHON_VERSION} 1 || true
  172. if [[ -f /etc/alternatives/python3-config ]]; then update-alternatives --remove-all python3-config; fi; update-alternatives --install /usr/bin/python3-config python3-config /usr/bin/python${PYTHON_VERSION}-config 1 || true
  173. # Cleanup
  174. rm -rf /var/tmp/* \
  175. && rm -rf /tmp/* \
  176. && rm -rf /var/cache/apt
  177. EOF
  178. ## Install Python buildkit
  179. ENV PIP_NO_CACHE_DIR=1 \
  180. PIP_DISABLE_PIP_VERSION_CHECK=1 \
  181. PIP_ROOT_USER_ACTION=ignore \
  182. PIPX_HOME=/root/.local/share/pipx \
  183. PIPX_LOCAL_VENVS=/root/.local/share/pipx/venvs \
  184. POETRY_NO_CACHE=1 \
  185. UV_NO_CACHE=1 \
  186. UV_HTTP_TIMEOUT=500 \
  187. UV_INDEX_STRATEGY="unsafe-best-match"
  188. RUN <<EOF
  189. # Buildkit
  190. cat <<EOT >/tmp/requirements.txt
  191. build
  192. cmake<4
  193. ninja<1.11
  194. setuptools<80
  195. setuptools-scm
  196. packaging<25
  197. wheel
  198. pybind11<3
  199. Cython
  200. psutil
  201. pipx
  202. uv
  203. yq
  204. hatchling
  205. py-spy
  206. poetry
  207. EOT
  208. pip install -r /tmp/requirements.txt
  209. # Cleanup
  210. rm -rf /var/tmp/* \
  211. && rm -rf /tmp/*
  212. EOF
  213. ## Install s6-overlay
  214. ARG S6_OVERLAY_VERSION=3.2.1.0
  215. RUN set -eux; \
  216. case "${TARGETARCH}" in \
  217. amd64) S6_ARCH="x86_64" ;; \
  218. arm64) S6_ARCH="aarch64" ;; \
  219. arm/v7) S6_ARCH="armhf" ;; \
  220. arm/v6) S6_ARCH="arm" ;; \
  221. *) \
  222. echo >&2 "⚠️ Warning: Unknown TARGETARCH='${TARGETARCH}', defaulting to x86_64"; \
  223. S6_ARCH="x86_64"; \
  224. ;; \
  225. esac; \
  226. echo "Installing s6-overlay ${S6_OVERLAY_VERSION} for arch: ${S6_ARCH} (from TARGETARCH=${TARGETARCH})"; \
  227. base_url="https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}"; \
  228. for pkg in noarch ${S6_ARCH}; do \
  229. wget -q -O "/tmp/s6-overlay-${pkg}.tar.xz" "${base_url}/s6-overlay-${pkg}.tar.xz"; \
  230. done \
  231. && echo "📦 Extracting s6-overlay ..." \
  232. && tar -C / -Jxpf /tmp/s6-overlay-noarch.tar.xz \
  233. && tar -C / -Jxpf /tmp/s6-overlay-${S6_ARCH}.tar.xz \
  234. && rm -f /tmp/s6-overlay-*.tar.xz \
  235. && echo "Installed s6-overlay ${S6_OVERLAY_VERSION} successfully."
  236. ENV S6_KEEP_ENV=1 \
  237. S6_BEHAVIOUR_IF_STAGE2_FAILS=2 \
  238. S6_SERVICES_GRACETIME=3000 \
  239. S6_KILL_GRACETIME=3000 \
  240. S6_VERBOSITY=1 \
  241. S6_CMD_WAIT_FOR_SERVICES=1
  242. #
  243. # Stage GPUStack
  244. #
  245. # Example build command:
  246. # docker build --tag=gpustack/gpustack:main --file=pack/Dockerfile --progress=plain .
  247. #
  248. # Vendor ROCm libraries from ROCm base image,
  249. # now only linux/amd64 is supported.
  250. # Must build on linux/amd64 platform.
  251. FROM --platform=${BUILDPLATFORM} rocm/dev-ubuntu-22.04:${GPUSTACK_RUNTIME_ROCM_VERSION} AS rocm-base
  252. FROM ${GPUSTACK_BASE_IMAGE} AS gpustack
  253. SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
  254. ARG TARGETPLATFORM
  255. ARG TARGETOS
  256. ARG TARGETARCH
  257. ## Configure data volume
  258. VOLUME /var/lib/gpustack
  259. ## Install PostgreSQL
  260. ENV PGCONFIG_FILE=/etc/postgresql/main/postgresql.conf \
  261. POSTGRES_DB=gpustack
  262. RUN set -eux; \
  263. groupadd -r postgres --gid=9999; \
  264. useradd -r -g postgres --uid=9999 --home-dir=/var/lib/postgresql --shell=/bin/bash postgres; \
  265. mkdir -p /var/lib/postgresql; \
  266. chown -R postgres:postgres /var/lib/postgresql
  267. RUN <<EOF
  268. set -eux
  269. # Add PostgreSQL APT repository
  270. wget -O /tmp/ACCC4CF8.asc https://www.postgresql.org/media/keys/ACCC4CF8.asc \
  271. && gpg --dearmor /tmp/ACCC4CF8.asc \
  272. && mv /tmp/ACCC4CF8.asc.gpg /usr/share/keyrings/postgresql-archive-keyring.gpg \
  273. && echo "deb [signed-by=/usr/share/keyrings/postgresql-archive-keyring.gpg] http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list
  274. # Install
  275. apt-get update -y && apt-get install -y --no-install-recommends \
  276. postgresql-17 \
  277. gosu
  278. # Create symlinks for PostgreSQL 17 to simplify usage
  279. ln -s /usr/lib/postgresql/17/bin /usr/lib/postgresql/bin \
  280. && ln -s /etc/postgresql/17/main /etc/postgresql/main \
  281. && ln -s /var/lib/postgresql/17/main /var/lib/postgresql/main \
  282. && ls -1 /usr/lib/postgresql/bin/ | xargs -I @ ln -sf /usr/lib/postgresql/bin/@ /usr/bin/@
  283. gosu postgres echo "listen_addresses='*'" >> "$PGCONFIG_FILE" \
  284. && gosu postgres echo "local all postgres peer" > /etc/postgresql/main/pg_hba.conf \
  285. && gosu postgres echo "host all root 127.0.0.1/32 trust" >> /etc/postgresql/main/pg_hba.conf \
  286. && gosu postgres echo "host all root ::1/128 trust" >> /etc/postgresql/main/pg_hba.conf \
  287. && gosu postgres echo "host all all 0.0.0.0/0 scram-sha-256" >> /etc/postgresql/main/pg_hba.conf
  288. gosu postgres sed -i "s/^data_directory/#data_directory/" "$PGCONFIG_FILE" \
  289. && gosu postgres sed -i "s/^hba_file/#hba_file/" "$PGCONFIG_FILE" \
  290. && gosu postgres sed -i "s/^#log_destination/log_destination/" "$PGCONFIG_FILE" \
  291. && gosu postgres sed -i "s/^#log_min_messages = warning/log_min_messages = info/" "$PGCONFIG_FILE" \
  292. && gosu postgres sed -i "s/^#logging_collector = off/logging_collector = on/" "$PGCONFIG_FILE" \
  293. && gosu postgres sed -i "s/^#log_filename/log_filename/" "$PGCONFIG_FILE" \
  294. && gosu postgres sed -i "s/^#log_rotation_size/log_rotation_size/" "$PGCONFIG_FILE"
  295. # Cleanup
  296. rm -rf /var/tmp/* \
  297. && rm -rf /tmp/* \
  298. && rm -rf /var/cache/apt
  299. EOF
  300. ## Install Higress standalone components
  301. RUN --mount=type=bind,from=apiserver,source=/apiserver,dst=/mnt/apiserver,rw \
  302. --mount=type=bind,from=controller,source=/usr/local/bin/higress,dst=/mnt/higress,rw \
  303. --mount=type=bind,from=pilot,source=/usr/local/bin,dst=/mnt/pilot,rw \
  304. --mount=type=bind,from=gateway,source=/,dst=/mnt/gateway,rw <<EOF
  305. # Prepare Higress standalone components
  306. set -eux;
  307. # Install API server
  308. cp /mnt/apiserver /usr/local/bin/apiserver;
  309. # Install controller
  310. cp /mnt/higress /usr/local/bin/higress;
  311. # Install pilot
  312. cp /mnt/pilot/pilot-discovery /usr/local/bin/pilot-discovery;
  313. cp /mnt/pilot/higress-pilot-start.sh /usr/local/bin/higress-pilot-start.sh;
  314. # Install gateway
  315. mkdir -p /var/lib/istio/envoy/
  316. cp /mnt/gateway/var/lib/istio/envoy/*.json /var/lib/istio/envoy/;
  317. cp /mnt/gateway/var/lib/istio/envoy/*.so /var/lib/istio/envoy/;
  318. cp /mnt/gateway/usr/local/bin/pilot-agent /usr/local/bin/pilot-agent;
  319. cp /mnt/gateway/usr/local/bin/envoy /usr/local/bin/envoy;
  320. cp /mnt/gateway/usr/local/bin/supercronic-linux-${TARGETARCH} /usr/local/bin/;
  321. ln -s supercronic-linux-${TARGETARCH} supercronic && mv supercronic /usr/local/bin/;
  322. EOF
  323. # Initialize configurations
  324. COPY pack/rootfs/ /
  325. COPY docker-compose/grafana/grafana_dashboards/ /etc/dashboards/
  326. ## END Install Higress standalone components
  327. ## Install Skopeo
  328. ARG GOPROXY="https://proxy.golang.org,direct"
  329. RUN <<EOF
  330. # Skopeo
  331. # Install Go
  332. curl --retry 3 --retry-connrefused -fL "https://go.dev/dl/go1.23.3.${TARGETOS}-${TARGETARCH}.tar.gz" | tar -zx -C /usr/local
  333. export PATH="/usr/local/go/bin:${PATH}"
  334. export GOPROXY="${GOPROXY}"
  335. export
  336. # Download
  337. git -C /tmp clone --recursive --shallow-submodules \
  338. --depth 1 --branch v1.20.0 --single-branch \
  339. https://github.com/containers/skopeo.git skopeo
  340. # Build and install
  341. pushd /tmp/skopeo \
  342. && sed -i "/export GOPROXY=.*/d" Makefile \
  343. && make vendor \
  344. && DISABLE_DOCS=1 PREFIX=/usr make install-binary
  345. # Configure Skopeo to allow insecure registries.
  346. mkdir -p /etc/containers
  347. cat<<EOT > /etc/containers/policy.json
  348. {
  349. "default": [
  350. {
  351. "type": "insecureAcceptAnything"
  352. }
  353. ]
  354. }
  355. EOT
  356. # Review
  357. skopeo --version
  358. # Cleanup go
  359. go clean -cache -modcache -testcache \
  360. && rm -rf /usr/local/go \
  361. && rm -rf /root/.cache/go-build
  362. # Cleanup
  363. rm -rf /var/tmp/* \
  364. && rm -rf /tmp/* \
  365. && rm -rf /var/cache/apt
  366. EOF
  367. ## Install Prometheus and Grafana
  368. ARG PROMETHEUS_VERSION
  369. ARG GRAFANA_VERSION
  370. RUN <<EOF
  371. set -eux
  372. apt-get update -y && apt-get install -y --no-install-recommends \
  373. fontconfig
  374. case "${TARGETARCH}" in
  375. amd64)
  376. PROM_ARCH="amd64"
  377. GRAFANA_ARCH="amd64"
  378. ;;
  379. arm64)
  380. PROM_ARCH="arm64"
  381. GRAFANA_ARCH="arm64"
  382. ;;
  383. *)
  384. echo "Unsupported TARGETARCH: ${TARGETARCH}"
  385. exit 1
  386. ;;
  387. esac
  388. curl --retry 3 --retry-connrefused -fL \
  389. "https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-${PROM_ARCH}.tar.gz" \
  390. | tar -zx -C /opt
  391. mv "/opt/prometheus-${PROMETHEUS_VERSION}.linux-${PROM_ARCH}" /opt/prometheus
  392. ln -s /opt/prometheus/prometheus /usr/local/bin/prometheus
  393. ln -s /opt/prometheus/promtool /usr/local/bin/promtool
  394. curl --retry 3 --retry-connrefused -fL \
  395. "https://dl.grafana.com/oss/release/grafana-${GRAFANA_VERSION}.linux-${GRAFANA_ARCH}.tar.gz" \
  396. | tar -zx -C /opt
  397. mv "/opt/grafana-${GRAFANA_VERSION}" /opt/grafana
  398. ln -s /opt/grafana/bin/grafana-server /usr/local/bin/grafana-server
  399. ln -s /opt/grafana/bin/grafana-cli /usr/local/bin/grafana-cli
  400. mkdir -p /etc/grafana
  401. cp /opt/grafana/conf/sample.ini /etc/grafana/grafana.ini.sample
  402. rm -rf /var/tmp/* \
  403. && rm -rf /tmp/* \
  404. && rm -rf /var/cache/apt
  405. EOF
  406. ## Install GPUStack
  407. RUN --mount=type=cache,target=/root/.cache \
  408. --mount=type=bind,target=/workspace/gpustack,rw <<EOF
  409. # Install GPUStack
  410. export POETRY_NO_CACHE=0
  411. export UV_NO_CACHE=0
  412. export UV_SYSTEM_PYTHON=1
  413. export UV_LINK_MODE=copy
  414. # Build GPUStack
  415. cd /workspace/gpustack \
  416. && git config --global --add safe.directory /workspace/gpustack \
  417. && make build
  418. # Install GPUStack.
  419. # FIXME: There is no linux/arm64 vLLM prebuilt wheel,
  420. # so we only install the all wheel for linux/amd64.
  421. if [ "${TARGETARCH}" == "amd64" ]; then
  422. WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[all]";
  423. else
  424. WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[audio]";
  425. fi
  426. uv pip install --no-build-isolation --extra-index-url https://download.pytorch.org/whl/cpu/ \
  427. ${WHEEL_PACKAGE}
  428. # Download tools
  429. gpustack download-tools
  430. tree -hs "$(pip show gpustack | grep Location: | head -n 1 | cut -d" " -f 2)/gpustack/third_party"
  431. # Set up environment
  432. mkdir -p /var/lib/gpustack \
  433. && chmod -R 0755 /var/lib/gpustack
  434. # Review
  435. uv pip tree \
  436. --package gpustack
  437. gpustack version
  438. # Try to update PCI IDs
  439. if ! update-pciids; then
  440. curl -o /usr/share/misc/pci.ids https://pci-ids.ucw.cz/v2.2/pci.ids || true
  441. fi
  442. # Cleanup
  443. rm -rf /var/tmp/* \
  444. && rm -rf /tmp/* \
  445. && rm -rf /workspace/gpustack/dist
  446. EOF
  447. ## Entrypoint
  448. ## Active all AMD devices detection,
  449. ## works with (default) ROCm container runtime and privileged mode.
  450. ## See https://rocm.docs.amd.com/projects/amdsmi/en/latest/reference/amdsmi-py-api.html.
  451. ## Runs:
  452. ## - With container runtime installed:
  453. ## + If installed AMD contaienr runtime as default runtime, try with:
  454. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged ...
  455. ## + If there are mulitple container runtimes installed, try with:
  456. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged --runtime amd ...
  457. ## + If failed to detect devices' name, try with:
  458. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /usr/share:/usr/share:ro ...
  459. ## + If want to detect the correct host ROCm version, try with:
  460. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /opt/rocm:/opt/rocm:ro ...
  461. ## + Disallowing privileged, try with:
  462. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock ...
  463. ## - Without container runtime installed:
  464. ## + Allowing privileged, try with:
  465. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /opt/rocm:/opt/rocm:ro ...
  466. ## + Disallowing privileged, try with:
  467. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --security-opt seccomp=unconfined -v /dev:/dev:ro --group-add video -v /opt/rocm:/opt/rocm:ro ...
  468. RUN --mount=type=bind,from=rocm-base,source=/opt/rocm/share,target=/opt/rocm/share,rw <<EOF
  469. # Reinstall amd-smi
  470. export UV_SYSTEM_PYTHON=1
  471. export UV_PRERELEASE=allow
  472. uv pip install --no-build-isolation \
  473. /opt/rocm/share/amd_smi
  474. uv pip tree
  475. # Hack to avoid: Fail to open libdrm_amdgpu.so: libdrm_amdgpu.so: cannot open shared object file: No such file or directory
  476. TARGET_DIR="/usr/lib/$(uname -m)-linux-gnu"
  477. TARGET_LIB="libdrm_amdgpu.so.1"
  478. TARGET_LINK="libdrm_amdgpu.so"
  479. TARGET_LIB_EXISTED="true"
  480. if [[ ! -e "${TARGET_DIR}/${TARGET_LIB}" ]]; then
  481. TARGET_LIB_EXISTED="false"
  482. touch "${TARGET_DIR}/${TARGET_LIB}"
  483. fi
  484. pushd "${TARGET_DIR}" \
  485. && ln -sf "${TARGET_LIB}" "${TARGET_LINK}"
  486. if [[ "${TARGET_LIB_EXISTED}" == "false" ]]; then
  487. rm -f "${TARGET_DIR}/${TARGET_LIB}"
  488. fi
  489. EOF
  490. ENV AMD_VISIBLE_DEVICES="all" \
  491. GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/opt/rocm"
  492. ## Active all Ascend devices detection,
  493. ## works with (default) Ascend container runtime and privileged mode.
  494. ## See https://gitcode.com/Ascend/mind-cluster/blob/master/component/ascend-common/devmanager/dcmi/dcmi_interface_api.h.
  495. ## Runs:
  496. ## - With container runtime installed:
  497. ## + If installed Ascend container runtime as default runtime, try with:
  498. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -e "ASCEND_VISIBLE_DEVICES=$(npu-smi info -m | tail -n 1 | awk '{print $1}')" ...
  499. ## + If there are mulitple container runtimes installed, try with:
  500. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -e "ASCEND_VISIBLE_DEVICES=$(npu-smi info -m | tail -n 1 | awk '{print $1}')" --runtime ascend ...
  501. ## + If want to detect the correct host CANN version and SoC name, try with:
  502. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -e "ASCEND_VISIBLE_DEVICES=$(npu-smi info -m | tail -n 1 | awk '{print $1}')" -v /usr/local/Ascend/ascend-toolkit:/usr/local/Ascend/ascend-toolkit:ro ...
  503. ## + Disallowing privileged, try with:
  504. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -e "ASCEND_VISIBLE_DEVICES=$(npu-smi info -m | grep -v mcu | awk '{if(NR>1){print $1}}' | uniq | paste -sd ',')" ...
  505. ## - Without container runtime installed:
  506. ## + Allowing privileged, try with:
  507. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /usr/local/dcmi:/usr/local/dcmi:ro -v /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro -v /etc/hccn.conf:/etc/hccn.conf:ro -v /etc/ascend_install.info:/etc/ascend_install.info:ro ...
  508. ## + Disallowing privileged, try with:
  509. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --security-opt seccomp=unconfined -v /dev:/dev:ro -v /usr/local/dcmi:/usr/local/dcmi:ro -v /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro -v /etc/hccn.conf:/etc/hccn.conf:ro -v /etc/ascend_install.info:/etc/ascend_install.info:ro ...
  510. ENV ASCEND_HOME_PATH="/usr/local/Ascend/ascend-toolkit/latest" \
  511. LD_LIBRARY_PATH="/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/ascend-toolkit/latest/runtime/lib64:${LD_LIBRARY_PATH}" \
  512. GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/usr/local/Ascend/ascend-toolkit;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}"
  513. ## Active all Cambricon devices detection,
  514. ## works with (default) Cambricon container runtime and privileged mode.
  515. ## See https://github.com/Cambricon/cambricon-k8s-device-plugin/blob/master/device-plugin/pkg/cndev/include/cndev.h,
  516. ## https://github.com/Cambricon/cambricon-k8s-device-plugin/blob/master/device-plugin/pkg/cntopo/include/cntopo.h.
  517. ## Runs:
  518. ## - With container runtime installed:
  519. ## [TODO, TBD]
  520. ## - Without container runtime installed:
  521. ## + Allowing privileged, try with:
  522. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /usr/local/neuware:/usr/local/neuware:ro -v /usr/bin/cnmon:/usr/bin/cnmon ...
  523. ## + Disallowing privileged, try with:
  524. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --security-opt seccomp=unconfined -v /dev:/dev:ro -v /usr/local/neuware:/usr/local/neuware:ro -v /usr/bin/cnmon:/usr/bin/cnmon ...
  525. ENV CAMBRICON_VISIBLE_DEVICES="all" \
  526. NEUWARE_HOME="/usr/local/neuware" \
  527. LD_LIBRARY_PATH="/usr/local/neuware/lib64:${LD_LIBRARY_PATH}" \
  528. GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/usr/local/neuware;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}"
  529. ## Active all Hygon devices detection,
  530. ## works with (default) Hygon container runtime and privileged mode.
  531. ## See https://github.com/Project-HAMi/dcu-dcgm/blob/master/pkg/dcgm/include/rocm_smi.h.
  532. ## Runs:
  533. ## - With container runtime installed:
  534. ## [TODO, TBD]
  535. ## - Without container runtime installed:
  536. ## + Allowing privileged, try with:
  537. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /opt/hyhal:/opt/hyhal:ro -v /opt/dtk:/opt/dtk:ro -e ROCM_SMI_LIB_PATH=/opt/hyhal/lib -e ROCM_PATH=/opt/dtk ...
  538. ## + Disallowing privileged, try with:
  539. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --security-opt seccomp=unconfined -v /dev:/dev:ro --group-add video -v /opt/hyhal:/opt/hyhal:ro -v /opt/dtk:/opt/dtk:ro -e ROCM_SMI_LIB_PATH=/opt/hyhal/lib -e ROCM_PATH=/opt/dtk ...
  540. ENV HYGON_VISIBLE_DEVICES="all" \
  541. GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/opt/dtk;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}"
  542. ## Active all Iluvatar devices detection,
  543. ## works with (default) Iluvatar container runtime and privileged mode.
  544. ## See https://github.com/Deep-Spark/ix-container-toolkit.
  545. ## Runs:
  546. ## - With container runtime installed:
  547. ## + If installed Iluvatar container runtime as default runtime, try with:
  548. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /usr/local/corex:/usr/local/corex:ro ...
  549. ## + If there are mulitple container runtimes installed, try with:
  550. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged --runtime iluvatar -v /usr/local/corex:/usr/local/corex:ro ...
  551. ## + Disallowing privileged, try with:
  552. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -v /usr/local/corex:/usr/local/corex:ro ...
  553. ## - Without container runtime installed:
  554. ## + Allowing privileged, try with:
  555. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /usr/local/corex:/usr/local/corex:ro ...
  556. ## + Disallowing privileged, try with:
  557. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --security-opt seccomp=unconfined -v /dev:/dev:ro -v /usr/local/corex:/usr/local/corex:ro ...
  558. ENV IX_VISIBLE_DEVICES="all" \
  559. COREX_HOME="/usr/local/corex" \
  560. LD_LIBRARY_PATH="/usr/local/corex/lib64:${LD_LIBRARY_PATH}" \
  561. GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/usr/local/corex;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}"
  562. ## Active all MetaX devices detection,
  563. ## works with (default) MetaX container runtime and privileged mode.
  564. ## See https://developer.metax-tech.com/api/client/document/preview/626/k8s/03_component.html#container-runtime.
  565. ## Runs:
  566. ## - With container runtime installed:
  567. ## [TODO, TBD]
  568. ## - Without container runtime installed:
  569. ## + Allowing privileged, try with:
  570. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /opt/mxdriver:/opt/mxdriver:ro -v /opt/maca:/opt/maca:ro ...
  571. ## + Disallowing privileged, try with:
  572. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --security-opt seccomp=unconfined -v /dev:/dev:ro -v /opt/mxdriver:/opt/mxdriver:ro -v /opt/maca:/opt/maca:ro ...
  573. ENV LD_LIBRARY_PATH="/opt/maca/lib:/opt/mxdriver/lib:${LD_LIBRARY_PATH}" \
  574. GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/opt/maca;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}"
  575. ## Active all MThreads devices detection,
  576. ## works with (default) MThreads container runtime and privileged mode.
  577. ## See https://docs.mthreads.com/cloud-native/cloud-native-doc-online/install_guide.
  578. ## Runs:
  579. ## - With container runtime installed:
  580. ## + If installed MThreads container runtime as default runtime, try with:
  581. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged ...
  582. ## + If there are mulitple container runtimes installed, try with:
  583. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged --runtime mthreads ...
  584. ## + Disallowing privileged, try with:
  585. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock ...
  586. ## - Without container runtime installed:
  587. ## [TODO, TBD]
  588. ENV MTHREADS_VISIBLE_DEVICES="all" \
  589. MTHREADS_DRIVER_CAPABILITIES="compute,utility"
  590. ## Active all NVIDIA devices detection,
  591. ## works with (default) NVIDIA container runtime and privileged mode.
  592. ## See https://docs.nvidia.com/deploy/nvml-api/nvml-api-reference.html#nvml-api-reference.
  593. ## Runs:
  594. ## - With container runtime installed:
  595. ## + If installed NVIDIA container runtime as default runtime, try with:
  596. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged ...
  597. ## + If there are mulitple container runtimes installed, try with:
  598. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged --runtime nvidia ...
  599. ## + Disallowing privileged, try with:
  600. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock ...
  601. ## - Without container runtime installed:
  602. ## [TODO, TBD]
  603. ENV NVIDIA_DISABLE_REQUIRE="true" \
  604. NVIDIA_VISIBLE_DEVICES="all" \
  605. NVIDIA_DRIVER_CAPABILITIES="compute,utility"
  606. ## Active all T-Head devices detection,
  607. ## works with (default) T-Head container runtime and privileged mode.
  608. ## See https://help.aliyun.com/document_detail/2996754.html.
  609. ## Runs:
  610. ## - With container runtime installed:
  611. ## [TODO, TBD]
  612. ## - Without container runtime installed:
  613. ## + Allowing privileged, try with:
  614. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -e GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY=CDI -v /var/run/cdi:/var/run/cdi --privileged -v /usr/local/PPU_SDK:/usr/local/PPU_SDK:ro ...
  615. ## + Disallowing privileged, try with:
  616. ## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -e GPUSTACK_RUNTIME_DOCKER_RESOURCE_INJECTION_POLICY=CDI -v /var/run/cdi:/var/run/cdi --security-opt seccomp=unconfined -v /dev:/dev:ro -v /usr/local/PPU_SDK:/usr/local/PPU_SDK:ro ...
  617. ENV PPU_HOME="/usr/local/PPU_SDK" \
  618. LD_LIBRARY_PATH="/usr/local/PPU_SDK/CUDA_SDK/lib64:/usr/local/PPU_SDK/lib:${LD_LIBRARY_PATH}" \
  619. GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/usr/local/PPU_SDK;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}"
  620. ## Active GPUStack runtime mirrored deployment mode,
  621. ## if getting an error like, "Found multiple Containers with the same hostname ...",
  622. ## please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact container name.
  623. ##
  624. ARG GPUSTACK_RUNTIME_DOCKER_MIRRORED_NAME_FILTER_LABELS
  625. ## GPUSTACK_RUNTIME_LOG_EXCEPTION=false: Disable logging exceptions from gpustack-runtime.
  626. ## GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY=/var/run/cdi: Set CDI specs directory.
  627. ## GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT=true: Enable mirrored deployment mode.
  628. ## GPUSTACK_RUNTIME_DEPLOY_CORRECT_RUNNER_IMAGE=false: Disable auto correction of runner images.
  629. ## GPUSTACK_RUNTIME_DOCKER_IMAGE_NO_PULL_VISUALIZATION=true: Disable visualizating image pull progress, instead using simple logs.
  630. ## GPUSTACK_RUNTIME_DOCKER_MIRRORED_NAME_FILTER_LABELS: Configure filter labels for mirrored deployment.
  631. ## GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES: Declare volumes to be ignored during mirrored deployment.
  632. ENV GPUSTACK_RUNTIME_LOG_EXCEPTION="false" \
  633. GPUSTACK_RUNTIME_DEPLOY_CDI_SPECS_DIRECTORY="/var/run/cdi" \
  634. GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT="true" \
  635. GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/var/run/cdi;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}" \
  636. GPUSTACK_RUNTIME_DEPLOY_CORRECT_RUNNER_IMAGE="false" \
  637. GPUSTACK_RUNTIME_DOCKER_IMAGE_NO_PULL_VISUALIZATION="true" \
  638. GPUSTACK_RUNTIME_DOCKER_MIRRORED_NAME_FILTER_LABELS="${GPUSTACK_RUNTIME_DOCKER_MIRRORED_NAME_FILTER_LABELS}"
  639. COPY --chmod=755 pack/entrypoint.sh /usr/bin/entrypoint.sh
  640. WORKDIR /
  641. ENTRYPOINT [ "/usr/bin/entrypoint.sh" ]