worker-daemonset.yaml 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. {{- if .Values.worker.enabled }}
  2. apiVersion: apps/v1
  3. kind: DaemonSet
  4. metadata:
  5. labels:
  6. app: {{ .Release.Name }}-worker
  7. {{ include "chart_labels" . | indent 4 }}
  8. name: {{ .Release.Name }}-worker
  9. namespace: {{ .Release.Namespace }}
  10. spec:
  11. revisionHistoryLimit: 10
  12. selector:
  13. matchLabels:
  14. app: {{ .Release.Name }}-worker
  15. template:
  16. metadata:
  17. labels:
  18. app: {{ .Release.Name }}-worker
  19. spec:
  20. containers:
  21. - env:
  22. - name: GPUSTACK_SERVER_URL
  23. value: http://{{ .Release.Name }}-server.{{ .Release.Namespace }}.svc:{{ .Values.server.apiPort }}
  24. - name: GPUSTACK_WORKER_NAME
  25. valueFrom:
  26. fieldRef:
  27. fieldPath: spec.nodeName
  28. - name: GPUSTACK_RUNTIME_DEPLOY
  29. value: "Kubernetes"
  30. - name: GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT
  31. value: "true"
  32. - name: GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME
  33. valueFrom:
  34. fieldRef:
  35. fieldPath: metadata.name
  36. - name: GPUSTACK_RUNTIME_KUBERNETES_NAMESPACE
  37. valueFrom:
  38. fieldRef:
  39. fieldPath: metadata.namespace
  40. - name: GPUSTACK_RUNTIME_KUBERNETES_NODE_NAME
  41. valueFrom:
  42. fieldRef:
  43. fieldPath: spec.nodeName
  44. - name: GPUSTACK_WORKER_IP
  45. valueFrom:
  46. fieldRef:
  47. fieldPath: status.hostIP
  48. {{- if eq .Values.worker.gpuVendor "hygon" }}
  49. - name: ROCM_PATH
  50. value: /opt/dtk
  51. - name: ROCM_SMI_LIB_PATH
  52. value: /opt/hyhal/lib
  53. {{- end }}
  54. envFrom:
  55. - secretRef:
  56. name: registration-token
  57. optional: false
  58. - configMapRef:
  59. name: worker-config
  60. optional: true
  61. image: {{ include "gpustack.image" . }}:{{ include "gpustack.imageTag" . }}
  62. imagePullPolicy: {{ .Values.image.pullPolicy }}
  63. name: {{ .Release.Name }}-worker
  64. resources: {}
  65. securityContext:
  66. allowPrivilegeEscalation: true
  67. capabilities: {}
  68. privileged: true
  69. readOnlyRootFilesystem: false
  70. runAsNonRoot: false
  71. terminationMessagePath: /dev/termination-log
  72. terminationMessagePolicy: File
  73. volumeMounts:
  74. - name: gpustack-data-dir
  75. mountPath: /var/lib/gpustack
  76. - name: cdi
  77. mountPath: /var/run/cdi
  78. - name: kubelet-device-plugins
  79. mountPath: /var/lib/kubelet/device-plugins
  80. {{- if eq .Values.worker.gpuVendor "amd" }}
  81. - name: gpustack-amd-driver
  82. mountPath: /opt/rocm
  83. readOnly: true
  84. {{- end }}
  85. {{- if eq .Values.worker.gpuVendor "ascend" }}
  86. - name: gpustack-ascend-driver
  87. mountPath: /usr/local/Ascend/driver
  88. readOnly: true
  89. - name: gpustack-ascend-toolkit
  90. mountPath: /usr/local/Ascend/ascend-toolkit
  91. readOnly: true
  92. {{- end }}
  93. {{- if eq .Values.worker.gpuVendor "hygon" }}
  94. - name: gpustack-hygon-driver
  95. mountPath: /opt/hyhal
  96. readOnly: true
  97. - name: gpustack-hygon-toolkit
  98. mountPath: /opt/dtk
  99. readOnly: true
  100. {{- end }}
  101. {{- if eq .Values.worker.gpuVendor "metax" }}
  102. - name: gpustack-metax-driver
  103. mountPath: /opt/mxdriver
  104. readOnly: true
  105. - name: gpustack-metax-toolkit
  106. mountPath: /opt/maca
  107. readOnly: true
  108. {{- end }}
  109. {{- if eq .Values.worker.gpuVendor "iluvatar" }}
  110. - name: gpustack-iluvatar-toolkit
  111. mountPath: /usr/local/corex
  112. readOnly: true
  113. {{- end }}
  114. {{- if eq .Values.worker.gpuVendor "cambricon" }}
  115. - name: gpustack-cambricon-bin
  116. mountPath: /usr/bin/cnmon
  117. - name: gpustack-cambricon-toolkit
  118. mountPath: /usr/local/neuware
  119. readOnly: true
  120. {{- end }}
  121. {{- if eq .Values.worker.gpuVendor "thead" }}
  122. - name: gpustack-thead-toolkit
  123. mountPath: /usr/local/PPU_SDK
  124. readOnly: true
  125. {{- end }}
  126. {{- with .Values.worker.extraVolumeMounts }}
  127. {{- toYaml . | nindent 12 }}
  128. {{- end }}
  129. ports:
  130. - name: api
  131. containerPort: {{ .Values.worker.port }}
  132. protocol: TCP
  133. - name: metrics
  134. containerPort: {{ .Values.worker.metricsPort }}
  135. protocol: TCP
  136. readinessProbe:
  137. httpGet:
  138. path: /readyz
  139. port: api
  140. initialDelaySeconds: 5
  141. periodSeconds: 10
  142. timeoutSeconds: 2
  143. failureThreshold: 3
  144. successThreshold: 1
  145. livenessProbe:
  146. httpGet:
  147. path: /healthz
  148. port: api
  149. initialDelaySeconds: 10
  150. periodSeconds: 10
  151. timeoutSeconds: 2
  152. failureThreshold: 5
  153. successThreshold: 1
  154. volumes:
  155. - name: gpustack-data-dir
  156. hostPath:
  157. path: {{ .Values.worker.dataDir }}
  158. type: DirectoryOrCreate
  159. - name: cdi
  160. hostPath:
  161. path: /var/run/cdi
  162. type: DirectoryOrCreate
  163. - name: kubelet-device-plugins
  164. hostPath:
  165. path: /var/lib/kubelet/device-plugins
  166. type: DirectoryOrCreate
  167. {{- if eq .Values.worker.gpuVendor "amd" }}
  168. - name: gpustack-amd-driver
  169. hostPath:
  170. path: /opt/rocm
  171. type: DirectoryOrCreate
  172. {{- end }}
  173. {{- if eq .Values.worker.gpuVendor "ascend" }}
  174. - name: gpustack-ascend-driver
  175. hostPath:
  176. path: /usr/local/Ascend/driver
  177. type: DirectoryOrCreate
  178. - name: gpustack-ascend-toolkit
  179. hostPath:
  180. path: /usr/local/Ascend/ascend-toolkit
  181. type: DirectoryOrCreate
  182. {{- end }}
  183. {{- if eq .Values.worker.gpuVendor "hygon" }}
  184. - name: gpustack-hygon-driver
  185. hostPath:
  186. path: /opt/hyhal
  187. type: DirectoryOrCreate
  188. - name: gpustack-hygon-toolkit
  189. hostPath:
  190. path: /opt/dtk
  191. type: DirectoryOrCreate
  192. {{- end }}
  193. {{- if eq .Values.worker.gpuVendor "metax" }}
  194. - name: gpustack-metax-driver
  195. hostPath:
  196. path: /opt/mxdriver
  197. type: DirectoryOrCreate
  198. - name: gpustack-metax-toolkit
  199. hostPath:
  200. path: /opt/maca
  201. type: DirectoryOrCreate
  202. {{- end }}
  203. {{- if eq .Values.worker.gpuVendor "iluvatar" }}
  204. - name: gpustack-iluvatar-toolkit
  205. hostPath:
  206. path: /usr/local/corex
  207. type: DirectoryOrCreate
  208. {{- end }}
  209. {{- if eq .Values.worker.gpuVendor "cambricon" }}
  210. - name: gpustack-cambricon-bin
  211. hostPath:
  212. path: /usr/bin/cnmon
  213. type: FileOrCreate
  214. - name: gpustack-cambricon-toolkit
  215. hostPath:
  216. path: /usr/local/neuware
  217. type: DirectoryOrCreate
  218. {{- end }}
  219. {{- if eq .Values.worker.gpuVendor "thead" }}
  220. - name: gpustack-thead-toolkit
  221. hostPath:
  222. path: /usr/local/PPU_SDK
  223. type: DirectoryOrCreate
  224. {{- end }}
  225. {{- with .Values.worker.extraVolumes }}
  226. {{- toYaml . | nindent 8 }}
  227. {{- end }}
  228. hostNetwork: true
  229. hostIPC: true
  230. serviceAccountName: {{ .Release.Name }}-worker
  231. dnsPolicy: ClusterFirstWithHostNet
  232. {{- if or (eq .Values.worker.gpuVendor "nvidia") (eq .Values.worker.gpuVendor "mthreads") }}
  233. runtimeClassName: {{ .Values.worker.gpuVendor }}
  234. {{- end }}
  235. updateStrategy:
  236. rollingUpdate:
  237. maxSurge: 0
  238. maxUnavailable: 1
  239. type: RollingUpdate
  240. {{- end }}