gpustack-worker.json 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922
  1. {
  2. "annotations": {
  3. "list": [
  4. {
  5. "builtIn": 1,
  6. "datasource": {
  7. "type": "datasource",
  8. "uid": "grafana"
  9. },
  10. "enable": true,
  11. "hide": true,
  12. "iconColor": "rgba(0, 211, 255, 1)",
  13. "name": "Annotations & Alerts",
  14. "type": "dashboard"
  15. }
  16. ]
  17. },
  18. "editable": true,
  19. "fiscalYearStartMonth": 0,
  20. "graphTooltip": 0,
  21. "id": 11,
  22. "links": [],
  23. "panels": [
  24. {
  25. "datasource": {
  26. "type": "prometheus",
  27. "uid": "prometheus"
  28. },
  29. "fieldConfig": {
  30. "defaults": {
  31. "color": {
  32. "mode": "thresholds"
  33. },
  34. "decimals": 0,
  35. "mappings": [],
  36. "thresholds": {
  37. "mode": "absolute",
  38. "steps": [
  39. {
  40. "color": "blue",
  41. "value": 0
  42. }
  43. ]
  44. }
  45. },
  46. "overrides": [
  47. {
  48. "matcher": {
  49. "id": "byName",
  50. "options": "Total Workers"
  51. },
  52. "properties": [
  53. {
  54. "id": "color",
  55. "value": {
  56. "fixedColor": "blue",
  57. "mode": "fixed"
  58. }
  59. }
  60. ]
  61. },
  62. {
  63. "matcher": {
  64. "id": "byName",
  65. "options": "Total GPUs"
  66. },
  67. "properties": [
  68. {
  69. "id": "color",
  70. "value": {
  71. "fixedColor": "yellow",
  72. "mode": "fixed"
  73. }
  74. }
  75. ]
  76. }
  77. ]
  78. },
  79. "gridPos": {
  80. "h": 4,
  81. "w": 9,
  82. "x": 0,
  83. "y": 0
  84. },
  85. "id": 74,
  86. "options": {
  87. "colorMode": "background",
  88. "graphMode": "none",
  89. "justifyMode": "auto",
  90. "orientation": "auto",
  91. "percentChangeColorMode": "standard",
  92. "reduceOptions": {
  93. "calcs": [
  94. "lastNotNull"
  95. ],
  96. "fields": "",
  97. "values": false
  98. },
  99. "showPercentChange": false,
  100. "textMode": "auto",
  101. "wideLayout": true
  102. },
  103. "pluginVersion": "12.2.0",
  104. "targets": [
  105. {
  106. "datasource": {
  107. "type": "prometheus",
  108. "uid": "${idc}"
  109. },
  110. "editorMode": "code",
  111. "expr": "count(gpustack:worker_node_os_info{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"})",
  112. "hide": false,
  113. "instant": false,
  114. "legendFormat": "Total Workers",
  115. "range": true,
  116. "refId": "E"
  117. },
  118. {
  119. "datasource": {
  120. "type": "prometheus",
  121. "uid": "${idc}"
  122. },
  123. "editorMode": "code",
  124. "expr": "count(gpustack:worker_node_gpu_cores{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"})",
  125. "instant": false,
  126. "legendFormat": "Total GPUs",
  127. "range": true,
  128. "refId": "A"
  129. }
  130. ],
  131. "title": "Summary",
  132. "type": "stat"
  133. },
  134. {
  135. "datasource": {
  136. "type": "prometheus",
  137. "uid": "prometheus"
  138. },
  139. "fieldConfig": {
  140. "defaults": {
  141. "color": {
  142. "mode": "thresholds"
  143. },
  144. "mappings": [],
  145. "max": 90,
  146. "min": 0,
  147. "thresholds": {
  148. "mode": "absolute",
  149. "steps": [
  150. {
  151. "color": "green",
  152. "value": 0
  153. },
  154. {
  155. "color": "#EAB839",
  156. "value": 60
  157. },
  158. {
  159. "color": "red",
  160. "value": 80
  161. }
  162. ]
  163. },
  164. "unit": "percent"
  165. },
  166. "overrides": []
  167. },
  168. "gridPos": {
  169. "h": 8,
  170. "w": 5,
  171. "x": 9,
  172. "y": 0
  173. },
  174. "id": 89,
  175. "options": {
  176. "minVizHeight": 75,
  177. "minVizWidth": 75,
  178. "orientation": "horizontal",
  179. "reduceOptions": {
  180. "calcs": [
  181. "lastNotNull"
  182. ],
  183. "fields": "",
  184. "values": false
  185. },
  186. "showThresholdLabels": false,
  187. "showThresholdMarkers": true,
  188. "sizing": "auto"
  189. },
  190. "pluginVersion": "12.2.0",
  191. "targets": [
  192. {
  193. "datasource": {
  194. "type": "prometheus",
  195. "uid": "P1809F7CD0C75ACF3"
  196. },
  197. "editorMode": "code",
  198. "expr": "avg(gpustack:worker_node_gpu_utilization_rate{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"})",
  199. "instant": false,
  200. "legendFormat": "__auto",
  201. "range": true,
  202. "refId": "A"
  203. }
  204. ],
  205. "title": "Average GPU Utilization",
  206. "type": "gauge"
  207. },
  208. {
  209. "datasource": {
  210. "type": "prometheus",
  211. "uid": "prometheus"
  212. },
  213. "fieldConfig": {
  214. "defaults": {
  215. "color": {
  216. "mode": "thresholds"
  217. },
  218. "mappings": [],
  219. "max": 90,
  220. "min": 0,
  221. "thresholds": {
  222. "mode": "absolute",
  223. "steps": [
  224. {
  225. "color": "green",
  226. "value": 0
  227. },
  228. {
  229. "color": "#EAB839",
  230. "value": 60
  231. },
  232. {
  233. "color": "red",
  234. "value": 80
  235. }
  236. ]
  237. },
  238. "unit": "percent"
  239. },
  240. "overrides": []
  241. },
  242. "gridPos": {
  243. "h": 8,
  244. "w": 5,
  245. "x": 14,
  246. "y": 0
  247. },
  248. "id": 90,
  249. "options": {
  250. "minVizHeight": 75,
  251. "minVizWidth": 75,
  252. "orientation": "horizontal",
  253. "reduceOptions": {
  254. "calcs": [
  255. "lastNotNull"
  256. ],
  257. "fields": "",
  258. "values": false
  259. },
  260. "showThresholdLabels": false,
  261. "showThresholdMarkers": true,
  262. "sizing": "auto"
  263. },
  264. "pluginVersion": "12.2.0",
  265. "targets": [
  266. {
  267. "datasource": {
  268. "type": "prometheus",
  269. "uid": "P1809F7CD0C75ACF3"
  270. },
  271. "editorMode": "code",
  272. "expr": "avg(gpustack:worker_node_gram_utilization_rate{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"})",
  273. "instant": false,
  274. "legendFormat": "__auto",
  275. "range": true,
  276. "refId": "A"
  277. }
  278. ],
  279. "title": "Average VRAM Utilization",
  280. "type": "gauge"
  281. },
  282. {
  283. "datasource": {
  284. "type": "prometheus",
  285. "uid": "prometheus"
  286. },
  287. "fieldConfig": {
  288. "defaults": {
  289. "color": {
  290. "mode": "thresholds"
  291. },
  292. "mappings": [],
  293. "max": 90,
  294. "min": 0,
  295. "thresholds": {
  296. "mode": "absolute",
  297. "steps": [
  298. {
  299. "color": "green",
  300. "value": 0
  301. },
  302. {
  303. "color": "#EAB839",
  304. "value": 60
  305. },
  306. {
  307. "color": "red",
  308. "value": 80
  309. }
  310. ]
  311. },
  312. "unit": "celsius"
  313. },
  314. "overrides": []
  315. },
  316. "gridPos": {
  317. "h": 8,
  318. "w": 5,
  319. "x": 19,
  320. "y": 0
  321. },
  322. "id": 82,
  323. "options": {
  324. "minVizHeight": 75,
  325. "minVizWidth": 75,
  326. "orientation": "horizontal",
  327. "reduceOptions": {
  328. "calcs": [
  329. "lastNotNull"
  330. ],
  331. "fields": "",
  332. "values": false
  333. },
  334. "showThresholdLabels": false,
  335. "showThresholdMarkers": true,
  336. "sizing": "auto"
  337. },
  338. "pluginVersion": "12.2.0",
  339. "targets": [
  340. {
  341. "datasource": {
  342. "type": "prometheus",
  343. "uid": "P1809F7CD0C75ACF3"
  344. },
  345. "editorMode": "code",
  346. "expr": "avg(gpustack:worker_node_gpu_temperature_celsius{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"})",
  347. "instant": false,
  348. "legendFormat": "__auto",
  349. "range": true,
  350. "refId": "A"
  351. }
  352. ],
  353. "title": "Average GPU Temperature",
  354. "type": "gauge"
  355. },
  356. {
  357. "datasource": {
  358. "type": "prometheus",
  359. "uid": "prometheus"
  360. },
  361. "fieldConfig": {
  362. "defaults": {
  363. "color": {
  364. "mode": "thresholds"
  365. },
  366. "mappings": [],
  367. "max": 90,
  368. "min": 0,
  369. "thresholds": {
  370. "mode": "absolute",
  371. "steps": [
  372. {
  373. "color": "green",
  374. "value": 0
  375. },
  376. {
  377. "color": "#EAB839",
  378. "value": 60
  379. },
  380. {
  381. "color": "red",
  382. "value": 80
  383. }
  384. ]
  385. },
  386. "unit": "bytes"
  387. },
  388. "overrides": []
  389. },
  390. "gridPos": {
  391. "h": 4,
  392. "w": 3,
  393. "x": 0,
  394. "y": 4
  395. },
  396. "id": 95,
  397. "options": {
  398. "colorMode": "none",
  399. "graphMode": "none",
  400. "justifyMode": "auto",
  401. "orientation": "horizontal",
  402. "percentChangeColorMode": "standard",
  403. "reduceOptions": {
  404. "calcs": [
  405. "lastNotNull"
  406. ],
  407. "fields": "",
  408. "values": false
  409. },
  410. "showPercentChange": false,
  411. "textMode": "auto",
  412. "wideLayout": true
  413. },
  414. "pluginVersion": "12.2.0",
  415. "targets": [
  416. {
  417. "datasource": {
  418. "type": "prometheus",
  419. "uid": "P1809F7CD0C75ACF3"
  420. },
  421. "editorMode": "code",
  422. "expr": "sum(gpustack:worker_node_gram_total_bytes{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"})",
  423. "instant": false,
  424. "legendFormat": "__auto",
  425. "range": true,
  426. "refId": "A"
  427. }
  428. ],
  429. "title": "VRAM Size",
  430. "type": "stat"
  431. },
  432. {
  433. "datasource": {
  434. "type": "prometheus",
  435. "uid": "prometheus"
  436. },
  437. "fieldConfig": {
  438. "defaults": {
  439. "color": {
  440. "mode": "thresholds"
  441. },
  442. "mappings": [],
  443. "max": 90,
  444. "min": 0,
  445. "thresholds": {
  446. "mode": "absolute",
  447. "steps": [
  448. {
  449. "color": "green",
  450. "value": 0
  451. },
  452. {
  453. "color": "#EAB839",
  454. "value": 60
  455. },
  456. {
  457. "color": "red",
  458. "value": 80
  459. }
  460. ]
  461. },
  462. "unit": "bytes"
  463. },
  464. "overrides": []
  465. },
  466. "gridPos": {
  467. "h": 4,
  468. "w": 3,
  469. "x": 3,
  470. "y": 4
  471. },
  472. "id": 93,
  473. "options": {
  474. "colorMode": "value",
  475. "graphMode": "none",
  476. "justifyMode": "auto",
  477. "orientation": "horizontal",
  478. "percentChangeColorMode": "standard",
  479. "reduceOptions": {
  480. "calcs": [
  481. "lastNotNull"
  482. ],
  483. "fields": "",
  484. "values": false
  485. },
  486. "showPercentChange": false,
  487. "textMode": "auto",
  488. "wideLayout": true
  489. },
  490. "pluginVersion": "12.2.0",
  491. "targets": [
  492. {
  493. "datasource": {
  494. "type": "prometheus",
  495. "uid": "P1809F7CD0C75ACF3"
  496. },
  497. "editorMode": "code",
  498. "expr": "avg(gpustack:worker_node_memory_total_bytes{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"})",
  499. "instant": false,
  500. "legendFormat": "__auto",
  501. "range": true,
  502. "refId": "A"
  503. }
  504. ],
  505. "title": "Memory Size",
  506. "type": "stat"
  507. },
  508. {
  509. "datasource": {
  510. "type": "prometheus",
  511. "uid": "prometheus"
  512. },
  513. "fieldConfig": {
  514. "defaults": {
  515. "color": {
  516. "mode": "thresholds"
  517. },
  518. "mappings": [],
  519. "max": 90,
  520. "min": 0,
  521. "thresholds": {
  522. "mode": "absolute",
  523. "steps": [
  524. {
  525. "color": "green",
  526. "value": 0
  527. },
  528. {
  529. "color": "#EAB839",
  530. "value": 60
  531. },
  532. {
  533. "color": "red",
  534. "value": 80
  535. }
  536. ]
  537. },
  538. "unit": "none"
  539. },
  540. "overrides": []
  541. },
  542. "gridPos": {
  543. "h": 4,
  544. "w": 3,
  545. "x": 6,
  546. "y": 4
  547. },
  548. "id": 94,
  549. "options": {
  550. "colorMode": "value",
  551. "graphMode": "none",
  552. "justifyMode": "auto",
  553. "orientation": "horizontal",
  554. "percentChangeColorMode": "standard",
  555. "reduceOptions": {
  556. "calcs": [
  557. "lastNotNull"
  558. ],
  559. "fields": "",
  560. "values": false
  561. },
  562. "showPercentChange": false,
  563. "textMode": "auto",
  564. "wideLayout": true
  565. },
  566. "pluginVersion": "12.2.0",
  567. "targets": [
  568. {
  569. "datasource": {
  570. "type": "prometheus",
  571. "uid": "P1809F7CD0C75ACF3"
  572. },
  573. "editorMode": "code",
  574. "expr": "avg(gpustack:worker_node_cpu_cores{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"})",
  575. "instant": false,
  576. "legendFormat": "__auto",
  577. "range": true,
  578. "refId": "A"
  579. }
  580. ],
  581. "title": "CPU Cores",
  582. "type": "stat"
  583. },
  584. {
  585. "datasource": {
  586. "type": "prometheus",
  587. "uid": "prometheus"
  588. },
  589. "description": "",
  590. "fieldConfig": {
  591. "defaults": {
  592. "color": {
  593. "mode": "thresholds"
  594. },
  595. "custom": {
  596. "align": "center",
  597. "cellOptions": {
  598. "type": "auto"
  599. },
  600. "footer": {
  601. "reducers": []
  602. },
  603. "inspect": false
  604. },
  605. "mappings": [],
  606. "thresholds": {
  607. "mode": "absolute",
  608. "steps": [
  609. {
  610. "color": "green",
  611. "value": 0
  612. },
  613. {
  614. "color": "red",
  615. "value": 80
  616. }
  617. ]
  618. }
  619. },
  620. "overrides": [
  621. {
  622. "matcher": {
  623. "id": "byName",
  624. "options": "Cluster"
  625. },
  626. "properties": [
  627. {
  628. "id": "custom.cellOptions",
  629. "value": {
  630. "type": "color-background"
  631. }
  632. },
  633. {
  634. "id": "custom.width",
  635. "value": 173
  636. }
  637. ]
  638. },
  639. {
  640. "matcher": {
  641. "id": "byName",
  642. "options": "GPU VRAM Util Rate"
  643. },
  644. "properties": [
  645. {
  646. "id": "custom.cellOptions",
  647. "value": {
  648. "type": "color-background"
  649. }
  650. },
  651. {
  652. "id": "color",
  653. "value": {
  654. "fixedColor": "blue",
  655. "mode": "fixed"
  656. }
  657. }
  658. ]
  659. },
  660. {
  661. "matcher": {
  662. "id": "byName",
  663. "options": "Worker"
  664. },
  665. "properties": [
  666. {
  667. "id": "custom.cellOptions",
  668. "value": {
  669. "type": "color-text"
  670. }
  671. },
  672. {
  673. "id": "color",
  674. "value": {
  675. "fixedColor": "blue",
  676. "mode": "fixed"
  677. }
  678. },
  679. {
  680. "id": "custom.width",
  681. "value": 144
  682. }
  683. ]
  684. },
  685. {
  686. "matcher": {
  687. "id": "byName",
  688. "options": "Chip Index"
  689. },
  690. "properties": [
  691. {
  692. "id": "custom.cellOptions",
  693. "value": {
  694. "type": "color-text"
  695. }
  696. },
  697. {
  698. "id": "color",
  699. "value": {
  700. "fixedColor": "blue",
  701. "mode": "fixed"
  702. }
  703. },
  704. {
  705. "id": "custom.width",
  706. "value": 116
  707. }
  708. ]
  709. },
  710. {
  711. "matcher": {
  712. "id": "byName",
  713. "options": "GPU Index"
  714. },
  715. "properties": [
  716. {
  717. "id": "custom.cellOptions",
  718. "value": {
  719. "type": "color-text"
  720. }
  721. },
  722. {
  723. "id": "color",
  724. "value": {
  725. "fixedColor": "blue",
  726. "mode": "fixed"
  727. }
  728. },
  729. {
  730. "id": "custom.width",
  731. "value": 90
  732. }
  733. ]
  734. },
  735. {
  736. "matcher": {
  737. "id": "byName",
  738. "options": "GPU VRAM Utilization"
  739. },
  740. "properties": [
  741. {
  742. "id": "unit",
  743. "value": "percent"
  744. },
  745. {
  746. "id": "decimals",
  747. "value": 1
  748. },
  749. {
  750. "id": "custom.cellOptions",
  751. "value": {
  752. "type": "gauge"
  753. }
  754. },
  755. {
  756. "id": "color",
  757. "value": {
  758. "mode": "continuous-GrYlRd"
  759. }
  760. }
  761. ]
  762. },
  763. {
  764. "matcher": {
  765. "id": "byName",
  766. "options": "GPU Utilization"
  767. },
  768. "properties": [
  769. {
  770. "id": "unit",
  771. "value": "percent"
  772. },
  773. {
  774. "id": "decimals",
  775. "value": 1
  776. },
  777. {
  778. "id": "custom.cellOptions",
  779. "value": {
  780. "mode": "gradient",
  781. "type": "gauge",
  782. "valueDisplayMode": "text"
  783. }
  784. },
  785. {
  786. "id": "color",
  787. "value": {
  788. "mode": "continuous-GrYlRd"
  789. }
  790. },
  791. {
  792. "id": "custom.width",
  793. "value": 200
  794. }
  795. ]
  796. },
  797. {
  798. "matcher": {
  799. "id": "byName",
  800. "options": "GPU Name"
  801. },
  802. "properties": [
  803. {
  804. "id": "custom.width",
  805. "value": 279
  806. }
  807. ]
  808. },
  809. {
  810. "matcher": {
  811. "id": "byName",
  812. "options": "Chip Index"
  813. },
  814. "properties": [
  815. {
  816. "id": "custom.width",
  817. "value": 102
  818. }
  819. ]
  820. },
  821. {
  822. "matcher": {
  823. "id": "byName",
  824. "options": "Temperature"
  825. },
  826. "properties": [
  827. {
  828. "id": "custom.width",
  829. "value": 117
  830. }
  831. ]
  832. }
  833. ]
  834. },
  835. "gridPos": {
  836. "h": 7,
  837. "w": 24,
  838. "x": 0,
  839. "y": 8
  840. },
  841. "id": 66,
  842. "options": {
  843. "cellHeight": "sm",
  844. "frameIndex": 1,
  845. "showHeader": true,
  846. "sortBy": []
  847. },
  848. "pluginVersion": "12.2.0",
  849. "targets": [
  850. {
  851. "datasource": {
  852. "type": "prometheus",
  853. "uid": "${idc}"
  854. },
  855. "editorMode": "code",
  856. "exemplar": false,
  857. "expr": "gpustack:worker_node_gpu_utilization_rate{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"} ",
  858. "format": "table",
  859. "instant": true,
  860. "legendFormat": "__auto",
  861. "range": false,
  862. "refId": "A"
  863. },
  864. {
  865. "datasource": {
  866. "type": "prometheus",
  867. "uid": "${idc}"
  868. },
  869. "editorMode": "code",
  870. "exemplar": false,
  871. "expr": "gpustack:worker_node_gram_utilization_rate{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"} ",
  872. "format": "table",
  873. "hide": false,
  874. "instant": true,
  875. "legendFormat": "__auto",
  876. "range": false,
  877. "refId": "B"
  878. },
  879. {
  880. "datasource": {
  881. "type": "prometheus",
  882. "uid": "prometheus"
  883. },
  884. "editorMode": "code",
  885. "exemplar": false,
  886. "expr": "gpustack:worker_node_gpu_temperature_celsius{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"} ",
  887. "format": "table",
  888. "hide": false,
  889. "instant": true,
  890. "legendFormat": "__auto",
  891. "range": false,
  892. "refId": "C"
  893. }
  894. ],
  895. "title": "GPUs",
  896. "transformations": [
  897. {
  898. "id": "concatenate",
  899. "options": {
  900. "frameNameLabel": "frame",
  901. "frameNameMode": "field"
  902. }
  903. },
  904. {
  905. "id": "organize",
  906. "options": {
  907. "excludeByName": {
  908. "Time": true,
  909. "__name__ 1": true,
  910. "__name__ 2": true,
  911. "__name__ 3": true,
  912. "cluster_id 1": true,
  913. "cluster_id 2": true,
  914. "cluster_id 3": true,
  915. "cluster_name 2": true,
  916. "cluster_name 3": true,
  917. "exported_instance 1": true,
  918. "exported_instance 2": true,
  919. "exported_instance 3": true,
  920. "gpu_chip_index 1": false,
  921. "gpu_chip_index 2": true,
  922. "gpu_chip_index 3": true,
  923. "gpu_index 2": true,
  924. "gpu_index 3": true,
  925. "gpu_name 2": true,
  926. "gpu_name 3": true,
  927. "instance 1": true,
  928. "instance 2": true,
  929. "instance 3": true,
  930. "job 1": true,
  931. "job 2": true,
  932. "job 3": true,
  933. "worker_id 1": true,
  934. "worker_id 2": true,
  935. "worker_id 3": true,
  936. "worker_name 2": true,
  937. "worker_name 3": true
  938. },
  939. "includeByName": {},
  940. "indexByName": {
  941. "Time": 15,
  942. "Value #A": 12,
  943. "Value #B": 27,
  944. "Value #C": 11,
  945. "__name__ 1": 0,
  946. "__name__ 2": 16,
  947. "__name__ 3": 28,
  948. "cluster_id 1": 1,
  949. "cluster_id 2": 17,
  950. "cluster_id 3": 29,
  951. "cluster_name 1": 2,
  952. "cluster_name 2": 18,
  953. "cluster_name 3": 30,
  954. "exported_instance 1": 4,
  955. "exported_instance 2": 19,
  956. "exported_instance 3": 31,
  957. "gpu_chip_index 1": 5,
  958. "gpu_chip_index 2": 20,
  959. "gpu_chip_index 3": 32,
  960. "gpu_index 1": 6,
  961. "gpu_index 2": 21,
  962. "gpu_index 3": 33,
  963. "gpu_name 1": 7,
  964. "gpu_name 2": 22,
  965. "gpu_name 3": 34,
  966. "instance 1": 8,
  967. "instance 2": 23,
  968. "instance 3": 35,
  969. "job 1": 9,
  970. "job 2": 24,
  971. "job 3": 36,
  972. "worker_id 1": 10,
  973. "worker_id 2": 25,
  974. "worker_id 3": 37,
  975. "worker_name 1": 3,
  976. "worker_name 2": 26,
  977. "worker_name 3": 38
  978. },
  979. "renameByName": {
  980. "Value #A": "GPU Utilization",
  981. "Value #B": "GPU VRAM Utilization",
  982. "Value #C": "Temperature",
  983. "cluster_name 1": "Cluster",
  984. "exported_instance 1": "",
  985. "gpu_chip_index 1": "Chip Index",
  986. "gpu_index 1": "GPU Index",
  987. "gpu_name 1": "GPU Name",
  988. "worker_name 1": "Worker"
  989. }
  990. }
  991. }
  992. ],
  993. "type": "table"
  994. },
  995. {
  996. "datasource": {
  997. "type": "prometheus",
  998. "uid": "prometheus"
  999. },
  1000. "fieldConfig": {
  1001. "defaults": {
  1002. "color": {
  1003. "mode": "palette-classic"
  1004. },
  1005. "custom": {
  1006. "axisBorderShow": false,
  1007. "axisCenteredZero": false,
  1008. "axisColorMode": "text",
  1009. "axisLabel": "",
  1010. "axisPlacement": "auto",
  1011. "barAlignment": 0,
  1012. "barWidthFactor": 0.6,
  1013. "drawStyle": "line",
  1014. "fillOpacity": 10,
  1015. "gradientMode": "none",
  1016. "hideFrom": {
  1017. "legend": false,
  1018. "tooltip": false,
  1019. "viz": false
  1020. },
  1021. "insertNulls": false,
  1022. "lineInterpolation": "linear",
  1023. "lineWidth": 2,
  1024. "pointSize": 5,
  1025. "scaleDistribution": {
  1026. "type": "linear"
  1027. },
  1028. "showPoints": "never",
  1029. "showValues": false,
  1030. "spanNulls": true,
  1031. "stacking": {
  1032. "group": "A",
  1033. "mode": "none"
  1034. },
  1035. "thresholdsStyle": {
  1036. "mode": "off"
  1037. }
  1038. },
  1039. "mappings": [],
  1040. "min": 0,
  1041. "thresholds": {
  1042. "mode": "absolute",
  1043. "steps": [
  1044. {
  1045. "color": "green",
  1046. "value": 0
  1047. },
  1048. {
  1049. "color": "red",
  1050. "value": 80
  1051. }
  1052. ]
  1053. },
  1054. "unit": "bytes"
  1055. },
  1056. "overrides": []
  1057. },
  1058. "gridPos": {
  1059. "h": 13,
  1060. "w": 6,
  1061. "x": 0,
  1062. "y": 15
  1063. },
  1064. "id": 92,
  1065. "options": {
  1066. "dataLinks": [],
  1067. "legend": {
  1068. "calcs": [
  1069. "last",
  1070. "max"
  1071. ],
  1072. "displayMode": "table",
  1073. "placement": "bottom",
  1074. "showLegend": true
  1075. },
  1076. "tooltip": {
  1077. "hideZeros": false,
  1078. "mode": "multi",
  1079. "sort": "none"
  1080. }
  1081. },
  1082. "pluginVersion": "12.2.0",
  1083. "targets": [
  1084. {
  1085. "datasource": {
  1086. "type": "prometheus",
  1087. "uid": "prometheus"
  1088. },
  1089. "editorMode": "code",
  1090. "expr": "gpustack:worker_node_memory_used_bytes{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"}",
  1091. "format": "time_series",
  1092. "hide": false,
  1093. "instant": false,
  1094. "interval": "",
  1095. "intervalFactor": 2,
  1096. "legendFormat": "{{worker_name}}",
  1097. "refId": "A"
  1098. }
  1099. ],
  1100. "title": "System Memory Usage",
  1101. "type": "timeseries"
  1102. },
  1103. {
  1104. "datasource": {
  1105. "type": "prometheus",
  1106. "uid": "prometheus"
  1107. },
  1108. "description": "",
  1109. "fieldConfig": {
  1110. "defaults": {
  1111. "color": {
  1112. "fixedColor": "blue",
  1113. "mode": "palette-classic"
  1114. },
  1115. "custom": {
  1116. "axisBorderShow": false,
  1117. "axisCenteredZero": false,
  1118. "axisColorMode": "text",
  1119. "axisLabel": "",
  1120. "axisPlacement": "auto",
  1121. "barAlignment": 0,
  1122. "barWidthFactor": 0.6,
  1123. "drawStyle": "line",
  1124. "fillOpacity": 10,
  1125. "gradientMode": "none",
  1126. "hideFrom": {
  1127. "legend": false,
  1128. "tooltip": false,
  1129. "viz": false
  1130. },
  1131. "insertNulls": false,
  1132. "lineInterpolation": "linear",
  1133. "lineWidth": 1,
  1134. "pointSize": 5,
  1135. "scaleDistribution": {
  1136. "type": "linear"
  1137. },
  1138. "showPoints": "never",
  1139. "showValues": false,
  1140. "spanNulls": false,
  1141. "stacking": {
  1142. "group": "A",
  1143. "mode": "none"
  1144. },
  1145. "thresholdsStyle": {
  1146. "mode": "off"
  1147. }
  1148. },
  1149. "fieldMinMax": false,
  1150. "mappings": [],
  1151. "thresholds": {
  1152. "mode": "absolute",
  1153. "steps": [
  1154. {
  1155. "color": "green",
  1156. "value": 0
  1157. },
  1158. {
  1159. "color": "red",
  1160. "value": 80
  1161. }
  1162. ]
  1163. },
  1164. "unit": "percent"
  1165. },
  1166. "overrides": []
  1167. },
  1168. "gridPos": {
  1169. "h": 13,
  1170. "w": 9,
  1171. "x": 6,
  1172. "y": 15
  1173. },
  1174. "id": 48,
  1175. "options": {
  1176. "legend": {
  1177. "calcs": [
  1178. "max",
  1179. "last"
  1180. ],
  1181. "displayMode": "table",
  1182. "placement": "bottom",
  1183. "showLegend": true
  1184. },
  1185. "tooltip": {
  1186. "hideZeros": false,
  1187. "mode": "single",
  1188. "sort": "none"
  1189. }
  1190. },
  1191. "pluginVersion": "12.2.0",
  1192. "targets": [
  1193. {
  1194. "datasource": {
  1195. "type": "prometheus",
  1196. "uid": "${idc}"
  1197. },
  1198. "editorMode": "code",
  1199. "exemplar": false,
  1200. "expr": "gpustack:worker_node_gpu_utilization_rate{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"}",
  1201. "format": "time_series",
  1202. "instant": false,
  1203. "legendFormat": "{{worker_name}} GPU{{gpu_index}} (chip {{gpu_chip_index}})",
  1204. "range": true,
  1205. "refId": "A"
  1206. }
  1207. ],
  1208. "title": "GPU Utilization",
  1209. "type": "timeseries"
  1210. },
  1211. {
  1212. "datasource": {
  1213. "type": "prometheus",
  1214. "uid": "prometheus"
  1215. },
  1216. "description": "",
  1217. "fieldConfig": {
  1218. "defaults": {
  1219. "color": {
  1220. "fixedColor": "blue",
  1221. "mode": "palette-classic"
  1222. },
  1223. "custom": {
  1224. "axisBorderShow": false,
  1225. "axisCenteredZero": false,
  1226. "axisColorMode": "text",
  1227. "axisLabel": "",
  1228. "axisPlacement": "auto",
  1229. "barAlignment": 0,
  1230. "barWidthFactor": 0.6,
  1231. "drawStyle": "line",
  1232. "fillOpacity": 10,
  1233. "gradientMode": "none",
  1234. "hideFrom": {
  1235. "legend": false,
  1236. "tooltip": false,
  1237. "viz": false
  1238. },
  1239. "insertNulls": false,
  1240. "lineInterpolation": "linear",
  1241. "lineStyle": {
  1242. "fill": "solid"
  1243. },
  1244. "lineWidth": 1,
  1245. "pointSize": 5,
  1246. "scaleDistribution": {
  1247. "type": "linear"
  1248. },
  1249. "showPoints": "never",
  1250. "showValues": false,
  1251. "spanNulls": false,
  1252. "stacking": {
  1253. "group": "A",
  1254. "mode": "none"
  1255. },
  1256. "thresholdsStyle": {
  1257. "mode": "off"
  1258. }
  1259. },
  1260. "fieldMinMax": false,
  1261. "mappings": [],
  1262. "thresholds": {
  1263. "mode": "absolute",
  1264. "steps": [
  1265. {
  1266. "color": "green",
  1267. "value": 0
  1268. },
  1269. {
  1270. "color": "red",
  1271. "value": 80
  1272. }
  1273. ]
  1274. },
  1275. "unit": "percent"
  1276. },
  1277. "overrides": []
  1278. },
  1279. "gridPos": {
  1280. "h": 13,
  1281. "w": 9,
  1282. "x": 15,
  1283. "y": 15
  1284. },
  1285. "id": 86,
  1286. "options": {
  1287. "legend": {
  1288. "calcs": [
  1289. "max",
  1290. "last"
  1291. ],
  1292. "displayMode": "table",
  1293. "placement": "bottom",
  1294. "showLegend": true
  1295. },
  1296. "tooltip": {
  1297. "hideZeros": false,
  1298. "mode": "single",
  1299. "sort": "none"
  1300. }
  1301. },
  1302. "pluginVersion": "12.2.0",
  1303. "targets": [
  1304. {
  1305. "datasource": {
  1306. "type": "prometheus",
  1307. "uid": "${idc}"
  1308. },
  1309. "editorMode": "code",
  1310. "exemplar": false,
  1311. "expr": "gpustack:worker_node_gram_utilization_rate{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"}",
  1312. "format": "time_series",
  1313. "instant": false,
  1314. "legendFormat": "{{worker_name}} GPU{{gpu_index}} (chip {{gpu_chip_index}})",
  1315. "range": true,
  1316. "refId": "A"
  1317. }
  1318. ],
  1319. "title": "VRAM Utilization",
  1320. "type": "timeseries"
  1321. },
  1322. {
  1323. "datasource": {
  1324. "type": "prometheus",
  1325. "uid": "prometheus"
  1326. },
  1327. "fieldConfig": {
  1328. "defaults": {
  1329. "color": {
  1330. "mode": "palette-classic"
  1331. },
  1332. "custom": {
  1333. "axisBorderShow": false,
  1334. "axisCenteredZero": false,
  1335. "axisColorMode": "text",
  1336. "axisLabel": "",
  1337. "axisPlacement": "auto",
  1338. "barAlignment": 0,
  1339. "barWidthFactor": 0.6,
  1340. "drawStyle": "line",
  1341. "fillOpacity": 10,
  1342. "gradientMode": "none",
  1343. "hideFrom": {
  1344. "legend": false,
  1345. "tooltip": false,
  1346. "viz": false
  1347. },
  1348. "insertNulls": false,
  1349. "lineInterpolation": "linear",
  1350. "lineWidth": 2,
  1351. "pointSize": 5,
  1352. "scaleDistribution": {
  1353. "type": "linear"
  1354. },
  1355. "showPoints": "never",
  1356. "showValues": false,
  1357. "spanNulls": false,
  1358. "stacking": {
  1359. "group": "A",
  1360. "mode": "none"
  1361. },
  1362. "thresholdsStyle": {
  1363. "mode": "off"
  1364. }
  1365. },
  1366. "mappings": [],
  1367. "thresholds": {
  1368. "mode": "absolute",
  1369. "steps": [
  1370. {
  1371. "color": "green",
  1372. "value": 0
  1373. },
  1374. {
  1375. "color": "red",
  1376. "value": 80
  1377. }
  1378. ]
  1379. },
  1380. "unit": "percent"
  1381. },
  1382. "overrides": []
  1383. },
  1384. "gridPos": {
  1385. "h": 13,
  1386. "w": 6,
  1387. "x": 0,
  1388. "y": 28
  1389. },
  1390. "id": 91,
  1391. "options": {
  1392. "dataLinks": [],
  1393. "legend": {
  1394. "calcs": [
  1395. "last",
  1396. "max"
  1397. ],
  1398. "displayMode": "table",
  1399. "placement": "bottom",
  1400. "showLegend": true
  1401. },
  1402. "tooltip": {
  1403. "hideZeros": false,
  1404. "mode": "multi",
  1405. "sort": "none"
  1406. }
  1407. },
  1408. "pluginVersion": "12.2.0",
  1409. "targets": [
  1410. {
  1411. "datasource": {
  1412. "type": "prometheus",
  1413. "uid": "prometheus"
  1414. },
  1415. "editorMode": "code",
  1416. "expr": "gpustack:worker_node_cpu_utilization_rate{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"}",
  1417. "format": "time_series",
  1418. "intervalFactor": 1,
  1419. "legendFormat": "{{worker_name}}",
  1420. "range": true,
  1421. "refId": "A"
  1422. }
  1423. ],
  1424. "title": "CPU Utilization",
  1425. "type": "timeseries"
  1426. },
  1427. {
  1428. "datasource": {
  1429. "type": "prometheus",
  1430. "uid": "prometheus"
  1431. },
  1432. "fieldConfig": {
  1433. "defaults": {
  1434. "color": {
  1435. "mode": "palette-classic"
  1436. },
  1437. "custom": {
  1438. "axisBorderShow": false,
  1439. "axisCenteredZero": false,
  1440. "axisColorMode": "text",
  1441. "axisLabel": "",
  1442. "axisPlacement": "auto",
  1443. "barAlignment": 0,
  1444. "barWidthFactor": 0.6,
  1445. "drawStyle": "line",
  1446. "fillOpacity": 10,
  1447. "gradientMode": "none",
  1448. "hideFrom": {
  1449. "legend": false,
  1450. "tooltip": false,
  1451. "viz": false
  1452. },
  1453. "insertNulls": false,
  1454. "lineInterpolation": "linear",
  1455. "lineWidth": 1,
  1456. "pointSize": 5,
  1457. "scaleDistribution": {
  1458. "type": "linear"
  1459. },
  1460. "showPoints": "never",
  1461. "showValues": false,
  1462. "spanNulls": true,
  1463. "stacking": {
  1464. "group": "A",
  1465. "mode": "none"
  1466. },
  1467. "thresholdsStyle": {
  1468. "mode": "off"
  1469. }
  1470. },
  1471. "fieldMinMax": false,
  1472. "mappings": [],
  1473. "min": 0,
  1474. "thresholds": {
  1475. "mode": "absolute",
  1476. "steps": [
  1477. {
  1478. "color": "green",
  1479. "value": 0
  1480. },
  1481. {
  1482. "color": "red",
  1483. "value": 80
  1484. }
  1485. ]
  1486. },
  1487. "unit": "bytes"
  1488. },
  1489. "overrides": []
  1490. },
  1491. "gridPos": {
  1492. "h": 13,
  1493. "w": 9,
  1494. "x": 6,
  1495. "y": 28
  1496. },
  1497. "id": 88,
  1498. "options": {
  1499. "dataLinks": [],
  1500. "legend": {
  1501. "calcs": [
  1502. "last",
  1503. "max"
  1504. ],
  1505. "displayMode": "table",
  1506. "placement": "bottom",
  1507. "showLegend": true
  1508. },
  1509. "tooltip": {
  1510. "hideZeros": false,
  1511. "mode": "multi",
  1512. "sort": "none"
  1513. }
  1514. },
  1515. "pluginVersion": "12.2.0",
  1516. "targets": [
  1517. {
  1518. "datasource": {
  1519. "type": "prometheus",
  1520. "uid": "prometheus"
  1521. },
  1522. "editorMode": "code",
  1523. "expr": "gpustack:worker_node_gram_total_bytes{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"}",
  1524. "format": "time_series",
  1525. "hide": false,
  1526. "intervalFactor": 1,
  1527. "legendFormat": "Total: {{worker_name}} GPU{{gpu_index}} (chip {{gpu_chip_index}})",
  1528. "range": true,
  1529. "refId": "A"
  1530. },
  1531. {
  1532. "datasource": {
  1533. "type": "prometheus",
  1534. "uid": "prometheus"
  1535. },
  1536. "editorMode": "code",
  1537. "expr": "gpustack:worker_node_gram_used_bytes{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"}",
  1538. "hide": false,
  1539. "instant": false,
  1540. "legendFormat": "Used: {{worker_name}} GPU{{gpu_index}} (chip {{gpu_chip_index}})",
  1541. "range": true,
  1542. "refId": "B"
  1543. },
  1544. {
  1545. "datasource": {
  1546. "type": "prometheus",
  1547. "uid": "prometheus"
  1548. },
  1549. "editorMode": "code",
  1550. "expr": "gpustack:worker_node_gram_allocated_bytes{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"}",
  1551. "hide": false,
  1552. "instant": false,
  1553. "legendFormat": "Allocated: {{worker_name}} GPU{{gpu_index}} (chip {{gpu_chip_index}})",
  1554. "range": true,
  1555. "refId": "C"
  1556. }
  1557. ],
  1558. "title": "GPU Mem Usage",
  1559. "type": "timeseries"
  1560. },
  1561. {
  1562. "datasource": {
  1563. "type": "prometheus",
  1564. "uid": "prometheus"
  1565. },
  1566. "fieldConfig": {
  1567. "defaults": {
  1568. "color": {
  1569. "mode": "palette-classic"
  1570. },
  1571. "custom": {
  1572. "axisBorderShow": false,
  1573. "axisCenteredZero": false,
  1574. "axisColorMode": "text",
  1575. "axisLabel": "",
  1576. "axisPlacement": "auto",
  1577. "barAlignment": 0,
  1578. "barWidthFactor": 0.6,
  1579. "drawStyle": "line",
  1580. "fillOpacity": 0,
  1581. "gradientMode": "none",
  1582. "hideFrom": {
  1583. "legend": false,
  1584. "tooltip": false,
  1585. "viz": false
  1586. },
  1587. "insertNulls": false,
  1588. "lineInterpolation": "linear",
  1589. "lineWidth": 2,
  1590. "pointSize": 5,
  1591. "scaleDistribution": {
  1592. "type": "linear"
  1593. },
  1594. "showPoints": "never",
  1595. "showValues": false,
  1596. "spanNulls": true,
  1597. "stacking": {
  1598. "group": "A",
  1599. "mode": "none"
  1600. },
  1601. "thresholdsStyle": {
  1602. "mode": "off"
  1603. }
  1604. },
  1605. "mappings": [],
  1606. "thresholds": {
  1607. "mode": "absolute",
  1608. "steps": [
  1609. {
  1610. "color": "green",
  1611. "value": 0
  1612. },
  1613. {
  1614. "color": "red",
  1615. "value": 80
  1616. }
  1617. ]
  1618. },
  1619. "unit": "celsius"
  1620. },
  1621. "overrides": []
  1622. },
  1623. "gridPos": {
  1624. "h": 13,
  1625. "w": 9,
  1626. "x": 15,
  1627. "y": 28
  1628. },
  1629. "id": 87,
  1630. "options": {
  1631. "dataLinks": [],
  1632. "legend": {
  1633. "calcs": [
  1634. "last",
  1635. "max"
  1636. ],
  1637. "displayMode": "table",
  1638. "placement": "bottom",
  1639. "showLegend": true
  1640. },
  1641. "tooltip": {
  1642. "hideZeros": false,
  1643. "mode": "multi",
  1644. "sort": "none"
  1645. }
  1646. },
  1647. "pluginVersion": "12.2.0",
  1648. "targets": [
  1649. {
  1650. "datasource": {
  1651. "type": "prometheus",
  1652. "uid": "prometheus"
  1653. },
  1654. "editorMode": "code",
  1655. "expr": "gpustack:worker_node_gpu_temperature_celsius{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"}",
  1656. "format": "time_series",
  1657. "hide": false,
  1658. "intervalFactor": 1,
  1659. "legendFormat": "{{worker_name}} GPU{{gpu_index}} (chip {{gpu_chip_index}})",
  1660. "range": true,
  1661. "refId": "A"
  1662. }
  1663. ],
  1664. "title": "GPU Temperature",
  1665. "type": "timeseries"
  1666. },
  1667. {
  1668. "datasource": {
  1669. "type": "prometheus",
  1670. "uid": "prometheus"
  1671. },
  1672. "fieldConfig": {
  1673. "defaults": {
  1674. "color": {
  1675. "mode": "palette-classic"
  1676. },
  1677. "decimals": 2,
  1678. "mappings": [],
  1679. "max": 100,
  1680. "min": 0,
  1681. "thresholds": {
  1682. "mode": "absolute",
  1683. "steps": [
  1684. {
  1685. "color": "green",
  1686. "value": 0
  1687. },
  1688. {
  1689. "color": "red",
  1690. "value": 80
  1691. }
  1692. ]
  1693. },
  1694. "unit": "percent"
  1695. },
  1696. "overrides": []
  1697. },
  1698. "gridPos": {
  1699. "h": 14,
  1700. "w": 12,
  1701. "x": 0,
  1702. "y": 41
  1703. },
  1704. "id": 78,
  1705. "options": {
  1706. "displayMode": "lcd",
  1707. "legend": {
  1708. "calcs": [
  1709. "last",
  1710. "max"
  1711. ],
  1712. "displayMode": "table",
  1713. "placement": "bottom",
  1714. "showLegend": true
  1715. },
  1716. "maxVizHeight": 300,
  1717. "minVizHeight": 16,
  1718. "minVizWidth": 8,
  1719. "namePlacement": "auto",
  1720. "orientation": "auto",
  1721. "reduceOptions": {
  1722. "calcs": [
  1723. "lastNotNull"
  1724. ],
  1725. "fields": "",
  1726. "values": false
  1727. },
  1728. "showUnfilled": true,
  1729. "sizing": "auto",
  1730. "valueMode": "color"
  1731. },
  1732. "pluginVersion": "12.2.0",
  1733. "targets": [
  1734. {
  1735. "datasource": {
  1736. "type": "prometheus",
  1737. "uid": "prometheus"
  1738. },
  1739. "editorMode": "code",
  1740. "exemplar": false,
  1741. "expr": "avg by (cluster_name,worker_name,gpu_chip_index,gpu_index) (gpustack:worker_node_gpu_utilization_rate{cluster_name=~\"$cluster_name\",worker_name=~\"$worker_name\"}) ",
  1742. "instant": false,
  1743. "legendFormat": "{{worker_name}} GPU{{gpu_index}} (chip {{gpu_chip_index}})",
  1744. "range": true,
  1745. "refId": "A"
  1746. }
  1747. ],
  1748. "title": "GPU Utilization",
  1749. "type": "bargauge"
  1750. },
  1751. {
  1752. "datasource": {
  1753. "type": "prometheus",
  1754. "uid": "prometheus"
  1755. },
  1756. "fieldConfig": {
  1757. "defaults": {
  1758. "color": {
  1759. "mode": "thresholds"
  1760. },
  1761. "decimals": 2,
  1762. "mappings": [],
  1763. "max": 100,
  1764. "min": 0,
  1765. "thresholds": {
  1766. "mode": "absolute",
  1767. "steps": [
  1768. {
  1769. "color": "green",
  1770. "value": 0
  1771. },
  1772. {
  1773. "color": "#EAB839",
  1774. "value": 50
  1775. },
  1776. {
  1777. "color": "#EF843C",
  1778. "value": 60
  1779. },
  1780. {
  1781. "color": "red",
  1782. "value": 80
  1783. }
  1784. ]
  1785. },
  1786. "unit": "percent"
  1787. },
  1788. "overrides": []
  1789. },
  1790. "gridPos": {
  1791. "h": 14,
  1792. "w": 12,
  1793. "x": 12,
  1794. "y": 41
  1795. },
  1796. "id": 79,
  1797. "options": {
  1798. "displayMode": "gradient",
  1799. "legend": {
  1800. "calcs": [
  1801. "last",
  1802. "max"
  1803. ],
  1804. "displayMode": "table",
  1805. "placement": "bottom",
  1806. "showLegend": true
  1807. },
  1808. "maxVizHeight": 300,
  1809. "minVizHeight": 16,
  1810. "minVizWidth": 8,
  1811. "namePlacement": "auto",
  1812. "orientation": "auto",
  1813. "reduceOptions": {
  1814. "calcs": [
  1815. "mean"
  1816. ],
  1817. "fields": "",
  1818. "values": false
  1819. },
  1820. "showUnfilled": true,
  1821. "sizing": "auto",
  1822. "valueMode": "color"
  1823. },
  1824. "pluginVersion": "12.2.0",
  1825. "targets": [
  1826. {
  1827. "datasource": {
  1828. "type": "prometheus",
  1829. "uid": "${idc}"
  1830. },
  1831. "editorMode": "code",
  1832. "expr": "avg by (cluster_name,worker_name, gpu_chip_index,gpu_index) (gpustack:worker_node_gram_utilization_rate{cluster_name=\"$cluster_name\",worker_name=~\"$worker_name\"}) ",
  1833. "instant": false,
  1834. "legendFormat": "{{worker_name}} GPU{{gpu_index}} (chip {{gpu_chip_index}})",
  1835. "range": true,
  1836. "refId": "A"
  1837. }
  1838. ],
  1839. "title": "VRAM Utilization",
  1840. "type": "bargauge"
  1841. }
  1842. ],
  1843. "preload": false,
  1844. "refresh": "30s",
  1845. "schemaVersion": 42,
  1846. "tags": [],
  1847. "templating": {
  1848. "list": [
  1849. {
  1850. "current": {
  1851. "text": "",
  1852. "value": ""
  1853. },
  1854. "datasource": {
  1855. "type": "prometheus",
  1856. "uid": "prometheus"
  1857. },
  1858. "definition": "query_result(gpustack:cluster_info)",
  1859. "includeAll": false,
  1860. "label": "Cluster",
  1861. "name": "cluster_name",
  1862. "options": [],
  1863. "query": {
  1864. "qryType": 1,
  1865. "query": "query_result(gpustack:cluster_info)",
  1866. "refId": "PrometheusVariableQueryEditor-VariableQuery"
  1867. },
  1868. "refresh": 2,
  1869. "regex": "/cluster_name=\"([^\"]+)\"/",
  1870. "sort": 1,
  1871. "type": "query"
  1872. },
  1873. {
  1874. "allValue": ".*",
  1875. "current": {
  1876. "text": "All",
  1877. "value": [
  1878. "$__all"
  1879. ]
  1880. },
  1881. "datasource": {
  1882. "type": "prometheus",
  1883. "uid": "prometheus"
  1884. },
  1885. "definition": "query_result(gpustack:worker_info{cluster_name=\"$cluster_name\"})",
  1886. "includeAll": true,
  1887. "label": "Worker",
  1888. "multi": true,
  1889. "name": "worker_name",
  1890. "options": [],
  1891. "query": {
  1892. "qryType": 1,
  1893. "query": "query_result(gpustack:worker_info{cluster_name=\"$cluster_name\"})",
  1894. "refId": "PrometheusVariableQueryEditor-VariableQuery"
  1895. },
  1896. "refresh": 2,
  1897. "regex": "/worker_name=\"([^\"]+)\"/",
  1898. "type": "query"
  1899. }
  1900. ]
  1901. },
  1902. "time": {
  1903. "from": "now-5m",
  1904. "to": "now"
  1905. },
  1906. "timepicker": {
  1907. "refresh_intervals": [
  1908. "10s",
  1909. "30s",
  1910. "1m",
  1911. "5m",
  1912. "15m",
  1913. "30m",
  1914. "1h",
  1915. "2h"
  1916. ]
  1917. },
  1918. "timezone": "browser",
  1919. "title": "GPUStack Worker",
  1920. "uid": "gpustack-worker",
  1921. "version": 1
  1922. }