runtime.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. from gpustack_runtime.detector import (
  2. detect_devices,
  3. manufacturer_to_backend,
  4. ManufacturerEnum,
  5. )
  6. from gpustack.detectors.base import GPUDetector
  7. from gpustack.schemas import GPUDeviceStatus, GPUDevicesStatus
  8. from gpustack.schemas.workers import GPUCoreInfo, MemoryInfo, GPUNetworkInfo
  9. from gpustack.utils.convert import safe_int
  10. class Runtime(GPUDetector):
  11. """
  12. Detect GPUs using gpustack-runtime.
  13. """
  14. def is_available(self) -> bool:
  15. return True
  16. def gather_gpu_info(self) -> GPUDevicesStatus:
  17. ret: GPUDevicesStatus = []
  18. # Detect devices.
  19. devs = detect_devices(fast=False)
  20. if not devs:
  21. return ret
  22. # Convert to GPUDevicesInfo.
  23. for dev in devs:
  24. gpudev = GPUDeviceStatus(
  25. vendor=dev.manufacturer.value,
  26. type=manufacturer_to_backend(dev.manufacturer),
  27. index=dev.index,
  28. device_index=dev.index,
  29. device_chip_index=0,
  30. name=dev.name,
  31. uuid=dev.uuid,
  32. driver_version=dev.driver_version,
  33. runtime_version=dev.runtime_version,
  34. compute_capability=dev.compute_capability,
  35. core=GPUCoreInfo(
  36. total=dev.cores or 0,
  37. utilization_rate=dev.cores_utilization,
  38. ),
  39. memory=MemoryInfo(
  40. total=dev.memory << 20, # MiB -> Bytes
  41. used=dev.memory_used << 20, # MiB -> Bytes
  42. utilization_rate=dev.memory_utilization,
  43. ),
  44. temperature=dev.temperature,
  45. )
  46. # Correct device_index if possible.
  47. if "card_id" in dev.appendix and dev.appendix["card_id"] is not None:
  48. gpudev.device_index = safe_int(dev.appendix["card_id"])
  49. # Correct device_chip_index if possible.
  50. if "device_id" in dev.appendix and dev.appendix["device_id"] is not None:
  51. gpudev.device_chip_index = safe_int(dev.appendix["device_id"])
  52. # Record architecture if possible.
  53. if (
  54. "arch_family" in dev.appendix
  55. and dev.appendix["arch_family"] is not None
  56. ):
  57. gpudev.arch_family = str(dev.appendix["arch_family"])
  58. # Record network for Ascend devices if possible.
  59. if dev.manufacturer == ManufacturerEnum.ASCEND:
  60. gpudev_network = GPUNetworkInfo(
  61. inet=dev.appendix["roce_ip"] if "roce_ip" in dev.appendix else "",
  62. netmask=(
  63. dev.appendix["roce_mask"] if "roce_mask" in dev.appendix else ""
  64. ),
  65. gateway=(
  66. dev.appendix["roce_gateway"]
  67. if "roce_gateway" in dev.appendix
  68. else ""
  69. ),
  70. )
  71. if gpudev_network.inet:
  72. gpudev_network.status = "up"
  73. gpudev.network = gpudev_network
  74. ret.append(gpudev)
  75. return ret