test_generate_combinations.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. from typing import Dict, Tuple
  2. import pytest
  3. from gpustack.policies.candidate_selectors import GGUFResourceFitSelector
  4. from gpustack.schemas.models import (
  5. GPUSelector,
  6. PlacementStrategyEnum,
  7. )
  8. from tests.fixtures.workers.fixtures import (
  9. linux_nvidia_10_3090_24gx8,
  10. linux_nvidia_8_3090_24gx8,
  11. linux_nvidia_9_3090_24gx8,
  12. )
  13. from tests.utils.model import new_model, new_model_instance
  14. from unittest.mock import patch
  15. @pytest.mark.asyncio
  16. async def test_generate_combinations_for_single_worker_gpus():
  17. workers = [
  18. linux_nvidia_8_3090_24gx8(),
  19. ]
  20. m = new_model(1, "test", 1, "Meta-Llama-3-70B-Instruct-GGUF")
  21. mi = new_model_instance(1, "test", 1)
  22. resource_fit_selector = GGUFResourceFitSelector(m, mi)
  23. resource_fit_selector._worker_id_to_worker = {
  24. worker.id: worker for worker in workers
  25. }
  26. with (
  27. patch(
  28. 'gpustack.policies.utils.get_worker_model_instances',
  29. return_value=[],
  30. ),
  31. ):
  32. allocatable = resource_fit_selector._get_worker_allocatable_resource(workers[0])
  33. actual_combinations_count = {}
  34. for i in range(2, 9):
  35. combinations, _ = (
  36. resource_fit_selector._generate_combinations_for_single_worker_multi_gpus(
  37. allocatable, workers[0], i
  38. )
  39. )
  40. actual_combinations_count[i] = combinations
  41. expected_total = 247
  42. expected_combinations = {
  43. # key: gpu count, value: combinations number
  44. 2: 28,
  45. 3: 56,
  46. 4: 70,
  47. 5: 56,
  48. 6: 28,
  49. 7: 8,
  50. 8: 1,
  51. }
  52. compare_combinations(
  53. actual_combinations_count, expected_combinations, expected_total
  54. )
  55. @pytest.mark.asyncio
  56. async def test_generate_combinations_for_worker_with_rpc_servers_with_manual_selected_gpus():
  57. workers = [
  58. linux_nvidia_8_3090_24gx8(),
  59. linux_nvidia_9_3090_24gx8(),
  60. linux_nvidia_10_3090_24gx8(),
  61. ]
  62. m = new_model(
  63. 1,
  64. "DeepSeek-R1-GGUF",
  65. 1,
  66. huggingface_repo_id="unsloth/DeepSeek-R1-GGUF",
  67. cpu_offloading=False,
  68. gpu_selector=GPUSelector(
  69. gpu_ids=[
  70. # host01
  71. "host01-3090:cuda:0",
  72. "host01-3090:cuda:1",
  73. "host01-3090:cuda:2",
  74. "host01-3090:cuda:3",
  75. "host01-3090:cuda:4",
  76. "host01-3090:cuda:5",
  77. "host01-3090:cuda:6",
  78. "host01-3090:cuda:7",
  79. # host02
  80. "host02-3090:cuda:0",
  81. "host02-3090:cuda:1",
  82. "host02-3090:cuda:2",
  83. "host02-3090:cuda:3",
  84. "host02-3090:cuda:4",
  85. "host02-3090:cuda:5",
  86. "host02-3090:cuda:6",
  87. "host02-3090:cuda:7",
  88. # host03
  89. "host03-3090:cuda:0",
  90. "host03-3090:cuda:1",
  91. "host03-3090:cuda:2",
  92. "host03-3090:cuda:3",
  93. "host03-3090:cuda:4",
  94. "host03-3090:cuda:5",
  95. "host03-3090:cuda:6",
  96. "host03-3090:cuda:7",
  97. ]
  98. ),
  99. placement_strategy=PlacementStrategyEnum.SPREAD,
  100. huggingface_filename="DeepSeek-R1-Q4_K_M/DeepSeek-R1-Q4_K_M-00001-of-00009.gguf",
  101. )
  102. mi = new_model_instance(1, "test", 1)
  103. resource_fit_selector = GGUFResourceFitSelector(m, mi)
  104. resource_fit_selector._non_uma_single_gpu_full_offload_vram = (
  105. 537.09 * 1024 * 1024 * 1024
  106. )
  107. with (
  108. patch(
  109. 'gpustack.policies.utils.get_worker_model_instances',
  110. return_value=[],
  111. ),
  112. ):
  113. resource_fit_selector._set_workers_allocatable_resource(workers)
  114. combinations = (
  115. resource_fit_selector._generate_combinations_for_worker_with_rpcs(workers)
  116. )
  117. expected_total = 1
  118. expected_combinations = {
  119. # key: gpu count, value: combinations number
  120. 17: 1,
  121. }
  122. compare_combinations(combinations, expected_combinations, expected_total)
  123. @pytest.mark.asyncio
  124. async def test_generate_combinations_for_worker_with_rpc_servers_with_auto_selected_gpus():
  125. workers = [
  126. linux_nvidia_8_3090_24gx8(),
  127. linux_nvidia_9_3090_24gx8(),
  128. linux_nvidia_10_3090_24gx8(),
  129. ]
  130. m = new_model(
  131. 1,
  132. "DeepSeek-R1-GGUF",
  133. 1,
  134. huggingface_repo_id="unsloth/DeepSeek-R1-GGUF",
  135. cpu_offloading=False,
  136. placement_strategy=PlacementStrategyEnum.SPREAD,
  137. huggingface_filename="DeepSeek-R1-Q4_K_M/DeepSeek-R1-Q4_K_M-00001-of-00009.gguf",
  138. backend_parameters=[],
  139. )
  140. mi = new_model_instance(1, "test", 1)
  141. resource_fit_selector = GGUFResourceFitSelector(m, mi)
  142. resource_fit_selector._worker_id_to_worker = {
  143. worker.id: worker for worker in workers
  144. }
  145. resource_fit_selector._non_uma_single_gpu_full_offload_vram = (
  146. 537.09 * 1024 * 1024 * 1024
  147. )
  148. with (
  149. patch(
  150. 'gpustack.policies.utils.get_worker_model_instances',
  151. return_value=[],
  152. ),
  153. ):
  154. resource_fit_selector._set_workers_allocatable_resource(workers)
  155. combinations = (
  156. resource_fit_selector._generate_combinations_for_worker_with_rpcs(workers)
  157. )
  158. expected_total = 39202
  159. expected_combinations = {
  160. # key: gpu count, value: combinations number
  161. 2: 16,
  162. 3: 120,
  163. 4: 560,
  164. 5: 1820,
  165. 6: 4368,
  166. 7: 8008,
  167. 8: 11440,
  168. 9: 12870,
  169. }
  170. compare_combinations(combinations, expected_combinations, expected_total)
  171. def compare_combinations(
  172. combinations: dict[Tuple[Tuple[int]]],
  173. expected_combinations: Dict[int, int],
  174. expected_total: int,
  175. ):
  176. actual_total = 0
  177. for e_gpu_count, e_comb_num in expected_combinations.items():
  178. a_comb = combinations[e_gpu_count]
  179. actual_total += len(a_comb)
  180. assert len(a_comb) == e_comb_num
  181. assert actual_total == expected_total