| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392 |
- import argparse
- from gpustack.worker.backends.ascend_mindie import (
- AscendMindIEParameters,
- AscendMindIEServer,
- )
- import pytest
- @pytest.mark.parametrize(
- "world_size, local_world_size, args, expected",
- [
- # The following cases are a forward derivation, which means that
- # the world size is not provided,
- # and is determined by input parameters.
- [
- -1,
- -1,
- ["--pipeline-parallel-size=2", "--tensor-parallel-size=8"],
- AscendMindIEParameters(
- world_size=16,
- local_world_size=8,
- pipeline_parallel_size=2,
- tensor_parallel_size=8,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- [
- -1,
- -1,
- ["--tensor-parallel-size=8"],
- AscendMindIEParameters(
- world_size=8,
- tensor_parallel_size=8,
- moe_tensor_parallel_size=8,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- [
- -1,
- -1,
- ["--data-parallel-size=2", "--tensor-parallel-size=8"],
- AscendMindIEParameters(
- world_size=16,
- local_world_size=8,
- data_parallel_size=2,
- tensor_parallel_size=8,
- moe_tensor_parallel_size=16,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- [
- -1,
- -1,
- ["--context-parallel-size=2", "--tensor-parallel-size=8"],
- AscendMindIEParameters(
- world_size=16,
- local_world_size=8,
- context_parallel_size=2,
- tensor_parallel_size=8,
- moe_tensor_parallel_size=16,
- data_parallel_size=1,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- [
- -1,
- -1,
- ["--moe-expert-parallel-size=2", "--moe-tensor-parallel-size=8"],
- AscendMindIEParameters(
- world_size=16,
- local_world_size=8,
- tensor_parallel_size=8,
- moe_expert_parallel_size=2,
- moe_tensor_parallel_size=8,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- # The following cases are a backward derivation, which means that
- # the world size is provided,
- # and provided partial parameters.
- [
- 16,
- 8,
- ["--pipeline-parallel-size=2"],
- AscendMindIEParameters(
- world_size=16,
- local_world_size=8,
- pipeline_parallel_size=2,
- tensor_parallel_size=8,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- [
- 16,
- 8,
- ["--tensor-parallel-size=8"],
- AscendMindIEParameters(
- world_size=16,
- local_world_size=8,
- tensor_parallel_size=8,
- moe_tensor_parallel_size=16,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- [
- 16,
- 8,
- ["--data-parallel-size=2"],
- AscendMindIEParameters(
- world_size=16,
- local_world_size=8,
- data_parallel_size=2,
- tensor_parallel_size=8,
- moe_tensor_parallel_size=16,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- [
- 16,
- 8,
- ["--context-parallel-size=2"],
- AscendMindIEParameters(
- world_size=16,
- local_world_size=8,
- context_parallel_size=2,
- tensor_parallel_size=8,
- moe_tensor_parallel_size=16,
- data_parallel_size=1,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- [
- 16,
- 8,
- ["--moe-expert-parallel-size=2"],
- AscendMindIEParameters(
- world_size=16,
- local_world_size=8,
- moe_expert_parallel_size=2,
- tensor_parallel_size=16,
- moe_tensor_parallel_size=8,
- max_prefill_tokens=8192,
- max_input_token_len=8192,
- max_iter_times=8192,
- ),
- ],
- ],
- )
- @pytest.mark.asyncio
- async def test_ascend_mindie_parameters_parallelism_default(
- world_size, local_world_size, args, expected: AscendMindIEParameters
- ):
- actual = AscendMindIEParameters(
- world_size=world_size,
- local_world_size=local_world_size,
- )
- actual.from_args_and_envs(args)
- assert actual == expected
- @pytest.mark.parametrize(
- "world_size, local_world_size, args, exception_msg",
- [
- # The following cases are a forward derivation, which means that
- # the world size is not provided,
- # and is determined by input parameters.
- [
- -1,
- -1,
- ["--pipeline-parallel-size=-1"],
- "--pipeline-parallel-size must be greater than 0",
- ],
- [
- -1,
- -1,
- ["--tensor-parallel-size=3"],
- "--tensor-parallel-size must be the power of 2",
- ],
- [
- -1,
- -1,
- ["--data-parallel-size=3"],
- "--data-parallel-size must be the power of 2",
- ],
- [
- -1,
- -1,
- ["--context-parallel-size=3"],
- "--context-parallel-size must be the power of 2",
- ],
- [
- -1,
- -1,
- ["--sequence-parallel-size=3"],
- "--sequence-parallel-size must be the power of 2",
- ],
- [
- -1,
- -1,
- ["--moe-tensor-parallel-size=3"],
- "--moe-tensor-parallel-size must be the power of 2",
- ],
- [
- -1,
- -1,
- ["--moe-expert-parallel-size=3"],
- "--moe-expert-parallel-size must be the power of 2",
- ],
- [
- -1,
- -1,
- ["--pipeline-parallel-size=2", "--data-parallel-size=4"],
- "--pipeline-parallel-size 2 and --data-parallel-size 4 are incompatible, set --pipeline-parallel-size to 1 or disable data parallelism",
- ],
- [
- -1,
- -1,
- ["--data-parallel-size=4", "--context-parallel-size=2"],
- "--data-parallel-size 4 and --context-parallel-size 2 are incompatible, set --data-parallel-size to 1 or disable context parallelism",
- ],
- [
- -1,
- -1,
- ["--sequence-parallel-size=4", "--tensor-parallel-size=2"],
- "--sequence-parallel-size 4 must be equal to --tensor-parallel-size 2",
- ],
- [
- -1,
- -1,
- [
- "--data-parallel-size=4",
- "--tensor-parallel-size=2",
- ], # DP and TP are compatible
- "", # No exception expected
- ],
- [
- -1,
- -1,
- [
- "--context-parallel-size=2",
- "--tensor-parallel-size=4",
- ], # CP and TP are compatible
- "", # No exception expected
- ],
- [
- -1,
- -1,
- [
- "--sequence-parallel-size=4",
- "--tensor-parallel-size=4",
- ], # SP and TP are compatible
- "", # No exception expected
- ],
- # The following cases are a backward derivation, which means that
- # the world size is provided,
- # and provided partial parameters.
- # These situations should not normally occur,
- # if they do, it means we have made the wrong choice in resource selection.
- [
- 4,
- 4,
- ["--pipeline-parallel-size=2", "--tensor-parallel-size=4"],
- "--pipeline-parallel-size 2 and --tensor-parallel-size 4 must be multiples of world size: 4",
- ],
- [
- 16,
- 4,
- ["--tensor-parallel-size=8"],
- "--tensor-parallel-size 8 must be less or equal to local world size: 4 or equal to world size: 16",
- ],
- [
- 32,
- 8,
- ["--data-parallel-size=2", "--tensor-parallel-size=8"],
- "--data-parallel-size 2 and --tensor-parallel-size 8 must be multiples of world size: 32",
- ],
- [
- 32,
- 8,
- ["--context-parallel-size=2", "--tensor-parallel-size=8"],
- "--context-parallel-size 2 and --tensor-parallel-size 8 must be multiples of world size: 32",
- ],
- [
- 16,
- 4,
- ["--moe-expert-parallel-size=4", "--moe-tensor-parallel-size=8"],
- "--moe-tensor-parallel-size 8 must be less or equal to local world size: 4 or equal to world size: 16",
- ],
- [
- 16,
- 8,
- ["--moe-expert-parallel-size=4", "--moe-tensor-parallel-size=8"],
- "--moe-expert-parallel-size 4and --moe-tensor-parallel-size 8 must be multiples of world size: 16",
- ],
- [
- 32,
- 8,
- ["--moe-tensor-parallel-size=8"],
- "--moe-tensor-parallel-size 8 must be equal to world size: 32",
- ],
- ],
- )
- @pytest.mark.asyncio
- async def test_ascend_mindie_parameters_parallelism_violation(
- world_size,
- local_world_size,
- args,
- exception_msg: str,
- ):
- """
- Test AscendMindIEParameters.from_args for various parallelism violations.
- """
- if not exception_msg:
- # No exception expected
- params = AscendMindIEParameters(
- world_size=world_size,
- local_world_size=local_world_size,
- )
- params.from_args_and_envs(args)
- return
- with pytest.raises(argparse.ArgumentTypeError, match=exception_msg):
- params = AscendMindIEParameters(
- world_size=world_size,
- local_world_size=local_world_size,
- )
- params.from_args_and_envs(args)
- def test_ascend_mindie_parameters_changed_backend_parameters():
- baseline = AscendMindIEParameters(max_seq_len=32768)
- baseline.from_args_and_envs([])
- params = AscendMindIEParameters(max_seq_len=32768)
- params.from_args_and_envs(["--max-seq-len", "8192", "--dtype", "float16"])
- assert params.changed_backend_parameters(baseline) == [
- "--max-seq-len",
- "8192",
- "--max-input-token-len",
- "8192",
- "--max-prefill-tokens",
- "8192",
- "--max-iter-times",
- "8192",
- "--dtype",
- "float16",
- ]
- def test_filter_user_defined_parameters():
- parameters = [
- "--max-seq-len",
- "8192",
- "--max-input-token-len",
- "8192",
- "--dtype",
- "float16",
- ]
- user_backend_parameters = ["--max-input-token-len", "4096", "--dtype=bfloat16"]
- assert AscendMindIEServer._filter_user_defined_parameters(
- parameters,
- user_backend_parameters,
- ) == ["--max-seq-len", "8192"]
- def test_backend_parameter_name_keeps_store_true_no_prefix():
- assert AscendMindIEServer._backend_parameter_name("--no-metrics") == "no-metrics"
- assert (
- AscendMindIEServer._backend_parameter_name("--no-enable-split")
- == "enable-split"
- )
|