qwen3_5-122b-server.log 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. (APIServer pid=8) INFO 03-28 12:05:54 [utils.py:297]
  2. (APIServer pid=8) INFO 03-28 12:05:54 [utils.py:297] █ █ █▄ ▄█
  3. (APIServer pid=8) INFO 03-28 12:05:54 [utils.py:297] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.18.0
  4. (APIServer pid=8) INFO 03-28 12:05:54 [utils.py:297] █▄█▀ █ █ █ █ model /model/Qwen3.5-122B-A10B
  5. (APIServer pid=8) INFO 03-28 12:05:54 [utils.py:297] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀
  6. (APIServer pid=8) INFO 03-28 12:05:54 [utils.py:297]
  7. (APIServer pid=8) INFO 03-28 12:05:54 [utils.py:233] non-default args: {'model_tag': '/model/Qwen3.5-122B-A10B', 'host': '0.0.0.0', 'port': 30000, 'api_key': ['lq123456'], 'model': '/model/Qwen3.5-122B-A10B', 'trust_remote_code': True, 'tensor_parallel_size': 2}
  8. (APIServer pid=8) The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
  9. (APIServer pid=8) INFO 03-28 12:06:01 [model.py:533] Resolved architecture: Qwen3_5MoeForConditionalGeneration
  10. (APIServer pid=8) INFO 03-28 12:06:01 [model.py:1582] Using max model len 262144
  11. (APIServer pid=8) INFO 03-28 12:06:01 [scheduler.py:231] Chunked prefill is enabled with max_num_batched_tokens=8192.
  12. (APIServer pid=8) INFO 03-28 12:06:01 [config.py:212] Setting attention block size to 2096 tokens to ensure that attention page size is >= mamba page size.
  13. (APIServer pid=8) INFO 03-28 12:06:01 [config.py:243] Padding mamba page size by 0.58% to ensure that mamba page size and attention page size are exactly equal.
  14. (APIServer pid=8) INFO 03-28 12:06:01 [vllm.py:754] Asynchronous scheduling is enabled.
  15. (APIServer pid=8) <frozen importlib._bootstrap_external>:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
  16. (APIServer pid=8) <frozen importlib._bootstrap_external>:1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
  17. (APIServer pid=8) INFO 03-28 12:06:02 [compilation.py:289] Enabled custom fusions: allreduce_rms
  18. (EngineCore pid=413) INFO 03-28 12:06:14 [core.py:103] Initializing a V1 LLM engine (v0.18.0) with config: model='/model/Qwen3.5-122B-A10B', speculative_config=None, tokenizer='/model/Qwen3.5-122B-A10B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/model/Qwen3.5-122B-A10B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
  19. (EngineCore pid=413) WARNING 03-28 12:06:14 [multiproc_executor.py:997] Reducing Torch parallelism from 88 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
  20. (EngineCore pid=413) INFO 03-28 12:06:14 [multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=172.19.0.5 (local), world_size=2, local_world_size=2
  21. (Worker pid=612) INFO 03-28 12:06:20 [parallel_state.py:1395] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:59153 backend=nccl
  22. (Worker pid=613) INFO 03-28 12:06:20 [parallel_state.py:1395] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:59153 backend=nccl
  23. (Worker pid=612) <frozen importlib._bootstrap_external>:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
  24. (Worker pid=612) <frozen importlib._bootstrap_external>:1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
  25. (Worker pid=613) <frozen importlib._bootstrap_external>:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
  26. (Worker pid=613) <frozen importlib._bootstrap_external>:1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
  27. (Worker pid=612) INFO 03-28 12:06:21 [pynccl.py:111] vLLM is using nccl==2.27.5
  28. (Worker pid=612) INFO 03-28 12:06:22 [parallel_state.py:1717] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A
  29. (Worker pid=613) INFO 03-28 12:06:22 [parallel_state.py:1717] rank 1 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 1, EP rank 1, EPLB rank N/A
  30. (Worker_TP0 pid=612) INFO 03-28 12:06:27 [gpu_model_runner.py:4481] Starting to load model /model/Qwen3.5-122B-A10B...
  31. (Worker_TP1 pid=613) INFO 03-28 12:06:27 [cuda.py:373] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention
  32. (Worker_TP1 pid=613) INFO 03-28 12:06:27 [mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.
  33. (Worker_TP0 pid=612) INFO 03-28 12:06:27 [cuda.py:373] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention
  34. (Worker_TP0 pid=612) INFO 03-28 12:06:27 [mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.
  35. (Worker_TP1 pid=613) INFO 03-28 12:06:27 [qwen3_next.py:191] Using FlashInfer GDN prefill kernel
  36. (Worker_TP1 pid=613) INFO 03-28 12:06:27 [qwen3_next.py:192] FlashInfer GDN prefill kernel is JIT-compiled; first run may take a while to compile. Set `--gdn-prefill-backend triton` to avoid JIT compile time.
  37. (Worker_TP0 pid=612) INFO 03-28 12:06:27 [qwen3_next.py:191] Using FlashInfer GDN prefill kernel
  38. (Worker_TP0 pid=612) INFO 03-28 12:06:27 [qwen3_next.py:192] FlashInfer GDN prefill kernel is JIT-compiled; first run may take a while to compile. Set `--gdn-prefill-backend triton` to avoid JIT compile time.
  39. (Worker_TP0 pid=612) INFO 03-28 12:06:27 [unquantized.py:186] Using TRITON backend for Unquantized MoE
  40. (Worker_TP0 pid=612) INFO 03-28 12:06:27 [cuda.py:317] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
  41. (Worker_TP0 pid=612) INFO 03-28 12:06:27 [flash_attn.py:598] Using FlashAttention version 3
  42. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 0% Completed | 0/39 [00:00<?, ?it/s]
  43. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 3% Completed | 1/39 [00:00<00:17, 2.16it/s]
  44. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 5% Completed | 2/39 [00:00<00:17, 2.06it/s]
  45. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 8% Completed | 3/39 [00:01<00:18, 1.98it/s]
  46. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 10% Completed | 4/39 [00:02<00:18, 1.94it/s]
  47. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 13% Completed | 5/39 [00:02<00:17, 1.92it/s]
  48. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 15% Completed | 6/39 [00:03<00:16, 1.95it/s]
  49. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 18% Completed | 7/39 [00:03<00:16, 1.96it/s]
  50. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 21% Completed | 8/39 [00:04<00:16, 1.94it/s]
  51. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 23% Completed | 9/39 [00:04<00:15, 1.91it/s]
  52. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 26% Completed | 10/39 [00:05<00:15, 1.90it/s]
  53. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 28% Completed | 11/39 [00:05<00:14, 1.90it/s]
  54. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 31% Completed | 12/39 [00:06<00:14, 1.91it/s]
  55. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 33% Completed | 13/39 [00:06<00:13, 1.90it/s]
  56. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 36% Completed | 14/39 [00:07<00:13, 1.90it/s]
  57. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 38% Completed | 15/39 [00:07<00:12, 1.90it/s]
  58. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 41% Completed | 16/39 [00:08<00:12, 1.90it/s]
  59. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 44% Completed | 17/39 [00:08<00:11, 1.90it/s]
  60. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 46% Completed | 18/39 [00:09<00:11, 1.90it/s]
  61. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 49% Completed | 19/39 [00:09<00:10, 1.89it/s]
  62. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 51% Completed | 20/39 [00:10<00:10, 1.89it/s]
  63. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 54% Completed | 21/39 [00:10<00:09, 1.89it/s]
  64. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 56% Completed | 22/39 [00:11<00:09, 1.89it/s]
  65. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 59% Completed | 23/39 [00:12<00:08, 1.89it/s]
  66. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 62% Completed | 24/39 [00:12<00:07, 1.89it/s]
  67. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 64% Completed | 25/39 [00:13<00:09, 1.50it/s]
  68. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 67% Completed | 26/39 [00:14<00:10, 1.27it/s]
  69. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 69% Completed | 27/39 [00:15<00:10, 1.15it/s]
  70. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 72% Completed | 28/39 [00:16<00:10, 1.08it/s]
  71. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 74% Completed | 29/39 [00:17<00:09, 1.05it/s]
  72. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 77% Completed | 30/39 [00:18<00:08, 1.02it/s]
  73. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 79% Completed | 31/39 [00:19<00:08, 1.01s/it]
  74. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 82% Completed | 32/39 [00:20<00:07, 1.03s/it]
  75. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 85% Completed | 33/39 [00:21<00:06, 1.03s/it]
  76. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 87% Completed | 34/39 [00:23<00:05, 1.04s/it]
  77. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 90% Completed | 35/39 [00:24<00:04, 1.05s/it]
  78. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 92% Completed | 36/39 [00:25<00:03, 1.05s/it]
  79. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 95% Completed | 37/39 [00:25<00:01, 1.13it/s]
  80. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 97% Completed | 38/39 [00:26<00:00, 1.22it/s]
  81. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 100% Completed | 39/39 [00:26<00:00, 1.60it/s]
  82. (Worker_TP0 pid=612) Loading safetensors checkpoint shards: 100% Completed | 39/39 [00:26<00:00, 1.47it/s]
  83. (Worker_TP0 pid=612)
  84. (Worker_TP0 pid=612) INFO 03-28 12:06:54 [default_loader.py:384] Loading weights took 26.50 seconds
  85. (Worker_TP0 pid=612) INFO 03-28 12:06:54 [gpu_model_runner.py:4566] Model loading took 114.35 GiB memory and 26.866823 seconds
  86. (Worker_TP1 pid=613) INFO 03-28 12:06:55 [gpu_model_runner.py:5488] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 image items of the maximum feature size.
  87. (Worker_TP0 pid=612) INFO 03-28 12:06:55 [gpu_model_runner.py:5488] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 1 image items of the maximum feature size.
  88. (Worker_TP0 pid=612) INFO 03-28 12:07:06 [backends.py:988] Using cache directory: /root/.cache/vllm/torch_compile_cache/0515f0dd89/rank_0_0/backbone for vLLM's torch.compile
  89. (Worker_TP0 pid=612) INFO 03-28 12:07:06 [backends.py:1048] Dynamo bytecode transform time: 6.97 s
  90. (Worker_TP0 pid=612) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning.
  91. (Worker_TP0 pid=612) return func(*args, **kwargs)
  92. (Worker_TP1 pid=613) INFO 03-28 12:07:09 [backends.py:371] Cache the graph of compile range (1, 8192) for later use
  93. (Worker_TP0 pid=612) INFO 03-28 12:07:09 [backends.py:371] Cache the graph of compile range (1, 8192) for later use
  94. (Worker_TP0 pid=612) INFO 03-28 12:07:27 [backends.py:387] Compiling a graph for compile range (1, 8192) takes 19.40 s
  95. (Worker_TP0 pid=612) INFO 03-28 12:07:28 [decorators.py:627] saved AOT compiled function to /root/.cache/vllm/torch_compile_cache/torch_aot_compile/3b76a4b4777762ad3913c96dc9d4a6c867913e1f0c222ab694fb2ef76ac58c61/rank_0_0/model
  96. (Worker_TP0 pid=612) INFO 03-28 12:07:28 [monitor.py:48] torch.compile took 28.72 s in total
  97. (Worker_TP0 pid=612) WARNING 03-28 12:07:29 [fused_moe.py:1093] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=NVIDIA_H20-3e.json
  98. (Worker_TP0 pid=612) INFO 03-28 12:07:31 [monitor.py:76] Initial profiling/warmup run took 3.17 s
  99. (Worker_TP0 pid=612) INFO 03-28 12:07:38 [kv_cache_utils.py:826] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512
  100. (Worker_TP1 pid=613) INFO 03-28 12:07:38 [kv_cache_utils.py:826] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512
  101. (Worker_TP0 pid=612) INFO 03-28 12:07:38 [gpu_model_runner.py:5607] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512)
  102. (Worker_TP1 pid=613) INFO 03-28 12:07:38 [gpu_model_runner.py:5607] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512)
  103. (Worker_TP0 pid=612) INFO 03-28 12:07:41 [custom_all_reduce.py:216] Registering 384 cuda graph addresses
  104. (Worker_TP1 pid=613) INFO 03-28 12:07:41 [custom_all_reduce.py:216] Registering 384 cuda graph addresses
  105. (Worker_TP0 pid=612) INFO 03-28 12:07:42 [gpu_model_runner.py:5686] Estimated CUDA graph memory: 0.75 GiB total
  106. (Worker_TP1 pid=613) INFO 03-28 12:07:42 [gpu_model_runner.py:5686] Estimated CUDA graph memory: 0.75 GiB total
  107. (Worker_TP0 pid=612) INFO 03-28 12:07:42 [gpu_worker.py:456] Available KV cache memory: 4.84 GiB
  108. (Worker_TP0 pid=612) INFO 03-28 12:07:42 [gpu_worker.py:490] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9000 to 0.9054 to maintain the same effective KV cache size.
  109. (Worker_TP1 pid=613) INFO 03-28 12:07:42 [gpu_worker.py:490] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9000 to 0.9053 to maintain the same effective KV cache size.
  110. (EngineCore pid=413) INFO 03-28 12:07:42 [kv_cache_utils.py:1316] GPU KV cache size: 104,800 tokens
  111. (EngineCore pid=413) INFO 03-28 12:07:42 [kv_cache_utils.py:1321] Maximum concurrency for 262,144 tokens per request: 1.56x
  112. (Worker_TP1 pid=613) WARNING 03-28 12:07:42 [compilation.py:1236] Capping cudagraph capture sizes from max 512 to 200 to fit Mamba cache blocks (201 blocks available). This limits the maximum batch size that can use CUDA graphs. To increase this limit, reduce max_num_seqs or increase available GPU memory.
  113. (Worker_TP0 pid=612) WARNING 03-28 12:07:42 [compilation.py:1236] Capping cudagraph capture sizes from max 512 to 200 to fit Mamba cache blocks (201 blocks available). This limits the maximum batch size that can use CUDA graphs. To increase this limit, reduce max_num_seqs or increase available GPU memory.
  114. (Worker_TP1 pid=613) 2026-03-28 12:07:42,463 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
  115. (Worker_TP0 pid=612) 2026-03-28 12:07:42,465 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
  116. (Worker_TP1 pid=613) (Worker_TP0 pid=612) 2026-03-28 12:07:42,688 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends
  117. 2026-03-28 12:07:42,688 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends
  118. (Worker_TP0 pid=612) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/28 [00:00<?, ?it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 4%|▎ | 1/28 [00:00<00:03, 7.17it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 7%|▋ | 2/28 [00:00<00:03, 7.39it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 11%|█ | 3/28 [00:00<00:03, 7.65it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 14%|█▍ | 4/28 [00:00<00:03, 7.77it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 18%|█▊ | 5/28 [00:00<00:03, 7.65it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 21%|██▏ | 6/28 [00:00<00:02, 7.47it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 25%|██▌ | 7/28 [00:00<00:02, 7.45it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 29%|██▊ | 8/28 [00:01<00:02, 7.39it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 32%|███▏ | 9/28 [00:01<00:02, 7.43it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 36%|███▌ | 10/28 [00:01<00:05, 3.06it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 39%|███▉ | 11/28 [00:02<00:04, 3.72it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 43%|████▎ | 12/28 [00:02<00:03, 4.39it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 46%|████▋ | 13/28 [00:02<00:02, 5.00it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 50%|█████ | 14/28 [00:03<00:04, 2.99it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 54%|█████▎ | 15/28 [00:03<00:03, 3.62it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 57%|█████▋ | 16/28 [00:03<00:02, 4.26it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 61%|██████ | 17/28 [00:03<00:02, 4.86it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 64%|██████▍ | 18/28 [00:04<00:03, 2.94it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 68%|██████▊ | 19/28 [00:04<00:02, 3.57it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 71%|███████▏ | 20/28 [00:04<00:01, 4.22it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 75%|███████▌ | 21/28 [00:04<00:01, 4.82it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 79%|███████▊ | 22/28 [00:05<00:01, 3.05it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 82%|████████▏ | 23/28 [00:05<00:01, 3.69it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 86%|████████▌ | 24/28 [00:05<00:00, 4.31it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 89%|████████▉ | 25/28 [00:05<00:00, 4.90it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 93%|█████████▎| 26/28 [00:05<00:00, 5.46it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 96%|█████████▋| 27/28 [00:05<00:00, 5.95it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 28/28 [00:06<00:00, 2.59it/s] Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 28/28 [00:06<00:00, 4.19it/s]
  119. (Worker_TP0 pid=612) Capturing CUDA graphs (decode, FULL): 0%| | 0/28 [00:00<?, ?it/s] Capturing CUDA graphs (decode, FULL): 4%|▎ | 1/28 [00:00<00:06, 3.92it/s] Capturing CUDA graphs (decode, FULL): 7%|▋ | 2/28 [00:00<00:06, 3.99it/s] Capturing CUDA graphs (decode, FULL): 11%|█ | 3/28 [00:00<00:04, 5.18it/s] Capturing CUDA graphs (decode, FULL): 14%|█▍ | 4/28 [00:00<00:03, 6.04it/s] Capturing CUDA graphs (decode, FULL): 18%|█▊ | 5/28 [00:00<00:03, 6.52it/s] Capturing CUDA graphs (decode, FULL): 21%|██▏ | 6/28 [00:01<00:03, 6.94it/s] Capturing CUDA graphs (decode, FULL): 25%|██▌ | 7/28 [00:01<00:02, 7.26it/s] Capturing CUDA graphs (decode, FULL): 29%|██▊ | 8/28 [00:01<00:02, 7.46it/s] Capturing CUDA graphs (decode, FULL): 32%|███▏ | 9/28 [00:01<00:02, 7.64it/s] Capturing CUDA graphs (decode, FULL): 36%|███▌ | 10/28 [00:01<00:02, 7.75it/s] Capturing CUDA graphs (decode, FULL): 39%|███▉ | 11/28 [00:01<00:02, 7.81it/s] Capturing CUDA graphs (decode, FULL): 43%|████▎ | 12/28 [00:01<00:02, 7.87it/s] Capturing CUDA graphs (decode, FULL): 46%|████▋ | 13/28 [00:01<00:01, 7.91it/s] Capturing CUDA graphs (decode, FULL): 50%|█████ | 14/28 [00:02<00:01, 7.93it/s] Capturing CUDA graphs (decode, FULL): 54%|█████▎ | 15/28 [00:02<00:01, 7.96it/s] Capturing CUDA graphs (decode, FULL): 57%|█████▋ | 16/28 [00:02<00:01, 7.96it/s] Capturing CUDA graphs (decode, FULL): 61%|██████ | 17/28 [00:02<00:01, 8.00it/s] Capturing CUDA graphs (decode, FULL): 64%|██████▍ | 18/28 [00:02<00:01, 8.03it/s] Capturing CUDA graphs (decode, FULL): 68%|██████▊ | 19/28 [00:02<00:01, 8.02it/s] Capturing CUDA graphs (decode, FULL): 71%|███████▏ | 20/28 [00:02<00:00, 8.05it/s] Capturing CUDA graphs (decode, FULL): 75%|███████▌ | 21/28 [00:02<00:00, 8.04it/s] Capturing CUDA graphs (decode, FULL): 79%|███████▊ | 22/28 [00:03<00:00, 8.06it/s] Capturing CUDA graphs (decode, FULL): 82%|████████▏ | 23/28 [00:03<00:00, 8.07it/s] Capturing CUDA graphs (decode, FULL): 86%|████████▌ | 24/28 [00:03<00:00, 8.02it/s] Capturing CUDA graphs (decode, FULL): 89%|████████▉ | 25/28 [00:03<00:00, 8.01it/s] Capturing CUDA graphs (decode, FULL): 93%|█████████▎| 26/28 [00:03<00:00, 7.99it/s] Capturing CUDA graphs (decode, FULL): 96%|█████████▋| 27/28 [00:03<00:00, 8.00it/s](Worker_TP1 pid=613) INFO 03-28 12:07:53 [custom_all_reduce.py:216] Registering 5376 cuda graph addresses
  120. Capturing CUDA graphs (decode, FULL): 100%|██████████| 28/28 [00:03<00:00, 6.09it/s] Capturing CUDA graphs (decode, FULL): 100%|██████████| 28/28 [00:03<00:00, 7.21it/s]
  121. (Worker_TP0 pid=612) INFO 03-28 12:07:53 [custom_all_reduce.py:216] Registering 5376 cuda graph addresses
  122. (Worker_TP1 pid=613) INFO 03-28 12:07:54 [gpu_worker.py:617] CUDA graph pool memory: 0.93 GiB (actual), 0.75 GiB (estimated), difference: 0.19 GiB (19.9%).
  123. (Worker_TP0 pid=612) INFO 03-28 12:07:54 [gpu_model_runner.py:5746] Graph capturing finished in 11 secs, took 0.93 GiB
  124. (Worker_TP0 pid=612) INFO 03-28 12:07:54 [gpu_worker.py:617] CUDA graph pool memory: 0.93 GiB (actual), 0.75 GiB (estimated), difference: 0.18 GiB (19.7%).
  125. (EngineCore pid=413) INFO 03-28 12:07:54 [core.py:281] init engine (profile, create kv cache, warmup model) took 59.32 seconds
  126. (EngineCore pid=413) INFO 03-28 12:07:59 [vllm.py:754] Asynchronous scheduling is enabled.
  127. (EngineCore pid=413) <frozen importlib._bootstrap_external>:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
  128. (EngineCore pid=413) <frozen importlib._bootstrap_external>:1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
  129. (EngineCore pid=413) INFO 03-28 12:07:59 [compilation.py:289] Enabled custom fusions: allreduce_rms
  130. (APIServer pid=8) INFO 03-28 12:07:59 [api_server.py:576] Supported tasks: ['generate']
  131. (APIServer pid=8) WARNING 03-28 12:07:59 [model.py:1376] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_k': 20, 'top_p': 0.95}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`.
  132. (APIServer pid=8) INFO 03-28 12:08:00 [hf.py:320] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
  133. (APIServer pid=8) INFO 03-28 12:08:04 [base.py:216] Multi-modal warmup completed in 3.630s
  134. (APIServer pid=8) INFO 03-28 12:08:04 [api_server.py:580] Starting vLLM server on http://0.0.0.0:30000
  135. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:37] Available routes are:
  136. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /openapi.json, Methods: HEAD, GET
  137. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /docs, Methods: HEAD, GET
  138. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET
  139. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /redoc, Methods: HEAD, GET
  140. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /tokenize, Methods: POST
  141. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /detokenize, Methods: POST
  142. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /load, Methods: GET
  143. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /version, Methods: GET
  144. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /health, Methods: GET
  145. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /metrics, Methods: GET
  146. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/models, Methods: GET
  147. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /ping, Methods: GET
  148. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /ping, Methods: POST
  149. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /invocations, Methods: POST
  150. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/chat/completions, Methods: POST
  151. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/responses, Methods: POST
  152. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET
  153. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST
  154. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/completions, Methods: POST
  155. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/messages, Methods: POST
  156. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST
  157. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /inference/v1/generate, Methods: POST
  158. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /scale_elastic_ep, Methods: POST
  159. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST
  160. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/chat/completions/render, Methods: POST
  161. (APIServer pid=8) INFO 03-28 12:08:04 [launcher.py:46] Route: /v1/completions/render, Methods: POST
  162. (APIServer pid=8) INFO: Started server process [8]
  163. (APIServer pid=8) INFO: Waiting for application startup.
  164. (APIServer pid=8) INFO: Application startup complete.
  165. (APIServer pid=8) INFO: 172.19.0.1:47954 - "POST /v1/chat/completions HTTP/1.1" 200 OK
  166. (APIServer pid=8) INFO 03-28 12:08:34 [loggers.py:259] Engine 000: Avg prompt throughput: 1.6 tokens/s, Avg generation throughput: 5.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
  167. (APIServer pid=8) INFO 03-28 12:08:44 [loggers.py:259] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%