qwen3_5-122b-server.log 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. [2026-03-27 10:53:30] WARNING server_args.py:1748: Disabling overlap schedule since mamba no_buffer is not compatible with overlap schedule, try to use --disable-radix-cache if overlap schedule is necessary
  2. [2026-03-27 10:53:30] INFO server_args.py:1835: Attention backend not specified. Use fa3 backend by default.
  3. [2026-03-27 10:53:31] server_args=ServerArgs(model_path='/model/Qwen3.5-122B-A10B', tokenizer_path='/model/Qwen3.5-122B-A10B', tokenizer_mode='auto', tokenizer_worker_num=1, skip_tokenizer_init=False, load_format='auto', model_loader_extra_config='{}', trust_remote_code=False, context_length=None, is_embedding=False, enable_multimodal=None, revision=None, model_impl='auto', host='0.0.0.0', port=30000, fastapi_root_path='', grpc_mode=False, skip_server_warmup=False, warmups=None, nccl_port=None, checkpoint_engine_wait_weights_before_ready=False, dtype='auto', quantization=None, quantization_param_path=None, kv_cache_dtype='auto', enable_fp32_lm_head=False, modelopt_quant=None, modelopt_checkpoint_restore_path=None, modelopt_checkpoint_save_path=None, modelopt_export_path=None, quantize_and_serve=False, rl_quant_profile=None, mem_fraction_static=0.83783765625, max_running_requests=None, max_queued_requests=None, max_total_tokens=None, chunked_prefill_size=8192, enable_dynamic_chunking=False, max_prefill_tokens=16384, prefill_max_requests=None, schedule_policy='fcfs', enable_priority_scheduling=False, abort_on_priority_when_disabled=False, schedule_low_priority_values_first=False, priority_scheduling_preemption_threshold=10, schedule_conservativeness=1.0, page_size=1, swa_full_tokens_ratio=0.8, disable_hybrid_swa_memory=False, radix_eviction_policy='lru', enable_prefill_delayer=False, prefill_delayer_max_delay_passes=30, prefill_delayer_token_usage_low_watermark=None, prefill_delayer_forward_passes_buckets=None, prefill_delayer_wait_seconds_buckets=None, device='cuda', tp_size=2, pp_size=1, pp_max_micro_batch_size=None, pp_async_batch_depth=0, stream_interval=1, stream_output=False, random_seed=403232373, constrained_json_whitespace_pattern=None, constrained_json_disable_any_whitespace=False, watchdog_timeout=300, soft_watchdog_timeout=None, dist_timeout=None, download_dir=None, model_checksum=None, base_gpu_id=0, gpu_id_step=1, sleep_on_idle=False, custom_sigquit_handler=None, log_level='info', log_level_http=None, log_requests=False, log_requests_level=2, log_requests_format='text', log_requests_target=None, uvicorn_access_log_exclude_prefixes=[], crash_dump_folder=None, show_time_cost=False, enable_metrics=False, enable_metrics_for_all_schedulers=False, tokenizer_metrics_custom_labels_header='x-custom-labels', tokenizer_metrics_allowed_custom_labels=None, extra_metric_labels=None, bucket_time_to_first_token=None, bucket_inter_token_latency=None, bucket_e2e_request_latency=None, collect_tokens_histogram=False, prompt_tokens_buckets=None, generation_tokens_buckets=None, gc_warning_threshold_secs=0.0, decode_log_interval=40, enable_request_time_stats_logging=False, kv_events_config=None, enable_trace=False, otlp_traces_endpoint='localhost:4317', export_metrics_to_file=False, export_metrics_to_file_dir=None, api_key='lq123456', admin_api_key=None, served_model_name='/model/Qwen3.5-122B-A10B', weight_version='default', chat_template=None, hf_chat_template_name=None, completion_template=None, file_storage_path='sglang_storage', enable_cache_report=False, reasoning_parser=None, tool_call_parser=None, tool_server=None, sampling_defaults='model', dp_size=1, load_balance_method='round_robin', attn_cp_size=1, moe_dp_size=1, dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', preferred_sampling_params=None, enable_lora=None, enable_lora_overlap_loading=None, max_lora_rank=None, lora_target_modules=None, lora_paths=None, max_loaded_loras=None, max_loras_per_batch=8, lora_eviction_policy='lru', lora_backend='csgmv', max_lora_chunk_size=16, attention_backend='fa3', decode_attention_backend=None, prefill_attention_backend=None, sampling_backend='flashinfer', grammar_backend='xgrammar', mm_attention_backend=None, fp8_gemm_runner_backend='auto', fp4_gemm_runner_backend='flashinfer_cutlass', nsa_prefill_backend=None, nsa_decode_backend=None, disable_flashinfer_autotune=False, mamba_backend='triton', speculative_algorithm=None, speculative_draft_model_path=None, speculative_draft_model_revision=None, speculative_draft_load_format=None, speculative_num_steps=None, speculative_eagle_topk=None, speculative_num_draft_tokens=None, speculative_accept_threshold_single=1.0, speculative_accept_threshold_acc=1.0, speculative_token_map=None, speculative_attention_mode='prefill', speculative_draft_attention_backend=None, speculative_moe_runner_backend='auto', speculative_moe_a2a_backend=None, speculative_draft_model_quantization=None, speculative_ngram_min_match_window_size=1, speculative_ngram_max_match_window_size=12, speculative_ngram_min_bfs_breadth=1, speculative_ngram_max_bfs_breadth=10, speculative_ngram_match_type='BFS', speculative_ngram_branch_length=18, speculative_ngram_capacity=10000000, enable_multi_layer_eagle=False, ep_size=1, moe_a2a_backend='none', moe_runner_backend='auto', flashinfer_mxfp4_moe_precision='default', enable_flashinfer_allreduce_fusion=False, deepep_mode='auto', ep_num_redundant_experts=0, ep_dispatch_algorithm=None, init_expert_location='trivial', enable_eplb=False, eplb_algorithm='auto', eplb_rebalance_num_iterations=1000, eplb_rebalance_layers_per_chunk=None, eplb_min_rebalancing_utilization_threshold=1.0, expert_distribution_recorder_mode=None, expert_distribution_recorder_buffer_size=1000, enable_expert_distribution_metrics=False, deepep_config=None, moe_dense_tp_size=None, elastic_ep_backend=None, mooncake_ib_device=None, max_mamba_cache_size=None, mamba_ssm_dtype=None, mamba_full_memory_ratio=0.9, mamba_scheduler_strategy='no_buffer', mamba_track_interval=256, enable_hierarchical_cache=False, hicache_ratio=2.0, hicache_size=0, hicache_write_policy='write_through', hicache_io_backend='kernel', hicache_mem_layout='layer_first', disable_hicache_numa_detect=False, hicache_storage_backend=None, hicache_storage_prefetch_policy='best_effort', hicache_storage_backend_extra_config=None, hierarchical_sparse_attention_extra_config=None, enable_lmcache=False, kt_weight_path=None, kt_method='AMXINT4', kt_cpuinfer=None, kt_threadpool_count=2, kt_num_gpu_experts=None, kt_max_deferred_experts_per_token=None, dllm_algorithm=None, dllm_algorithm_config=None, enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, cpu_offload_gb=0, offload_group_size=-1, offload_num_in_group=1, offload_prefetch_step=1, offload_mode='cpu', multi_item_scoring_delimiter=None, disable_radix_cache=False, cuda_graph_max_bs=256, cuda_graph_bs=[1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], disable_cuda_graph=False, disable_cuda_graph_padding=False, enable_profile_cuda_graph=False, enable_cudagraph_gc=False, enable_layerwise_nvtx_marker=False, enable_nccl_nvls=False, enable_symm_mem=False, disable_flashinfer_cutlass_moe_fp4_allgather=False, enable_tokenizer_batch_encode=False, disable_tokenizer_batch_decode=False, disable_outlines_disk_cache=False, disable_custom_all_reduce=False, enable_mscclpp=False, enable_torch_symm_mem=False, disable_overlap_schedule=True, enable_mixed_chunk=False, enable_dp_attention=False, enable_dp_lm_head=False, enable_two_batch_overlap=False, enable_single_batch_overlap=False, tbo_token_distribution_threshold=0.48, enable_torch_compile=False, enable_piecewise_cuda_graph=False, enable_torch_compile_debug_mode=False, torch_compile_max_bs=32, piecewise_cuda_graph_max_tokens=8192, piecewise_cuda_graph_tokens=[4, 8, 12, 16, 20, 24, 28, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1280, 1536, 1792, 2048, 2304, 2560, 2816, 3072, 3328, 3584, 3840, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192], piecewise_cuda_graph_compiler='eager', torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, triton_attention_num_kv_splits=8, triton_attention_split_tile_size=None, num_continuous_decode_steps=1, delete_ckpt_after_loading=False, enable_memory_saver=False, enable_weights_cpu_backup=False, enable_draft_weights_cpu_backup=False, allow_auto_truncate=False, enable_custom_logit_processor=False, flashinfer_mla_disable_ragged=False, disable_shared_experts_fusion=False, disable_chunked_prefix_cache=False, disable_fast_image_processor=False, keep_mm_feature_on_device=False, enable_return_hidden_states=False, enable_return_routed_experts=False, scheduler_recv_interval=1, numa_node=None, enable_deterministic_inference=False, rl_on_policy_target=None, enable_attn_tp_input_scattered=False, enable_nsa_prefill_context_parallel=False, nsa_prefill_cp_mode='round-robin-split', enable_fused_qk_norm_rope=False, enable_precise_embedding_interpolation=False, enable_dynamic_batch_tokenizer=False, dynamic_batch_tokenizer_batch_size=32, dynamic_batch_tokenizer_batch_timeout=0.002, debug_tensor_dump_output_folder=None, debug_tensor_dump_layers=None, debug_tensor_dump_input_file=None, debug_tensor_dump_inject=False, disaggregation_mode='null', disaggregation_transfer_backend='mooncake', disaggregation_bootstrap_port=8998, disaggregation_decode_tp=None, disaggregation_decode_dp=None, disaggregation_prefill_pp=1, disaggregation_ib_device=None, disaggregation_decode_enable_offload_kvcache=False, num_reserved_decode_tokens=512, disaggregation_decode_polling_interval=1, encoder_only=False, language_only=False, encoder_transfer_backend='zmq_to_scheduler', encoder_urls=[], custom_weight_loader=[], weight_loader_disable_mmap=False, remote_instance_weight_loader_seed_instance_ip=None, remote_instance_weight_loader_seed_instance_service_port=None, remote_instance_weight_loader_send_weights_group_ports=None, remote_instance_weight_loader_backend='nccl', remote_instance_weight_loader_start_seed_via_transfer_engine=False, enable_pdmux=False, pdmux_config_path=None, sm_group_num=8, mm_max_concurrent_calls=32, mm_per_request_timeout=10.0, enable_broadcast_mm_inputs_process=False, enable_prefix_mm_cache=False, mm_enable_dp_encoder=False, mm_process_config={}, limit_mm_data_per_request=None, decrypted_config_file=None, decrypted_draft_config_file=None, forward_hooks=None)
  4. [2026-03-27 10:53:32] Ignore import error when loading sglang.srt.multimodal.processors.glmasr: cannot import name 'GlmAsrConfig' from 'transformers' (/usr/local/lib/python3.12/dist-packages/transformers/__init__.py)
  5. [2026-03-27 10:53:34] Using default HuggingFace chat template with detected content format: openai
  6. [2026-03-27 10:53:39 TP0] Mamba selective_state_update backend initialized: triton
  7. [2026-03-27 10:53:39 TP1] Mamba selective_state_update backend initialized: triton
  8. [2026-03-27 10:53:39 TP0] Init torch distributed begin.
  9. [2026-03-27 10:53:39 TP1] Init torch distributed begin.
  10. [Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
  11. [Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
  12. [Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1[Gloo] Rank
  13. 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
  14. [2026-03-27 10:53:40 TP0] sglang is using nccl==2.28.3
  15. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  16. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  17. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  18. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  19. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  20. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  21. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  22. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  23. [2026-03-27 10:53:40 TP0] Init torch distributed ends. elapsed=1.00 s, mem usage=0.90 GB
  24. [2026-03-27 10:53:40 TP1] Init torch distributed ends. elapsed=0.99 s, mem usage=0.90 GB
  25. [2026-03-27 10:53:41 TP1] Ignore import error when loading sglang.srt.models.glm_ocr: No module named 'transformers.models.glm_ocr'
  26. [2026-03-27 10:53:41 TP0] Ignore import error when loading sglang.srt.models.glm_ocr: No module named 'transformers.models.glm_ocr'
  27. [2026-03-27 10:53:41 TP1] Ignore import error when loading sglang.srt.models.glm_ocr_nextn: No module named 'transformers.models.glm_ocr'
  28. [2026-03-27 10:53:41 TP1] Ignore import error when loading sglang.srt.models.glmasr: cannot import name 'GlmAsrConfig' from 'transformers' (/usr/local/lib/python3.12/dist-packages/transformers/__init__.py)
  29. [2026-03-27 10:53:41 TP0] Ignore import error when loading sglang.srt.models.glm_ocr_nextn: No module named 'transformers.models.glm_ocr'
  30. [2026-03-27 10:53:41 TP0] Ignore import error when loading sglang.srt.models.glmasr: cannot import name 'GlmAsrConfig' from 'transformers' (/usr/local/lib/python3.12/dist-packages/transformers/__init__.py)
  31. [2026-03-27 10:53:41 TP0] Load weight begin. avail mem=138.56 GB
  32. [2026-03-27 10:53:41 TP1] Load weight begin. avail mem=138.56 GB
  33. [2026-03-27 10:53:41 TP0] Multimodal attention backend not set. Use fa3.
  34. [2026-03-27 10:53:41 TP0] Using fa3 as multimodal attention backend.
  35. [2026-03-27 10:53:41 TP1] Multimodal attention backend not set. Use fa3.
  36. [2026-03-27 10:53:41 TP1] Using fa3 as multimodal attention backend.
  37. `torch_dtype` is deprecated! Use `dtype` instead!
  38. `torch_dtype` is deprecated! Use `dtype` instead!
  39. [2026-03-27 10:53:41 TP0] using attn output gate!
  40. [2026-03-27 10:53:41 TP1] using attn output gate!
  41. Loading safetensors checkpoint shards: 0% Completed | 0/39 [00:00<?, ?it/s]
  42. Loading safetensors checkpoint shards: 3% Completed | 1/39 [00:00<00:21, 1.78it/s]
  43. Loading safetensors checkpoint shards: 5% Completed | 2/39 [00:01<00:21, 1.70it/s]
  44. Loading safetensors checkpoint shards: 8% Completed | 3/39 [00:01<00:21, 1.68it/s]
  45. Loading safetensors checkpoint shards: 10% Completed | 4/39 [00:02<00:20, 1.68it/s]
  46. Loading safetensors checkpoint shards: 13% Completed | 5/39 [00:02<00:19, 1.76it/s]
  47. Loading safetensors checkpoint shards: 15% Completed | 6/39 [00:03<00:18, 1.81it/s]
  48. Loading safetensors checkpoint shards: 18% Completed | 7/39 [00:03<00:17, 1.84it/s]
  49. Loading safetensors checkpoint shards: 21% Completed | 8/39 [00:04<00:16, 1.86it/s]
  50. Loading safetensors checkpoint shards: 23% Completed | 9/39 [00:04<00:16, 1.87it/s]
  51. Loading safetensors checkpoint shards: 26% Completed | 10/39 [00:05<00:15, 1.88it/s]
  52. Loading safetensors checkpoint shards: 28% Completed | 11/39 [00:06<00:14, 1.89it/s]
  53. Loading safetensors checkpoint shards: 31% Completed | 12/39 [00:06<00:14, 1.89it/s]
  54. Loading safetensors checkpoint shards: 33% Completed | 13/39 [00:07<00:13, 1.89it/s]
  55. Loading safetensors checkpoint shards: 36% Completed | 14/39 [00:07<00:13, 1.88it/s]
  56. Loading safetensors checkpoint shards: 38% Completed | 15/39 [00:08<00:12, 1.88it/s]
  57. Loading safetensors checkpoint shards: 41% Completed | 16/39 [00:08<00:12, 1.87it/s]
  58. Loading safetensors checkpoint shards: 44% Completed | 17/39 [00:09<00:11, 1.87it/s]
  59. Loading safetensors checkpoint shards: 46% Completed | 18/39 [00:09<00:11, 1.87it/s]
  60. Loading safetensors checkpoint shards: 49% Completed | 19/39 [00:10<00:10, 1.87it/s]
  61. Loading safetensors checkpoint shards: 51% Completed | 20/39 [00:10<00:10, 1.87it/s]
  62. Loading safetensors checkpoint shards: 54% Completed | 21/39 [00:11<00:09, 1.86it/s]
  63. Loading safetensors checkpoint shards: 56% Completed | 22/39 [00:11<00:09, 1.88it/s]
  64. Loading safetensors checkpoint shards: 59% Completed | 23/39 [00:12<00:10, 1.58it/s]
  65. Loading safetensors checkpoint shards: 62% Completed | 24/39 [00:13<00:10, 1.40it/s]
  66. Loading safetensors checkpoint shards: 64% Completed | 25/39 [00:14<00:10, 1.29it/s]
  67. Loading safetensors checkpoint shards: 67% Completed | 26/39 [00:15<00:10, 1.23it/s]
  68. Loading safetensors checkpoint shards: 69% Completed | 27/39 [00:16<00:10, 1.19it/s]
  69. Loading safetensors checkpoint shards: 72% Completed | 28/39 [00:17<00:09, 1.16it/s]
  70. Loading safetensors checkpoint shards: 74% Completed | 29/39 [00:18<00:08, 1.14it/s]
  71. Loading safetensors checkpoint shards: 77% Completed | 30/39 [00:19<00:08, 1.12it/s]
  72. Loading safetensors checkpoint shards: 79% Completed | 31/39 [00:20<00:07, 1.11it/s]
  73. Loading safetensors checkpoint shards: 82% Completed | 32/39 [00:20<00:06, 1.10it/s]
  74. Loading safetensors checkpoint shards: 85% Completed | 33/39 [00:21<00:04, 1.29it/s]
  75. Loading safetensors checkpoint shards: 87% Completed | 34/39 [00:22<00:04, 1.24it/s]
  76. Loading safetensors checkpoint shards: 90% Completed | 35/39 [00:23<00:03, 1.19it/s]
  77. Loading safetensors checkpoint shards: 92% Completed | 36/39 [00:23<00:02, 1.28it/s]
  78. Loading safetensors checkpoint shards: 95% Completed | 37/39 [00:24<00:01, 1.69it/s]
  79. Loading safetensors checkpoint shards: 97% Completed | 38/39 [00:24<00:00, 1.79it/s]
  80. Loading safetensors checkpoint shards: 100% Completed | 39/39 [00:25<00:00, 1.82it/s]
  81. Loading safetensors checkpoint shards: 100% Completed | 39/39 [00:25<00:00, 1.56it/s]
  82. [2026-03-27 10:54:07 TP0] Load weight end. elapsed=25.32 s, type=Qwen3_5MoeForConditionalGeneration, dtype=torch.bfloat16, avail mem=24.27 GB, mem usage=114.29 GB.
  83. [2026-03-27 10:54:11 TP1] Load weight end. elapsed=29.60 s, type=Qwen3_5MoeForConditionalGeneration, dtype=torch.bfloat16, avail mem=24.27 GB, mem usage=114.29 GB.
  84. [2026-03-27 10:54:11 TP0] Using KV cache dtype: torch.bfloat16
  85. [2026-03-27 10:54:11 TP1] Mamba Cache is allocated. max_mamba_cache_size: 11, conv_state size: 0.01GB, ssm_state size: 0.84GB
  86. [2026-03-27 10:54:11 TP0] Mamba Cache is allocated. max_mamba_cache_size: 11, conv_state size: 0.01GB, ssm_state size: 0.84GB
  87. [2026-03-27 10:54:11 TP1] KV Cache is allocated. #tokens: 84848, K size: 0.49 GB, V size: 0.49 GB
  88. [2026-03-27 10:54:11 TP1] Memory pool end. avail mem=22.39 GB
  89. [2026-03-27 10:54:11 TP0] KV Cache is allocated. #tokens: 84848, K size: 0.49 GB, V size: 0.49 GB
  90. [2026-03-27 10:54:11 TP0] Memory pool end. avail mem=22.39 GB
  91. [2026-03-27 10:54:11 TP1] Using hybrid linear attention backend for hybrid GDN models.
  92. [2026-03-27 10:54:11 TP1] Capture cuda graph begin. This can take up to several minutes. avail mem=22.29 GB
  93. [2026-03-27 10:54:11 TP0] Using hybrid linear attention backend for hybrid GDN models.
  94. [2026-03-27 10:54:11 TP0] CuTe DSL GDN decode enabled: False
  95. [2026-03-27 10:54:11 TP0] Capture cuda graph begin. This can take up to several minutes. avail mem=22.29 GB
  96. [2026-03-27 10:54:11 TP0] Capture cuda graph bs [1, 2, 3]
  97. 0%| | 0/3 [00:00<?, ?it/s] Capturing batches (bs=3 avail_mem=22.29 GB): 0%| | 0/3 [00:00<?, ?it/s][2026-03-27 10:54:15 TP0] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=512,device_name=NVIDIA_H20-3e.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
  98. [2026-03-27 10:54:15 TP0] Using MoE kernel config with down_moe=False. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=512,device_name=NVIDIA_H20-3e_down.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
  99. [2026-03-27 10:54:15 TP1] Using default MoE kernel config. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=512,device_name=NVIDIA_H20-3e.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
  100. [2026-03-27 10:54:15 TP1] Using MoE kernel config with down_moe=False. Performance might be sub-optimal! Config file not found at /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_5_1/E=256,N=512,device_name=NVIDIA_H20-3e_down.json, you can create them with https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
  101. Capturing batches (bs=3 avail_mem=22.29 GB): 33%|███▎ | 1/3 [00:10<00:20, 10.43s/it] Capturing batches (bs=2 avail_mem=22.23 GB): 33%|███▎ | 1/3 [00:10<00:20, 10.43s/it] Capturing batches (bs=2 avail_mem=22.23 GB): 67%|██████▋ | 2/3 [00:12<00:05, 5.24s/it] Capturing batches (bs=1 avail_mem=22.22 GB): 67%|██████▋ | 2/3 [00:12<00:05, 5.24s/it] Capturing batches (bs=1 avail_mem=22.22 GB): 100%|██████████| 3/3 [00:13<00:00, 3.38s/it] Capturing batches (bs=1 avail_mem=22.22 GB): 100%|██████████| 3/3 [00:13<00:00, 4.40s/it]
  102. [2026-03-27 10:54:25 TP0] Registering 291 cuda graph addresses
  103. [2026-03-27 10:54:25 TP0] Capture cuda graph end. Time elapsed: 14.06 s. mem usage=0.08 GB. avail mem=22.21 GB.
  104. [2026-03-27 10:54:25 TP1] Capture cuda graph end. Time elapsed: 14.07 s. mem usage=0.08 GB. avail mem=22.21 GB.
  105. [2026-03-27 10:54:26 TP0] max_total_num_tokens=84848, chunked_prefill_size=8192, max_prefill_tokens=16384, max_running_requests=3, context_len=262144, available_gpu_mem=22.21 GB
  106. [2026-03-27 10:54:28] INFO: Started server process [30]
  107. [2026-03-27 10:54:28] INFO: Waiting for application startup.
  108. [2026-03-27 10:54:28] Using default chat sampling params from model generation config: {'repetition_penalty': 1.0, 'temperature': 0.6, 'top_k': 20, 'top_p': 0.95}
  109. [2026-03-27 10:54:28] INFO: Application startup complete.
  110. [2026-03-27 10:54:28] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)
  111. [2026-03-27 10:54:29] INFO: 127.0.0.1:55608 - "GET /model_info HTTP/1.1" 200 OK
  112. [2026-03-27 10:54:37 TP0] Prefill batch, #new-seq: 1, #new-token: 80, #cached-token: 0, full token usage: 0.00, mamba usage: 0.18, #running-req: 0, #queue-req: 0, input throughput (token/s): 0.00, cuda graph: False
  113. [2026-03-27 10:54:37] INFO: 127.0.0.1:55624 - "POST /v1/chat/completions HTTP/1.1" 200 OK
  114. [2026-03-27 10:54:37] The server is fired up and ready to roll!
  115. [2026-03-27 10:56:06 TP0] Prefill batch, #new-seq: 1, #new-token: 16, #cached-token: 0, full token usage: 0.00, mamba usage: 0.18, #running-req: 0, #queue-req: 0, input throughput (token/s): 0.90, cuda graph: False
  116. [2026-03-27 10:56:06 TP0] Decode batch, #running-req: 1, #full token: 49, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 0.27, #queue-req: 0
  117. [2026-03-27 10:56:07] INFO: 172.19.0.1:40992 - "POST /v1/chat/completions HTTP/1.1" 200 OK
  118. [2026-03-27 11:14:58 TP0] Prefill batch, #new-seq: 1, #new-token: 15, #cached-token: 0, full token usage: 0.00, mamba usage: 0.18, #running-req: 0, #queue-req: 0, input throughput (token/s): 0.01, cuda graph: False
  119. [2026-03-27 11:14:58 TP0] Decode batch, #running-req: 1, #full token: 39, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 0.04, #queue-req: 0
  120. [2026-03-27 11:14:59 TP0] Decode batch, #running-req: 1, #full token: 79, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.50, #queue-req: 0
  121. [2026-03-27 11:14:59 TP0] Decode batch, #running-req: 1, #full token: 119, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.85, #queue-req: 0
  122. [2026-03-27 11:14:59 TP0] Decode batch, #running-req: 1, #full token: 159, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.91, #queue-req: 0
  123. [2026-03-27 11:15:00 TP0] Decode batch, #running-req: 1, #full token: 199, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.88, #queue-req: 0
  124. [2026-03-27 11:15:00 TP0] Decode batch, #running-req: 1, #full token: 239, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.80, #queue-req: 0
  125. [2026-03-27 11:15:00 TP0] Decode batch, #running-req: 1, #full token: 279, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.73, #queue-req: 0
  126. [2026-03-27 11:15:01 TP0] Decode batch, #running-req: 1, #full token: 319, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.92, #queue-req: 0
  127. [2026-03-27 11:15:01 TP0] Decode batch, #running-req: 1, #full token: 359, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.75, #queue-req: 0
  128. [2026-03-27 11:15:01 TP0] Decode batch, #running-req: 1, #full token: 399, full token usage: 0.00, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.51, #queue-req: 0
  129. [2026-03-27 11:15:02 TP0] Decode batch, #running-req: 1, #full token: 439, full token usage: 0.01, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.08, #queue-req: 0
  130. [2026-03-27 11:15:02 TP0] Decode batch, #running-req: 1, #full token: 479, full token usage: 0.01, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.48, #queue-req: 0
  131. [2026-03-27 11:15:03 TP0] Decode batch, #running-req: 1, #full token: 519, full token usage: 0.01, mamba num: 2, mamba usage: 0.18, cuda graph: True, gen throughput (token/s): 109.73, #queue-req: 0
  132. [2026-03-27 11:15:03] INFO: 171.217.104.177:54062 - "POST /v1/chat/completions HTTP/1.1" 200 OK