CRBC-MaaS-Platform-Project
/
LQDeployConfig


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
							[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:50 [utils.py:325] 
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:50 [utils.py:325]        █     █     █▄   ▄█
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:50 [utils.py:325]  ▄▄ ▄█ █     █     █ ▀▄▀ █  version 0.15.0
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:50 [utils.py:325]   █▄█▀ █     █     █     █  model   /model/Qwen3-Reranker-8B
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:50 [utils.py:325]    ▀▀  ▀▀▀▀▀ ▀▀▀▀▀ ▀     ▀
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:50 [utils.py:325] 
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:50 [utils.py:261] non-default args: {'host': '0.0.0.0', 'port': 30000, 'api_key': ['lq123456'], 'model': '/model/Qwen3-Reranker-8B', 'runner': 'pooling', 'trust_remote_code': True, 'gpu_memory_utilization': 0.45}
[0;36m(APIServer pid=8)[0;0m The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[0;36m(APIServer pid=8)[0;0m The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:59 [model.py:871] Resolved `--convert auto` to `--convert embed`. Pass the value explicitly to silence this message.
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:59 [model.py:541] Resolved architecture: Qwen3ForCausalLM
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:59 [model.py:1561] Using max model len 40960
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:59 [scheduler.py:226] Chunked prefill is enabled with max_num_batched_tokens=8192.
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:05:59 [vllm.py:624] Asynchronous scheduling is enabled.
[0;36m(APIServer pid=8)[0;0m WARNING 03-28 05:05:59 [vllm.py:741] Pooling models do not support full cudagraphs. Overriding cudagraph_mode to PIECEWISE.
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:04 [core.py:96] Initializing a V1 LLM engine (v0.15.0) with config: model='/model/Qwen3-Reranker-8B', speculative_config=None, tokenizer='/model/Qwen3-Reranker-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/model/Qwen3-Reranker-8B, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=PoolerConfig(pooling_type=None, seq_pooling_type='LAST', tok_pooling_type='ALL', normalize=None, dimensions=None, enable_chunked_processing=None, max_embed_len=None, softmax=None, activation=None, use_activation=None, logit_bias=None, step_tag_id=None, returned_token_ids=None), compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.PIECEWISE: 1>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': True}, 'local_cache_dir': None}
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:06 [parallel_state.py:1212] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://172.19.0.3:49759 backend=nccl
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:06 [parallel_state.py:1423] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:07 [gpu_model_runner.py:4021] Starting to load model /model/Qwen3-Reranker-8B...
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:25 [cuda.py:364] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')
[0;36m(EngineCore_DP0 pid=274)[0;0m 
Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]
[0;36m(EngineCore_DP0 pid=274)[0;0m 
Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:00<00:00, 903.01it/s]
[0;36m(EngineCore_DP0 pid=274)[0;0m 
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:27 [default_loader.py:291] Loading weights took 1.56 seconds
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:27 [gpu_model_runner.py:4118] Model loading took 14.11 GiB memory and 19.099960 seconds
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:33 [backends.py:805] Using cache directory: /root/.cache/vllm/torch_compile_cache/79ebc418d5/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:33 [backends.py:865] Dynamo bytecode transform time: 5.97 s
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:06:44 [backends.py:302] Cache the graph of compile range (1, 8192) for later use
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:07:37 [backends.py:319] Compiling a graph for compile range (1, 8192) takes 59.02 s
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:07:37 [monitor.py:34] torch.compile takes 64.99 s in total
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:07:38 [gpu_worker.py:356] Available KV cache memory: 47.39 GiB
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:07:38 [kv_cache_utils.py:1307] GPU KV cache size: 345,056 tokens
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:07:38 [kv_cache_utils.py:1312] Maximum concurrency for 40,960 tokens per request: 8.42x
[0;36m(EngineCore_DP0 pid=274)[0;0m 2026-03-28 05:07:38,901 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
[0;36m(EngineCore_DP0 pid=274)[0;0m 2026-03-28 05:07:38,917 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
[0;36m(EngineCore_DP0 pid=274)[0;0m 
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   0%|          | 0/51 [00:00<?, ?it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   2%|▏         | 1/51 [00:00<00:28,  1.74it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):   6%|▌         | 3/51 [00:00<00:09,  4.80it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  10%|▉         | 5/51 [00:00<00:06,  7.11it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  14%|█▎        | 7/51 [00:01<00:04,  8.86it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  18%|█▊        | 9/51 [00:01<00:04, 10.22it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  22%|██▏       | 11/51 [00:01<00:03, 11.57it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  25%|██▌       | 13/51 [00:01<00:03, 12.55it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  29%|██▉       | 15/51 [00:01<00:02, 13.41it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  33%|███▎      | 17/51 [00:01<00:02, 14.57it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  37%|███▋      | 19/51 [00:01<00:02, 15.74it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  41%|████      | 21/51 [00:01<00:01, 16.63it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  45%|████▌     | 23/51 [00:02<00:01, 16.88it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  49%|████▉     | 25/51 [00:02<00:01, 17.41it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  55%|█████▍    | 28/51 [00:02<00:01, 18.36it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  61%|██████    | 31/51 [00:02<00:01, 19.11it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  67%|██████▋   | 34/51 [00:02<00:00, 20.08it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  73%|███████▎  | 37/51 [00:02<00:00, 20.91it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  78%|███████▊  | 40/51 [00:02<00:00, 21.53it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  84%|████████▍ | 43/51 [00:02<00:00, 22.36it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  90%|█████████ | 46/51 [00:03<00:00, 22.98it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE):  96%|█████████▌| 49/51 [00:03<00:00, 23.46it/s]
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:03<00:00, 15.65it/s]
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:07:42 [gpu_model_runner.py:5051] Graph capturing finished in 4 secs, took -0.67 GiB
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:07:43 [core.py:272] init engine (profile, create kv cache, warmup model) took 75.31 seconds
[0;36m(EngineCore_DP0 pid=274)[0;0m INFO 03-28 05:07:43 [vllm.py:624] Asynchronous scheduling is enabled.
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [api_server.py:665] Supported tasks: ['token_embed', 'embed']
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [api_server.py:946] Starting vLLM API server 0 on http://0.0.0.0:30000
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:38] Available routes are:
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /openapi.json, Methods: GET, HEAD
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /docs, Methods: GET, HEAD
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /redoc, Methods: GET, HEAD
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /scale_elastic_ep, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /tokenize, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /detokenize, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /inference/v1/generate, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /pause, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /resume, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /is_paused, Methods: GET
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /metrics, Methods: GET
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /health, Methods: GET
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/chat/completions, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/chat/completions/render, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/responses, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/audio/transcriptions, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/audio/translations, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/completions, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/completions/render, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/messages, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/models, Methods: GET
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /load, Methods: GET
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /version, Methods: GET
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /ping, Methods: GET
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /ping, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /invocations, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /classify, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/embeddings, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /score, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/score, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /rerank, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v1/rerank, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /v2/rerank, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:07:44 [launcher.py:46] Route: /pooling, Methods: POST
[0;36m(APIServer pid=8)[0;0m INFO:     Started server process [8]
[0;36m(APIServer pid=8)[0;0m INFO:     Waiting for application startup.
[0;36m(APIServer pid=8)[0;0m INFO:     Application startup complete.
[0;36m(APIServer pid=8)[0;0m WARNING 03-28 05:08:27 [api_router.py:128] To indicate that the rerank API is not part of the standard OpenAI API, we have located it at `/rerank`. Please update your client accordingly. (Note: Conforms to JinaAI rerank API)
[0;36m(APIServer pid=8)[0;0m INFO:     172.19.0.1:36202 - "POST /v1/rerank HTTP/1.1" 200 OK
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:08:34 [loggers.py:257] Engine 000: Avg prompt throughput: 1.8 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%
[0;36m(APIServer pid=8)[0;0m INFO 03-28 05:08:44 [loggers.py:257] Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0%