|
|
@@ -16,6 +16,9 @@
|
|
|
|
|
|
# python utils_test/Model_Test/test_model_stress.py --concurrency 150 --count 150 --model shutian_qwen3_6_27b --context-size 8k
|
|
|
|
|
|
+ # 避免服务端 KV 缓存命中(注入随机值)
|
|
|
+ python utils_test/Model_Test/test_model_stress.py --concurrency 10 --count 50 --bust-cache
|
|
|
+
|
|
|
# 自定义参数
|
|
|
python utils_test/Model_Test/test_model_stress.py --concurrency 20 --count 100 --model shutian_qwen3_5_122b
|
|
|
|
|
|
@@ -41,6 +44,7 @@ import asyncio
|
|
|
import argparse
|
|
|
import time
|
|
|
import statistics
|
|
|
+import uuid
|
|
|
from pathlib import Path
|
|
|
from dataclasses import dataclass, field
|
|
|
from typing import List, Optional, Tuple
|
|
|
@@ -191,11 +195,13 @@ def _extract_token_usage(response) -> Tuple[int, int]:
|
|
|
|
|
|
async def _run_llm_request(trace_id: str, model_name: Optional[str] = None,
|
|
|
function_name: Optional[str] = None,
|
|
|
- context_size: int = 0) -> RequestResult:
|
|
|
+ context_size: int = 0,
|
|
|
+ bust_cache: bool = False) -> RequestResult:
|
|
|
"""执行单次 LLM 调用并记录延迟和 token 用量
|
|
|
|
|
|
Args:
|
|
|
context_size: 上下文 token 数,>0 时在 user_prompt 前拼接填充文本
|
|
|
+ bust_cache: 在 prompt 末尾追加随机值避免 KV 缓存命中
|
|
|
"""
|
|
|
from foundation.ai.models.model_handler import model_handler
|
|
|
from foundation.ai.models.model_config_loader import get_model_for_function, get_thinking_mode_for_function
|
|
|
@@ -222,6 +228,10 @@ async def _run_llm_request(trace_id: str, model_name: Optional[str] = None,
|
|
|
padding = _generate_context_text(context_size)
|
|
|
user_prompt = f"{padding}\n\n---\n\n{TEST_USER_PROMPT}"
|
|
|
|
|
|
+ if bust_cache:
|
|
|
+ rand = uuid.uuid4().hex[:12]
|
|
|
+ user_prompt = f"[noise:{rand}]\n{user_prompt}"
|
|
|
+
|
|
|
messages = [SystemMessage(content=TEST_SYSTEM_PROMPT), HumanMessage(content=user_prompt)]
|
|
|
|
|
|
start = time.perf_counter()
|
|
|
@@ -274,6 +284,7 @@ async def run_stress_test(
|
|
|
total_count: int,
|
|
|
function_name: Optional[str] = None,
|
|
|
context_size: int = 0,
|
|
|
+ bust_cache: bool = False,
|
|
|
) -> StressTestResult:
|
|
|
"""执行压力测试
|
|
|
|
|
|
@@ -304,7 +315,7 @@ async def run_stress_test(
|
|
|
if model_type == "embedding":
|
|
|
return await _run_embedding_request(trace_id, model_name)
|
|
|
else:
|
|
|
- return await _run_llm_request(trace_id, model_name, function_name, context_size)
|
|
|
+ return await _run_llm_request(trace_id, model_name, function_name, context_size, bust_cache)
|
|
|
|
|
|
ctx_label = f" | 上下文: {context_size//1024}k tokens" if context_size > 0 else ""
|
|
|
print(f"\n{'='*60}")
|
|
|
@@ -502,6 +513,10 @@ def parse_args():
|
|
|
"--all-embeddings", action="store_true",
|
|
|
help="逐个测试所有 Embedding 模型",
|
|
|
)
|
|
|
+ parser.add_argument(
|
|
|
+ "--bust-cache", action="store_true",
|
|
|
+ help="在每次请求的 prompt 末尾注入随机值,避免服务端 KV 缓存命中",
|
|
|
+ )
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
@@ -539,6 +554,7 @@ async def _run_single_model_test(args, model_name: str, function_name: Optional[
|
|
|
total_count=args.count,
|
|
|
function_name=function_name,
|
|
|
context_size=ctx_size,
|
|
|
+ bust_cache=args.bust_cache,
|
|
|
)
|
|
|
summary = print_report(result)
|
|
|
summary["context_display"] = ctx_display
|
|
|
@@ -571,6 +587,7 @@ async def main():
|
|
|
model_type=model_type,
|
|
|
concurrency=args.concurrency,
|
|
|
total_count=args.count,
|
|
|
+ bust_cache=args.bust_cache,
|
|
|
)
|
|
|
summary = print_report(result)
|
|
|
results_summary.append(summary)
|