| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- 测试统一模型调用的思考模式配置开关
- 测试内容:
- 1. 测试 enable_thinking=False(默认)时,Qwen3.5 模型响应时间是否显著缩短
- 2. 测试 enable_thinking=True 时,Qwen3.5 模型响应是否包含思考过程
- 3. 测试非 Qwen3.5 模型不受 enable_thinking 参数影响
- 运行方式:
- cd D:/wx_work/sichuan_luqiao/LQAgentPlatform
- python utils_test/Model_Test/test_thinking_mode.py
- """
- import asyncio
- import time
- import sys
- from pathlib import Path
- # 添加项目根目录到 Python 路径
- project_root = Path(__file__).parent.parent.parent
- sys.path.insert(0, str(project_root))
- from foundation.ai.agent.generate.model_generate import generate_model_client
- # 测试用的简单提示词
- TEST_SYSTEM_PROMPT = "你是一个 helpful 的 AI 助手,请简洁回答。"
- TEST_USER_PROMPT = "请用一句话回答:1+1等于几?"
- async def test_qwen35_with_thinking_disabled():
- """测试 Qwen3.5 模型 - 禁用思考模式(默认)"""
- print("\n" + "=" * 60)
- print("测试 1: Qwen3.5 35B - 禁用思考模式 (enable_thinking=False)")
- print("=" * 60)
- model_name = "qwen3_5_35b_a3b"
- start_time = time.time()
- try:
- response = await generate_model_client.get_model_generate_invoke(
- trace_id=f"test_thinking_disabled_{int(time.time())}",
- system_prompt=TEST_SYSTEM_PROMPT,
- user_prompt=TEST_USER_PROMPT,
- model_name=model_name,
- enable_thinking=False, # 显式禁用思考模式
- timeout=120
- )
- elapsed_time = time.time() - start_time
- print(f"✅ 调用成功")
- print(f" 模型: {model_name}")
- print(f" 响应时间: {elapsed_time:.2f}s")
- print(f" 响应内容: {response[:100]}...")
- # 验证响应中不包含思考标记(如 <think> 或 思考过程)
- has_think_tag = "<think>" in response or "</think>" in response
- has_thinking_marker = "思考" in response and "过程" in response
- if has_think_tag or has_thinking_marker:
- print(f" ⚠️ 警告: 响应可能包含思考过程标记")
- else:
- print(f" ✅ 响应不包含思考过程标记")
- return elapsed_time, True
- except Exception as e:
- elapsed_time = time.time() - start_time
- print(f"❌ 调用失败: {e}")
- return elapsed_time, False
- async def test_qwen35_with_thinking_enabled():
- """测试 Qwen3.5 模型 - 启用思考模式"""
- print("\n" + "=" * 60)
- print("测试 2: Qwen3.5 35B - 启用思考模式 (enable_thinking=True)")
- print("=" * 60)
- model_name = "qwen3_5_35b_a3b"
- start_time = time.time()
- # 使用需要推理的问题来激发思考过程
- reasoning_prompt = "请详细解释为什么 1+1=2?请展示你的思考过程。"
- try:
- response = await generate_model_client.get_model_generate_invoke(
- trace_id=f"test_thinking_enabled_{int(time.time())}",
- system_prompt="你是一个善于思考的AI助手,请详细展示你的推理过程。",
- user_prompt=reasoning_prompt,
- model_name=model_name,
- enable_thinking=True, # 显式启用思考模式
- timeout=300 # 思考模式可能需要更长时间
- )
- elapsed_time = time.time() - start_time
- print(f"✅ 调用成功")
- print(f" 模型: {model_name}")
- print(f" 响应时间: {elapsed_time:.2f}s")
- print(f" 响应长度: {len(response)} 字符")
- # 显示响应的前500字符和后200字符
- display_len = min(500, len(response))
- print(f" 响应开头: {response[:display_len]}...")
- if len(response) > 700:
- print(f" 响应结尾: ...{response[-200:]}")
- # 检查是否包含思考过程特征
- has_think_tag = "<think>" in response or "</think>" in response
- has_reasoning_markers = any(marker in response for marker in [
- "思考", "推理", "首先", "然后", "第一步", "第二步",
- "让我", "我需要", "我们来", "分析一下"
- ])
- is_long_response = len(response) > 800 # 思考模式通常产生更长响应
- print(f"\n 思考模式检测:")
- print(f" - 包含 <think> 标签: {'是' if has_think_tag else '否'}")
- print(f" - 包含推理标记词: {'是' if has_reasoning_markers else '否'}")
- print(f" - 响应较长 (>800字符): {'是' if is_long_response else '否'}")
- if has_think_tag or (has_reasoning_markers and is_long_response):
- print(f" ✅ 思考模式似乎已生效")
- else:
- print(f" ℹ️ 思考模式特征不明显,但调用已返回")
- return elapsed_time, True
- except Exception as e:
- elapsed_time = time.time() - start_time
- print(f"❌ 调用失败: {e}")
- return elapsed_time, False
- async def test_non_qwen35_model():
- """测试非 Qwen3.5 模型(如 Doubao)不受 enable_thinking 影响"""
- print("\n" + "=" * 60)
- print("测试 3: 非 Qwen3.5 模型 - enable_thinking 参数不应产生影响")
- print("=" * 60)
- model_name = "doubao-1.5-pro-256k" # 非 Qwen3.5 模型
- start_time = time.time()
- try:
- response = await generate_model_client.get_model_generate_invoke(
- trace_id=f"test_non_qwen35_{int(time.time())}",
- system_prompt=TEST_SYSTEM_PROMPT,
- user_prompt=TEST_USER_PROMPT,
- model_name=model_name,
- enable_thinking=False, # 对非 Qwen3.5 模型应被忽略
- timeout=60
- )
- elapsed_time = time.time() - start_time
- print(f"✅ 调用成功")
- print(f" 模型: {model_name}")
- print(f" 响应时间: {elapsed_time:.2f}s")
- print(f" 响应内容: {response[:100]}...")
- print(f" ✅ 非 Qwen3.5 模型正常响应,enable_thinking 参数被正确忽略")
- return elapsed_time, True
- except Exception as e:
- elapsed_time = time.time() - start_time
- print(f"❌ 调用失败: {e}")
- return elapsed_time, False
- async def test_multiple_calls_consistency():
- """测试多次调用的一致性(验证 enable_thinking=False 稳定生效)"""
- print("\n" + "=" * 60)
- print("测试 4: Qwen3.5 多次调用一致性测试 (enable_thinking=False)")
- print("=" * 60)
- model_name = "qwen3_5_35b_a3b"
- call_times = []
- success_count = 0
- num_calls = 3
- for i in range(num_calls):
- start_time = time.time()
- try:
- response = await generate_model_client.get_model_generate_invoke(
- trace_id=f"test_consistency_{i}_{int(time.time())}",
- system_prompt=TEST_SYSTEM_PROMPT,
- user_prompt=TEST_USER_PROMPT,
- model_name=model_name,
- enable_thinking=False,
- timeout=120
- )
- elapsed_time = time.time() - start_time
- call_times.append(elapsed_time)
- success_count += 1
- print(f" 调用 {i+1}/{num_calls}: {elapsed_time:.2f}s - 成功")
- except Exception as e:
- elapsed_time = time.time() - start_time
- call_times.append(elapsed_time)
- print(f" 调用 {i+1}/{num_calls}: {elapsed_time:.2f}s - 失败: {e}")
- if call_times:
- avg_time = sum(call_times) / len(call_times)
- min_time = min(call_times)
- max_time = max(call_times)
- print(f"\n 统计结果:")
- print(f" - 成功次数: {success_count}/{num_calls}")
- print(f" - 平均响应时间: {avg_time:.2f}s")
- print(f" - 最快: {min_time:.2f}s, 最慢: {max_time:.2f}s")
- # 验证响应时间合理性(禁用思考模式应在 60s 内完成)
- if avg_time < 60:
- print(f" ✅ 平均响应时间在合理范围内(<60s)")
- else:
- print(f" ⚠️ 平均响应时间较长(>=60s),可能思考模式未正确禁用")
- return success_count == num_calls
- async def test_qwen35_122b_model():
- """测试 Qwen3.5 122B 大模型"""
- print("\n" + "=" * 60)
- print("测试 5: Qwen3.5 122B - 禁用思考模式")
- print("=" * 60)
- model_name = "qwen3_5_122b_a10b"
- start_time = time.time()
- try:
- response = await generate_model_client.get_model_generate_invoke(
- trace_id=f"test_122b_disabled_{int(time.time())}",
- system_prompt=TEST_SYSTEM_PROMPT,
- user_prompt=TEST_USER_PROMPT,
- model_name=model_name,
- enable_thinking=False,
- timeout=120
- )
- elapsed_time = time.time() - start_time
- print(f"✅ 调用成功")
- print(f" 模型: {model_name}")
- print(f" 响应时间: {elapsed_time:.2f}s")
- print(f" 响应内容: {response[:100]}...")
- if elapsed_time < 100:
- print(f" ✅ 响应时间合理(<100s),思考模式可能已禁用")
- else:
- print(f" ⚠️ 响应时间较长(>=100s)")
- return elapsed_time, True
- except Exception as e:
- elapsed_time = time.time() - start_time
- print(f"❌ 调用失败: {e}")
- return elapsed_time, False
- async def run_all_tests():
- """运行所有测试"""
- print("\n" + "=" * 70)
- print(" 统一模型调用 - 思考模式配置开关测试")
- print("=" * 70)
- print("\n测试说明:")
- print("- 本测试验证 generate_model_client.get_model_generate_invoke()")
- print("- 的 enable_thinking 参数是否正确控制 Qwen3.5 模型的思考模式")
- print("- 预期: enable_thinking=False 时响应时间显著缩短(<60s)")
- results = {}
- # 测试 1: Qwen3.5 35B 禁用思考模式
- time1, success1 = await test_qwen35_with_thinking_disabled()
- results["qwen35_35b_disabled"] = {"time": time1, "success": success1}
- # 测试 2: Qwen3.5 35B 启用思考模式(对比测试)
- print("\n [对比测试] 启用思考模式 - 预计耗时 60-300s,请耐心等待...")
- time2, success2 = await test_qwen35_with_thinking_enabled()
- results["qwen35_35b_enabled"] = {"time": time2, "success": success2}
- # 测试 3: 非 Qwen3.5 模型
- time3, success3 = await test_non_qwen35_model()
- results["non_qwen35"] = {"time": time3, "success": success3}
- # 测试 4: 多次调用一致性
- success4 = await test_multiple_calls_consistency()
- results["consistency"] = {"success": success4}
- # 测试 5: Qwen3.5 122B 大模型
- time5, success5 = await test_qwen35_122b_model()
- results["qwen35_122b_disabled"] = {"time": time5, "success": success5}
- # 汇总结果
- print("\n" + "=" * 70)
- print(" 测试结果汇总")
- print("=" * 70)
- for test_name, result in results.items():
- status = "✅ 通过" if result.get("success") else "❌ 失败"
- time_info = f" ({result['time']:.2f}s)" if "time" in result else ""
- print(f" {test_name}: {status}{time_info}")
- # 思考模式对比分析
- if "qwen35_35b_disabled" in results and "qwen35_35b_enabled" in results:
- print("\n" + "-" * 70)
- print(" 思考模式性能对比")
- print("-" * 70)
- disabled_time = results["qwen35_35b_disabled"]["time"]
- enabled_time = results["qwen35_35b_enabled"]["time"]
- speedup = enabled_time / disabled_time if disabled_time > 0 else 0
- print(f" 禁用思考模式: {disabled_time:.2f}s")
- print(f" 启用思考模式: {enabled_time:.2f}s")
- print(f" 性能差异: {speedup:.1f}倍")
- if speedup > 3:
- print(f" ✅ 思考模式开关效果显著,禁用后提速 {speedup:.1f} 倍")
- else:
- print(f" ℹ️ 性能差异不明显")
- all_passed = all(r.get("success") for r in results.values())
- print("\n" + "=" * 70)
- if all_passed:
- print(" 🎉 所有测试通过!思考模式配置开关工作正常")
- else:
- print(" ⚠️ 部分测试失败,请检查配置")
- print("=" * 70 + "\n")
- return all_passed
- if __name__ == "__main__":
- try:
- success = asyncio.run(run_all_tests())
- sys.exit(0 if success else 1)
- except KeyboardInterrupt:
- print("\n\n测试被用户中断")
- sys.exit(1)
- except Exception as e:
- print(f"\n\n测试运行出错: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
|