test_thinking_mode.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 测试统一模型调用的思考模式配置开关
  5. 测试内容:
  6. 1. 测试 enable_thinking=False(默认)时,Qwen3.5 模型响应时间是否显著缩短
  7. 2. 测试 enable_thinking=True 时,Qwen3.5 模型响应是否包含思考过程
  8. 3. 测试非 Qwen3.5 模型不受 enable_thinking 参数影响
  9. 运行方式:
  10. cd D:/wx_work/sichuan_luqiao/LQAgentPlatform
  11. python utils_test/Model_Test/test_thinking_mode.py
  12. """
  13. import asyncio
  14. import time
  15. import sys
  16. from pathlib import Path
  17. project_root = Path(__file__).parent.parent.parent
  18. from foundation.ai.agent.generate.model_generate import generate_model_client
  19. # 测试用的简单提示词
  20. TEST_SYSTEM_PROMPT = "你是一个 helpful 的 AI 助手,请简洁回答。"
  21. TEST_USER_PROMPT = "请用一句话回答:1+1等于几?"
  22. async def test_qwen35_with_thinking_disabled():
  23. """测试 Qwen3.5 模型 - 禁用思考模式(默认)"""
  24. print("\n" + "=" * 60)
  25. print("测试 1: Qwen3.5 35B - 禁用思考模式 (enable_thinking=False)")
  26. print("=" * 60)
  27. model_name = "qwen3_5_35b_a3b"
  28. start_time = time.time()
  29. try:
  30. response = await generate_model_client.get_model_generate_invoke(
  31. trace_id=f"test_thinking_disabled_{int(time.time())}",
  32. system_prompt=TEST_SYSTEM_PROMPT,
  33. user_prompt=TEST_USER_PROMPT,
  34. model_name=model_name,
  35. enable_thinking=False, # 显式禁用思考模式
  36. timeout=120
  37. )
  38. elapsed_time = time.time() - start_time
  39. print(f"✅ 调用成功")
  40. print(f" 模型: {model_name}")
  41. print(f" 响应时间: {elapsed_time:.2f}s")
  42. print(f" 响应内容: {response[:100]}...")
  43. # 验证响应中不包含思考标记(如 <think> 或 思考过程)
  44. has_think_tag = "<think>" in response or "</think>" in response
  45. has_thinking_marker = "思考" in response and "过程" in response
  46. if has_think_tag or has_thinking_marker:
  47. print(f" ⚠️ 警告: 响应可能包含思考过程标记")
  48. else:
  49. print(f" ✅ 响应不包含思考过程标记")
  50. return elapsed_time, True
  51. except Exception as e:
  52. elapsed_time = time.time() - start_time
  53. print(f"❌ 调用失败: {e}")
  54. return elapsed_time, False
  55. async def test_qwen35_with_thinking_enabled():
  56. """测试 Qwen3.5 模型 - 启用思考模式"""
  57. print("\n" + "=" * 60)
  58. print("测试 2: Qwen3.5 35B - 启用思考模式 (enable_thinking=True)")
  59. print("=" * 60)
  60. model_name = "qwen3_5_35b_a3b"
  61. start_time = time.time()
  62. # 使用需要推理的问题来激发思考过程
  63. reasoning_prompt = "请详细解释为什么 1+1=2?请展示你的思考过程。"
  64. try:
  65. response = await generate_model_client.get_model_generate_invoke(
  66. trace_id=f"test_thinking_enabled_{int(time.time())}",
  67. system_prompt="你是一个善于思考的AI助手,请详细展示你的推理过程。",
  68. user_prompt=reasoning_prompt,
  69. model_name=model_name,
  70. enable_thinking=True, # 显式启用思考模式
  71. timeout=300 # 思考模式可能需要更长时间
  72. )
  73. elapsed_time = time.time() - start_time
  74. print(f"✅ 调用成功")
  75. print(f" 模型: {model_name}")
  76. print(f" 响应时间: {elapsed_time:.2f}s")
  77. print(f" 响应长度: {len(response)} 字符")
  78. # 显示响应的前500字符和后200字符
  79. display_len = min(500, len(response))
  80. print(f" 响应开头: {response[:display_len]}...")
  81. if len(response) > 700:
  82. print(f" 响应结尾: ...{response[-200:]}")
  83. # 检查是否包含思考过程特征
  84. has_think_tag = "<think>" in response or "</think>" in response
  85. has_reasoning_markers = any(marker in response for marker in [
  86. "思考", "推理", "首先", "然后", "第一步", "第二步",
  87. "让我", "我需要", "我们来", "分析一下"
  88. ])
  89. is_long_response = len(response) > 800 # 思考模式通常产生更长响应
  90. print(f"\n 思考模式检测:")
  91. print(f" - 包含 <think> 标签: {'是' if has_think_tag else '否'}")
  92. print(f" - 包含推理标记词: {'是' if has_reasoning_markers else '否'}")
  93. print(f" - 响应较长 (>800字符): {'是' if is_long_response else '否'}")
  94. if has_think_tag or (has_reasoning_markers and is_long_response):
  95. print(f" ✅ 思考模式似乎已生效")
  96. else:
  97. print(f" ℹ️ 思考模式特征不明显,但调用已返回")
  98. return elapsed_time, True
  99. except Exception as e:
  100. elapsed_time = time.time() - start_time
  101. print(f"❌ 调用失败: {e}")
  102. return elapsed_time, False
  103. async def test_non_qwen35_model():
  104. """测试非 Qwen3.5 模型(如 Doubao)不受 enable_thinking 影响"""
  105. print("\n" + "=" * 60)
  106. print("测试 3: 非 Qwen3.5 模型 - enable_thinking 参数不应产生影响")
  107. print("=" * 60)
  108. model_name = "doubao-1.5-pro-256k" # 非 Qwen3.5 模型
  109. start_time = time.time()
  110. try:
  111. response = await generate_model_client.get_model_generate_invoke(
  112. trace_id=f"test_non_qwen35_{int(time.time())}",
  113. system_prompt=TEST_SYSTEM_PROMPT,
  114. user_prompt=TEST_USER_PROMPT,
  115. model_name=model_name,
  116. enable_thinking=False, # 对非 Qwen3.5 模型应被忽略
  117. timeout=60
  118. )
  119. elapsed_time = time.time() - start_time
  120. print(f"✅ 调用成功")
  121. print(f" 模型: {model_name}")
  122. print(f" 响应时间: {elapsed_time:.2f}s")
  123. print(f" 响应内容: {response[:100]}...")
  124. print(f" ✅ 非 Qwen3.5 模型正常响应,enable_thinking 参数被正确忽略")
  125. return elapsed_time, True
  126. except Exception as e:
  127. elapsed_time = time.time() - start_time
  128. print(f"❌ 调用失败: {e}")
  129. return elapsed_time, False
  130. async def test_multiple_calls_consistency():
  131. """测试多次调用的一致性(验证 enable_thinking=False 稳定生效)"""
  132. print("\n" + "=" * 60)
  133. print("测试 4: Qwen3.5 多次调用一致性测试 (enable_thinking=False)")
  134. print("=" * 60)
  135. model_name = "qwen3_5_35b_a3b"
  136. call_times = []
  137. success_count = 0
  138. num_calls = 3
  139. for i in range(num_calls):
  140. start_time = time.time()
  141. try:
  142. response = await generate_model_client.get_model_generate_invoke(
  143. trace_id=f"test_consistency_{i}_{int(time.time())}",
  144. system_prompt=TEST_SYSTEM_PROMPT,
  145. user_prompt=TEST_USER_PROMPT,
  146. model_name=model_name,
  147. enable_thinking=False,
  148. timeout=120
  149. )
  150. elapsed_time = time.time() - start_time
  151. call_times.append(elapsed_time)
  152. success_count += 1
  153. print(f" 调用 {i+1}/{num_calls}: {elapsed_time:.2f}s - 成功")
  154. except Exception as e:
  155. elapsed_time = time.time() - start_time
  156. call_times.append(elapsed_time)
  157. print(f" 调用 {i+1}/{num_calls}: {elapsed_time:.2f}s - 失败: {e}")
  158. if call_times:
  159. avg_time = sum(call_times) / len(call_times)
  160. min_time = min(call_times)
  161. max_time = max(call_times)
  162. print(f"\n 统计结果:")
  163. print(f" - 成功次数: {success_count}/{num_calls}")
  164. print(f" - 平均响应时间: {avg_time:.2f}s")
  165. print(f" - 最快: {min_time:.2f}s, 最慢: {max_time:.2f}s")
  166. # 验证响应时间合理性(禁用思考模式应在 60s 内完成)
  167. if avg_time < 60:
  168. print(f" ✅ 平均响应时间在合理范围内(<60s)")
  169. else:
  170. print(f" ⚠️ 平均响应时间较长(>=60s),可能思考模式未正确禁用")
  171. return success_count == num_calls
  172. async def test_qwen35_122b_model():
  173. """测试 Qwen3.5 122B 大模型"""
  174. print("\n" + "=" * 60)
  175. print("测试 5: Qwen3.5 122B - 禁用思考模式")
  176. print("=" * 60)
  177. model_name = "qwen3_5_122b_a10b"
  178. start_time = time.time()
  179. try:
  180. response = await generate_model_client.get_model_generate_invoke(
  181. trace_id=f"test_122b_disabled_{int(time.time())}",
  182. system_prompt=TEST_SYSTEM_PROMPT,
  183. user_prompt=TEST_USER_PROMPT,
  184. model_name=model_name,
  185. enable_thinking=False,
  186. timeout=120
  187. )
  188. elapsed_time = time.time() - start_time
  189. print(f"✅ 调用成功")
  190. print(f" 模型: {model_name}")
  191. print(f" 响应时间: {elapsed_time:.2f}s")
  192. print(f" 响应内容: {response[:100]}...")
  193. if elapsed_time < 100:
  194. print(f" ✅ 响应时间合理(<100s),思考模式可能已禁用")
  195. else:
  196. print(f" ⚠️ 响应时间较长(>=100s)")
  197. return elapsed_time, True
  198. except Exception as e:
  199. elapsed_time = time.time() - start_time
  200. print(f"❌ 调用失败: {e}")
  201. return elapsed_time, False
  202. async def run_all_tests():
  203. """运行所有测试"""
  204. print("\n" + "=" * 70)
  205. print(" 统一模型调用 - 思考模式配置开关测试")
  206. print("=" * 70)
  207. print("\n测试说明:")
  208. print("- 本测试验证 generate_model_client.get_model_generate_invoke()")
  209. print("- 的 enable_thinking 参数是否正确控制 Qwen3.5 模型的思考模式")
  210. print("- 预期: enable_thinking=False 时响应时间显著缩短(<60s)")
  211. results = {}
  212. # 测试 1: Qwen3.5 35B 禁用思考模式
  213. time1, success1 = await test_qwen35_with_thinking_disabled()
  214. results["qwen35_35b_disabled"] = {"time": time1, "success": success1}
  215. # 测试 2: Qwen3.5 35B 启用思考模式(对比测试)
  216. print("\n [对比测试] 启用思考模式 - 预计耗时 60-300s,请耐心等待...")
  217. time2, success2 = await test_qwen35_with_thinking_enabled()
  218. results["qwen35_35b_enabled"] = {"time": time2, "success": success2}
  219. # 测试 3: 非 Qwen3.5 模型
  220. time3, success3 = await test_non_qwen35_model()
  221. results["non_qwen35"] = {"time": time3, "success": success3}
  222. # 测试 4: 多次调用一致性
  223. success4 = await test_multiple_calls_consistency()
  224. results["consistency"] = {"success": success4}
  225. # 测试 5: Qwen3.5 122B 大模型
  226. time5, success5 = await test_qwen35_122b_model()
  227. results["qwen35_122b_disabled"] = {"time": time5, "success": success5}
  228. # 汇总结果
  229. print("\n" + "=" * 70)
  230. print(" 测试结果汇总")
  231. print("=" * 70)
  232. for test_name, result in results.items():
  233. status = "✅ 通过" if result.get("success") else "❌ 失败"
  234. time_info = f" ({result['time']:.2f}s)" if "time" in result else ""
  235. print(f" {test_name}: {status}{time_info}")
  236. # 思考模式对比分析
  237. if "qwen35_35b_disabled" in results and "qwen35_35b_enabled" in results:
  238. print("\n" + "-" * 70)
  239. print(" 思考模式性能对比")
  240. print("-" * 70)
  241. disabled_time = results["qwen35_35b_disabled"]["time"]
  242. enabled_time = results["qwen35_35b_enabled"]["time"]
  243. speedup = enabled_time / disabled_time if disabled_time > 0 else 0
  244. print(f" 禁用思考模式: {disabled_time:.2f}s")
  245. print(f" 启用思考模式: {enabled_time:.2f}s")
  246. print(f" 性能差异: {speedup:.1f}倍")
  247. if speedup > 3:
  248. print(f" ✅ 思考模式开关效果显著,禁用后提速 {speedup:.1f} 倍")
  249. else:
  250. print(f" ℹ️ 性能差异不明显")
  251. all_passed = all(r.get("success") for r in results.values())
  252. print("\n" + "=" * 70)
  253. if all_passed:
  254. print(" 🎉 所有测试通过!思考模式配置开关工作正常")
  255. else:
  256. print(" ⚠️ 部分测试失败,请检查配置")
  257. print("=" * 70 + "\n")
  258. return all_passed
  259. if __name__ == "__main__":
  260. try:
  261. success = asyncio.run(run_all_tests())
  262. sys.exit(0 if success else 1)
  263. except KeyboardInterrupt:
  264. print("\n\n测试被用户中断")
  265. sys.exit(1)
  266. except Exception as e:
  267. print(f"\n\n测试运行出错: {e}")
  268. import traceback
  269. traceback.print_exc()
  270. sys.exit(1)