test_thinking_mode.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. 测试统一模型调用的思考模式配置开关
  5. 测试内容:
  6. 1. 测试 enable_thinking=False(默认)时,Qwen3.5 模型响应时间是否显著缩短
  7. 2. 测试 enable_thinking=True 时,Qwen3.5 模型响应是否包含思考过程
  8. 3. 测试非 Qwen3.5 模型不受 enable_thinking 参数影响
  9. 运行方式:
  10. cd D:/wx_work/sichuan_luqiao/LQAgentPlatform
  11. python utils_test/Model_Test/test_thinking_mode.py
  12. """
  13. import asyncio
  14. import time
  15. import sys
  16. from pathlib import Path
  17. # 添加项目根目录到 Python 路径
  18. project_root = Path(__file__).parent.parent.parent
  19. sys.path.insert(0, str(project_root))
  20. from foundation.ai.agent.generate.model_generate import generate_model_client
  21. # 测试用的简单提示词
  22. TEST_SYSTEM_PROMPT = "你是一个 helpful 的 AI 助手,请简洁回答。"
  23. TEST_USER_PROMPT = "请用一句话回答:1+1等于几?"
  24. async def test_qwen35_with_thinking_disabled():
  25. """测试 Qwen3.5 模型 - 禁用思考模式(默认)"""
  26. print("\n" + "=" * 60)
  27. print("测试 1: Qwen3.5 35B - 禁用思考模式 (enable_thinking=False)")
  28. print("=" * 60)
  29. model_name = "qwen3_5_35b_a3b"
  30. start_time = time.time()
  31. try:
  32. response = await generate_model_client.get_model_generate_invoke(
  33. trace_id=f"test_thinking_disabled_{int(time.time())}",
  34. system_prompt=TEST_SYSTEM_PROMPT,
  35. user_prompt=TEST_USER_PROMPT,
  36. model_name=model_name,
  37. enable_thinking=False, # 显式禁用思考模式
  38. timeout=120
  39. )
  40. elapsed_time = time.time() - start_time
  41. print(f"✅ 调用成功")
  42. print(f" 模型: {model_name}")
  43. print(f" 响应时间: {elapsed_time:.2f}s")
  44. print(f" 响应内容: {response[:100]}...")
  45. # 验证响应中不包含思考标记(如 <think> 或 思考过程)
  46. has_think_tag = "<think>" in response or "</think>" in response
  47. has_thinking_marker = "思考" in response and "过程" in response
  48. if has_think_tag or has_thinking_marker:
  49. print(f" ⚠️ 警告: 响应可能包含思考过程标记")
  50. else:
  51. print(f" ✅ 响应不包含思考过程标记")
  52. return elapsed_time, True
  53. except Exception as e:
  54. elapsed_time = time.time() - start_time
  55. print(f"❌ 调用失败: {e}")
  56. return elapsed_time, False
  57. async def test_qwen35_with_thinking_enabled():
  58. """测试 Qwen3.5 模型 - 启用思考模式"""
  59. print("\n" + "=" * 60)
  60. print("测试 2: Qwen3.5 35B - 启用思考模式 (enable_thinking=True)")
  61. print("=" * 60)
  62. model_name = "qwen3_5_35b_a3b"
  63. start_time = time.time()
  64. # 使用需要推理的问题来激发思考过程
  65. reasoning_prompt = "请详细解释为什么 1+1=2?请展示你的思考过程。"
  66. try:
  67. response = await generate_model_client.get_model_generate_invoke(
  68. trace_id=f"test_thinking_enabled_{int(time.time())}",
  69. system_prompt="你是一个善于思考的AI助手,请详细展示你的推理过程。",
  70. user_prompt=reasoning_prompt,
  71. model_name=model_name,
  72. enable_thinking=True, # 显式启用思考模式
  73. timeout=300 # 思考模式可能需要更长时间
  74. )
  75. elapsed_time = time.time() - start_time
  76. print(f"✅ 调用成功")
  77. print(f" 模型: {model_name}")
  78. print(f" 响应时间: {elapsed_time:.2f}s")
  79. print(f" 响应长度: {len(response)} 字符")
  80. # 显示响应的前500字符和后200字符
  81. display_len = min(500, len(response))
  82. print(f" 响应开头: {response[:display_len]}...")
  83. if len(response) > 700:
  84. print(f" 响应结尾: ...{response[-200:]}")
  85. # 检查是否包含思考过程特征
  86. has_think_tag = "<think>" in response or "</think>" in response
  87. has_reasoning_markers = any(marker in response for marker in [
  88. "思考", "推理", "首先", "然后", "第一步", "第二步",
  89. "让我", "我需要", "我们来", "分析一下"
  90. ])
  91. is_long_response = len(response) > 800 # 思考模式通常产生更长响应
  92. print(f"\n 思考模式检测:")
  93. print(f" - 包含 <think> 标签: {'是' if has_think_tag else '否'}")
  94. print(f" - 包含推理标记词: {'是' if has_reasoning_markers else '否'}")
  95. print(f" - 响应较长 (>800字符): {'是' if is_long_response else '否'}")
  96. if has_think_tag or (has_reasoning_markers and is_long_response):
  97. print(f" ✅ 思考模式似乎已生效")
  98. else:
  99. print(f" ℹ️ 思考模式特征不明显,但调用已返回")
  100. return elapsed_time, True
  101. except Exception as e:
  102. elapsed_time = time.time() - start_time
  103. print(f"❌ 调用失败: {e}")
  104. return elapsed_time, False
  105. async def test_non_qwen35_model():
  106. """测试非 Qwen3.5 模型(如 Doubao)不受 enable_thinking 影响"""
  107. print("\n" + "=" * 60)
  108. print("测试 3: 非 Qwen3.5 模型 - enable_thinking 参数不应产生影响")
  109. print("=" * 60)
  110. model_name = "doubao-1.5-pro-256k" # 非 Qwen3.5 模型
  111. start_time = time.time()
  112. try:
  113. response = await generate_model_client.get_model_generate_invoke(
  114. trace_id=f"test_non_qwen35_{int(time.time())}",
  115. system_prompt=TEST_SYSTEM_PROMPT,
  116. user_prompt=TEST_USER_PROMPT,
  117. model_name=model_name,
  118. enable_thinking=False, # 对非 Qwen3.5 模型应被忽略
  119. timeout=60
  120. )
  121. elapsed_time = time.time() - start_time
  122. print(f"✅ 调用成功")
  123. print(f" 模型: {model_name}")
  124. print(f" 响应时间: {elapsed_time:.2f}s")
  125. print(f" 响应内容: {response[:100]}...")
  126. print(f" ✅ 非 Qwen3.5 模型正常响应,enable_thinking 参数被正确忽略")
  127. return elapsed_time, True
  128. except Exception as e:
  129. elapsed_time = time.time() - start_time
  130. print(f"❌ 调用失败: {e}")
  131. return elapsed_time, False
  132. async def test_multiple_calls_consistency():
  133. """测试多次调用的一致性(验证 enable_thinking=False 稳定生效)"""
  134. print("\n" + "=" * 60)
  135. print("测试 4: Qwen3.5 多次调用一致性测试 (enable_thinking=False)")
  136. print("=" * 60)
  137. model_name = "qwen3_5_35b_a3b"
  138. call_times = []
  139. success_count = 0
  140. num_calls = 3
  141. for i in range(num_calls):
  142. start_time = time.time()
  143. try:
  144. response = await generate_model_client.get_model_generate_invoke(
  145. trace_id=f"test_consistency_{i}_{int(time.time())}",
  146. system_prompt=TEST_SYSTEM_PROMPT,
  147. user_prompt=TEST_USER_PROMPT,
  148. model_name=model_name,
  149. enable_thinking=False,
  150. timeout=120
  151. )
  152. elapsed_time = time.time() - start_time
  153. call_times.append(elapsed_time)
  154. success_count += 1
  155. print(f" 调用 {i+1}/{num_calls}: {elapsed_time:.2f}s - 成功")
  156. except Exception as e:
  157. elapsed_time = time.time() - start_time
  158. call_times.append(elapsed_time)
  159. print(f" 调用 {i+1}/{num_calls}: {elapsed_time:.2f}s - 失败: {e}")
  160. if call_times:
  161. avg_time = sum(call_times) / len(call_times)
  162. min_time = min(call_times)
  163. max_time = max(call_times)
  164. print(f"\n 统计结果:")
  165. print(f" - 成功次数: {success_count}/{num_calls}")
  166. print(f" - 平均响应时间: {avg_time:.2f}s")
  167. print(f" - 最快: {min_time:.2f}s, 最慢: {max_time:.2f}s")
  168. # 验证响应时间合理性(禁用思考模式应在 60s 内完成)
  169. if avg_time < 60:
  170. print(f" ✅ 平均响应时间在合理范围内(<60s)")
  171. else:
  172. print(f" ⚠️ 平均响应时间较长(>=60s),可能思考模式未正确禁用")
  173. return success_count == num_calls
  174. async def test_qwen35_122b_model():
  175. """测试 Qwen3.5 122B 大模型"""
  176. print("\n" + "=" * 60)
  177. print("测试 5: Qwen3.5 122B - 禁用思考模式")
  178. print("=" * 60)
  179. model_name = "qwen3_5_122b_a10b"
  180. start_time = time.time()
  181. try:
  182. response = await generate_model_client.get_model_generate_invoke(
  183. trace_id=f"test_122b_disabled_{int(time.time())}",
  184. system_prompt=TEST_SYSTEM_PROMPT,
  185. user_prompt=TEST_USER_PROMPT,
  186. model_name=model_name,
  187. enable_thinking=False,
  188. timeout=120
  189. )
  190. elapsed_time = time.time() - start_time
  191. print(f"✅ 调用成功")
  192. print(f" 模型: {model_name}")
  193. print(f" 响应时间: {elapsed_time:.2f}s")
  194. print(f" 响应内容: {response[:100]}...")
  195. if elapsed_time < 100:
  196. print(f" ✅ 响应时间合理(<100s),思考模式可能已禁用")
  197. else:
  198. print(f" ⚠️ 响应时间较长(>=100s)")
  199. return elapsed_time, True
  200. except Exception as e:
  201. elapsed_time = time.time() - start_time
  202. print(f"❌ 调用失败: {e}")
  203. return elapsed_time, False
  204. async def run_all_tests():
  205. """运行所有测试"""
  206. print("\n" + "=" * 70)
  207. print(" 统一模型调用 - 思考模式配置开关测试")
  208. print("=" * 70)
  209. print("\n测试说明:")
  210. print("- 本测试验证 generate_model_client.get_model_generate_invoke()")
  211. print("- 的 enable_thinking 参数是否正确控制 Qwen3.5 模型的思考模式")
  212. print("- 预期: enable_thinking=False 时响应时间显著缩短(<60s)")
  213. results = {}
  214. # 测试 1: Qwen3.5 35B 禁用思考模式
  215. time1, success1 = await test_qwen35_with_thinking_disabled()
  216. results["qwen35_35b_disabled"] = {"time": time1, "success": success1}
  217. # 测试 2: Qwen3.5 35B 启用思考模式(对比测试)
  218. print("\n [对比测试] 启用思考模式 - 预计耗时 60-300s,请耐心等待...")
  219. time2, success2 = await test_qwen35_with_thinking_enabled()
  220. results["qwen35_35b_enabled"] = {"time": time2, "success": success2}
  221. # 测试 3: 非 Qwen3.5 模型
  222. time3, success3 = await test_non_qwen35_model()
  223. results["non_qwen35"] = {"time": time3, "success": success3}
  224. # 测试 4: 多次调用一致性
  225. success4 = await test_multiple_calls_consistency()
  226. results["consistency"] = {"success": success4}
  227. # 测试 5: Qwen3.5 122B 大模型
  228. time5, success5 = await test_qwen35_122b_model()
  229. results["qwen35_122b_disabled"] = {"time": time5, "success": success5}
  230. # 汇总结果
  231. print("\n" + "=" * 70)
  232. print(" 测试结果汇总")
  233. print("=" * 70)
  234. for test_name, result in results.items():
  235. status = "✅ 通过" if result.get("success") else "❌ 失败"
  236. time_info = f" ({result['time']:.2f}s)" if "time" in result else ""
  237. print(f" {test_name}: {status}{time_info}")
  238. # 思考模式对比分析
  239. if "qwen35_35b_disabled" in results and "qwen35_35b_enabled" in results:
  240. print("\n" + "-" * 70)
  241. print(" 思考模式性能对比")
  242. print("-" * 70)
  243. disabled_time = results["qwen35_35b_disabled"]["time"]
  244. enabled_time = results["qwen35_35b_enabled"]["time"]
  245. speedup = enabled_time / disabled_time if disabled_time > 0 else 0
  246. print(f" 禁用思考模式: {disabled_time:.2f}s")
  247. print(f" 启用思考模式: {enabled_time:.2f}s")
  248. print(f" 性能差异: {speedup:.1f}倍")
  249. if speedup > 3:
  250. print(f" ✅ 思考模式开关效果显著,禁用后提速 {speedup:.1f} 倍")
  251. else:
  252. print(f" ℹ️ 性能差异不明显")
  253. all_passed = all(r.get("success") for r in results.values())
  254. print("\n" + "=" * 70)
  255. if all_passed:
  256. print(" 🎉 所有测试通过!思考模式配置开关工作正常")
  257. else:
  258. print(" ⚠️ 部分测试失败,请检查配置")
  259. print("=" * 70 + "\n")
  260. return all_passed
  261. if __name__ == "__main__":
  262. try:
  263. success = asyncio.run(run_all_tests())
  264. sys.exit(0 if success else 1)
  265. except KeyboardInterrupt:
  266. print("\n\n测试被用户中断")
  267. sys.exit(1)
  268. except Exception as e:
  269. print(f"\n\n测试运行出错: {e}")
  270. import traceback
  271. traceback.print_exc()
  272. sys.exit(1)