|
|
@@ -0,0 +1,554 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+测试 qwen3.5 模型在思考模式与非思考模式下的输出区别
|
|
|
+
|
|
|
+测试场景:目录完整性审查(catalog_integrity_review)
|
|
|
+模型:shutian_qwen3_5_122b
|
|
|
+
|
|
|
+运行方式:
|
|
|
+ cd D:/wx_work/sichuan_luqiao/LQAgentPlatform
|
|
|
+ python utils_test/Model_Test/test_thinking_vs_nonthinking.py
|
|
|
+"""
|
|
|
+
|
|
|
+import asyncio
|
|
|
+import time
|
|
|
+import sys
|
|
|
+import json
|
|
|
+from pathlib import Path
|
|
|
+from datetime import datetime
|
|
|
+
|
|
|
+# 添加项目根目录到 Python 路径
|
|
|
+project_root = Path(__file__).parent.parent.parent
|
|
|
+sys.path.insert(0, str(project_root))
|
|
|
+
|
|
|
+from foundation.ai.agent.generate.model_generate import generate_model_client
|
|
|
+from foundation.observability.logger.loggering import review_logger as logger
|
|
|
+
|
|
|
+
|
|
|
+# 测试用的实际目录文本(模拟OCR识别结果)
|
|
|
+TEST_ACTUAL_CATALOG = """第一章 编制依据
|
|
|
+一、法律法规
|
|
|
+二、标准规范
|
|
|
+三、文件制度
|
|
|
+
|
|
|
+第二章 工程概况
|
|
|
+一、设计概况
|
|
|
+二、工程地质与水文气象
|
|
|
+三、周边环境
|
|
|
+
|
|
|
+第三章 施工计划
|
|
|
+一、施工进度计划
|
|
|
+二、施工材料计划
|
|
|
+
|
|
|
+第四章 施工工艺技术
|
|
|
+一、主要施工方法概述
|
|
|
+二、技术参数
|
|
|
+三、工艺流程
|
|
|
+
|
|
|
+第五章 安全保证措施
|
|
|
+一、安全保证体系
|
|
|
+二、组织保证措施
|
|
|
+
|
|
|
+第六章 质量保证措施
|
|
|
+一、质量保证体系
|
|
|
+
|
|
|
+第七章 施工管理及作业人员配备与分工
|
|
|
+一、施工管理人员
|
|
|
+
|
|
|
+第八章 验收要求
|
|
|
+一、验收标准"""
|
|
|
+
|
|
|
+# 标准目录模板
|
|
|
+STANDARD_CATALOG = """第一章 编制依据
|
|
|
+一、法律法规
|
|
|
+二、标准规范
|
|
|
+三、文件制度
|
|
|
+四、编制原则
|
|
|
+五、编制范围
|
|
|
+
|
|
|
+第二章 工程概况
|
|
|
+一、设计概况
|
|
|
+二、工程地质与水文气象
|
|
|
+三、周边环境
|
|
|
+四、施工平面及立面布置
|
|
|
+五、施工要求和技术保证条件
|
|
|
+六、风险辨识与分级
|
|
|
+七、参建各方责任主体单位
|
|
|
+
|
|
|
+第三章 施工计划
|
|
|
+一、施工进度计划
|
|
|
+二、施工材料计划
|
|
|
+三、施工设备计划
|
|
|
+四、劳动力计划
|
|
|
+五、安全生产费用使用计划
|
|
|
+
|
|
|
+第四章 施工工艺技术
|
|
|
+一、主要施工方法概述
|
|
|
+二、技术参数
|
|
|
+三、工艺流程
|
|
|
+四、施工准备
|
|
|
+五、施工方法及操作要求
|
|
|
+六、检查要求
|
|
|
+
|
|
|
+第五章 安全保证措施
|
|
|
+一、安全保证体系
|
|
|
+二、组织保证措施
|
|
|
+三、技术保证措施
|
|
|
+四、监测监控措施
|
|
|
+五、应急处置措施
|
|
|
+
|
|
|
+第六章 质量保证措施
|
|
|
+一、质量保证体系
|
|
|
+二、质量目标
|
|
|
+三、工程创优规划
|
|
|
+四、质量控制程序与具体措施
|
|
|
+
|
|
|
+第七章 环境保证措施
|
|
|
+一、环境保证体系
|
|
|
+二、环境保护组织机构
|
|
|
+三、环境保护及文明施工措施
|
|
|
+
|
|
|
+第八章 施工管理及作业人员配备与分工
|
|
|
+一、施工管理人员
|
|
|
+二、专职安全生产管理人员
|
|
|
+三、其他作业人员
|
|
|
+
|
|
|
+第九章 验收要求
|
|
|
+一、验收标准
|
|
|
+二、验收程序
|
|
|
+三、验收内容
|
|
|
+四、验收时间
|
|
|
+五、验收人员
|
|
|
+
|
|
|
+第十章 其他资料
|
|
|
+一、计算书
|
|
|
+二、相关施工图纸
|
|
|
+三、附图附表
|
|
|
+四、编制及审核人员情况"""
|
|
|
+
|
|
|
+# System Prompt(更新后,允许思考)
|
|
|
+SYSTEM_PROMPT = "你是一位施工方案文档审查专家,负责对比实际目录和标准目录,找出缺失项。请按JSON格式输出最终结果。"
|
|
|
+
|
|
|
+# User Prompt(更新后,允许思考)
|
|
|
+USER_PROMPT_TEMPLATE = """你是一位施工方案文档审查专家。请对比【实际目录】和【标准目录】,找出缺失项。
|
|
|
+
|
|
|
+## 审查原则
|
|
|
+1. **语义匹配**:实际目录与标准目录含义相同即算匹配,不要求文字完全一致
|
|
|
+2. **常见同义表述**(示例):
|
|
|
+ - "编制依据" ≈ "方案编制依据" ≈ "编制原则及依据"
|
|
|
+ - "工程概况" ≈ "工程基本情况" ≈ "项目概况"
|
|
|
+ - "施工计划" ≈ "施工进度计划" ≈ "施工安排"
|
|
|
+ - "法律法规" ≈ "相关法律" ≈ "法规依据"
|
|
|
+3. **容错范围**:
|
|
|
+ - 一级标题必须严格对应(如"编制依据"不能变成"引用标准")
|
|
|
+ - 二级标题允许一定变通,但核心含义必须一致
|
|
|
+
|
|
|
+## 实际目录(来自OCR识别)
|
|
|
+```
|
|
|
+{actual_catalog}
|
|
|
+```
|
|
|
+
|
|
|
+## 标准目录(必须包含的完整结构)
|
|
|
+```
|
|
|
+{standard_catalog}
|
|
|
+```
|
|
|
+
|
|
|
+## 输出规则
|
|
|
+1. **一级缺失判定**:实际目录中完全没有对应的章,或章节标题完全不匹配
|
|
|
+2. **二级缺失判定**:只有当父级一级目录**存在**时,才检查其下的二级目录是否缺失
|
|
|
+3. **重要**:如果某个一级目录缺失,**不要报告**该章节下的二级目录缺失(避免重复提醒)
|
|
|
+
|
|
|
+## 输出要求
|
|
|
+**重要:最终答案只输出 JSON,不要添加 markdown 代码块标记(```json)。**
|
|
|
+
|
|
|
+请直接输出 check_completeness 格式的 JSON 结果:
|
|
|
+{{
|
|
|
+ "details": {{
|
|
|
+ "name": "catalog_check",
|
|
|
+ "response": [
|
|
|
+ {{
|
|
|
+ "check_item": "check_completeness",
|
|
|
+ "chapter_code": "catalog",
|
|
|
+ "check_item_code": "catalog_check_completeness",
|
|
|
+ "check_result": {{
|
|
|
+ "issue_point": "【一级缺失】xxx",
|
|
|
+ "location": "目录页",
|
|
|
+ "suggestion": "建议补充'xxx'章节",
|
|
|
+ "reason": "简要说明",
|
|
|
+ "risk_level": "高风险"
|
|
|
+ }},
|
|
|
+ "exist_issue": true,
|
|
|
+ "risk_info": {{"risk_level": "high"}}
|
|
|
+ }}
|
|
|
+ ],
|
|
|
+ "review_location_label": "目录完整性审查",
|
|
|
+ "chapter_code": "catalog"
|
|
|
+ }},
|
|
|
+ "success": true
|
|
|
+}}
|
|
|
+
|
|
|
+**注意**:
|
|
|
+- 一级缺失:risk_level 为 "高风险", risk_info.risk_level 为 "high"
|
|
|
+- 二级缺失:risk_level 为 "中风险", risk_info.risk_level 为 "medium"
|
|
|
+- 如无缺失,response 中放一条 "issue_point": "【目录完整】一二级目录结构完整", "exist_issue": false
|
|
|
+"""
|
|
|
+
|
|
|
+
|
|
|
+def build_prompt(actual_catalog: str, standard_catalog: str) -> str:
|
|
|
+ """构建测试用的 prompt"""
|
|
|
+ return USER_PROMPT_TEMPLATE.format(
|
|
|
+ actual_catalog=actual_catalog,
|
|
|
+ standard_catalog=standard_catalog
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+async def test_with_mode(mode_name: str, enable_thinking: bool, output_dir: Path) -> dict:
|
|
|
+ """
|
|
|
+ 测试指定模式
|
|
|
+
|
|
|
+ Args:
|
|
|
+ mode_name: 模式名称(用于输出)
|
|
|
+ enable_thinking: 是否启用思考模式
|
|
|
+ output_dir: 输出目录
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 测试结果字典
|
|
|
+ """
|
|
|
+ print(f"\n{'='*70}")
|
|
|
+ print(f" 测试模式: {mode_name} (enable_thinking={enable_thinking})")
|
|
|
+ print(f"{'='*70}")
|
|
|
+
|
|
|
+ model_name = "shutian_qwen3_5_122b"
|
|
|
+ trace_id = f"test_{mode_name}_{int(time.time())}"
|
|
|
+
|
|
|
+ # 构建 prompt
|
|
|
+ user_prompt = build_prompt(TEST_ACTUAL_CATALOG, STANDARD_CATALOG)
|
|
|
+
|
|
|
+ # 记录开始时间
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 调用模型
|
|
|
+ print(f"⏳ 正在调用模型 {model_name}...")
|
|
|
+ print(f" enable_thinking={enable_thinking}")
|
|
|
+ print(f" trace_id={trace_id}")
|
|
|
+
|
|
|
+ response = await generate_model_client.get_model_generate_invoke(
|
|
|
+ trace_id=trace_id,
|
|
|
+ system_prompt=SYSTEM_PROMPT,
|
|
|
+ user_prompt=user_prompt,
|
|
|
+ model_name=model_name,
|
|
|
+ enable_thinking=enable_thinking,
|
|
|
+ timeout=180
|
|
|
+ )
|
|
|
+
|
|
|
+ elapsed_time = time.time() - start_time
|
|
|
+
|
|
|
+ print(f"✅ 调用成功")
|
|
|
+ print(f" 响应时间: {elapsed_time:.2f}s")
|
|
|
+ print(f" 响应长度: {len(response)} 字符")
|
|
|
+
|
|
|
+ # 保存完整响应到文件
|
|
|
+ output_file = output_dir / f"{mode_name}_response.txt"
|
|
|
+ with open(output_file, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(f"模式: {mode_name}\n")
|
|
|
+ f.write(f"enable_thinking: {enable_thinking}\n")
|
|
|
+ f.write(f"模型: {model_name}\n")
|
|
|
+ f.write(f"响应时间: {elapsed_time:.2f}s\n")
|
|
|
+ f.write(f"响应长度: {len(response)} 字符\n")
|
|
|
+ f.write(f"trace_id: {trace_id}\n")
|
|
|
+ f.write("="*70 + "\n")
|
|
|
+ f.write("原始响应内容:\n")
|
|
|
+ f.write(response)
|
|
|
+
|
|
|
+ print(f" 完整响应已保存: {output_file}")
|
|
|
+
|
|
|
+ # 分析响应特征
|
|
|
+ analysis = analyze_response(response)
|
|
|
+
|
|
|
+ # 保存分析结果
|
|
|
+ analysis_file = output_dir / f"{mode_name}_analysis.json"
|
|
|
+ with open(analysis_file, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump({
|
|
|
+ "mode": mode_name,
|
|
|
+ "enable_thinking": enable_thinking,
|
|
|
+ "model": model_name,
|
|
|
+ "elapsed_time": elapsed_time,
|
|
|
+ "response_length": len(response),
|
|
|
+ "trace_id": trace_id,
|
|
|
+ "analysis": analysis
|
|
|
+ }, f, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ print(f" 分析结果已保存: {analysis_file}")
|
|
|
+
|
|
|
+ return {
|
|
|
+ "success": True,
|
|
|
+ "mode": mode_name,
|
|
|
+ "enable_thinking": enable_thinking,
|
|
|
+ "elapsed_time": elapsed_time,
|
|
|
+ "response_length": len(response),
|
|
|
+ "analysis": analysis,
|
|
|
+ "response_preview": response[:500] + "..." if len(response) > 500 else response
|
|
|
+ }
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ elapsed_time = time.time() - start_time
|
|
|
+ print(f"❌ 调用失败: {e}")
|
|
|
+
|
|
|
+ # 保存错误信息
|
|
|
+ error_file = output_dir / f"{mode_name}_error.txt"
|
|
|
+ with open(error_file, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(f"模式: {mode_name}\n")
|
|
|
+ f.write(f"enable_thinking: {enable_thinking}\n")
|
|
|
+ f.write(f"错误信息: {str(e)}\n")
|
|
|
+ f.write(f"响应时间: {elapsed_time:.2f}s\n")
|
|
|
+
|
|
|
+ return {
|
|
|
+ "success": False,
|
|
|
+ "mode": mode_name,
|
|
|
+ "enable_thinking": enable_thinking,
|
|
|
+ "elapsed_time": elapsed_time,
|
|
|
+ "error": str(e)
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+def analyze_response(response: str) -> dict:
|
|
|
+ """分析响应内容的特征"""
|
|
|
+ analysis = {
|
|
|
+ "has_thinking_process": False,
|
|
|
+ "has_answer_marker": False,
|
|
|
+ "has_json_structure": False,
|
|
|
+ "thinking_length": 0,
|
|
|
+ "json_start_index": -1,
|
|
|
+ "detected_patterns": []
|
|
|
+ }
|
|
|
+
|
|
|
+ # 检测思考过程标记
|
|
|
+ thinking_markers = [
|
|
|
+ "Thinking Process:",
|
|
|
+ "思考过程",
|
|
|
+ "1. **Analyze",
|
|
|
+ "1. **分析",
|
|
|
+ "让我",
|
|
|
+ "我需要",
|
|
|
+ "第一步",
|
|
|
+ "首先",
|
|
|
+ ]
|
|
|
+
|
|
|
+ for marker in thinking_markers:
|
|
|
+ if marker in response:
|
|
|
+ analysis["detected_patterns"].append(f"包含标记: {marker}")
|
|
|
+ analysis["has_thinking_process"] = True
|
|
|
+ break
|
|
|
+
|
|
|
+ # 检测答案标记
|
|
|
+ answer_markers = [
|
|
|
+ "Answer:\n",
|
|
|
+ "Final Answer:\n",
|
|
|
+ "**Answer:**\n",
|
|
|
+ "**Final Answer:**\n",
|
|
|
+ ]
|
|
|
+
|
|
|
+ for marker in answer_markers:
|
|
|
+ if marker in response:
|
|
|
+ analysis["has_answer_marker"] = True
|
|
|
+ analysis["detected_patterns"].append(f"答案标记: {marker}")
|
|
|
+ break
|
|
|
+
|
|
|
+ # 检测 JSON 结构
|
|
|
+ json_start = response.find('{')
|
|
|
+ json_end = response.rfind('}')
|
|
|
+
|
|
|
+ if json_start != -1 and json_end != -1 and json_end > json_start:
|
|
|
+ analysis["has_json_structure"] = True
|
|
|
+ analysis["json_start_index"] = json_start
|
|
|
+
|
|
|
+ # 尝试计算思考部分长度
|
|
|
+ if json_start > 0:
|
|
|
+ analysis["thinking_length"] = json_start
|
|
|
+
|
|
|
+ # 检测响应结构特征
|
|
|
+ lines = response.split('\n')
|
|
|
+ analysis["total_lines"] = len(lines)
|
|
|
+
|
|
|
+ # 检测是否有明显的思考/答案分隔
|
|
|
+ for i, line in enumerate(lines):
|
|
|
+ if line.strip() in ["Answer:", "Final Answer:", "**Answer:**", "**Final Answer:**"]:
|
|
|
+ analysis["answer_line_number"] = i + 1
|
|
|
+ break
|
|
|
+
|
|
|
+ return analysis
|
|
|
+
|
|
|
+
|
|
|
+def print_comparison(result_thinking: dict, result_non_thinking: dict):
|
|
|
+ """打印两种模式的对比结果"""
|
|
|
+ print(f"\n{'='*70}")
|
|
|
+ print(" 对比分析结果")
|
|
|
+ print(f"{'='*70}")
|
|
|
+
|
|
|
+ # 基本信息对比
|
|
|
+ print("\n【基本信息对比】")
|
|
|
+ print(f"{'指标':<30} {'思考模式':>15} {'非思考模式':>15}")
|
|
|
+ print("-" * 70)
|
|
|
+ print(f"{'响应时间':<30} {result_thinking['elapsed_time']:>14.2f}s {result_non_thinking['elapsed_time']:>14.2f}s")
|
|
|
+ print(f"{'响应长度':<30} {result_thinking['response_length']:>14,} {result_non_thinking['response_length']:>14,}")
|
|
|
+
|
|
|
+ if result_thinking['success'] and result_non_thinking['success']:
|
|
|
+ time_diff = result_thinking['elapsed_time'] - result_non_thinking['elapsed_time']
|
|
|
+ length_diff = result_thinking['response_length'] - result_non_thinking['response_length']
|
|
|
+
|
|
|
+ print(f"{'时间差异':<30} {f'+{time_diff:.2f}s' if time_diff > 0 else f'{time_diff:.2f}s':>15}")
|
|
|
+ print(f"{'长度差异':<30} {f'+{length_diff:,}' if length_diff > 0 else f'{length_diff:,}':>15,}")
|
|
|
+
|
|
|
+ # 特征对比
|
|
|
+ print("\n【内容特征对比】")
|
|
|
+ analysis_t = result_thinking['analysis']
|
|
|
+ analysis_nt = result_non_thinking['analysis']
|
|
|
+
|
|
|
+ print(f"{'指标':<30} {'思考模式':>15} {'非思考模式':>15}")
|
|
|
+ print("-" * 70)
|
|
|
+ print(f"{'包含思考过程':<30} {'是' if analysis_t['has_thinking_process'] else '否':>15} {'是' if analysis_nt['has_thinking_process'] else '否':>15}")
|
|
|
+ print(f"{'包含答案标记':<30} {'是' if analysis_t['has_answer_marker'] else '否':>15} {'是' if analysis_nt['has_answer_marker'] else '否':>15}")
|
|
|
+ print(f"{'包含JSON结构':<30} {'是' if analysis_t['has_json_structure'] else '否':>15} {'是' if analysis_nt['has_json_structure'] else '否':>15}")
|
|
|
+ print(f"{'思考部分长度':<30} {analysis_t['thinking_length']:>14,} {analysis_nt['thinking_length']:>14,}")
|
|
|
+ print(f"{'总行数':<30} {analysis_t.get('total_lines', 0):>14,} {analysis_nt.get('total_lines', 0):>14,}")
|
|
|
+
|
|
|
+ # 检测到的模式
|
|
|
+ print("\n【思考模式 - 检测到的特征】")
|
|
|
+ for pattern in analysis_t.get('detected_patterns', []):
|
|
|
+ print(f" - {pattern}")
|
|
|
+
|
|
|
+ print("\n【非思考模式 - 检测到的特征】")
|
|
|
+ for pattern in analysis_nt.get('detected_patterns', []):
|
|
|
+ print(f" - {pattern}")
|
|
|
+
|
|
|
+ # 内容预览
|
|
|
+ print("\n【思考模式 - 响应预览】")
|
|
|
+ print(result_thinking.get('response_preview', 'N/A')[:300])
|
|
|
+
|
|
|
+ print("\n【非思考模式 - 响应预览】")
|
|
|
+ print(result_non_thinking.get('response_preview', 'N/A')[:300])
|
|
|
+
|
|
|
+ else:
|
|
|
+ print("\n⚠️ 部分测试失败,无法完成完整对比")
|
|
|
+
|
|
|
+
|
|
|
+def save_summary_report(result_thinking: dict, result_non_thinking: dict, output_dir: Path):
|
|
|
+ """保存汇总报告"""
|
|
|
+ report_file = output_dir / "comparison_report.md"
|
|
|
+
|
|
|
+ with open(report_file, 'w', encoding='utf-8') as f:
|
|
|
+ f.write("# Qwen3.5 思考模式 vs 非思考模式 测试报告\n\n")
|
|
|
+ f.write(f"测试时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
|
+
|
|
|
+ f.write("## 测试配置\n\n")
|
|
|
+ f.write(f"- 模型: `shutian_qwen3_5_122b`\n")
|
|
|
+ f.write(f"- 测试场景: 目录完整性审查 (catalog_integrity_review)\n")
|
|
|
+ f.write(f"- Prompt: 允许思考,要求最终输出 JSON\n\n")
|
|
|
+
|
|
|
+ f.write("## 性能对比\n\n")
|
|
|
+ f.write("| 指标 | 思考模式 | 非思考模式 | 差异 |\n")
|
|
|
+ f.write("|------|----------|------------|------|\n")
|
|
|
+
|
|
|
+ if result_thinking['success'] and result_non_thinking['success']:
|
|
|
+ time_t = result_thinking['elapsed_time']
|
|
|
+ time_nt = result_non_thinking['elapsed_time']
|
|
|
+ len_t = result_thinking['response_length']
|
|
|
+ len_nt = result_non_thinking['response_length']
|
|
|
+
|
|
|
+ f.write(f"| 响应时间 | {time_t:.2f}s | {time_nt:.2f}s | {time_t - time_nt:+.2f}s |\n")
|
|
|
+ f.write(f"| 响应长度 | {len_t:,} | {len_nt:,} | {len_t - len_nt:+,} |\n")
|
|
|
+
|
|
|
+ f.write("\n## 内容特征对比\n\n")
|
|
|
+
|
|
|
+ if result_thinking['success'] and result_non_thinking['success']:
|
|
|
+ analysis_t = result_thinking['analysis']
|
|
|
+ analysis_nt = result_non_thinking['analysis']
|
|
|
+
|
|
|
+ f.write("| 特征 | 思考模式 | 非思考模式 |\n")
|
|
|
+ f.write("|------|----------|------------|\n")
|
|
|
+ f.write(f"| 包含思考过程 | {'✅ 是' if analysis_t['has_thinking_process'] else '❌ 否'} | {'✅ 是' if analysis_nt['has_thinking_process'] else '❌ 否'} |\n")
|
|
|
+ f.write(f"| 包含答案标记 | {'✅ 是' if analysis_t['has_answer_marker'] else '❌ 否'} | {'✅ 是' if analysis_nt['has_answer_marker'] else '❌ 否'} |\n")
|
|
|
+ f.write(f"| 包含JSON结构 | {'✅ 是' if analysis_t['has_json_structure'] else '❌ 否'} | {'✅ 是' if analysis_nt['has_json_structure'] else '❌ 否'} |\n")
|
|
|
+ f.write(f"| 思考部分长度 | {analysis_t['thinking_length']:,} | {analysis_nt['thinking_length']:,} |\n")
|
|
|
+
|
|
|
+ f.write("\n### 思考模式检测到的特征\n\n")
|
|
|
+ for pattern in analysis_t.get('detected_patterns', []):
|
|
|
+ f.write(f"- {pattern}\n")
|
|
|
+
|
|
|
+ f.write("\n### 非思考模式检测到的特征\n\n")
|
|
|
+ for pattern in analysis_nt.get('detected_patterns', []):
|
|
|
+ f.write(f"- {pattern}\n")
|
|
|
+
|
|
|
+ f.write("\n## 结论\n\n")
|
|
|
+
|
|
|
+ if result_thinking['success'] and result_non_thinking['success']:
|
|
|
+ analysis_t = result_thinking['analysis']
|
|
|
+
|
|
|
+ if analysis_t['has_thinking_process']:
|
|
|
+ f.write("✅ **思考模式生效**: 模型输出了明显的思考过程,然后通过 `Answer:` 标记分隔最终答案。\n\n")
|
|
|
+ else:
|
|
|
+ f.write("⚠️ **思考模式未生效**: 模型未输出明显的思考过程。\n\n")
|
|
|
+
|
|
|
+ f.write("### 建议\n\n")
|
|
|
+ f.write("- 思考模式下响应时间更长,但可能获得更好的推理质量\n")
|
|
|
+ f.write("- 非思考模式下响应更快,适合对延迟敏感的场景\n")
|
|
|
+ f.write("- 当前 `_extract_json` 增强逻辑可以正确处理思考模式的输出\n")
|
|
|
+ else:
|
|
|
+ f.write("⚠️ 部分测试失败,请检查日志。\n")
|
|
|
+
|
|
|
+ print(f"\n📄 汇总报告已保存: {report_file}")
|
|
|
+
|
|
|
+
|
|
|
+async def main():
|
|
|
+ """主函数"""
|
|
|
+ print("="*70)
|
|
|
+ print(" Qwen3.5 思考模式 vs 非思考模式 输出对比测试")
|
|
|
+ print("="*70)
|
|
|
+ print(f"\n测试场景: 目录完整性审查 (catalog_integrity_review)")
|
|
|
+ print(f"模型: shutian_qwen3_5_122b")
|
|
|
+ print(f"Prompt: 允许思考,最终输出 JSON")
|
|
|
+
|
|
|
+ # 创建输出目录
|
|
|
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
+ output_dir = Path(project_root) / "utils_test" / "Model_Test" / "output" / f"thinking_test_{timestamp}"
|
|
|
+ output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ print(f"\n输出目录: {output_dir}")
|
|
|
+
|
|
|
+ # 测试思考模式
|
|
|
+ result_thinking = await test_with_mode("thinking_enabled", True, output_dir)
|
|
|
+
|
|
|
+ # 等待一小段时间,避免 trace_id 冲突
|
|
|
+ await asyncio.sleep(1)
|
|
|
+
|
|
|
+ # 测试非思考模式
|
|
|
+ result_non_thinking = await test_with_mode("thinking_disabled", False, output_dir)
|
|
|
+
|
|
|
+ # 打印对比结果
|
|
|
+ print_comparison(result_thinking, result_non_thinking)
|
|
|
+
|
|
|
+ # 保存汇总报告
|
|
|
+ save_summary_report(result_thinking, result_non_thinking, output_dir)
|
|
|
+
|
|
|
+ print(f"\n{'='*70}")
|
|
|
+ print(" 测试完成")
|
|
|
+ print(f"{'='*70}")
|
|
|
+ print(f"\n所有结果已保存到: {output_dir}")
|
|
|
+
|
|
|
+ return result_thinking['success'] and result_non_thinking['success']
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ try:
|
|
|
+ success = asyncio.run(main())
|
|
|
+ sys.exit(0 if success else 1)
|
|
|
+ except KeyboardInterrupt:
|
|
|
+ print("\n\n测试被用户中断")
|
|
|
+ sys.exit(1)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"\n\n测试运行出错: {e}")
|
|
|
+ import traceback
|
|
|
+ traceback.print_exc()
|
|
|
+ sys.exit(1)
|