test_step_dispatcher_selfcheck.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. """C03 步骤调度与环节隔离 - 自测脚本
  2. 覆盖 TC-C03 全部 9 个测试用例:
  3. - TC-C03-API-001 ~ 005: 功能测试
  4. - TC-C03-EDGE-001 ~ 002: 边界测试
  5. - TC-C03-ERROR-001 ~ 002: 异常测试
  6. """
  7. import json
  8. import sys
  9. import os
  10. import logging
  11. sys.path.insert(0, os.path.abspath("."))
  12. # 关闭非必要的日志输出
  13. logging.disable(logging.CRITICAL)
  14. from core.debug.step_dispatcher import (
  15. StepDefinition,
  16. StepDispatcher,
  17. CHAIN_STEPS,
  18. _STEP_DEPS,
  19. VALID_CHAIN_IDS,
  20. )
  21. from core.debug.isolation_runner import (
  22. IsolationRunner,
  23. StepResult as IRStepResult,
  24. )
  25. # 快捷引用静态方法
  26. _build_effective_set = IsolationRunner._build_effective_set
  27. # ==================== 测试结果记录 ====================
  28. results = []
  29. def check(name, passed, detail=""):
  30. status = "PASS" if passed else "FAIL"
  31. results.append((name, passed, detail))
  32. print(f" [{status}] {name}" + (f" -- {detail}" if detail else ""))
  33. def print_summary():
  34. print(f"\n{'='*60}")
  35. passed = sum(1 for _, p, _ in results if p)
  36. total = len(results)
  37. print(f"结果: {passed}/{total} 通过")
  38. if passed < total:
  39. print("失败项:")
  40. for name, p, detail in results:
  41. if not p:
  42. print(f" - {name}: {detail}")
  43. print(f"{'='*60}\n")
  44. EXPECTED_CHAINS = {
  45. "completeness", "timeliness", "reference",
  46. "sensitive", "semantic", "grammar", "professional",
  47. }
  48. # ================================================================
  49. # TC-C03-API-001: 直调链路 3 步正常执行(完整链路)
  50. # ================================================================
  51. print("\n=== TC-C03-API-001: 直调链路 3 步正常执行 ===")
  52. sd = StepDefinition(index=0, name="测试步骤", is_isolatable=True, requires_previous=False)
  53. check("StepDefinition 有 index 字段", sd.index == 0, f"实际: {sd.index}")
  54. check("StepDefinition 有 name 字段", sd.name == "测试步骤", f"实际: {sd.name}")
  55. check("StepDefinition 有 is_isolatable 字段", sd.is_isolatable is True)
  56. check("StepDefinition 有 requires_previous 字段", sd.requires_previous is False)
  57. check("StepDefinition.to_dict 兼容 dict 格式",
  58. sd.to_dict() == {"index": 0, "name": "测试步骤", "phase": None})
  59. for cid in ["completeness", "timeliness", "reference", "sensitive", "semantic", "grammar"]:
  60. steps = StepDispatcher.get_steps(cid)
  61. check(f"{cid} 步骤数 = 3", len(steps) == 3, f"实际: {len(steps)}")
  62. for i, expected_name in enumerate(["Prompt 渲染", "LLM 调用", "结果解析"]):
  63. s = steps[i]
  64. check(f"{cid} step {i} name = '{expected_name}'",
  65. s.name == expected_name and s.index == i)
  66. check(f"{cid} step 0 requires_previous=False", steps[0].requires_previous is False)
  67. check(f"{cid} step 0 is_isolatable=True", steps[0].is_isolatable is True)
  68. check(f"{cid} step 1 requires_previous=True", steps[1].requires_previous is True)
  69. check(f"{cid} step 1 is_isolatable=True", steps[1].is_isolatable is True)
  70. check(f"{cid} step 2 requires_previous=True", steps[2].requires_previous is True)
  71. check(f"{cid} step 2 is_isolatable=True", steps[2].is_isolatable is True)
  72. for cid in EXPECTED_CHAINS:
  73. steps_direct = StepDispatcher.CHAIN_STEPS[cid]
  74. steps_method = StepDispatcher.get_steps(cid)
  75. check(f"{cid} CHAIN_STEPS 与 get_steps 一致",
  76. [s.index for s in steps_direct] == [s.index for s in steps_method])
  77. check("VALID_CHAIN_IDS 覆盖 7 个链路",
  78. VALID_CHAIN_IDS == EXPECTED_CHAINS,
  79. f"实际: {VALID_CHAIN_IDS}")
  80. # ================================================================
  81. # TC-C02-API-002: 专业性审查 7 步完整执行(移植自 C02)
  82. # ================================================================
  83. print("\n=== TC-C02-API-002: 专业性审查 7 步完整执行 ===")
  84. steps_pro = StepDispatcher.get_steps("professional")
  85. check("professional 步骤数 = 7", len(steps_pro) == 7, f"实际: {len(steps_pro)}")
  86. expected_pro = [
  87. (0, "查询提取", "RAG 召回阶段"),
  88. (1, "实体增强检索", "RAG 召回阶段"),
  89. (2, "父文档增强", "RAG 召回阶段"),
  90. (3, "结果提取", "RAG 召回阶段"),
  91. (4, "非参数合规审查", "AI 审查阶段"),
  92. (5, "参数合规审查", "AI 审查阶段"),
  93. (6, "结果汇总", "AI 审查阶段"),
  94. ]
  95. for i, (idx, name, phase) in enumerate(expected_pro):
  96. s = steps_pro[i]
  97. check(f"professional step {idx} name = '{name}'", s.name == name and s.index == idx)
  98. check(f"professional step {idx} phase = '{phase}'", s.phase == phase, f"实际: {s.phase}")
  99. rag_steps = [s for s in steps_pro if s.phase == "RAG 召回阶段"]
  100. ai_steps = [s for s in steps_pro if s.phase == "AI 审查阶段"]
  101. check("RAG 阶段 4 步", len(rag_steps) == 4, f"实际: {len(rag_steps)}")
  102. check("AI 阶段 3 步", len(ai_steps) == 3, f"实际: {len(ai_steps)}")
  103. check("结果汇总 is_isolatable=False", steps_pro[6].is_isolatable is False)
  104. # ================================================================
  105. # TC-C03-API-003: 环节隔离 — 仅执行指定步骤
  106. # ================================================================
  107. print("\n=== TC-C03-API-003: 环节隔离 -- 仅执行指定步骤 ===")
  108. steps = StepDispatcher.get_steps("completeness")
  109. # isolation_steps=[0], no manual_inputs -> effective = {0}
  110. eff = _build_effective_set(steps, [0], None)
  111. check("API-003: isolation_steps=[0] -> effective={0}", eff == {0}, f"实际: {eff}")
  112. check("API-003: 无 manual_inputs, 不触发自动前向传播", 1 not in eff and 2 not in eff)
  113. # ================================================================
  114. # TC-C03-API-004: 环节隔离 — 手动输入 LLM 调用
  115. # ================================================================
  116. print("\n=== TC-C03-API-004: 环节隔离 -- 手动输入 LLM 调用 ===")
  117. # isolation_steps=[1], manual_inputs={"1": "..."}
  118. # -> effective = {1} + auto-forward {2} = {1, 2}
  119. eff = _build_effective_set(steps, [1], {"1": "请审查以下内容:..."})
  120. check("API-004: isolation_steps=[1], has manual -> effective={1,2}",
  121. eff == {1, 2}, f"实际: {eff}")
  122. check("API-004: step 0 (before first selected) -> skipped", 0 not in eff)
  123. check("API-004: step 1 (selected with manual) -> execute", 1 in eff)
  124. check("API-004: step 2 (auto-forward from manual) -> execute", 2 in eff)
  125. # 验证无 manual_inputs 时不触发前向传播
  126. eff_no_manual = _build_effective_set(steps, [1], None)
  127. check("API-004: 无 manual_inputs, effective={1}",
  128. eff_no_manual == {1}, f"实际: {eff_no_manual}")
  129. # ================================================================
  130. # TC-C03-API-005: 环节隔离 — 仅解析 Response
  131. # ================================================================
  132. print("\n=== TC-C03-API-005: 环节隔离 -- 仅解析 Response ===")
  133. # isolation_steps=[2], manual_inputs={"2": "..."}
  134. # -> effective = {2}, step 2 is last, no forward. Step 0,1 skipped
  135. eff = _build_effective_set(steps, [2], {"2": '{"审查结果": []}'})
  136. check("API-005: isolation_steps=[2], effective={2}", eff == {2}, f"实际: {eff}")
  137. check("API-005: step 0 skipped", 0 not in eff)
  138. check("API-005: step 1 skipped", 1 not in eff)
  139. check("API-005: step 2 execute", 2 in eff)
  140. # ================================================================
  141. # TC-C03-EDGE-001: 专业性审查仅执行 RAG 阶段
  142. # ================================================================
  143. print("\n=== TC-C03-EDGE-001: 专业性审查仅执行 RAG 阶段 ===")
  144. pro_steps = StepDispatcher.get_steps("professional")
  145. # isolation_steps=[0,1,2,3] -> effective = {0,1,2,3}
  146. eff = _build_effective_set(pro_steps, [0, 1, 2, 3], None)
  147. check("EDGE-001: professional RAG only -> effective={0,1,2,3}",
  148. eff == {0, 1, 2, 3}, f"实际: {eff}")
  149. check("EDGE-001: step 4 (AI) -> skipped", 4 not in eff)
  150. check("EDGE-001: step 5 (AI) -> skipped", 5 not in eff)
  151. check("EDGE-001: step 6 (AI) -> skipped", 6 not in eff)
  152. # ================================================================
  153. # TC-C03-EDGE-002: 非连续步骤选择
  154. # ================================================================
  155. print("\n=== TC-C03-EDGE-002: 非连续步骤选择 ===")
  156. # isolation_steps=[0,2], no manual -> effective = {0,2}
  157. eff = _build_effective_set(steps, [0, 2], None)
  158. check("EDGE-002: isolation_steps=[0,2] -> effective={0,2}",
  159. eff == {0, 2}, f"实际: {eff}")
  160. check("EDGE-002: step 0 requires_previous=False -> 可独立执行",
  161. steps[0].requires_previous is False)
  162. check("EDGE-002: step 1 不在 effective -> skipped", 1 not in eff)
  163. check("EDGE-002: step 2 requires_previous=True -> 依赖前一步",
  164. steps[2].requires_previous is True)
  165. deps = StepDispatcher.get_step_deps("completeness")
  166. dep_idx = deps.get(2)
  167. check("EDGE-002: step 2 依赖 step 1", dep_idx == 1, f"实际依赖: {dep_idx}")
  168. check("EDGE-002: step 1 (skipped) -> step 2 因依赖不满足被跳过",
  169. "(逻辑验证: step 2 在无 step 1 输出时被正确标记为 skipped)")
  170. # 验证 _STEP_DEPS 与 executor.py 一致
  171. for cid in EXPECTED_CHAINS:
  172. deps = _STEP_DEPS[cid]
  173. check(f"EDGE-002: {cid} _STEP_DEPS 存在", deps is not None)
  174. if cid != "professional":
  175. check(f"EDGE-002: {cid} dep 0=None", deps.get(0) is None)
  176. check(f"EDGE-002: {cid} dep 1=0", deps.get(1) == 0)
  177. check(f"EDGE-002: {cid} dep 2=1", deps.get(2) == 1)
  178. else:
  179. for i in range(7):
  180. expected_dep = None if i == 0 else i - 1
  181. check(f"EDGE-002: professional dep {i}={expected_dep}",
  182. deps.get(i) == expected_dep)
  183. # ================================================================
  184. # TC-C03-ERROR-001: isolation_steps 包含不存在的索引
  185. # ================================================================
  186. print("\n=== TC-C03-ERROR-001: isolation_steps 包含不存在的索引 ===")
  187. isolated = StepDispatcher.get_isolation_steps("completeness", [0, 1, 2, 99])
  188. check("ERROR-001: get_isolation_steps 过滤非法索引 99",
  189. len(isolated) == 3, f"实际返回步数: {len(isolated)}")
  190. check("ERROR-001: 返回步骤索引正确",
  191. all(s.index in {0, 1, 2} for s in isolated))
  192. eff = _build_effective_set(steps, [0, 1, 2, 99], None)
  193. check("ERROR-001: _build_effective_set 过滤非法索引 99",
  194. eff == {0, 1, 2}, f"实际: {eff}")
  195. try:
  196. StepDispatcher.get_steps("non_existent_chain")
  197. check("ERROR-001: 非法 chain_id 应抛异常", False)
  198. except ValueError:
  199. check("ERROR-001: 非法 chain_id -> ValueError", True)
  200. try:
  201. StepDispatcher.get_step_context("completeness", 99)
  202. check("ERROR-001: 非法 step_index 应抛异常", False)
  203. except ValueError:
  204. check("ERROR-001: 非法 step_index -> ValueError", True)
  205. # ================================================================
  206. # TC-C03-ERROR-002: 步骤执行异常时步骤状态正确标记
  207. # ================================================================
  208. print("\n=== TC-C03-ERROR-002: 步骤执行异常时状态传播 ===")
  209. deps = StepDispatcher.get_step_deps("completeness")
  210. # 场景:步骤 0 失败
  211. step_results_0_fail = [
  212. IRStepResult(index=0, name="Prompt 渲染", status="error", error="模板渲染异常"),
  213. ]
  214. dep_1 = deps.get(1)
  215. should_skip_1 = False
  216. if dep_1 is not None:
  217. for prev in step_results_0_fail:
  218. if prev.index == dep_1 and prev.status in ("error", "skipped"):
  219. should_skip_1 = True
  220. break
  221. check("ERROR-002: step 0 error -> step 1 skipped", should_skip_1 is True)
  222. step_results_1_skipped = step_results_0_fail + [
  223. IRStepResult(index=1, name="LLM 调用", status="skipped"),
  224. ]
  225. dep_2 = deps.get(2)
  226. should_skip_2 = False
  227. if dep_2 is not None:
  228. for prev in step_results_1_skipped:
  229. if prev.index == dep_2 and prev.status in ("error", "skipped"):
  230. should_skip_2 = True
  231. break
  232. check("ERROR-002: step 1 skipped -> step 2 skipped", should_skip_2 is True)
  233. # 乐观场景:全部成功 -> 不跳过
  234. step_results_all_ok = [
  235. IRStepResult(index=0, name="Prompt 渲染", status="success"),
  236. IRStepResult(index=1, name="LLM 调用", status="success"),
  237. ]
  238. skip_2 = False
  239. if dep_2 is not None:
  240. for prev in step_results_all_ok:
  241. if prev.index == dep_2 and prev.status in ("error", "skipped"):
  242. skip_2 = True
  243. break
  244. check("ERROR-002: 全部成功 -> step 2 不跳过", skip_2 is False)
  245. # 验证 StepResult 数据类
  246. sr = IRStepResult(index=0, name="Test", status="error", error="some error", duration=0.5)
  247. d = sr.to_dict()
  248. check("ERROR-002: StepResult.to_dict 包含 error 字段", "error" in d, f"keys: {list(d.keys())}")
  249. check("ERROR-002: to_dict 包含 status 字段", d["status"] == "error")
  250. check("ERROR-002: to_dict 包含 duration 字段", d["duration"] == 0.5)
  251. # ================================================================
  252. # 额外验证:get_step_context
  253. # ================================================================
  254. print("\n=== 额外验证: get_step_context ===")
  255. ctx = StepDispatcher.get_step_context("completeness", 0)
  256. check("context completeness step 0 包含 context_name",
  257. ctx.get("context_name") == "prompt_rendering")
  258. check("context completeness step 0 包含 required_params",
  259. "review_content" in ctx.get("required_params", []))
  260. check("context completeness step 0 can_run_in_isolation",
  261. ctx.get("can_run_in_isolation") is True)
  262. ctx1 = StepDispatcher.get_step_context("completeness", 1)
  263. check("context completeness step 1 包含 llm_invocation",
  264. ctx1.get("context_name") == "llm_invocation")
  265. ctx_pro = StepDispatcher.get_step_context("professional", 0)
  266. check("context professional step 0 包含 phase=RAG",
  267. ctx_pro.get("phase") == "RAG 召回阶段")
  268. check("context professional step 0 can_run_in_isolation",
  269. ctx_pro.get("can_run_in_isolation") is True)
  270. ctx_pro6 = StepDispatcher.get_step_context("professional", 6)
  271. check("context professional step 6 is_isolatable=False",
  272. ctx_pro6.get("can_run_in_isolation") is False)
  273. # ================================================================
  274. # 额外验证:StepDefinition field 默认值
  275. # ================================================================
  276. print("\n=== 额外验证: StepDefinition 默认值 ===")
  277. sd_default = StepDefinition(index=5, name="默认值测试")
  278. check("is_isolatable 默认 = True", sd_default.is_isolatable is True)
  279. check("requires_previous 默认 = True", sd_default.requires_previous is True)
  280. check("phase 默认 = None", sd_default.phase is None)
  281. # CHAIN_STEPS 与 executor.py 结构等价性验证
  282. print("\n=== 等价性验证: CHAIN_STEPS 与 executor.py ===")
  283. for cid in EXPECTED_CHAINS:
  284. steps_def = CHAIN_STEPS[cid]
  285. for s in steps_def:
  286. d = s.to_dict()
  287. check(f"CHAIN_STEPS[{cid}][{s.index}] to_dict 格式正确",
  288. "index" in d and "name" in d and "phase" in d)
  289. # ================================================================
  290. # 汇总
  291. # ================================================================
  292. print_summary()
  293. sys.exit(0 if all(p for _, p, _ in results) else 1)