run_full_scan.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """全量 chunk 词句语法审查 — 保存所有原始响应用于人工分析"""
  4. import sys, os, json, asyncio, time
  5. sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)).split('utils_test')[0])
  6. RESULT_JSON = os.path.join(
  7. os.path.dirname(os.path.abspath(__file__)).split('utils_test')[0],
  8. "temp", "construction_review", "final_result",
  9. "67d45692fb97aeef8f896e78475ce539-1779781589.json"
  10. )
  11. OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "full_scan_results")
  12. async def main():
  13. from core.construction_review.component.reviewers.grammar_check_reviewer import GrammarCheckReviewer
  14. with open(RESULT_JSON, 'r', encoding='utf-8') as f:
  15. data = json.load(f)
  16. chunks = data['document_result']['structured_content']['chunks']
  17. os.makedirs(OUTPUT_DIR, exist_ok=True)
  18. reviewer = GrammarCheckReviewer()
  19. all_results = []
  20. for i, chunk in enumerate(chunks):
  21. content = chunk['content']
  22. section = chunk.get('section_label', f'chunk_{i}')
  23. chapter = chunk.get('chapter_classification', 'unknown')
  24. trace_id = f"full_scan_{i}_{int(time.time())}"
  25. print(f"[{i:02d}/{len(chunks)}] {chapter}/{section[:40]}... (len={len(content)})")
  26. start = time.time()
  27. try:
  28. result = await reviewer.check_grammar(
  29. trace_id=trace_id,
  30. review_content=content,
  31. state=None, stage_name=None,
  32. enable_thinking=False,
  33. )
  34. wall_time = time.time() - start
  35. response_text = result.details.get('response', '')
  36. success = result.success
  37. error = result.error_message
  38. except Exception as e:
  39. wall_time = time.time() - start
  40. response_text = ""
  41. success = False
  42. error = str(e)
  43. print(f" ERROR: {e}")
  44. record = {
  45. "chunk_index": i,
  46. "chapter": chapter,
  47. "section": section,
  48. "content_length": len(content),
  49. "content_preview": content[:200],
  50. "success": success,
  51. "error": error,
  52. "wall_time": round(wall_time, 2),
  53. "response_length": len(response_text),
  54. "raw_response": response_text,
  55. }
  56. all_results.append(record)
  57. is_no_issue = '无明显问题' in response_text and len(response_text) < 50
  58. status = "NO_ISSUE" if is_no_issue else f"ISSUES(response_len={len(response_text)})"
  59. print(f" {wall_time:.2f}s | {status}")
  60. # 保存汇总
  61. summary_path = os.path.join(OUTPUT_DIR, "all_results.json")
  62. with open(summary_path, 'w', encoding='utf-8') as f:
  63. json.dump(all_results, f, ensure_ascii=False, indent=2)
  64. print(f"\nSaved {len(all_results)} results to {summary_path}")
  65. # 保存每个 chunk 的独立文件(方便逐条阅读)
  66. for record in all_results:
  67. idx = record["chunk_index"]
  68. chunk_path = os.path.join(OUTPUT_DIR, f"chunk_{idx:02d}_{record['chapter']}.json")
  69. with open(chunk_path, 'w', encoding='utf-8') as f:
  70. json.dump(record, f, ensure_ascii=False, indent=2)
  71. print(f"Saved individual files to {OUTPUT_DIR}/")
  72. # 打印统计
  73. no_issue_count = sum(1 for r in all_results if '无明显问题' in r['raw_response'] and len(r['raw_response']) < 50)
  74. issue_count = len(all_results) - no_issue_count
  75. error_count = sum(1 for r in all_results if not r['success'])
  76. print(f"\nStats: {no_issue_count} no-issue, {issue_count} has-issues, {error_count} errors")
  77. asyncio.run(main())