|
@@ -0,0 +1,334 @@
|
|
|
|
|
+#!/usr/bin/env python
|
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
|
+"""
|
|
|
|
|
+文档切分修复批量验证测试
|
|
|
|
|
+
|
|
|
|
|
+测试目标:批量验证多个 PDF 中最后一章是否被正确提取,无跨章节泄漏。
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+import json
|
|
|
|
|
+import os
|
|
|
|
|
+import sys
|
|
|
|
|
+import traceback
|
|
|
|
|
+from datetime import datetime
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+
|
|
|
|
|
+# 添加项目根目录到路径
|
|
|
|
|
+project_root = Path(__file__).parent.parent.parent
|
|
|
|
|
+sys.path.insert(0, str(project_root))
|
|
|
|
|
+
|
|
|
|
|
+from core.construction_review.component.doc_worker.pipeline import PipelineComponents, DefaultDocumentPipeline, DefaultFileParseFacade
|
|
|
|
|
+from core.construction_review.component.doc_worker.config.provider import default_config_provider
|
|
|
|
|
+from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
|
|
|
|
|
+from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
|
|
|
|
|
+from core.construction_review.component.doc_worker.pdf_worker.fulltext_extractor import PdfFullTextExtractor
|
|
|
|
|
+from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
|
|
|
|
|
+from core.construction_review.component.doc_worker.pdf_worker.json_writer import PdfJsonResultWriter
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+TEST_DIR = Path("D:/wx_work/sichuan_luqiao/lu_sgsc_testfile")
|
|
|
|
|
+
|
|
|
|
|
+TEST_FILES = [
|
|
|
|
|
+ # 必须包含
|
|
|
|
|
+ Path("utils_test/Chunk_Split_Test/标准结构测试文件.pdf").resolve(),
|
|
|
|
|
+ # 代表性施工方案(按推荐优先级排序)
|
|
|
|
|
+ TEST_DIR / "测试模版-四川路桥专项施工方案框架以及编制说明(2025修订第三版)- v0.2.pdf",
|
|
|
|
|
+ TEST_DIR / "成渝扩容桥梁下部结构专项施工方案(正式版)(1).pdf",
|
|
|
|
|
+ TEST_DIR / "达州绕西高速西段RX2标段人工挖孔桩施工方案(2).pdf",
|
|
|
|
|
+ TEST_DIR / "高处作业安全带、防坠器系挂方案.2026.1.5改.pdf",
|
|
|
|
|
+ TEST_DIR / "四川智能建造科技股份有限公司G999线大源至中和高速公路TJ5项目经理部龙泉山左线特大桥T梁安装专项施工方案.pdf",
|
|
|
|
|
+ TEST_DIR / "主线天桥现浇箱梁支模体系(满堂支架)安全专项施工方案(1).pdf",
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def build_test_facade():
|
|
|
|
|
+ components = PipelineComponents(
|
|
|
|
|
+ config=default_config_provider,
|
|
|
|
|
+ toc_extractor=PdfTOCExtractor(),
|
|
|
|
|
+ classifier=HierarchyClassifier(),
|
|
|
|
|
+ fulltext_extractor=PdfFullTextExtractor(),
|
|
|
|
|
+ splitter=PdfTextSplitter(),
|
|
|
|
|
+ writers=[PdfJsonResultWriter()],
|
|
|
|
|
+ chunk_classifier=None,
|
|
|
|
|
+ )
|
|
|
|
|
+ pipeline = DefaultDocumentPipeline(components)
|
|
|
|
|
+ return DefaultFileParseFacade(pipeline)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def locate_existing_files() -> list[Path]:
|
|
|
|
|
+ existing = []
|
|
|
|
|
+ for p in TEST_FILES:
|
|
|
|
|
+ if p.exists():
|
|
|
|
|
+ existing.append(p)
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f"[SKIP] 文件不存在,跳过: {p}")
|
|
|
|
|
+ return existing
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def run_pipeline(file_path: Path, facade) -> dict:
|
|
|
|
|
+ print(f"\n[INFO] 正在处理: {file_path.name}")
|
|
|
|
|
+ result = facade.process_file(
|
|
|
|
|
+ file_path=file_path,
|
|
|
|
|
+ target_level=None,
|
|
|
|
|
+ max_chunk_size=None,
|
|
|
|
|
+ min_chunk_size=None,
|
|
|
|
|
+ output_dir=None,
|
|
|
|
|
+ )
|
|
|
|
|
+ return result
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def analyze_file(file_path: Path, result: dict) -> dict:
|
|
|
|
|
+ chunks = result.get("chunks") or []
|
|
|
|
|
+ toc_info = result.get("toc_info") or {}
|
|
|
|
|
+ toc_items = toc_info.get("toc_items") or []
|
|
|
|
|
+
|
|
|
|
|
+ section_labels = sorted({c.get("section_label", "UNKNOWN") for c in chunks})
|
|
|
|
|
+
|
|
|
|
|
+ # 一级章节标签:section_label 中不含 "->" 的部分
|
|
|
|
|
+ first_level_labels = []
|
|
|
|
|
+ for label in section_labels:
|
|
|
|
|
+ if "->" in label:
|
|
|
|
|
+ first = label.split("->")[0].strip()
|
|
|
|
|
+ if first not in first_level_labels:
|
|
|
|
|
+ first_level_labels.append(first)
|
|
|
|
|
+ else:
|
|
|
|
|
+ if label.strip() not in first_level_labels:
|
|
|
|
|
+ first_level_labels.append(label.strip())
|
|
|
|
|
+
|
|
|
|
|
+ # 找目录中 level=1 的最后一个章节
|
|
|
|
|
+ level1_items = [item for item in toc_items if item.get("level") == 1]
|
|
|
|
|
+ last_level1_item = level1_items[-1] if level1_items else None
|
|
|
|
|
+ last_level1_title = last_level1_item.get("title", "").strip() if last_level1_item else ""
|
|
|
|
|
+ last_level1_page = last_level1_item.get("page") if last_level1_item else None
|
|
|
|
|
+
|
|
|
|
|
+ # 判断最后一章是否有对应 chunk(模糊匹配标题)
|
|
|
|
|
+ def normalize(t: str) -> str:
|
|
|
|
|
+ return t.replace(" ", "").replace("\u3000", "").strip()
|
|
|
|
|
+
|
|
|
|
|
+ last_chapter_found = False
|
|
|
|
|
+ matched_label = None
|
|
|
|
|
+ if last_level1_title:
|
|
|
|
|
+ norm_target = normalize(last_level1_title)
|
|
|
|
|
+ for label in first_level_labels:
|
|
|
|
|
+ if norm_target in normalize(label) or normalize(label) in norm_target:
|
|
|
|
|
+ last_chapter_found = True
|
|
|
|
|
+ matched_label = label
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ # 检查最后一章 page 是否明显大于目录页范围(简单:page > toc_page + 2)
|
|
|
|
|
+ toc_page = toc_info.get("toc_page") or 1
|
|
|
|
|
+ try:
|
|
|
|
|
+ toc_page = int(toc_page)
|
|
|
|
|
+ except (ValueError, TypeError):
|
|
|
|
|
+ toc_page = 1
|
|
|
|
|
+ page_reasonable = False
|
|
|
|
|
+ if last_level1_page is not None:
|
|
|
|
|
+ try:
|
|
|
|
|
+ page_reasonable = int(last_level1_page) > toc_page + 2
|
|
|
|
|
+ except (ValueError, TypeError):
|
|
|
|
|
+ page_reasonable = False
|
|
|
|
|
+
|
|
|
|
|
+ # 检查跨章节泄漏
|
|
|
|
|
+ leak_detected = False
|
|
|
|
|
+ leak_details = []
|
|
|
|
|
+ if len(first_level_labels) >= 2 and last_level1_title:
|
|
|
|
|
+ # 倒数第二个一级章节
|
|
|
|
|
+ prev_first = first_level_labels[-2] if len(first_level_labels) >= 2 else None
|
|
|
|
|
+ if prev_first:
|
|
|
|
|
+ # 该一级章节下的所有 chunk(包含其二级节)中的最后一个 chunk
|
|
|
|
|
+ prev_chunks = [c for c in chunks if c.get("section_label", "").startswith(prev_first)]
|
|
|
|
|
+ if prev_chunks:
|
|
|
|
|
+ last_prev_chunk = prev_chunks[-1]
|
|
|
|
|
+ content = (last_prev_chunk.get("review_chunk_content", "") or "") + (last_prev_chunk.get("content", "") or "")
|
|
|
|
|
+ # 用最后一章标题的几个关键词检查是否混入
|
|
|
|
|
+ keywords = [k for k in last_level1_title.split() if len(k) >= 2]
|
|
|
|
|
+ if not keywords:
|
|
|
|
|
+ keywords = [last_level1_title]
|
|
|
|
|
+ for kw in keywords:
|
|
|
|
|
+ if kw in content:
|
|
|
|
|
+ leak_detected = True
|
|
|
|
|
+ leak_details.append({
|
|
|
|
|
+ "chunk_id": last_prev_chunk.get("chunk_id"),
|
|
|
|
|
+ "section_label": last_prev_chunk.get("section_label"),
|
|
|
|
|
+ "keyword": kw,
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 特殊情形:如果完全没有识别出章节标题(只有 fallback 的 "正文" chunk),
|
|
|
|
|
+ # 说明 toc_extractor 可能将正文页误判为目录页,导致 title_matcher 过滤掉所有匹配。
|
|
|
|
|
+ # 这与本次 "第十章被吞并" 的修复无关,单独标记。
|
|
|
|
|
+ if len(chunks) == 1 and len(section_labels) == 1 and section_labels[0] == "正文":
|
|
|
|
|
+ return {
|
|
|
|
|
+ "filename": file_path.name,
|
|
|
|
|
+ "total_chunks": len(chunks),
|
|
|
|
|
+ "total_level1": 0,
|
|
|
|
|
+ "last_level1_title": last_level1_title,
|
|
|
|
|
+ "last_level1_page": last_level1_page,
|
|
|
|
|
+ "last_chapter_found": False,
|
|
|
|
|
+ "last_chapter_label": None,
|
|
|
|
|
+ "page_reasonable": False,
|
|
|
|
|
+ "toc_page": toc_page,
|
|
|
|
|
+ "leak_detected": False,
|
|
|
|
|
+ "leak_details": [],
|
|
|
|
|
+ "section_labels": section_labels,
|
|
|
|
|
+ "return_code": 1,
|
|
|
|
|
+ "reasons": ["未能识别任何章节标题(可能目录页范围误判),无法评估切分修复效果"],
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # 返回码判定
|
|
|
|
|
+ ret = 0
|
|
|
|
|
+ reasons = []
|
|
|
|
|
+ if not last_chapter_found:
|
|
|
|
|
+ ret = 1
|
|
|
|
|
+ reasons.append("最后一章未找到对应 chunk")
|
|
|
|
|
+ if not page_reasonable:
|
|
|
|
|
+ ret = 1
|
|
|
|
|
+ reasons.append("最后一章页码可能异常(落在目录页附近)")
|
|
|
|
|
+ if leak_detected:
|
|
|
|
|
+ ret = 1
|
|
|
|
|
+ reasons.append("发现跨章节内容泄漏")
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
|
|
+ "filename": file_path.name,
|
|
|
|
|
+ "total_chunks": len(chunks),
|
|
|
|
|
+ "total_level1": len(first_level_labels),
|
|
|
|
|
+ "last_level1_title": last_level1_title,
|
|
|
|
|
+ "last_level1_page": last_level1_page,
|
|
|
|
|
+ "last_chapter_found": last_chapter_found,
|
|
|
|
|
+ "last_chapter_label": matched_label,
|
|
|
|
|
+ "page_reasonable": page_reasonable,
|
|
|
|
|
+ "toc_page": toc_page,
|
|
|
|
|
+ "leak_detected": leak_detected,
|
|
|
|
|
+ "leak_details": leak_details,
|
|
|
|
|
+ "section_labels": section_labels,
|
|
|
|
|
+ "return_code": ret,
|
|
|
|
|
+ "reasons": reasons,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def print_summary(reports: list[dict]) -> str:
|
|
|
|
|
+ lines = []
|
|
|
|
|
+ lines.append("\n" + "=" * 80)
|
|
|
|
|
+ lines.append("批量切分测试汇总")
|
|
|
|
|
+ lines.append("=" * 80)
|
|
|
|
|
+
|
|
|
|
|
+ passed = 0
|
|
|
|
|
+ failed = 0
|
|
|
|
|
+ for r in reports:
|
|
|
|
|
+ status = "PASS" if r["return_code"] == 0 else "FAIL"
|
|
|
|
|
+ if r["return_code"] == 0:
|
|
|
|
|
+ passed += 1
|
|
|
|
|
+ else:
|
|
|
|
|
+ failed += 1
|
|
|
|
|
+ lines.append(f"\n文件: {r['filename']}")
|
|
|
|
|
+ lines.append(f" 状态: {status}")
|
|
|
|
|
+ lines.append(f" 总 chunk 数: {r['total_chunks']}")
|
|
|
|
|
+ lines.append(f" 总一级章节数: {r['total_level1']}")
|
|
|
|
|
+ lines.append(f" 最后一章标题: {r['last_level1_title']}")
|
|
|
|
|
+ lines.append(f" 最后一章页码: {r['last_level1_page']}")
|
|
|
|
|
+ lines.append(f" 最后一章提取成功: {r['last_chapter_found']} ({r['last_chapter_label'] or 'N/A'})")
|
|
|
|
|
+ lines.append(f" 页码合理: {r['page_reasonable']} (目录页={r['toc_page']})")
|
|
|
|
|
+ lines.append(f" 跨章节泄漏: {r['leak_detected']}")
|
|
|
|
|
+ if r["leak_details"]:
|
|
|
|
|
+ for d in r["leak_details"]:
|
|
|
|
|
+ lines.append(f" -> {d['chunk_id']} ({d['section_label']}) 包含 '{d['keyword']}'")
|
|
|
|
|
+ if r["reasons"]:
|
|
|
|
|
+ lines.append(f" 不通过原因: {'; '.join(r['reasons'])}")
|
|
|
|
|
+
|
|
|
|
|
+ lines.append("\n" + "-" * 80)
|
|
|
|
|
+ lines.append(f"汇总: {passed} 通过, {failed} 失败 / 总计 {len(reports)} 个文件")
|
|
|
|
|
+ lines.append("=" * 80)
|
|
|
|
|
+ summary = "\n".join(lines)
|
|
|
|
|
+ print(summary)
|
|
|
|
|
+ return summary
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def main() -> int:
|
|
|
|
|
+ files = locate_existing_files()
|
|
|
|
|
+ if not files:
|
|
|
|
|
+ print("[ERROR] 没有可用的测试文件。")
|
|
|
|
|
+ return 1
|
|
|
|
|
+
|
|
|
|
|
+ facade = build_test_facade()
|
|
|
|
|
+ reports = []
|
|
|
|
|
+ errors = []
|
|
|
|
|
+
|
|
|
|
|
+ for fp in files:
|
|
|
|
|
+ try:
|
|
|
|
|
+ result = run_pipeline(fp, facade)
|
|
|
|
|
+ report = analyze_file(fp, result)
|
|
|
|
|
+ reports.append(report)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"[ERROR] 处理失败: {fp.name} -> {e}")
|
|
|
|
|
+ traceback.print_exc()
|
|
|
|
|
+ errors.append({"filename": fp.name, "error": str(e)})
|
|
|
|
|
+
|
|
|
|
|
+ summary = print_summary(reports)
|
|
|
|
|
+
|
|
|
|
|
+ # 写出报告和中间 JSON
|
|
|
|
|
+ out_dir = Path(__file__).parent
|
|
|
|
|
+ md_path = out_dir / "batch_test_report.md"
|
|
|
|
|
+ json_path = out_dir / "batch_test_result.json"
|
|
|
|
|
+
|
|
|
|
|
+ with open(json_path, "w", encoding="utf-8") as f:
|
|
|
|
|
+ json.dump({
|
|
|
|
|
+ "timestamp": datetime.now().isoformat(),
|
|
|
|
|
+ "reports": reports,
|
|
|
|
|
+ "errors": errors,
|
|
|
|
|
+ }, f, ensure_ascii=False, indent=2)
|
|
|
|
|
+ print(f"[INFO] JSON 结果已保存: {json_path}")
|
|
|
|
|
+
|
|
|
|
|
+ md_content = f"""# 文档切分修复批量测试报告
|
|
|
|
|
+
|
|
|
|
|
+生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
|
|
|
|
+
|
|
|
|
|
+## 测试文件列表
|
|
|
|
|
+
|
|
|
|
|
+"""
|
|
|
|
|
+ for fp in files:
|
|
|
|
|
+ md_content += f"- `{fp.name}`\n"
|
|
|
|
|
+
|
|
|
|
|
+ md_content += "\n## 详细结果\n\n"
|
|
|
|
|
+ for r in reports:
|
|
|
|
|
+ status = "PASS" if r["return_code"] == 0 else "FAIL"
|
|
|
|
|
+ md_content += f"### {r['filename']} — {status}\n\n"
|
|
|
|
|
+ md_content += f"- 总 chunk 数: {r['total_chunks']}\n"
|
|
|
|
|
+ md_content += f"- 总一级章节数: {r['total_level1']}\n"
|
|
|
|
|
+ md_content += f"- 最后一章标题: {r['last_level1_title']}\n"
|
|
|
|
|
+ md_content += f"- 最后一章页码: {r['last_level1_page']}\n"
|
|
|
|
|
+ md_content += f"- 最后一章提取成功: {'是' if r['last_chapter_found'] else '否'} (`{r['last_chapter_label'] or 'N/A'}`)\n"
|
|
|
|
|
+ md_content += f"- 页码合理: {'是' if r['page_reasonable'] else '否'} (目录页={r['toc_page']})\n"
|
|
|
|
|
+ md_content += f"- 跨章节泄漏: {'是' if r['leak_detected'] else '否'}\n"
|
|
|
|
|
+ if r["leak_details"]:
|
|
|
|
|
+ md_content += " 泄漏详情:\n"
|
|
|
|
|
+ for d in r["leak_details"]:
|
|
|
|
|
+ md_content += f" - `{d['chunk_id']}` (`{d['section_label']}`) 包含关键词 `{d['keyword']}`\n"
|
|
|
|
|
+ if r["reasons"]:
|
|
|
|
|
+ md_content += f"- 不通过原因: **{';'.join(r['reasons'])}**\n"
|
|
|
|
|
+ md_content += "\n"
|
|
|
|
|
+
|
|
|
|
|
+ if errors:
|
|
|
|
|
+ md_content += "## 运行错误\n\n"
|
|
|
|
|
+ for e in errors:
|
|
|
|
|
+ md_content += f"- `{e['filename']}`: {e['error']}\n"
|
|
|
|
|
+ md_content += "\n"
|
|
|
|
|
+
|
|
|
|
|
+ total = len(reports)
|
|
|
|
|
+ passed = sum(1 for r in reports if r["return_code"] == 0)
|
|
|
|
|
+ failed = total - passed
|
|
|
|
|
+ md_content += f"""## 汇总
|
|
|
|
|
+
|
|
|
|
|
+- 通过: {passed}
|
|
|
|
|
+- 失败: {failed}
|
|
|
|
|
+- 总计: {total}
|
|
|
|
|
+- 运行错误: {len(errors)}
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+ with open(md_path, "w", encoding="utf-8") as f:
|
|
|
|
|
+ f.write(md_content)
|
|
|
|
|
+ print(f"[INFO] Markdown 报告已保存: {md_path}")
|
|
|
|
|
+
|
|
|
|
|
+ return 0
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ sys.exit(main())
|