|
@@ -240,11 +240,15 @@ class SimpleDocumentProcessor:
|
|
|
self._merge_tertiary_to_unified(unified, chunks)
|
|
self._merge_tertiary_to_unified(unified, chunks)
|
|
|
|
|
|
|
|
# 原始元数据
|
|
# 原始元数据
|
|
|
|
|
+ chapters = structure.get("chapters", {})
|
|
|
|
|
+ quality_check = chapters.get("quality_check", {})
|
|
|
|
|
+ logger.info(f"[_build_unified_doc] 从 chapters 获取 quality_check: {quality_check}")
|
|
|
unified.raw_metadata = {
|
|
unified.raw_metadata = {
|
|
|
"processing_info": {
|
|
"processing_info": {
|
|
|
"chunks_count": len(chunks),
|
|
"chunks_count": len(chunks),
|
|
|
"pages_count": structure.get("total_pages", 0),
|
|
"pages_count": structure.get("total_pages", 0),
|
|
|
- }
|
|
|
|
|
|
|
+ },
|
|
|
|
|
+ "quality_check": quality_check
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
# 设置目录结构(YOLO检测+OCR提取)
|
|
# 设置目录结构(YOLO检测+OCR提取)
|
|
@@ -461,7 +465,7 @@ class SimpleDocumentProcessor:
|
|
|
l2_threshold: float = 0.73,
|
|
l2_threshold: float = 0.73,
|
|
|
) -> None:
|
|
) -> None:
|
|
|
"""
|
|
"""
|
|
|
- 检查文档提取质量,如果低于阈值则在 chapters 中添加质量字段。
|
|
|
|
|
|
|
+ 检查文档提取质量,无论是否低于阈值都在 chapters 中添加质量字段。
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
structure: PDF 提取结构
|
|
structure: PDF 提取结构
|
|
@@ -493,34 +497,44 @@ class SimpleDocumentProcessor:
|
|
|
l1_alert = l1_rate < l1_threshold
|
|
l1_alert = l1_rate < l1_threshold
|
|
|
l2_alert = l2_rate < l2_threshold
|
|
l2_alert = l2_rate < l2_threshold
|
|
|
|
|
|
|
|
- if l1_alert or l2_alert:
|
|
|
|
|
- quality_result: Dict[str, Any] = {}
|
|
|
|
|
-
|
|
|
|
|
- if l1_alert:
|
|
|
|
|
- quality_result["l1_chapter_quality"] = {
|
|
|
|
|
- "extracted_count": l1_count,
|
|
|
|
|
- "expected_count": default_total_chapters,
|
|
|
|
|
- "extraction_rate": round(l1_rate * 100, 2),
|
|
|
|
|
- "threshold": round(l1_threshold * 100, 2),
|
|
|
|
|
- }
|
|
|
|
|
- quality_result["l1_system_alerts"] = "该文档一级章节提取可能存在缺失,请检查文档标题格式是否符合标准。"
|
|
|
|
|
- logger.warning(
|
|
|
|
|
- f"[质量检查] 一级章节提取率 {l1_rate*100:.1f}% 低于阈值 {l1_threshold*100:.1f}% "
|
|
|
|
|
- f"({l1_count}/{default_total_chapters})"
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ # 构建质量检查结果(始终添加)
|
|
|
|
|
+ quality_result: Dict[str, Any] = {}
|
|
|
|
|
|
|
|
- if l2_alert:
|
|
|
|
|
- quality_result["l2_Subsection_quality"] = {
|
|
|
|
|
- "extracted_count": l2_count,
|
|
|
|
|
- "expected_count": default_total_subsections,
|
|
|
|
|
- "extraction_rate": round(l2_rate * 100, 2),
|
|
|
|
|
- "threshold": round(l2_threshold * 100, 2),
|
|
|
|
|
- }
|
|
|
|
|
- quality_result["l2_system_alerts"] = "该文档二级小节提取可能存在缺失,请检查文档标题格式是否符合标准。"
|
|
|
|
|
- logger.warning(
|
|
|
|
|
- f"[质量检查] 二级小节提取率 {l2_rate*100:.1f}% 低于阈值 {l2_threshold*100:.1f}% "
|
|
|
|
|
- f"({l2_count}/{default_total_subsections})"
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ # 一级章节质量
|
|
|
|
|
+ quality_result["l1_chapter_quality"] = {
|
|
|
|
|
+ "extracted_count": l1_count,
|
|
|
|
|
+ "expected_count": default_total_chapters,
|
|
|
|
|
+ "extraction_rate": round(l1_rate * 100, 2),
|
|
|
|
|
+ "threshold": round(l1_threshold * 100, 2),
|
|
|
|
|
+ "exist_issue": l1_alert,
|
|
|
|
|
+ }
|
|
|
|
|
+ quality_result["l1_system_alerts"] = (
|
|
|
|
|
+ "该文档一级章节提取可能存在缺失,请检查文档标题格式是否符合标准。"
|
|
|
|
|
+ if l1_alert else ""
|
|
|
|
|
+ )
|
|
|
|
|
+ if l1_alert:
|
|
|
|
|
+ logger.warning(
|
|
|
|
|
+ f"[质量检查] 一级章节提取率 {l1_rate*100:.1f}% 低于阈值 {l1_threshold*100:.1f}% "
|
|
|
|
|
+ f"({l1_count}/{default_total_chapters})"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 二级小节质量
|
|
|
|
|
+ quality_result["l2_Subsection_quality"] = {
|
|
|
|
|
+ "extracted_count": l2_count,
|
|
|
|
|
+ "expected_count": default_total_subsections,
|
|
|
|
|
+ "extraction_rate": round(l2_rate * 100, 2),
|
|
|
|
|
+ "threshold": round(l2_threshold * 100, 2),
|
|
|
|
|
+ "exist_issue": l2_alert,
|
|
|
|
|
+ }
|
|
|
|
|
+ quality_result["l2_system_alerts"] = (
|
|
|
|
|
+ "该文档二级小节提取可能存在缺失,请检查文档标题格式是否符合标准。"
|
|
|
|
|
+ if l2_alert else ""
|
|
|
|
|
+ )
|
|
|
|
|
+ if l2_alert:
|
|
|
|
|
+ logger.warning(
|
|
|
|
|
+ f"[质量检查] 二级小节提取率 {l2_rate*100:.1f}% 低于阈值 {l2_threshold*100:.1f}% "
|
|
|
|
|
+ f"({l2_count}/{default_total_subsections})"
|
|
|
|
|
+ )
|
|
|
|
|
|
|
|
- # 将质量检查结果添加到 chapters 中
|
|
|
|
|
- chapters["_quality_check"] = quality_result
|
|
|
|
|
|
|
+ # 将质量检查结果添加到 chapters 中
|
|
|
|
|
+ chapters["quality_check"] = quality_result
|