|
|
@@ -132,18 +132,7 @@ class SimpleDocumentProcessor:
|
|
|
# 文档提取质量检查
|
|
|
self._check_extraction_quality(structure, file_name)
|
|
|
|
|
|
- catalog = structure.get("catalog") # 获取YOLO检测+OCR提取的目录
|
|
|
-
|
|
|
- # 对 catalog 进行分类(如果存在)
|
|
|
- if catalog and catalog.get("chapters"):
|
|
|
- try:
|
|
|
- catalog = await self._classify_catalog(catalog)
|
|
|
- logger.info(f"[SimpleProcessor] Catalog分类完成")
|
|
|
- # 验证一级分类是否写入
|
|
|
- for ch in catalog.get("chapters", [])[:2]:
|
|
|
- logger.info(f"[SimpleProcessor] Catalog章节验证: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
|
|
|
- except Exception as e:
|
|
|
- logger.warning(f"[SimpleProcessor] Catalog分类失败: {e}")
|
|
|
+ catalog = structure.get("catalog") # 获取YOLO检测+OCR提取的纯净目录
|
|
|
|
|
|
await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
|
|
|
|
|
|
@@ -181,12 +170,6 @@ class SimpleDocumentProcessor:
|
|
|
logger.info("[SimpleProcessor] 跳过三级分类(已由LLM直接完整性审查替代)")
|
|
|
await self._emit_progress(progress_callback, "文档分类", 90, "文档处理完成")
|
|
|
|
|
|
- # 验证返回前的catalog
|
|
|
- if catalog:
|
|
|
- logger.info(f"[SimpleProcessor] 返回前Catalog验证: {len(catalog.get('chapters', []))} 章")
|
|
|
- for ch in catalog.get("chapters", [])[:2]:
|
|
|
- logger.info(f"[SimpleProcessor] 返回前章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
|
|
|
-
|
|
|
return structure, primary_result, secondary_result, chunks, catalog
|
|
|
|
|
|
async def _emit_progress(
|
|
|
@@ -258,11 +241,9 @@ class SimpleDocumentProcessor:
|
|
|
"quality_check": quality_check
|
|
|
}
|
|
|
|
|
|
- # 设置目录结构(YOLO检测+OCR提取)
|
|
|
+ # 设置纯净目录结构(仅来自 YOLO 检测 + OCR 提取)
|
|
|
if catalog:
|
|
|
logger.info(f"[_build_unified_doc] 设置catalog: {len(catalog.get('chapters', []))} 章")
|
|
|
- for ch in catalog.get("chapters", [])[:2]:
|
|
|
- logger.info(f"[_build_unified_doc] catalog章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
|
|
|
unified.catalog = catalog
|
|
|
|
|
|
# 设置原始切分结果(来自 PDF 提取器)
|