1 vecka sedan · 8de76d6042
--- a/.gitignore
+++ b/.gitignore
@@ -83,6 +83,10 @@ output/
 
				 # 云同步冲突文件
			
 
				 *.sync-conflict-*
			
 
				 
			
 
				+# ---> Node.js / Frontend
			
 
				+node_modules/
			
 
				+frontend/dist/
			
 
				+
			
 
				 # 环境文件
			
 
				 .env
			
 
				 venv/
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor.py
@@ -170,28 +170,6 @@ class PdfStructureExtractor:
 
				         doc = fitz.open(stream=file_content)
			
 
				         try:
			
 
				             structure = self._extract_from_doc(doc, progress_callback)
			
 
				-            if result.get("catalog"):
			
 
				-                # 正文抽取和目录检测是两条独立链路：
			
 
				-                # 1. 正文抽取更容易拿到连续 content
			
 
				-                # 2. 目录检测更容易保留顺序和层级
			
 
				-                # 这里先用目录骨架对齐正文，再按标题边界重建内容，尽量减少漏标题造成的结构缺失。
			
 
				-                structure["chapters"] = self._reconcile_structure_with_catalog(
			
 
				-                    structure.get("chapters", {}),
			
 
				-                    result["catalog"],
			
 
				-                )
			
 
				-                rebuilt_chapters = self._rebuild_section_contents_from_catalog(
			
 
				-                    structure.get("chapters", {}),
			
 
				-                    result["catalog"],
			
 
				-                    structure.get("_body_lines", []),
			
 
				-                )
			
 
				-                if rebuilt_chapters:
			
 
				-                    structure["chapters"] = rebuilt_chapters
			
 
				-                enriched_catalog = self._enrich_catalog_with_structure(
			
 
				-                    result["catalog"],
			
 
				-                    structure.get("chapters", {}),
			
 
				-                )
			
 
				-                if enriched_catalog:
			
 
				-                    result["catalog"] = enriched_catalog
			
 
				             structure.pop("_body_lines", None)
			
 
				             result["chapters"] = structure.get("chapters", {})
			
 
				             result["total_pages"] = len(doc)
			
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -132,18 +132,7 @@ class SimpleDocumentProcessor:
 
				         # 文档提取质量检查
			
 
				         self._check_extraction_quality(structure, file_name)
			
 
				 
			
 
				-        catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
			
 
				-
			
 
				-        # 对 catalog 进行分类（如果存在）
			
 
				-        if catalog and catalog.get("chapters"):
			
 
				-            try:
			
 
				-                catalog = await self._classify_catalog(catalog)
			
 
				-                logger.info(f"[SimpleProcessor] Catalog分类完成")
			
 
				-                # 验证一级分类是否写入
			
 
				-                for ch in catalog.get("chapters", [])[:2]:
			
 
				-                    logger.info(f"[SimpleProcessor] Catalog章节验证: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
			
 
				-            except Exception as e:
			
 
				-                logger.warning(f"[SimpleProcessor] Catalog分类失败: {e}")
			
 
				+        catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的纯净目录
			
 
				 
			
 
				         await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
			
 
				 
			
@@ -181,12 +170,6 @@ class SimpleDocumentProcessor:
 
				         logger.info("[SimpleProcessor] 跳过三级分类（已由LLM直接完整性审查替代）")
			
 
				         await self._emit_progress(progress_callback, "文档分类", 90, "文档处理完成")
			
 
				 
			
 
				-        # 验证返回前的catalog
			
 
				-        if catalog:
			
 
				-            logger.info(f"[SimpleProcessor] 返回前Catalog验证: {len(catalog.get('chapters', []))} 章")
			
 
				-            for ch in catalog.get("chapters", [])[:2]:
			
 
				-                logger.info(f"[SimpleProcessor] 返回前章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
			
 
				-
			
 
				         return structure, primary_result, secondary_result, chunks, catalog
			
 
				 
			
 
				     async def _emit_progress(
			
@@ -258,11 +241,9 @@ class SimpleDocumentProcessor:
 
				             "quality_check": quality_check
			
 
				         }
			
 
				 
			
 
				-        # 设置目录结构（YOLO检测+OCR提取）
			
 
				+        # 设置纯净目录结构（仅来自 YOLO 检测 + OCR 提取）
			
 
				         if catalog:
			
 
				             logger.info(f"[_build_unified_doc] 设置catalog: {len(catalog.get('chapters', []))} 章")
			
 
				-            for ch in catalog.get("chapters", [])[:2]:
			
 
				-                logger.info(f"[_build_unified_doc] catalog章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
			
 
				         unified.catalog = catalog
			
 
				 
			
 
				         # 设置原始切分结果（来自 PDF 提取器）