Przeglądaj źródła

Merge branch 'dev_sgsc_wxm' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev

WangXuMing 1 tydzień temu
rodzic
commit
8de76d6042

+ 4 - 0
.gitignore

@@ -83,6 +83,10 @@ output/
 # 云同步冲突文件
 *.sync-conflict-*
 
+# ---> Node.js / Frontend
+node_modules/
+frontend/dist/
+
 # 环境文件
 .env
 venv/

+ 0 - 22
core/construction_review/component/minimal_pipeline/pdf_extractor.py

@@ -170,28 +170,6 @@ class PdfStructureExtractor:
         doc = fitz.open(stream=file_content)
         try:
             structure = self._extract_from_doc(doc, progress_callback)
-            if result.get("catalog"):
-                # 正文抽取和目录检测是两条独立链路:
-                # 1. 正文抽取更容易拿到连续 content
-                # 2. 目录检测更容易保留顺序和层级
-                # 这里先用目录骨架对齐正文,再按标题边界重建内容,尽量减少漏标题造成的结构缺失。
-                structure["chapters"] = self._reconcile_structure_with_catalog(
-                    structure.get("chapters", {}),
-                    result["catalog"],
-                )
-                rebuilt_chapters = self._rebuild_section_contents_from_catalog(
-                    structure.get("chapters", {}),
-                    result["catalog"],
-                    structure.get("_body_lines", []),
-                )
-                if rebuilt_chapters:
-                    structure["chapters"] = rebuilt_chapters
-                enriched_catalog = self._enrich_catalog_with_structure(
-                    result["catalog"],
-                    structure.get("chapters", {}),
-                )
-                if enriched_catalog:
-                    result["catalog"] = enriched_catalog
             structure.pop("_body_lines", None)
             result["chapters"] = structure.get("chapters", {})
             result["total_pages"] = len(doc)

+ 2 - 21
core/construction_review/component/minimal_pipeline/simple_processor.py

@@ -132,18 +132,7 @@ class SimpleDocumentProcessor:
         # 文档提取质量检查
         self._check_extraction_quality(structure, file_name)
 
-        catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的目录
-
-        # 对 catalog 进行分类(如果存在)
-        if catalog and catalog.get("chapters"):
-            try:
-                catalog = await self._classify_catalog(catalog)
-                logger.info(f"[SimpleProcessor] Catalog分类完成")
-                # 验证一级分类是否写入
-                for ch in catalog.get("chapters", [])[:2]:
-                    logger.info(f"[SimpleProcessor] Catalog章节验证: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
-            except Exception as e:
-                logger.warning(f"[SimpleProcessor] Catalog分类失败: {e}")
+        catalog = structure.get("catalog")  # 获取YOLO检测+OCR提取的纯净目录
 
         await self._emit_progress(progress_callback, "文档提取", 10, "PDF结构提取完成")
 
@@ -181,12 +170,6 @@ class SimpleDocumentProcessor:
         logger.info("[SimpleProcessor] 跳过三级分类(已由LLM直接完整性审查替代)")
         await self._emit_progress(progress_callback, "文档分类", 90, "文档处理完成")
 
-        # 验证返回前的catalog
-        if catalog:
-            logger.info(f"[SimpleProcessor] 返回前Catalog验证: {len(catalog.get('chapters', []))} 章")
-            for ch in catalog.get("chapters", [])[:2]:
-                logger.info(f"[SimpleProcessor] 返回前章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
-
         return structure, primary_result, secondary_result, chunks, catalog
 
     async def _emit_progress(
@@ -258,11 +241,9 @@ class SimpleDocumentProcessor:
             "quality_check": quality_check
         }
 
-        # 设置目录结构(YOLO检测+OCR提取)
+        # 设置纯净目录结构(仅来自 YOLO 检测 + OCR 提取)
         if catalog:
             logger.info(f"[_build_unified_doc] 设置catalog: {len(catalog.get('chapters', []))} 章")
-            for ch in catalog.get("chapters", [])[:2]:
-                logger.info(f"[_build_unified_doc] catalog章节: '{ch.get('title')}' -> code={ch.get('chapter_classification')}")
         unified.catalog = catalog
 
         # 设置原始切分结果(来自 PDF 提取器)