Prechádzať zdrojové kódy

feat: 添加文档提取质量检查功能

- 新增 _check_extraction_quality 方法,统计一/二级章节提取率
- 默认阈值:一级 70% (10章),二级 73% (41小节)
- 始终输出 quality_check 字段,包含 exist_issue 标记和告警信息
- 在 document_workflow 中通过 SSE issues 推送质量检查结果
- 更新 chunk_assembler 跳过 quality_check 字段避免解析错误
- 更新文档响应结构体定义
WangXuMing 5 dní pred
rodič
commit
6a9589b89d

+ 39 - 9
config/config.ini.template

@@ -1,17 +1,14 @@
 
 
 [model]
-MODEL_TYPE=qwen3_5_35b_a3b
-
-# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
-EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
+# 注意:模型配置已迁移到 model_setting.yaml
+# 请通过 config/model_config_loader.py 获取模型配置
+# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed, shutian_qwen3_embed
+EMBEDDING_MODEL_TYPE=shutian_qwen3_embed
 
 # Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
 RERANK_MODEL_TYPE=lq_rerank_model
 
-# 完整性审查模型类型 (用于 llm_content_classifier_v2)
-COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
-
 
 [deepseek]
 DEEPSEEK_SERVER_URL=https://api.deepseek.com
@@ -61,6 +58,9 @@ REDIS_PASSWORD=123456
 REDIS_MAX_CONNECTIONS=50
 
 [ocr]
+# 是否启用 OCR 表格识别(true/false)
+enable = true
+
 # OCR 引擎选择(以下写法都支持):
 # GLM-OCR: glm_ocr | glm-ocr | glmocr
 # MinerU:  mineru | mineru-ocr | mineru_ocr
@@ -148,8 +148,8 @@ LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
 MYSQL_HOST=192.168.92.61
 MYSQL_PORT=13306
 MYSQL_USER=root
-MYSQL_PASSWORD=lq@123
-MYSQL_DB=lq_db
+MYSQL_PASSWORD=Lq123456!
+MYSQL_DB=lq_db_dev
 MYSQL_MIN_SIZE=1
 MYSQL_MAX_SIZE=5
 MYSQL_AUTO_COMMIT=True
@@ -162,6 +162,33 @@ PGVECTOR_DB=vector_db
 PGVECTOR_USER=vector_user
 PGVECTOR_PASSWORD=pg16@123
 
+# 蜀天AI模型服务器配置(183.220.37.46)
+[shutian]
+# Qwen3.5-122B-A10B 模型(端口25423)
+SHUTIAN_122B_SERVER_URL=http://183.220.37.46:25423/v1
+SHUTIAN_122B_MODEL_ID=/model/Qwen3.5-122B-A10B
+SHUTIAN_122B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3-8B 模型(端口25424)
+SHUTIAN_8B_SERVER_URL=http://183.220.37.46:25424/v1
+SHUTIAN_8B_MODEL_ID=/model/Qwen3-8B
+SHUTIAN_8B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3.5-35B 模型(端口25427)
+SHUTIAN_35B_SERVER_URL=http://183.220.37.46:25427/v1
+SHUTIAN_35B_MODEL_ID=/model/Qwen3.5-35B
+SHUTIAN_35B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3-Embedding-8B 嵌入模型(端口25425)
+SHUTIAN_EMBED_SERVER_URL=http://183.220.37.46:25425/v1
+SHUTIAN_EMBED_MODEL_ID=/model/Qwen3-Embedding-8B
+SHUTIAN_EMBED_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
+# Qwen3-Reranker-8B 重排序模型(端口25426)
+SHUTIAN_RERANK_SERVER_URL=http://183.220.37.46:25426/v1/rerank
+SHUTIAN_RERANK_MODEL_ID=/model/Qwen3-Reranker-8B
+SHUTIAN_RERANK_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+
 
 [milvus]
 MILVUS_HOST=192.168.92.96
@@ -212,5 +239,8 @@ STREAM=false
 TEMPERATURE=0.3
 MAX_TOKENS=1024
 
+[construction_review]
+MAX_CELERY_TASKS=1
+
 
 

+ 2 - 1
core/construction_review/component/doc_worker/models/document_structure.py

@@ -493,7 +493,8 @@ class UnifiedDocumentStructure:
                 "primary_count": self.primary_count,
                 "secondary_count": self.secondary_count,
                 "tertiary_count": self.tertiary_count,
-            }
+            },
+            "quality_check": self.raw_metadata.get("quality_check", {})
         }
 
         # 添加目录结构(如果存在)

+ 1 - 84
core/construction_review/component/document_processor.py

@@ -163,15 +163,7 @@ class DocumentProcessor:
 
         except Exception as e:
             logger.error(f"{file_type.upper()}解析失败: {str(e)}", exc_info=True)
-            # 如果智能处理失败,尝试基础处理
-            try:
-                logger.info("尝试使用基础处理模式")
-                return await self._fallback_processing(file_content, file_type)
-            except Exception as fallback_error:
-                logger.error(f"基础处理模式也失败: {str(fallback_error)}", exc_info=True)
-                raise RuntimeError(
-                    f"文档处理完全失败: {file_type.upper()}智能处理({str(e)}) + 基础处理({str(fallback_error)})"
-                ) from e
+            raise
 
     def _build_unified_structure(
         self,
@@ -624,78 +616,3 @@ class DocumentProcessor:
             except Exception as e:
                 logger.warning(f"分类进度推送失败: {e}")
 
-    async def _fallback_processing(self, file_content: bytes, file_type: str) -> UnifiedDocumentStructure:
-        """
-        统一的基础处理模式(当智能处理失败时使用)
-
-        Args:
-            file_content: 文件内容
-            file_type: 文件类型(仅支持 pdf)
-
-        Returns:
-            UnifiedDocumentStructure: 基础处理结果
-        """
-        return await self._fallback_pdf_processing(file_content)
-
-    async def _fallback_pdf_processing(self, file_content: bytes) -> UnifiedDocumentStructure:
-        """PDF基础处理模式(当智能处理失败时使用)"""
-        try:
-            from langchain_community.document_loaders import PyPDFLoader
-            from langchain_text_splitters import RecursiveCharacterTextSplitter
-
-            logger.info("使用基础PDF处理模式")
-
-            # PyPDFLoader需要文件路径,创建临时文件
-            with tempfile.NamedTemporaryFile(delete=True, suffix='.pdf') as temp_file:
-                temp_file.write(file_content)
-                temp_file.flush()
-                temp_file_path = temp_file.name
-
-                loader = PyPDFLoader(temp_file_path)
-                documents = loader.load()
-
-                # 文本分块
-                text_splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=1000,
-                    chunk_overlap=200,
-                    separators=["\n\n", "\n", " ", ""]
-                )
-                splits = text_splitter.split_documents(documents)
-
-                # 过滤空内容切块
-                valid_splits = []
-                for split in splits:
-                    content = split.page_content.strip()
-                    if content:
-                        valid_splits.append(split)
-
-                logger.info(f"基础处理完成,有效分块数量: {len(valid_splits)}")
-
-                # 构建基础版统一文档结构
-                secondary_list = []
-                for i, split in enumerate(valid_splits, 1):
-                    secondary_list.append(SecondaryClassification(
-                        first_seq=1,
-                        first_code="unknown",
-                        first_name="未分类",
-                        second_seq=i,
-                        second_code=f"chunk_{i}",
-                        second_name=f"内容块{i}",
-                        second_content=split.page_content,
-                        page_start=split.metadata.get("page", 0),
-                        page_end=split.metadata.get("page", 0),
-                    ))
-
-                unified_doc = UnifiedDocumentStructure(
-                    document_id=str(uuid.uuid4()),
-                    document_name="基础处理文档.pdf",
-                    total_pages=len(documents),
-                    secondary_classifications=secondary_list,
-                )
-
-                return unified_doc
-
-        except Exception as e:
-            logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
-            raise
-

+ 6 - 0
core/construction_review/component/minimal_pipeline/chunk_assembler.py

@@ -59,6 +59,12 @@ def assemble_chunks(
     chunk_index = 0
 
     for chapter_title, sections in structure.get("chapters", {}).items():
+        # 跳过质量检查字段
+        if chapter_title == "quality_check":
+            continue
+        # 确保 sections 是字典(章节数据)
+        if not isinstance(sections, dict):
+            continue
         primary_info = _get_primary_info(chapter_title, primary_map)
         first_code = primary_info["code"] or "non_standard"
         first_name = primary_info["name"] or "非标准项"

+ 45 - 31
core/construction_review/component/minimal_pipeline/simple_processor.py

@@ -240,11 +240,15 @@ class SimpleDocumentProcessor:
         self._merge_tertiary_to_unified(unified, chunks)
 
         # 原始元数据
+        chapters = structure.get("chapters", {})
+        quality_check = chapters.get("quality_check", {})
+        logger.info(f"[_build_unified_doc] 从 chapters 获取 quality_check: {quality_check}")
         unified.raw_metadata = {
             "processing_info": {
                 "chunks_count": len(chunks),
                 "pages_count": structure.get("total_pages", 0),
-            }
+            },
+            "quality_check": quality_check
         }
 
         # 设置目录结构(YOLO检测+OCR提取)
@@ -461,7 +465,7 @@ class SimpleDocumentProcessor:
         l2_threshold: float = 0.73,
     ) -> None:
         """
-        检查文档提取质量,如果低于阈值则在 chapters 中添加质量字段。
+        检查文档提取质量,无论是否低于阈值都在 chapters 中添加质量字段。
 
         Args:
             structure: PDF 提取结构
@@ -493,34 +497,44 @@ class SimpleDocumentProcessor:
         l1_alert = l1_rate < l1_threshold
         l2_alert = l2_rate < l2_threshold
 
-        if l1_alert or l2_alert:
-            quality_result: Dict[str, Any] = {}
-
-            if l1_alert:
-                quality_result["l1_chapter_quality"] = {
-                    "extracted_count": l1_count,
-                    "expected_count": default_total_chapters,
-                    "extraction_rate": round(l1_rate * 100, 2),
-                    "threshold": round(l1_threshold * 100, 2),
-                }
-                quality_result["l1_system_alerts"] = "该文档一级章节提取可能存在缺失,请检查文档标题格式是否符合标准。"
-                logger.warning(
-                    f"[质量检查] 一级章节提取率 {l1_rate*100:.1f}% 低于阈值 {l1_threshold*100:.1f}% "
-                    f"({l1_count}/{default_total_chapters})"
-                )
+        # 构建质量检查结果(始终添加)
+        quality_result: Dict[str, Any] = {}
 
-            if l2_alert:
-                quality_result["l2_Subsection_quality"] = {
-                    "extracted_count": l2_count,
-                    "expected_count": default_total_subsections,
-                    "extraction_rate": round(l2_rate * 100, 2),
-                    "threshold": round(l2_threshold * 100, 2),
-                }
-                quality_result["l2_system_alerts"] = "该文档二级小节提取可能存在缺失,请检查文档标题格式是否符合标准。"
-                logger.warning(
-                    f"[质量检查] 二级小节提取率 {l2_rate*100:.1f}% 低于阈值 {l2_threshold*100:.1f}% "
-                    f"({l2_count}/{default_total_subsections})"
-                )
+        # 一级章节质量
+        quality_result["l1_chapter_quality"] = {
+            "extracted_count": l1_count,
+            "expected_count": default_total_chapters,
+            "extraction_rate": round(l1_rate * 100, 2),
+            "threshold": round(l1_threshold * 100, 2),
+            "exist_issue": l1_alert,
+        }
+        quality_result["l1_system_alerts"] = (
+            "该文档一级章节提取可能存在缺失,请检查文档标题格式是否符合标准。"
+            if l1_alert else ""
+        )
+        if l1_alert:
+            logger.warning(
+                f"[质量检查] 一级章节提取率 {l1_rate*100:.1f}% 低于阈值 {l1_threshold*100:.1f}% "
+                f"({l1_count}/{default_total_chapters})"
+            )
+
+        # 二级小节质量
+        quality_result["l2_Subsection_quality"] = {
+            "extracted_count": l2_count,
+            "expected_count": default_total_subsections,
+            "extraction_rate": round(l2_rate * 100, 2),
+            "threshold": round(l2_threshold * 100, 2),
+            "exist_issue": l2_alert,
+        }
+        quality_result["l2_system_alerts"] = (
+            "该文档二级小节提取可能存在缺失,请检查文档标题格式是否符合标准。"
+            if l2_alert else ""
+        )
+        if l2_alert:
+            logger.warning(
+                f"[质量检查] 二级小节提取率 {l2_rate*100:.1f}% 低于阈值 {l2_threshold*100:.1f}% "
+                f"({l2_count}/{default_total_subsections})"
+            )
 
-            # 将质量检查结果添加到 chapters 中
-            chapters["_quality_check"] = quality_result
+        # 将质量检查结果添加到 chapters 中
+        chapters["quality_check"] = quality_result

+ 7 - 2
core/construction_review/component/minimal_pipeline/toc_builder.py

@@ -20,7 +20,10 @@ def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str,
     """
     toc_items: List[Dict[str, Any]] = []
     for chapter_title, sections in structure.get("chapters", {}).items():
-        page_start = min(s["page_start"] for s in sections.values()) if sections else 1
+        # 安全获取 page_start,默认值为 1
+        page_starts = [s.get("page_start", 1) for s in sections.values() if isinstance(s, dict)]
+        page_start = min(page_starts) if page_starts else 1
+
         toc_items.append({
             "title": chapter_title,
             "page": page_start,
@@ -30,9 +33,11 @@ def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str,
         for section_title, section_data in sections.items():
             if section_title == "章节标题":
                 continue
+            # 安全获取 page_start
+            sec_page_start = section_data.get("page_start", 1) if isinstance(section_data, dict) else 1
             toc_items.append({
                 "title": section_title,
-                "page": section_data["page_start"],
+                "page": sec_page_start,
                 "level": 2,
                 "original": section_title,
             })

+ 21 - 1
core/construction_review/workflows/document_workflow.py

@@ -116,6 +116,25 @@ class DocumentWorkflow:
                 except asyncio.CancelledError:
                     pass
 
+            # 提取 quality_check 信息
+            quality_check = {}
+            if hasattr(structured_content, 'raw_metadata') and structured_content.raw_metadata:
+                quality_check = structured_content.raw_metadata.get('quality_check', {})
+                logger.info(f"[DocumentWorkflow] 从 raw_metadata 提取 quality_check: {quality_check}")
+            else:
+                logger.warning(f"[DocumentWorkflow] raw_metadata 不存在或为空: {getattr(structured_content, 'raw_metadata', None)}")
+
+            # 构建 issues 列表(包含 quality_check)
+            issues = []
+            if quality_check:
+                issues.append({
+                    "type": "quality_check",
+                    "data": quality_check
+                })
+                logger.info(f"[DocumentWorkflow] 构建 issues 列表: {issues}")
+            else:
+                logger.info(f"[DocumentWorkflow] quality_check 为空,不添加到 issues")
+
             if self.progress_manager:
                 await self.progress_manager.update_stage_progress(
                     callback_task_id=self.callback_task_id,
@@ -123,7 +142,8 @@ class DocumentWorkflow:
                     current=100,
                     status="docu_ans_completed",
                     message="文档解析完成",
-                    event_type="processing"
+                    event_type="processing",
+                    issues=issues if issues else None
                 )
 
             # 转换为旧版字典格式以保持兼容性