5 dní pred · 6a9589b89d
--- a/config/config.ini.template
+++ b/config/config.ini.template
@@ -1,17 +1,14 @@
 
				 
			
 
				 
			
 
				 [model]
			
 
				-MODEL_TYPE=qwen3_5_35b_a3b
			
 
				-
			
 
				-# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed
			
 
				-EMBEDDING_MODEL_TYPE=lq_qwen3_8b_emd
			
 
				+# 注意：模型配置已迁移到 model_setting.yaml
			
 
				+# 请通过 config/model_config_loader.py 获取模型配置
			
 
				+# Embedding模型类型选择: lq_qwen3_8b_emd, siliconflow_embed, shutian_qwen3_embed
			
 
				+EMBEDDING_MODEL_TYPE=shutian_qwen3_embed
			
 
				 
			
 
				 # Rerank模型类型选择: bge_rerank_model, lq_rerank_model, silicoflow_rerank_model
			
 
				 RERANK_MODEL_TYPE=lq_rerank_model
			
 
				 
			
 
				-# 完整性审查模型类型 (用于 llm_content_classifier_v2)
			
 
				-COMPLETENESS_REVIEW_MODEL_TYPE=qwen3_5_122b_a10b
			
 
				-
			
 
				 
			
 
				 [deepseek]
			
 
				 DEEPSEEK_SERVER_URL=https://api.deepseek.com
			
@@ -61,6 +58,9 @@ REDIS_PASSWORD=123456
 
				 REDIS_MAX_CONNECTIONS=50
			
 
				 
			
 
				 [ocr]
			
 
				+# 是否启用 OCR 表格识别（true/false）
			
 
				+enable = true
			
 
				+
			
 
				 # OCR 引擎选择（以下写法都支持）：
			
 
				 # GLM-OCR: glm_ocr | glm-ocr | glmocr
			
 
				 # MinerU:  mineru | mineru-ocr | mineru_ocr
			
@@ -148,8 +148,8 @@ LQ_QWEN3_8B_LQ_LORA_API_KEY=dummy
 
				 MYSQL_HOST=192.168.92.61
			
 
				 MYSQL_PORT=13306
			
 
				 MYSQL_USER=root
			
 
				-MYSQL_PASSWORD=lq@123
			
 
				-MYSQL_DB=lq_db
			
 
				+MYSQL_PASSWORD=Lq123456!
			
 
				+MYSQL_DB=lq_db_dev
			
 
				 MYSQL_MIN_SIZE=1
			
 
				 MYSQL_MAX_SIZE=5
			
 
				 MYSQL_AUTO_COMMIT=True
			
@@ -162,6 +162,33 @@ PGVECTOR_DB=vector_db
 
				 PGVECTOR_USER=vector_user
			
 
				 PGVECTOR_PASSWORD=pg16@123
			
 
				 
			
 
				+# 蜀天AI模型服务器配置（183.220.37.46）
			
 
				+[shutian]
			
 
				+# Qwen3.5-122B-A10B 模型（端口25423）
			
 
				+SHUTIAN_122B_SERVER_URL=http://183.220.37.46:25423/v1
			
 
				+SHUTIAN_122B_MODEL_ID=/model/Qwen3.5-122B-A10B
			
 
				+SHUTIAN_122B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
			
 
				+
			
 
				+# Qwen3-8B 模型（端口25424）
			
 
				+SHUTIAN_8B_SERVER_URL=http://183.220.37.46:25424/v1
			
 
				+SHUTIAN_8B_MODEL_ID=/model/Qwen3-8B
			
 
				+SHUTIAN_8B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
			
 
				+
			
 
				+# Qwen3.5-35B 模型（端口25427）
			
 
				+SHUTIAN_35B_SERVER_URL=http://183.220.37.46:25427/v1
			
 
				+SHUTIAN_35B_MODEL_ID=/model/Qwen3.5-35B
			
 
				+SHUTIAN_35B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
			
 
				+
			
 
				+# Qwen3-Embedding-8B 嵌入模型（端口25425）
			
 
				+SHUTIAN_EMBED_SERVER_URL=http://183.220.37.46:25425/v1
			
 
				+SHUTIAN_EMBED_MODEL_ID=/model/Qwen3-Embedding-8B
			
 
				+SHUTIAN_EMBED_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
			
 
				+
			
 
				+# Qwen3-Reranker-8B 重排序模型（端口25426）
			
 
				+SHUTIAN_RERANK_SERVER_URL=http://183.220.37.46:25426/v1/rerank
			
 
				+SHUTIAN_RERANK_MODEL_ID=/model/Qwen3-Reranker-8B
			
 
				+SHUTIAN_RERANK_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
			
 
				+
			
 
				 
			
 
				 [milvus]
			
 
				 MILVUS_HOST=192.168.92.96
			
@@ -212,5 +239,8 @@ STREAM=false
 
				 TEMPERATURE=0.3
			
 
				 MAX_TOKENS=1024
			
 
				 
			
 
				+[construction_review]
			
 
				+MAX_CELERY_TASKS=1
			
 
				+
			
 
				 
			
 
				 
			
--- a/core/construction_review/component/doc_worker/models/document_structure.py
+++ b/core/construction_review/component/doc_worker/models/document_structure.py
@@ -493,7 +493,8 @@ class UnifiedDocumentStructure:
 
				                 "primary_count": self.primary_count,
			
 
				                 "secondary_count": self.secondary_count,
			
 
				                 "tertiary_count": self.tertiary_count,
			
 
				-            }
			
 
				+            },
			
 
				+            "quality_check": self.raw_metadata.get("quality_check", {})
			
 
				         }
			
 
				 
			
 
				         # 添加目录结构（如果存在）
			
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -163,15 +163,7 @@ class DocumentProcessor:
 
				 
			
 
				         except Exception as e:
			
 
				             logger.error(f"{file_type.upper()}解析失败: {str(e)}", exc_info=True)
			
 
				-            # 如果智能处理失败，尝试基础处理
			
 
				-            try:
			
 
				-                logger.info("尝试使用基础处理模式")
			
 
				-                return await self._fallback_processing(file_content, file_type)
			
 
				-            except Exception as fallback_error:
			
 
				-                logger.error(f"基础处理模式也失败: {str(fallback_error)}", exc_info=True)
			
 
				-                raise RuntimeError(
			
 
				-                    f"文档处理完全失败: {file_type.upper()}智能处理({str(e)}) + 基础处理({str(fallback_error)})"
			
 
				-                ) from e
			
 
				+            raise
			
 
				 
			
 
				     def _build_unified_structure(
			
 
				         self,
			
@@ -624,78 +616,3 @@ class DocumentProcessor:
 
				             except Exception as e:
			
 
				                 logger.warning(f"分类进度推送失败: {e}")
			
 
				 
			
 
				-    async def _fallback_processing(self, file_content: bytes, file_type: str) -> UnifiedDocumentStructure:
			
 
				-        """
			
 
				-        统一的基础处理模式（当智能处理失败时使用）
			
 
				-
			
 
				-        Args:
			
 
				-            file_content: 文件内容
			
 
				-            file_type: 文件类型（仅支持 pdf）
			
 
				-
			
 
				-        Returns:
			
 
				-            UnifiedDocumentStructure: 基础处理结果
			
 
				-        """
			
 
				-        return await self._fallback_pdf_processing(file_content)
			
 
				-
			
 
				-    async def _fallback_pdf_processing(self, file_content: bytes) -> UnifiedDocumentStructure:
			
 
				-        """PDF基础处理模式（当智能处理失败时使用）"""
			
 
				-        try:
			
 
				-            from langchain_community.document_loaders import PyPDFLoader
			
 
				-            from langchain_text_splitters import RecursiveCharacterTextSplitter
			
 
				-
			
 
				-            logger.info("使用基础PDF处理模式")
			
 
				-
			
 
				-            # PyPDFLoader需要文件路径，创建临时文件
			
 
				-            with tempfile.NamedTemporaryFile(delete=True, suffix='.pdf') as temp_file:
			
 
				-                temp_file.write(file_content)
			
 
				-                temp_file.flush()
			
 
				-                temp_file_path = temp_file.name
			
 
				-
			
 
				-                loader = PyPDFLoader(temp_file_path)
			
 
				-                documents = loader.load()
			
 
				-
			
 
				-                # 文本分块
			
 
				-                text_splitter = RecursiveCharacterTextSplitter(
			
 
				-                    chunk_size=1000,
			
 
				-                    chunk_overlap=200,
			
 
				-                    separators=["\n\n", "\n", " ", ""]
			
 
				-                )
			
 
				-                splits = text_splitter.split_documents(documents)
			
 
				-
			
 
				-                # 过滤空内容切块
			
 
				-                valid_splits = []
			
 
				-                for split in splits:
			
 
				-                    content = split.page_content.strip()
			
 
				-                    if content:
			
 
				-                        valid_splits.append(split)
			
 
				-
			
 
				-                logger.info(f"基础处理完成，有效分块数量: {len(valid_splits)}")
			
 
				-
			
 
				-                # 构建基础版统一文档结构
			
 
				-                secondary_list = []
			
 
				-                for i, split in enumerate(valid_splits, 1):
			
 
				-                    secondary_list.append(SecondaryClassification(
			
 
				-                        first_seq=1,
			
 
				-                        first_code="unknown",
			
 
				-                        first_name="未分类",
			
 
				-                        second_seq=i,
			
 
				-                        second_code=f"chunk_{i}",
			
 
				-                        second_name=f"内容块{i}",
			
 
				-                        second_content=split.page_content,
			
 
				-                        page_start=split.metadata.get("page", 0),
			
 
				-                        page_end=split.metadata.get("page", 0),
			
 
				-                    ))
			
 
				-
			
 
				-                unified_doc = UnifiedDocumentStructure(
			
 
				-                    document_id=str(uuid.uuid4()),
			
 
				-                    document_name="基础处理文档.pdf",
			
 
				-                    total_pages=len(documents),
			
 
				-                    secondary_classifications=secondary_list,
			
 
				-                )
			
 
				-
			
 
				-                return unified_doc
			
 
				-
			
 
				-        except Exception as e:
			
 
				-            logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
			
 
				-            raise
			
 
				-
			
--- a/core/construction_review/component/minimal_pipeline/chunk_assembler.py
+++ b/core/construction_review/component/minimal_pipeline/chunk_assembler.py
@@ -59,6 +59,12 @@ def assemble_chunks(
 
				     chunk_index = 0
			
 
				 
			
 
				     for chapter_title, sections in structure.get("chapters", {}).items():
			
 
				+        # 跳过质量检查字段
			
 
				+        if chapter_title == "quality_check":
			
 
				+            continue
			
 
				+        # 确保 sections 是字典（章节数据）
			
 
				+        if not isinstance(sections, dict):
			
 
				+            continue
			
 
				         primary_info = _get_primary_info(chapter_title, primary_map)
			
 
				         first_code = primary_info["code"] or "non_standard"
			
 
				         first_name = primary_info["name"] or "非标准项"
			
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -240,11 +240,15 @@ class SimpleDocumentProcessor:
 
				         self._merge_tertiary_to_unified(unified, chunks)
			
 
				 
			
 
				         # 原始元数据
			
 
				+        chapters = structure.get("chapters", {})
			
 
				+        quality_check = chapters.get("quality_check", {})
			
 
				+        logger.info(f"[_build_unified_doc] 从 chapters 获取 quality_check: {quality_check}")
			
 
				         unified.raw_metadata = {
			
 
				             "processing_info": {
			
 
				                 "chunks_count": len(chunks),
			
 
				                 "pages_count": structure.get("total_pages", 0),
			
 
				-            }
			
 
				+            },
			
 
				+            "quality_check": quality_check
			
 
				         }
			
 
				 
			
 
				         # 设置目录结构（YOLO检测+OCR提取）
			
@@ -461,7 +465,7 @@ class SimpleDocumentProcessor:
 
				         l2_threshold: float = 0.73,
			
 
				     ) -> None:
			
 
				         """
			
 
				-        检查文档提取质量，如果低于阈值则在 chapters 中添加质量字段。
			
 
				+        检查文档提取质量，无论是否低于阈值都在 chapters 中添加质量字段。
			
 
				 
			
 
				         Args:
			
 
				             structure: PDF 提取结构
			
@@ -493,34 +497,44 @@ class SimpleDocumentProcessor:
 
				         l1_alert = l1_rate < l1_threshold
			
 
				         l2_alert = l2_rate < l2_threshold
			
 
				 
			
 
				-        if l1_alert or l2_alert:
			
 
				-            quality_result: Dict[str, Any] = {}
			
 
				-
			
 
				-            if l1_alert:
			
 
				-                quality_result["l1_chapter_quality"] = {
			
 
				-                    "extracted_count": l1_count,
			
 
				-                    "expected_count": default_total_chapters,
			
 
				-                    "extraction_rate": round(l1_rate * 100, 2),
			
 
				-                    "threshold": round(l1_threshold * 100, 2),
			
 
				-                }
			
 
				-                quality_result["l1_system_alerts"] = "该文档一级章节提取可能存在缺失，请检查文档标题格式是否符合标准。"
			
 
				-                logger.warning(
			
 
				-                    f"[质量检查] 一级章节提取率 {l1_rate*100:.1f}% 低于阈值 {l1_threshold*100:.1f}% "
			
 
				-                    f"({l1_count}/{default_total_chapters})"
			
 
				-                )
			
 
				+        # 构建质量检查结果（始终添加）
			
 
				+        quality_result: Dict[str, Any] = {}
			
 
				 
			
 
				-            if l2_alert:
			
 
				-                quality_result["l2_Subsection_quality"] = {
			
 
				-                    "extracted_count": l2_count,
			
 
				-                    "expected_count": default_total_subsections,
			
 
				-                    "extraction_rate": round(l2_rate * 100, 2),
			
 
				-                    "threshold": round(l2_threshold * 100, 2),
			
 
				-                }
			
 
				-                quality_result["l2_system_alerts"] = "该文档二级小节提取可能存在缺失，请检查文档标题格式是否符合标准。"
			
 
				-                logger.warning(
			
 
				-                    f"[质量检查] 二级小节提取率 {l2_rate*100:.1f}% 低于阈值 {l2_threshold*100:.1f}% "
			
 
				-                    f"({l2_count}/{default_total_subsections})"
			
 
				-                )
			
 
				+        # 一级章节质量
			
 
				+        quality_result["l1_chapter_quality"] = {
			
 
				+            "extracted_count": l1_count,
			
 
				+            "expected_count": default_total_chapters,
			
 
				+            "extraction_rate": round(l1_rate * 100, 2),
			
 
				+            "threshold": round(l1_threshold * 100, 2),
			
 
				+            "exist_issue": l1_alert,
			
 
				+        }
			
 
				+        quality_result["l1_system_alerts"] = (
			
 
				+            "该文档一级章节提取可能存在缺失，请检查文档标题格式是否符合标准。"
			
 
				+            if l1_alert else ""
			
 
				+        )
			
 
				+        if l1_alert:
			
 
				+            logger.warning(
			
 
				+                f"[质量检查] 一级章节提取率 {l1_rate*100:.1f}% 低于阈值 {l1_threshold*100:.1f}% "
			
 
				+                f"({l1_count}/{default_total_chapters})"
			
 
				+            )
			
 
				+
			
 
				+        # 二级小节质量
			
 
				+        quality_result["l2_Subsection_quality"] = {
			
 
				+            "extracted_count": l2_count,
			
 
				+            "expected_count": default_total_subsections,
			
 
				+            "extraction_rate": round(l2_rate * 100, 2),
			
 
				+            "threshold": round(l2_threshold * 100, 2),
			
 
				+            "exist_issue": l2_alert,
			
 
				+        }
			
 
				+        quality_result["l2_system_alerts"] = (
			
 
				+            "该文档二级小节提取可能存在缺失，请检查文档标题格式是否符合标准。"
			
 
				+            if l2_alert else ""
			
 
				+        )
			
 
				+        if l2_alert:
			
 
				+            logger.warning(
			
 
				+                f"[质量检查] 二级小节提取率 {l2_rate*100:.1f}% 低于阈值 {l2_threshold*100:.1f}% "
			
 
				+                f"({l2_count}/{default_total_subsections})"
			
 
				+            )
			
 
				 
			
 
				-            # 将质量检查结果添加到 chapters 中
			
 
				-            chapters["_quality_check"] = quality_result
			
 
				+        # 将质量检查结果添加到 chapters 中
			
 
				+        chapters["quality_check"] = quality_result
			
--- a/core/construction_review/component/minimal_pipeline/toc_builder.py
+++ b/core/construction_review/component/minimal_pipeline/toc_builder.py
@@ -20,7 +20,10 @@ def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str,
 
				     """
			
 
				     toc_items: List[Dict[str, Any]] = []
			
 
				     for chapter_title, sections in structure.get("chapters", {}).items():
			
 
				-        page_start = min(s["page_start"] for s in sections.values()) if sections else 1
			
 
				+        # 安全获取 page_start，默认值为 1
			
 
				+        page_starts = [s.get("page_start", 1) for s in sections.values() if isinstance(s, dict)]
			
 
				+        page_start = min(page_starts) if page_starts else 1
			
 
				+
			
 
				         toc_items.append({
			
 
				             "title": chapter_title,
			
 
				             "page": page_start,
			
@@ -30,9 +33,11 @@ def build_toc_items_from_structure(structure: Dict[str, Any]) -> List[Dict[str,
 
				         for section_title, section_data in sections.items():
			
 
				             if section_title == "章节标题":
			
 
				                 continue
			
 
				+            # 安全获取 page_start
			
 
				+            sec_page_start = section_data.get("page_start", 1) if isinstance(section_data, dict) else 1
			
 
				             toc_items.append({
			
 
				                 "title": section_title,
			
 
				-                "page": section_data["page_start"],
			
 
				+                "page": sec_page_start,
			
 
				                 "level": 2,
			
 
				                 "original": section_title,
			
 
				             })
			
--- a/core/construction_review/workflows/document_workflow.py
+++ b/core/construction_review/workflows/document_workflow.py
@@ -116,6 +116,25 @@ class DocumentWorkflow:
 
				                 except asyncio.CancelledError:
			
 
				                     pass
			
 
				 
			
 
				+            # 提取 quality_check 信息
			
 
				+            quality_check = {}
			
 
				+            if hasattr(structured_content, 'raw_metadata') and structured_content.raw_metadata:
			
 
				+                quality_check = structured_content.raw_metadata.get('quality_check', {})
			
 
				+                logger.info(f"[DocumentWorkflow] 从 raw_metadata 提取 quality_check: {quality_check}")
			
 
				+            else:
			
 
				+                logger.warning(f"[DocumentWorkflow] raw_metadata 不存在或为空: {getattr(structured_content, 'raw_metadata', None)}")
			
 
				+
			
 
				+            # 构建 issues 列表（包含 quality_check）
			
 
				+            issues = []
			
 
				+            if quality_check:
			
 
				+                issues.append({
			
 
				+                    "type": "quality_check",
			
 
				+                    "data": quality_check
			
 
				+                })
			
 
				+                logger.info(f"[DocumentWorkflow] 构建 issues 列表: {issues}")
			
 
				+            else:
			
 
				+                logger.info(f"[DocumentWorkflow] quality_check 为空，不添加到 issues")
			
 
				+
			
 
				             if self.progress_manager:
			
 
				                 await self.progress_manager.update_stage_progress(
			
 
				                     callback_task_id=self.callback_task_id,
			
@@ -123,7 +142,8 @@ class DocumentWorkflow:
 
				                     current=100,
			
 
				                     status="docu_ans_completed",
			
 
				                     message="文档解析完成",
			
 
				-                    event_type="processing"
			
 
				+                    event_type="processing",
			
 
				+                    issues=issues if issues else None
			
 
				                 )
			
 
				 
			
 
				             # 转换为旧版字典格式以保持兼容性