Преглед изворни кода

refactor: 精简 core 审查引擎与工作流 — 移除冗余代码路径

简化 AIReviewEngine、DocumentProcessor、ChunkClassifier、Relevance 等组件,
合并重复逻辑,删除 ai_review_core_fun 中的死代码。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
WangXuMing пре 1 недеља
родитељ
комит
ab7e86e93b

+ 57 - 160
core/construction_review/component/ai_review_engine.py

@@ -5,7 +5,7 @@
 @Project   : lq-agent-api
 @File      : construction_review/ai_review_engine.py
 @IDE       : VsCode
-@Author    : 王旭明
+@Author    : wandaan
 @Date      : 2025-12-01 11:07:12
 @Description: AI审查引擎核心组件,负责执行各类文档审查任务,支持并发处理和多种审查模式
 
@@ -149,7 +149,6 @@ class AIReviewEngine(BaseReviewer):
 
         self.max_concurrent_reviews = max_concurrent_reviews
         self.semaphore = asyncio.Semaphore(max_concurrent_reviews)
-        self.milvus_collection = config_handler.get('milvus', 'MILVUS_COLLECTION', 'default')
 
         # [新增] 数据库连接池
         self.db_pool = db_pool
@@ -699,41 +698,29 @@ class AIReviewEngine(BaseReviewer):
                 model_client=getattr(self, 'model_client', None)
             )
             
-            # 从state获取outline和原始chunks(如果有
+            # 从state获取outline(仅用于目录审查
             outline = None
-            all_chunks = []
             if state and isinstance(state, dict):
-                structured = state.get('structured_content', {})
-                outline = structured.get('outline')
-                all_chunks = structured.get('chunks', [])
+                outline = state.get('structured_content', {}).get('outline')
 
-            # 从传入的chunks中提取chapter_code和章节信息
+            # 提取chunk信息(与其他审查一致,使用单个chunk)
             chapter_code = "all"
             chapter_name = ""
-            if review_content and isinstance(review_content, list):
-                first_chunk = review_content[0]
-                if isinstance(first_chunk, dict):
-                    chapter_code = first_chunk.get('chapter_classification', 'all')
-                    chapter_name = first_chunk.get('chapter', '') or first_chunk.get('section_label', '')
-                    # 只保留章级标题("->" 之前的部分),避免二级标题污染 location
-                    if chapter_name and '->' in chapter_name:
-                        chapter_name = chapter_name.split('->')[0]
-
-            # 获取该章节的所有原始chunks用于完整性审查(包含所有分类代码)
-            chapter_chunks = review_content
-            if chapter_code != "all" and all_chunks:
-                # 从state中获取该章节的所有原始chunks
-                chapter_chunks = [
-                    c for c in all_chunks
-                    if c.get('chapter_classification') == chapter_code
-                ]
-                logger.info(f"[{name}] 章节 '{chapter_code}' 从state获取 {len(chapter_chunks)} 个原始chunks进行完整性审查")
-
-            # 执行检查(传入当前章节分类,只检查该章节下的三级分类)
+            secondary_code = ""
+            chunk = review_content[0] if (review_content and isinstance(review_content, list)) else {}
+            if isinstance(chunk, dict):
+                chapter_code = chunk.get('chapter_classification', 'all')
+                secondary_code = chunk.get('secondary_category_code', '')
+                chapter_name = chunk.get('chapter', '') or chunk.get('section_label', '')
+                if chapter_name and '->' in chapter_name:
+                    chapter_name = chapter_name.split('->')[0]
+
+            # 执行检查(传入单个chunk + 二级分类,只检查该二级下的标准项)
             result = await checker.check(
-                chunks=chapter_chunks,
+                chunks=[chunk] if isinstance(chunk, dict) and chunk else review_content,
                 outline=outline,
-                chapter_classification=chapter_code if chapter_code != "all" else None
+                chapter_classification=chapter_code if chapter_code != "all" else None,
+                secondary_classification=secondary_code if secondary_code else None,
             )
             
             # 转换为字典
@@ -779,21 +766,28 @@ class AIReviewEngine(BaseReviewer):
                     response_item["second_seq"] = rec['second_seq']
                 if 'third_seq' in rec:
                     response_item["third_seq"] = rec['third_seq']
-                # 继续添加其他字段
+                # 继续添加其他字段(方案B增强:含 evidence / confidence)
+                check_result_data = {
+                    "issue_point": issue_point,
+                    "location": location,
+                    "suggestion": rec.get('suggestion', ''),
+                    "reason": rec.get('reason', ''),
+                    "risk_level": risk_level,
+                }
+                # 方案B特有字段
+                if rec.get('evidence'):
+                    check_result_data["evidence"] = rec['evidence']
+                if rec.get('confidence'):
+                    check_result_data["confidence"] = rec['confidence']
+
                 response_item.update({
                     "check_item_code": f"{chapter_code if chapter_code != 'all' else 'unknown'}_completeness_check",
-                    "check_result": {
-                        "issue_point": issue_point,
-                        "location": location,
-                        "suggestion": rec.get('suggestion', ''),
-                        "reason": rec.get('reason', ''),
-                        "risk_level": risk_level
-                    },
+                    "check_result": check_result_data,
                     "exist_issue": True,
                     "risk_info": {"risk_level": risk_level_en}
                 })
                 response_items.append(response_item)
-            
+
             # 如果没有缺失项,显示完整度
             if not response_items:
                 completeness_rate = result_dict.get('tertiary_completeness', {}).get('completeness_rate', '0%')
@@ -812,27 +806,38 @@ class AIReviewEngine(BaseReviewer):
                     "exist_issue": False,
                     "risk_info": {"risk_level": "low"}
                 })
-            
+
             execution_time = time.time() - start_time
-            
-            # 构建与原有格式兼容的结果
+
+            # 统计LLM调用信息
+            llm_calls = result_dict.get('direct_llm_call_count', 0)
+            direct_items_count = len(result_dict.get('direct_check_items', []))
+
+            # 构建与原有格式兼容的结果(方案B增强)
             check_result = {
                 "details": {
                     "name": "completeness_check",
                     "response": response_items,
-                    "review_location_label": "三级完整性审查",
+                    "review_location_label": "三级完整性审查(LLM直接解释)",
                     "chapter_code": chapter_code,
-                    "original_content": f"标准三级分类: {result_dict.get('tertiary_completeness', {}).get('total', 0)}个, "
-                                        f"有内容: {result_dict.get('tertiary_completeness', {}).get('present', 0)}个, "
-                                        f"缺失: {result_dict.get('tertiary_completeness', {}).get('missing', 0)}个",
-                    # 保留完整的轻量级审查结果供前端使用
-                    "lightweight_result": result_dict
+                    "original_content": (
+                        f"标准三级分类: {result_dict.get('tertiary_completeness', {}).get('total', 0)}个, "
+                        f"有内容: {result_dict.get('tertiary_completeness', {}).get('present', 0)}个, "
+                        f"缺失: {result_dict.get('tertiary_completeness', {}).get('missing', 0)}个"
+                    ),
+                    # 保留完整的审查结果供前端使用
+                    "lightweight_result": result_dict,
+                    # 方案B特有:LLM逐项判断详情
+                    "direct_check_items": result_dict.get('direct_check_items', []),
+                    "direct_llm_call_count": llm_calls,
+                    "review_method": "direct_llm",
                 },
                 "success": True,
                 "execution_time": execution_time
             }
-            
-            logger.info(f"[{name}] 审查完成,耗时: {execution_time:.2f}s, "
+
+            logger.info(f"[{name}] 审查完成(方案B), 耗时: {execution_time:.2f}s, "
+                       f"LLM调用: {llm_calls}次, 直接检查项: {direct_items_count}项, "
                        f"三级完整率: {result_dict.get('tertiary_completeness', {}).get('completeness_rate', 'N/A')}")
             
             return check_result, trace_id_idx
@@ -1075,7 +1080,7 @@ class AIReviewEngine(BaseReviewer):
             combined_content = review_content
 
         return await self.review("non_parameter_compliance_check", trace_id, reviewer_type, prompt_name, combined_content, review_references,
-                               reference_source, state, stage_name, timeout=45, function_name="completeness_review_classify")
+                               reference_source, state, stage_name, timeout=45, function_name="non_parameter_compliance_check")
 
     async def check_parameter_compliance(self, trace_id_idx: str, review_content: str, review_references: str,
                                         reference_source: str, state: str, stage_name: str,
@@ -1108,7 +1113,7 @@ class AIReviewEngine(BaseReviewer):
             combined_content = review_content
 
         return await self.review("parameter_compliance_check", trace_id, reviewer_type, prompt_name, combined_content, review_references,
-                               reference_source, state, stage_name, timeout=45, function_name="completeness_review_classify")
+                               reference_source, state, stage_name, timeout=45, function_name="parameter_compliance_check")
 
     async def reference_basis_reviewer(self, review_data: Dict[str, Any], trace_id: str,
                                 state: dict = None, stage_name: str = None) -> Dict[str, Any]:
@@ -1249,114 +1254,6 @@ class AIReviewEngine(BaseReviewer):
                     "error_message": error_msg
                 }
             }
-        
-    async def timeliness_content_reviewer(self, review_data: Dict[str, Any], trace_id: str,
-                                state: dict = None, stage_name: str = None) -> Dict[str, Any]:
-        """
-        执行三级分类内容时效性审查:检查tertiary_classification_details中引用的规范是否过时
-
-        Args:
-            review_data: 待审查数据,包含tertiary_classification_details
-            trace_id: 追踪ID
-            state: 状态字典
-            stage_name: 阶段名称
-
-        Returns:
-            审查结果字典,包含内容时效性审查结果
-        """
-        start_time = time.time()
-        try:
-            logger.info(f"开始三级分类内容时效性审查,trace_id: {trace_id}")
-
-            # 提取三级分类详情
-            tertiary_details = review_data.get('tertiary_classification_details', [])
-            max_concurrent = review_data.get('max_concurrent', 4)
-
-            if not tertiary_details:
-                logger.warning("三级分类详情为空,将跳过内容时效性审查")
-                return {
-                    "timeliness_content_review_results": {
-                        "review_results": [],
-                        "total_items": 0,
-                        "issue_items": 0,
-                        "execution_time": time.time() - start_time,
-                        "error_message": None,
-                        "message": "未找到三级分类详情,跳过内容时效性审查"
-                    }
-                }
-
-            logger.info(f"提取到 {len(tertiary_details)} 个三级分类详情")
-
-            # 调用内容时效性审查
-            try:
-                # 使用信号量控制并发
-                async with self.semaphore:
-                    # 从state中获取progress_manager和callback_task_id
-                    progress_manager = state.get('progress_manager') if state else None
-                    callback_task_id = state.get('callback_task_id') if state else None
-
-                    # 调用内容时效性审查器(使用新的统一入口)
-                    from core.construction_review.component.reviewers.timeliness_reviewer import TimelinessReviewService
-                    async with TimelinessReviewService(max_concurrent=max_concurrent, db_pool=self.db_pool) as reviewer:
-                        # 从 tertiary_details 提取内容
-                        contents = []
-                        for detail in tertiary_details:
-                            content = detail.get("content", "") if isinstance(detail, dict) else str(detail)
-                            if content:
-                                contents.append(content)
-                        full_content = "\n".join(contents)
-
-                        timeliness_content_results = await reviewer.review_from_content(
-                            content=full_content,
-                            chapter_code="content",
-                            collection_name="first_bfp_collection_status"
-                        )
-
-                    logger.info(f"内容时效性审查完成,发现问题数量: {len(timeliness_content_results)}")
-
-                    # 统计审查结果
-                    total_items = len(timeliness_content_results)
-                    issue_items = sum(1 for item in timeliness_content_results if item.get('exist_issue', False))
-
-                    logger.info(f"审查统计 - 总规范引用: {total_items}, 问题项: {issue_items}")
-
-            except Exception as e:
-                logger.error(f"内容时效性审查失败: {str(e)}")
-                return {
-                    "timeliness_content_review_results": {
-                        "review_results": [],
-                        "total_items": 0,
-                        "issue_items": 0,
-                        "execution_time": time.time() - start_time,
-                        "error_message": f"内容时效性审查失败: {str(e)}"
-                    }
-                }
-
-            # 返回完整结果
-            return {
-                "timeliness_content_review_results": {
-                    "review_results": timeliness_content_results,
-                    "total_items": total_items,
-                    "issue_items": issue_items,
-                    "execution_time": time.time() - start_time,
-                    "error_message": None
-                }
-            }
-
-        except Exception as e:
-            execution_time = time.time() - start_time
-            error_msg = f"内容时效性审查失败: {str(e)}"
-            logger.error(error_msg, exc_info=True)
-
-            return {
-                "timeliness_content_review_results": {
-                    "review_results": [],
-                    "total_items": 0,
-                    "issue_items": 0,
-                    "execution_time": execution_time,
-                    "error_message": error_msg
-                }
-            }
 
     async def timeliness_basis_reviewer(self, review_data: Dict[str, Any], trace_id: str,
                                 state: dict = None, stage_name: str = None) -> Dict[str, Any]:

+ 2 - 386
core/construction_review/component/doc_worker/classification/chunk_classifier.py

@@ -1,7 +1,7 @@
 """
-内容块分类模块(二级和三级分类)
+内容块分类模块(二级分类)
 
-对已经完成一级分类的内容块进行二级和三级分类
+对已经完成一级分类的内容块进行二级分类
 """
 
 from __future__ import annotations
@@ -22,22 +22,6 @@ from ..config.provider import default_config_provider
 from ..utils.prompt_loader import PromptLoader
 
 
-# 延迟导入新的三级分类器(避免循环导入)
-_LLM_CONTENT_CLASSIFIER = None
-
-
-def _get_llm_content_classifier():
-    """延迟导入 LLMContentClassifier"""
-    global _LLM_CONTENT_CLASSIFIER
-    if _LLM_CONTENT_CLASSIFIER is None:
-        from ...reviewers.utils.llm_content_classifier_v2 import (
-            LLMContentClassifier,
-            ClassifierConfig
-        )
-        _LLM_CONTENT_CLASSIFIER = (LLMContentClassifier, ClassifierConfig)
-    return _LLM_CONTENT_CLASSIFIER
-
-
 def _extract_json(text: str) -> Optional[Dict[str, Any]]:
     """从字符串中提取第一个有效 JSON 对象"""
     for pattern in [r"```json\s*(\{.*?})\s*```", r"```\s*(\{.*?})\s*```"]:
@@ -144,353 +128,6 @@ class ChunkClassifier:
 
         return "\n".join(standards_lines) if standards_lines else "(无二级分类标准)", index_mapping
 
-    def _build_tertiary_standards(self, first_category_code: str, second_category_code: str) -> tuple[str, dict]:
-        """
-        构建三级分类标准文本
-
-        返回:
-            (标准文本, 索引映射字典)
-        """
-        if first_category_code not in self.classification_tree:
-            return "(无三级分类标准)", {}
-
-        if second_category_code not in self.classification_tree[first_category_code]:
-            return "(无三级分类标准)", {}
-
-        third_items = self.classification_tree[first_category_code][second_category_code]["third_items"]
-
-        if not third_items:
-            return "(无三级分类标准)", {}
-
-        standards_lines = ["    0. 非标准项 - 不符合以下任何类别"]
-        index_mapping = {0: ("非标准项", "non_standard")}
-
-        for idx, third_item in enumerate(third_items, 1):
-            third_cn = third_item["third_cn"]
-            third_code = third_item["third_code"]
-            third_focus = third_item["third_focus"]
-
-            # 保存索引映射
-            index_mapping[idx] = (third_cn, third_code)
-
-            if third_focus and third_focus != "NULL":
-                standards_lines.append(f"    {idx}. {third_cn} - 关注点:{third_focus}")
-            else:
-                standards_lines.append(f"    {idx}. {third_cn}")
-
-        return "\n".join(standards_lines), index_mapping
-
-    # 默认模型(三级分类会从 model_setting.yaml 动态加载)
-    DEFAULT_MODEL = "qwen3_5_122b_a10b"
-
-    # 二级分类模型(从 model_setting.yaml 动态加载,配置 key: doc_classification_secondary)
-    @property
-    def SECONDARY_MODEL(self) -> str:
-        """二级分类模型,从 model_setting.yaml 读取配置"""
-        try:
-            from foundation.ai.models.model_config_loader import get_model_for_function
-            model = get_model_for_function("doc_classification_secondary")
-            if model:
-                return model
-        except Exception as e:
-            logger.debug(f"加载二级分类模型配置失败: {e}")
-        return "qwen3_5_35b_a3b"  # 兜底默认值
-
-    async def _call_llm_once(
-        self,
-        system_prompt: str,
-        user_prompt: str,
-        model_name: Optional[str] = None
-    ) -> Optional[Dict[str, Any]]:
-        """
-        单次异步 LLM 调用(使用统一的 GenerateModelClient)
-
-        参数:
-            system_prompt: 系统提示词
-            user_prompt: 用户提示词
-            model_name: 模型名称,默认使用 DEFAULT_MODEL
-
-        失败返回 None,由调用方决定处理逻辑
-        """
-        if model_name is None:
-            model_name = self.DEFAULT_MODEL
-
-        try:
-            content = await generate_model_client.get_model_generate_invoke(
-                trace_id="chunk_classifier",
-                system_prompt=system_prompt,
-                user_prompt=user_prompt,
-                model_name=model_name,
-            )
-            result = _extract_json(content)
-            return result if result is not None else {"raw_content": content}
-        except Exception as e:
-            logger.error(f"[ChunkClassifier] LLM 调用失败: {e}")
-            return None
-
-    async def _batch_call_llm(
-        self,
-        requests: List[tuple],  # [(system_prompt, user_prompt), ...]
-        model_name: Optional[str] = None,
-    ) -> List[Optional[Dict[str, Any]]]:
-        """
-        并发批量调用 LLM(带信号量控制)
-
-        参数:
-            requests: 请求列表,每个元素是 (system_prompt, user_prompt) 元组
-            model_name: 指定模型名称,None则使用默认模型
-
-        返回:
-            结果列表,与输入请求一一对应
-        """
-        semaphore = asyncio.Semaphore(self._concurrency)
-
-        async def bounded_call(system_prompt: str, user_prompt: str):
-            async with semaphore:
-                return await self._call_llm_once(system_prompt, user_prompt, model_name)
-
-        tasks = [bounded_call(sp, up) for sp, up in requests]
-        return list(await asyncio.gather(*tasks))
-
-    async def classify_chunks_secondary_async(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """
-        异步对chunks进行二级分类(全部走LLM,移除本地规则)
-
-        参数:
-            chunks: 已完成一级分类的chunk列表
-
-        返回:
-            添加了二级分类字段的chunk列表
-        """
-        logger.info(f"正在对 {len(chunks)} 个内容块进行二级分类(LLM全量)...")
-
-        # 准备LLM请求
-        llm_requests = []
-        valid_chunks = []
-        index_mappings = []  # 保存每个请求对应的索引映射
-
-        for chunk in chunks:
-            first_category_code = chunk.get("chapter_classification", "")
-            chunk_title = chunk.get("section_label", "")
-            hierarchy_path = " -> ".join(chunk.get("hierarchy_path", []))
-            content = chunk.get("review_chunk_content", "")
-            content_preview = content[:300] if content else ""
-
-            # 获取一级分类的中文名称
-            first_category_cn = self._get_first_category_cn(first_category_code)
-
-            # 构建二级分类标准(返回标准文本和索引映射)
-            secondary_standards, index_mapping = self._build_secondary_standards(first_category_code)
-
-            if secondary_standards == "(无二级分类标准)":
-                # 如果没有二级分类标准,跳过
-                chunk["secondary_category_cn"] = "无"
-                chunk["secondary_category_code"] = "none"
-                continue
-
-            # 渲染提示词
-            prompt = self.prompt_loader.render(
-                "chunk_secondary_classification",
-                first_category=first_category_cn,
-                chunk_title=chunk_title,
-                hierarchy_path=hierarchy_path,
-                content_preview=content_preview,
-                secondary_standards=secondary_standards
-            )
-
-            llm_requests.append((prompt["system"], prompt["user"]))
-            valid_chunks.append(chunk)
-            index_mappings.append(index_mapping)
-
-        if not llm_requests:
-            logger.info("所有内容块都没有二级分类标准,跳过二级分类")
-            return chunks
-
-        # 全部走LLM分类
-        logger.info(f"[二级分类] 全部 {len(valid_chunks)} 个内容块走LLM分类")
-
-        llm_results = await self._batch_call_llm(llm_requests, model_name=self.SECONDARY_MODEL)
-
-        # 处理LLM结果
-        for chunk, llm_result, index_mapping in zip(valid_chunks, llm_results, index_mappings):
-            if llm_result and isinstance(llm_result, dict):
-                category_index = llm_result.get("category_index")
-
-                if isinstance(category_index, int) and category_index in index_mapping:
-                    secondary_cn, secondary_code = index_mapping[category_index]
-                    chunk["secondary_category_code"] = secondary_code
-                    chunk["secondary_category_cn"] = secondary_cn
-                else:
-                    # LLM返回无效,使用非标准项
-                    chunk["secondary_category_code"] = "non_standard"
-                    chunk["secondary_category_cn"] = "非标准项"
-            else:
-                # LLM调用失败
-                chunk["secondary_category_code"] = "non_standard"
-                chunk["secondary_category_cn"] = "非标准项"
-
-        logger.info("二级分类完成!")
-        return chunks
-
-    async def classify_chunks_tertiary_async(
-        self,
-        chunks: List[Dict[str, Any]],
-        use_enhanced_classifier: bool = True,
-        classifier_config: Optional[Any] = None,
-        progress_callback: Optional[Any] = None
-    ) -> List[Dict[str, Any]]:
-        """
-        异步对chunks进行三级分类
-
-        参数:
-            chunks: 已完成二级分类的chunk列表
-            use_enhanced_classifier: 是否使用增强型分类器(行级细粒度、多分类、Embedding优化)
-                - True: 使用新的 llm_content_classifier_v2(推荐)
-                - False: 使用原有逐chunk分类方式
-            classifier_config: 增强型分类器的配置对象(ClassifierConfig),为None时使用默认配置
-            progress_callback: 进度回调函数 (completed, total, section_name, success) -> None,支持 async
-
-        返回:
-            添加了三级分类字段的chunk列表
-
-        新增字段(use_enhanced_classifier=True时):
-            - tertiary_category_code: 三级分类代码
-            - tertiary_category_cn: 三级分类名称
-            - tertiary_classification_details: 行级分类详情列表,每个条目包含:
-                - third_category_code: 三级分类代码
-                - third_category_name: 三级分类名称
-                - start_line: 起始行号
-                - end_line: 结束行号
-                - content: 原文内容
-        """
-        if use_enhanced_classifier:
-            return await self._classify_chunks_tertiary_enhanced(chunks, classifier_config, progress_callback)
-        else:
-            return await self._classify_chunks_tertiary_legacy(chunks)
-
-    async def _classify_chunks_tertiary_enhanced(
-        self,
-        chunks: List[Dict[str, Any]],
-        config: Optional[Any] = None,
-        progress_callback: Optional[Any] = None
-    ) -> List[Dict[str, Any]]:
-        """
-        使用增强型分类器进行三级分类
-
-        特点:
-        - 行级细粒度分类
-        - 支持一个段落包含多个三级分类
-        - Embedding 相似度优化(跳过明显对应的段落)
-        - 全局行号支持
-        """
-        logger.info(f"正在使用增强型分类器对 {len(chunks)} 个内容块进行三级分类... 特点: 行级细粒度 | 多分类支持 | Embedding优化")
-
-        try:
-            LLMContentClassifier, ClassifierConfig = _get_llm_content_classifier()
-        except ImportError as e:
-            logger.warning(f"无法导入增强型分类器,回退到传统方式: {e}")
-            return await self._classify_chunks_tertiary_legacy(chunks)
-
-        # 创建分类器实例
-        if config is None:
-            config = ClassifierConfig()
-            # 使用与二级分类相同的并发度
-            config.max_concurrent_requests = self._concurrency
-
-            # 从全局配置加载模型和thinking模式
-            try:
-                from foundation.ai.models.model_config_loader import get_model_for_function, get_thinking_mode_for_function
-                config.model = get_model_for_function("doc_classification_tertiary")
-                config.enable_thinking = get_thinking_mode_for_function("doc_classification_tertiary") or False
-                logger.info(f"三级分类配置 - 并发度: {config.max_concurrent_requests}, 模型: {config.model}, thinking: {config.enable_thinking}")
-            except Exception as e:
-                logger.warning(f"加载模型配置失败,使用默认配置: {e}")
-                config.model = "qwen3_5_35b_a3b"
-                config.enable_thinking = False
-
-        classifier = LLMContentClassifier(config)
-
-        # 调用增强型分类器
-        updated_chunks = await classifier.classify_chunks(chunks, progress_callback=progress_callback)
-
-        return updated_chunks
-
-    async def _classify_chunks_tertiary_legacy(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """
-        传统三级分类方式(逐chunk分类)
-
-        每个chunk只能属于一个三级分类
-        """
-        logger.info(f"正在对 {len(chunks)} 个内容块进行三级分类...")
-
-        # 准备LLM请求
-        llm_requests = []
-        valid_chunks = []
-        index_mappings = []  # 保存每个请求对应的索引映射
-
-        for chunk in chunks:
-            first_category_code = chunk.get("chapter_classification", "")
-            second_category_code = chunk.get("secondary_category_code", "")
-            second_category_cn = chunk.get("secondary_category_cn", "")
-            chunk_title = chunk.get("section_label", "")
-            content = chunk.get("review_chunk_content", "")
-            content_preview = content[:300] if content else ""
-
-            # 获取一级分类的中文名称
-            first_category_cn = self._get_first_category_cn(first_category_code)
-
-            # 构建三级分类标准(返回标准文本和索引映射)
-            tertiary_standards, index_mapping = self._build_tertiary_standards(first_category_code, second_category_code)
-
-            if tertiary_standards == "(无三级分类标准)":
-                # 如果没有三级分类标准,跳过
-                chunk["tertiary_category_cn"] = "无"
-                chunk["tertiary_category_code"] = "none"
-                continue
-
-            # 渲染提示词
-            prompt = self.prompt_loader.render(
-                "chunk_tertiary_classification",
-                first_category=first_category_cn,
-                secondary_category=second_category_cn,
-                chunk_title=chunk_title,
-                content_preview=content_preview,
-                tertiary_standards=tertiary_standards
-            )
-
-            llm_requests.append((prompt["system"], prompt["user"]))
-            valid_chunks.append(chunk)
-            index_mappings.append(index_mapping)
-
-        if not llm_requests:
-            logger.info("所有内容块都没有三级分类标准,跳过三级分类")
-            return chunks
-
-        # 批量异步调用LLM API
-        llm_results = await self._batch_call_llm(llm_requests)
-
-        # 处理分类结果
-        for chunk, llm_result, index_mapping in zip(valid_chunks, llm_results, index_mappings):
-            if llm_result and isinstance(llm_result, dict):
-                category_index = llm_result.get("category_index")
-
-                # 验证索引并映射到类别
-                if isinstance(category_index, int) and category_index in index_mapping:
-                    tertiary_cn, tertiary_code = index_mapping[category_index]
-                    chunk["tertiary_category_cn"] = tertiary_cn
-                    chunk["tertiary_category_code"] = tertiary_code
-                else:
-                    # 索引无效,归类为非标准项
-                    logger.warning(f"LLM返回的索引 {category_index} 无效,归类为'非标准项'")
-                    chunk["tertiary_category_cn"] = "非标准项"
-                    chunk["tertiary_category_code"] = "non_standard"
-            else:
-                chunk["tertiary_category_cn"] = "非标准项"
-                chunk["tertiary_category_code"] = "non_standard"
-
-        logger.info("三级分类完成!")
-        return chunks
-
     def _get_first_category_cn(self, first_category_code: str) -> str:
         """获取一级分类的中文名称"""
         category_mapping = {
@@ -515,24 +152,3 @@ class ChunkClassifier:
         except RuntimeError:
             raise RuntimeError("请使用 await classify_chunks_secondary_async")
 
-    def classify_chunks_tertiary(
-        self,
-        chunks: List[Dict[str, Any]],
-        use_enhanced_classifier: bool = True,
-        classifier_config: Optional[Any] = None
-    ) -> List[Dict[str, Any]]:
-        """同步包装:三级分类
-
-        Args:
-            chunks: 已完成二级分类的chunk列表
-            use_enhanced_classifier: 是否使用增强型分类器(默认True)
-            classifier_config: 增强型分类器配置(可选)
-        """
-        try:
-            return asyncio.run(self.classify_chunks_tertiary_async(
-                chunks,
-                use_enhanced_classifier=use_enhanced_classifier,
-                classifier_config=classifier_config
-            ))
-        except RuntimeError:
-            raise RuntimeError("请使用 await classify_chunks_tertiary_async")

+ 1 - 112
core/construction_review/component/document_processor.py

@@ -157,7 +157,6 @@ class DocumentProcessor:
 
             # 缓存结果
             await self._cache_unified_structure(unified_doc)
-            await self._cache_tertiary_results(unified_doc, [])
 
             return unified_doc
 
@@ -286,18 +285,7 @@ class DocumentProcessor:
                 else:
                     group["second_content"] = content
 
-            # 收集三级分类详情
-            details = chunk.get("tertiary_classification_details", [])
-            for idx, detail in enumerate(details, 1):
-                group["third_items"].append(TertiaryItem(
-                    third_seq=len(group["third_items"]) + 1,
-                    third_code=detail.get("third_category_code", ""),
-                    third_name=detail.get("third_category_name", ""),
-                    line_start=detail.get("start_line", 0),
-                    line_end=detail.get("end_line", 0),
-                    content=detail.get("content", ""),
-                    confidence=1.0
-                ))
+            # 三级分类已由完整性审查(LLM直接解释)替代,不再从chunk收集
 
         # 构建tertiary_classifications列表
         tertiary_list = []
@@ -478,47 +466,6 @@ class DocumentProcessor:
             logger.warning(f"二级分类失败: {str(e)},跳过二级分类", exc_info=True)
         return chunks
 
-    async def _classify_chunks_tertiary(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """对chunks进行三级分类,返回处理后的chunks"""
-        logger.info(f"{StageName.TERTIARY_CLASSIFICATION.value}: 对内容块进行三级分类")
-        await self._push_classification_progress(
-            stage="文档分类",
-            current=60,
-            message=f"正在进行三级分类,共 {len(chunks)} 个内容块..."
-        )
-
-        try:
-            cache.save(
-                data=chunks,
-                subdir="document_temp",
-                filename="三级分类输入结果",
-                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
-            )
-            logger.info("[三级分类] 输入结果已保存到缓存: temp/construction_review/document_temp/三级分类输入结果.json")
-        except Exception as e:
-            logger.warning(f"[三级分类] 保存缓存失败: {e}")
-        try:
-            chunk_classifier = self._get_chunk_classifier()
-
-            async def _tertiary_progress(completed: int, total: int, section_name: str, success: bool):
-                """将三级分类的 section 级进度映射到 60%→90%"""
-                if total > 0:
-                    current = 60 + int(completed / total * 30)
-                    status = "完成" if success else "失败"
-                    await self._push_classification_progress(
-                        stage="文档分类",
-                        current=current,
-                        message=f"三级分类中:{section_name} {status} [{completed}/{total}]"
-                    )
-
-            chunks = await chunk_classifier.classify_chunks_tertiary_async(
-                chunks, progress_callback=_tertiary_progress
-            )
-            logger.info("三级分类完成")
-        except Exception as e:
-            logger.warning(f"三级分类失败: {str(e)},跳过三级分类", exc_info=True)
-        return chunks
-
     async def _cache_unified_structure(self, unified_doc: UnifiedDocumentStructure) -> None:
         """
         缓存统一文档结构(二级分类后、三级分类前)
@@ -539,64 +486,6 @@ class DocumentProcessor:
         except Exception as e:
             logger.warning(f"[缓存] 保存统一文档结构失败: {e}", exc_info=True)
 
-    async def _cache_tertiary_results(
-        self,
-        unified_doc: UnifiedDocumentStructure,
-        chunks: List[Dict[str, Any]]
-    ) -> None:
-        """
-        缓存三级分类结果
-
-        保存路径:
-        - temp/construction_review/document_temp/三级分类结果.json
-        - temp/construction_review/document_temp/三级分类_chunks.json
-        """
-        try:
-            # 缓存统一文档结构
-            cache_path = cache.save(
-                data=unified_doc.to_dict(),
-                subdir='document_temp',
-                filename='三级分类结果',
-                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
-            )
-
-            logger.info(f"[缓存] 三级分类结果已保存: {cache_path}")
-            logger.info(f"[缓存] 包含 {unified_doc.secondary_count} 个二级分类, {unified_doc.tertiary_count} 个三级分类")
-
-            # 详细统计
-            for t in unified_doc.tertiary_classifications:
-                logger.info(f"[缓存] 三级分类 {t.second_code}: {len(t.third_items)} 个细项")
-
-            # 缓存chunks(简化版,只保留关键字段)
-            # 如果外部未传入 chunks,从 legacy_dict 中提取
-            source_chunks = chunks if chunks else unified_doc.to_legacy_dict().get("chunks", [])
-            chunks_summary = []
-            for chunk in source_chunks:
-                summary = {
-                    "chunk_id": chunk.get("chunk_id"),
-                    "chapter_classification": chunk.get("chapter_classification"),
-                    "secondary_category_code": chunk.get("secondary_category_code"),
-                    "section_label": chunk.get("section_label"),
-                    "content_length": len(chunk.get("review_chunk_content", "") or chunk.get("content", "")),
-                    "tertiary_classification_details": chunk.get("tertiary_classification_details", []),
-                }
-                chunks_summary.append(summary)
-
-            chunks_cache_path = cache.save(
-                data={
-                    "total_chunks": len(source_chunks),
-                    "chunks": chunks_summary
-                },
-                subdir='document_temp',
-                filename='三级分类_chunks',
-                base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW
-            )
-
-            logger.info(f"[缓存] 三级分类chunks已保存: {chunks_cache_path}")
-
-        except Exception as e:
-            logger.warning(f"[缓存] 保存三级分类结果失败: {e}", exc_info=True)
-
     async def _push_classification_progress(self, stage: str, current: int, message: str) -> None:
         """推送分类阶段进度,并同步更新心跳共享状态"""
         if self._progress_state is not None:

+ 13 - 18
core/construction_review/component/infrastructure/relevance.py

@@ -1,29 +1,24 @@
 import asyncio
 import json
 import re
-import requests
+
+from foundation.ai.models.model_handler import model_handler
 
 
 # ===============================
-# 1) 最小 async LLM 调用(等价 curl
+# 1) LLM 调用(通过统一模型管理,使用 蜀天122B
 # ===============================
+def _build_messages(prompt: str):
+    """构建 LangChain 消息格式"""
+    from langchain_core.messages import HumanMessage
+    return [HumanMessage(content=prompt)]
+
+
 async def qwen_chat_async(prompt: str) -> str:
-    def _call():
-        url = "http://192.168.91.253:8003/v1/chat/completions"
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": "Bearer sk-123456",
-        }
-        payload = {
-            "model": "qwen3-30b",
-            "messages": [{"role": "user", "content": prompt}],
-        }
-        resp = requests.post(url, json=payload, headers=headers, timeout=60)
-        resp.raise_for_status()
-        return resp.json()["choices"][0]["message"]["content"]
-
-    loop = asyncio.get_running_loop()
-    return await loop.run_in_executor(None, _call)
+    llm = model_handler.get_model_by_function("relevance_judge")
+    messages = _build_messages(prompt)
+    response = await llm.ainvoke(messages)
+    return response.content if hasattr(response, 'content') else str(response)
 
 
 # ===============================

+ 4 - 26
core/construction_review/component/minimal_pipeline/simple_processor.py

@@ -177,21 +177,9 @@ class SimpleDocumentProcessor:
             return structure, primary_result, secondary_result, chunks, catalog
         await self._emit_progress(progress_callback, "文档切分", 50, f"组装 {len(chunks)} 个内容块")
 
-        # 5. 三级分类
-        async def _tertiary_progress(completed: int, total: int, section_name: str, success: bool):
-            if total > 0:
-                current = 60 + int(completed / total * 30)
-                status = "完成" if success else "失败"
-                await self._emit_progress(
-                    progress_callback, "文档分类", current,
-                    f"三级分类中:{section_name} {status} [{completed}/{total}]"
-                )
-
-        chunks = await self.chunk_classifier.classify_chunks_tertiary_async(
-            chunks, progress_callback=_tertiary_progress
-        )
-        logger.info("[SimpleProcessor] 三级分类完成")
-        await self._emit_progress(progress_callback, "文档分类", 90, "三级分类完成")
+        # 5. 三级分类已移除——完整性审查改为LLM直接解释,不再需要预分类
+        logger.info("[SimpleProcessor] 跳过三级分类(已由LLM直接完整性审查替代)")
+        await self._emit_progress(progress_callback, "文档分类", 90, "文档处理完成")
 
         # 验证返回前的catalog
         if catalog:
@@ -316,17 +304,7 @@ class SimpleDocumentProcessor:
                 else:
                     group["second_content"] = content
 
-            details = chunk.get("tertiary_classification_details", [])
-            for detail in details:
-                group["third_items"].append(TertiaryItem(
-                    third_seq=len(group["third_items"]) + 1,
-                    third_code=detail.get("third_category_code", ""),
-                    third_name=detail.get("third_category_name", ""),
-                    line_start=detail.get("start_line", 0),
-                    line_end=detail.get("end_line", 0),
-                    content=detail.get("content", ""),
-                    confidence=1.0,
-                ))
+            # 三级分类已由LLM直接完整性审查替代,不再从chunk收集
 
         tertiary_list = []
         second_seq = 0

+ 1 - 2
core/construction_review/component/outline_catalogue_matcher.py

@@ -8,7 +8,6 @@
 """
 
 import difflib
-import logging
 import re
 from typing import Dict, List, Optional, Set, Tuple, Any
 from collections import defaultdict
@@ -16,7 +15,7 @@ from pathlib import Path
 
 import pandas as pd
 
-logger = logging.getLogger(__name__)
+from foundation.observability.logger.loggering import review_logger as logger
 
 
 class OutlineCatalogueMatcher:

+ 1 - 23
core/construction_review/component/reviewers/reference_basis_reviewer/punctuation_result_processor.py

@@ -9,10 +9,6 @@ import re
 from typing import Dict, List, Literal, Optional
 
 from pydantic import BaseModel, Field, ValidationError
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import PydanticOutputParser, StrOutputParser
-from langchain_openai import ChatOpenAI
-
 # 多模型投票已移除,格式建议直接返回基础建议
 
 
@@ -107,25 +103,7 @@ HUMAN = """
 /no_think
 """
 
-# ===== 3) Output Parser =====
-parser = PydanticOutputParser(pydantic_object=PunctuationIssueResults)
-
-# ===== 4) Prompt =====
-prompt = ChatPromptTemplate.from_messages([
-    ("system", SYSTEM),
-    ("human", HUMAN)
-])
-
-# ===== 5) LLM =====
-llm = ChatOpenAI(
-    model="qwen3-30b",
-    base_url="http://192.168.91.253:8003/v1",
-    api_key="sk-123456",
-    temperature=0,
-)
-
-
-# ===== 6) 提取第一个 JSON =====
+# ===== 3) 提取第一个 JSON =====
 def extract_first_json(text: str) -> dict:
     """从任意模型输出中提取第一个完整 JSON 对象 { ... }"""
     start = text.find("{")

+ 2 - 2
core/construction_review/component/reviewers/timeliness_reviewer.py

@@ -3,7 +3,7 @@
 """
 统一时效性审查模块
 
-整合原 standard_timeliness_reviewer、timeliness_basis_reviewer、timeliness_content_reviewer 的功能,
+整合原 standard_timeliness_reviewer、timeliness_basis_reviewer 的功能,
 提供统一的时效性审查入口。
 
 主要组件:
@@ -447,7 +447,7 @@ class StandardTimelinessReviewer:
                 trace_id=f"timeliness_mismatch_{self.callback_task_id or 'default'}_{result.seq_no}",
                 system_prompt=system_prompt,
                 user_prompt=user_prompt,
-                model_name="shutian_qwen3_5_122b",
+                function_name="timeliness_review",
                 enable_thinking=False
             )
             payload = self._extract_first_json(raw)

+ 1 - 5
core/construction_review/workflows/ai_review_workflow.py

@@ -346,14 +346,11 @@ class AIReviewWorkflow:
             else:
                 outline_content_str = self._build_outline_text(original_outline)
 
-            # 筛选与合并章节内容
+            # 筛选章节内容(完整性审查与其他审查共用同一chunk,无需合并标记)
             filtered_chunks = [
                 chunk for chunk in original_chunks
                 if chunk.get("chapter_classification") in review_item_dict_sorted.keys()
             ]
-            filtered_chunks = self.core_fun._merge_chunks_for_completeness_check(
-                filtered_chunks, review_item_dict_sorted
-            )
             cache.filtered_chunks(filtered_chunks, base_cache_dir=CacheBaseDir.CONSTRUCTION_REVIEW)
 
             # 计算总任务数
@@ -377,7 +374,6 @@ class AIReviewWorkflow:
                 "chapter": "目录",
                 "title": "目录",
                 "original_content": outline_content_str,
-                "is_complete_field": True
             }]
 
             # ===== Phase 3: 分章节执行审查 =====

+ 11 - 120
core/construction_review/workflows/core_functions/ai_review_core_fun.py

@@ -262,15 +262,14 @@ class AIReviewCoreFun:
         rag_enhanced_content = None  # 初始化变量,避免作用域错误
         basis_content = None  # 初始化变量,避免作用域错误
         rows_df = None
-        is_complete_field = chunk.get('is_complete_field', False)
-        logger.info(f"检查is_complete_field值是否正常: {is_complete_field}")
+
         # 只有非完整性审查的chunk才执行RAG检索(注意括号位置,确保运算符优先级正确)
-        if ('check_parameter_compliance' in func_names or 'check_non_parameter_compliance' in func_names) and not is_complete_field:
+        if 'check_parameter_compliance' in func_names or 'check_non_parameter_compliance' in func_names:
             logger.debug("开始执行RAG检索增强")
             rag_enhanced_content = self.ai_review_engine.rag_enhanced_check(chunk.get('content', ''))
 
         if ('reference_basis_reviewer' in func_names or 'timeliness_reviewer' in func_names or
-            'timeliness_basis_reviewer' in func_names or 'timeliness_content_reviewer' in func_names) and not is_complete_field:
+            'timeliness_basis_reviewer' in func_names):
             logger.debug("开始执行编制依据/时效性预处理")
             # 预处理编制依据/时效性审查所需内容
             basis_content = await directory_extraction.extract_basis(
@@ -376,11 +375,10 @@ class AIReviewCoreFun:
 
         # 获取块内容
         review_content = chunk.get("content", "")
-        is_complete_field = chunk.get("is_complete_field", False)
         logger.debug(f"执行审查: {trace_id} -> {func_name}")
 
         # 根据func_name构建对应的参数并调用
-        if func_name == "grammar_check" and not is_complete_field:
+        if func_name == "grammar_check":
             raw_result = await method(trace_id, review_content, state, stage_name)
             # 基础审查方法,放入 basic_compliance
             return UnitReviewResult(
@@ -393,7 +391,7 @@ class AIReviewCoreFun:
                 is_sse_push=True
             )
 
-        elif func_name == "check_semantic_logic" and not is_complete_field:
+        elif func_name == "check_semantic_logic":
             raw_result = await method(trace_id, review_content, state, stage_name)
             # 基础审查方法,放入 basic_compliance
             return UnitReviewResult(
@@ -406,7 +404,7 @@ class AIReviewCoreFun:
                 is_sse_push=True
             )
 
-        elif func_name == "check_sensitive" and not is_complete_field:
+        elif func_name == "check_sensitive":
             raw_result = await method(trace_id, review_content, state, stage_name)
             # 基础审查方法,放入 basic_compliance
             return UnitReviewResult(
@@ -419,7 +417,7 @@ class AIReviewCoreFun:
                 is_sse_push=True
             )
 
-        elif func_name == "check_completeness" and is_complete_field:
+        elif func_name == "check_completeness":
             # check_completeness 需要列表类型,将单个 chunk 包装成列表
             completeness_result, trace_id_idx = await method(trace_id, [chunk], state, stage_name)
 
@@ -470,7 +468,7 @@ class AIReviewCoreFun:
                 overall_risk=self._calculate_single_result_risk(outline_result),
                 is_sse_push=True
             )
-        elif func_name == "check_non_parameter_compliance" and not is_complete_field:
+        elif func_name == "check_non_parameter_compliance":
             # 技术审查方法需要从 RAG 检索结果中获取 references
             raw_result = await self._execute_technical_review(
                 method, trace_id, review_content, chunk, state, stage_name, rag_enhanced_content, func_name
@@ -486,7 +484,7 @@ class AIReviewCoreFun:
                 is_sse_push=True
             )
 
-        elif func_name == "check_parameter_compliance" and not is_complete_field:
+        elif func_name == "check_parameter_compliance":
             # 技术审查方法需要从 RAG 检索结果中获取 references
             raw_result = await self._execute_technical_review(
                 method, trace_id, review_content, chunk, state, stage_name, rag_enhanced_content, func_name
@@ -505,7 +503,7 @@ class AIReviewCoreFun:
 
 
         # reference_basis_reviewer:规范性审查(逐块处理,支持basis和其他章节)
-        elif func_name == "reference_basis_reviewer" and not is_complete_field:
+        elif func_name == "reference_basis_reviewer":
             review_data = {
                 "content": review_content,  # 原始文本内容
                 "basis_items": basis_content,  # 提取的 BasisItems 对象(basis章节使用)
@@ -530,7 +528,7 @@ class AIReviewCoreFun:
             )
 
         # timeliness_reviewer:统一的时效性审查入口(支持basis和content两种来源)
-        elif func_name in ("timeliness_basis_reviewer", "timeliness_content_reviewer", "timeliness_reviewer") and not is_complete_field:
+        elif func_name in ("timeliness_basis_reviewer", "timeliness_reviewer"):
             review_data = {
                 "content": review_content,  # 原始文本内容
                 "basis_items": basis_content,  # 提取的 BasisItems 对象(可能为None)
@@ -553,21 +551,7 @@ class AIReviewCoreFun:
             )
 
         else:
-            # 处理 check_completeness 但 is_complete_field=False 的情况
-            if func_name == "check_completeness" and not is_complete_field:
-                logger.debug(f"跳过 {func_name},当前 chunk 不是完整性审查类型")
-                return UnitReviewResult(
-                    unit_index=chunk_index,
-                    unit_content=chunk,
-                    basic_compliance={},
-                    technical_compliance={},
-                    rag_enhanced={},
-                    overall_risk="low",
-                    is_sse_push=False  # 不推送,因为跳过了
-                )
-
             logger.warning(f"未知的审查方法: {func_name}")
-            logger.warning(f"is_complete_field: {is_complete_field}")
             return UnitReviewResult(
                 unit_index=chunk_index,
                 unit_content=chunk,
@@ -1221,96 +1205,3 @@ class AIReviewCoreFun:
             review_item_dict_sorted[key] = review_item_dict[key]
         return review_item_dict_sorted
 
-    def _merge_chunks_for_completeness_check(
-        self,
-        chunks: List[Dict[str, Any]],
-        review_item_dict: Dict[str, List[str]]
-    ) -> List[Dict[str, Any]]:
-        """
-        筛选包含完整性审查的分类,标记该章节的第一个chunk进行完整性审查
-
-        Args:
-            chunks: 筛选后的chunks列表
-            review_item_dict: 审查项字典 {chapter_code: [func_names]}
-
-        Returns:
-            List[Dict[str, Any]]: 标记后的chunks列表,并按页码排序
-
-        Note:
-            标记规则:
-            1. 找出包含 'check_completeness' 的章节分类
-            2. 给所有chunk添加 is_complete_field=False
-            3. 对每个需要完整性审查的章节,标记第一个chunk的 is_complete_field=True
-            4. 按页码排序
-            5. 完整性审查时,check_completeness方法会从state获取该章节的所有原始chunks
-        """
-        try:
-            # 1. 找出包含完整性审查的章节分类
-            completeness_chapters = set()
-            for chapter_code, func_names in review_item_dict.items():
-                if 'check_completeness' in func_names or 'outline_check' in func_names:
-                    completeness_chapters.add(chapter_code)
-
-            if not completeness_chapters:
-                logger.info("没有包含完整性审查的章节,无需合并")
-                return chunks
-
-            logger.info(f"包含完整性审查的章节分类: {completeness_chapters}")
-
-            # 2. 筛选出需要合并的chunks(属于完整性审查章节的)
-            chunks_to_merge = []
-            for chunk in chunks:
-                chapter_code = chunk.get("chapter_classification", "")
-                if chapter_code in completeness_chapters:
-                    chunks_to_merge.append(chunk)
-
-            if not chunks_to_merge:
-                logger.info("没有找到需要合并的chunks")
-                return chunks
-
-            # 3. 按章节分组(章节定义:去除->及其之后的内容)
-            chapter_groups = {}
-            for chunk in chunks_to_merge:
-                chapter_full = chunk.get("chapter", chunk.get("section_label", ""))
-                # 提取章节名:去除->及其之后的内容
-                chapter_name = chapter_full.split("->")[0].strip() if "->" in chapter_full else chapter_full
-
-                if chapter_name not in chapter_groups:
-                    chapter_groups[chapter_name] = []
-                chapter_groups[chapter_name].append(chunk)
-
-            logger.info(f"按章节分组完成,共 {len(chapter_groups)} 个章节需要合并")
-
-            # 4. 标记完整性审查章节的第一个chunk
-            # 给所有原chunk添加 is_complete_field: False
-            result_chunks = []
-            for chunk in chunks:
-                chunk_copy = chunk.copy()
-                chunk_copy["is_complete_field"] = False
-                result_chunks.append(chunk_copy)
-
-            for chapter_name, chapter_chunk_list in chapter_groups.items():
-                # 按page升序排列
-                chapter_chunk_list.sort(key=lambda x: int(x.get("page", 0)) if str(x.get("page", 0)).isdigit() else x.get("page", 0))
-
-                # 找到该章节在result_chunks中的第一个chunk并标记
-                first_chunk_id = chapter_chunk_list[0].get('chunk_id')
-                for rc in result_chunks:
-                    if rc.get('chunk_id') == first_chunk_id:
-                        rc["is_complete_field"] = True
-                        logger.info(f"[完整性审查] 章节 '{chapter_name}' 标记第一个chunk (chunk_id={first_chunk_id}) 用于完整性审查")
-                        break
-
-            # 5. 按页码排序
-            result_chunks.sort(
-                key=lambda x: int(x.get("page", 0)) if str(x.get("page", 0)).isdigit() else x.get("page", 0)
-            )
-
-            logger.info(f"完整性审查标记完成: 共 {len(result_chunks)} 个chunk,章节数: {len(chapter_groups)}")
-
-            return result_chunks
-
-        except Exception as e:
-            logger.error(f"合并chunks失败: {str(e)}", exc_info=True)
-            # 出错时返回原始列表
-            return chunks