18 stundas atpakaļ · ba0a68f8ed
--- a/core/construction_review/component/minimal_pipeline/chunk_assembler.py
+++ b/core/construction_review/component/minimal_pipeline/chunk_assembler.py
@@ -8,6 +8,7 @@ import re
 
				 from typing import Dict, Any, List
			
 
				 
			
 
				 from foundation.observability.logger.loggering import review_logger as logger
			
 
				+from core.construction_review.component.reviewers.utils.text_preprocessor import preprocess_review_text
			
 
				 
			
 
				 
			
 
				 def assemble_chunks(
			
@@ -71,7 +72,7 @@ def assemble_chunks(
 
				         title_number = _extract_chapter_number(chapter_title)
			
 
				 
			
 
				         for section_title, section_data in sections.items():
			
 
				-            content = section_data.get("content", "")
			
 
				+            content = preprocess_review_text(section_data.get("content", ""))
			
 
				             if not content.strip():
			
 
				                 continue
			
 
				 
			
--- a/core/construction_review/component/reviewers/utils/text_preprocessor.py
+++ b/core/construction_review/component/reviewers/utils/text_preprocessor.py
@@ -0,0 +1,95 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+"""
			
 
				+@Project   : lq-agent-api
			
 
				+@File      : text_preprocessor.py
			
 
				+@IDE       : Cursor
			
 
				+@Author    : AI Assistant
			
 
				+@Date      : 2026-05-27
			
 
				+@Description: 审查文本预处理 — 合并 PDF 物理折行，消除排版换行导致的审查误报
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+
			
 
				+
			
 
				+# 句末标点：这些字符后的换行保留（段落/句子边界）
			
 
				+_SENTENCE_END_RE = re.compile(r'[。！？]$')
			
 
				+
			
 
				+# 空行
			
 
				+_BLANK_RE = re.compile(r'^\s*$')
			
 
				+
			
 
				+# 结构化编号：以这些模式开头的行保留换行（章节/条款/图表标题）
			
 
				+_STRUCTURAL_START_PATTERNS = [
			
 
				+    re.compile(r'^\d+\.\d'),                              # 4.9.2.、3.1.2
			
 
				+    re.compile(r'^第[一二三四五六七八九十百千]+[、章节条]'),    # 第一章、第一节
			
 
				+    re.compile(r'^[一二三四五六七八九十]+[、.]'),              # 一、二、三.
			
 
				+    re.compile(r'^[（(][一二三四五六七八九十\d]+[)）]'),       # （1）、(2)、（一）
			
 
				+    re.compile(r'^[（(][a-zA-Z][)）]'),                     # （a）、(b)
			
 
				+    re.compile(r'^[①②③④⑤⑥⑦⑧⑨⑩]'),                       # ①②③...
			
 
				+    re.compile(r'^图\s*\d'),                                # 图4.9.2-1
			
 
				+    re.compile(r'^表\s*\d'),                                # 表3-1
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def _is_structural_line(line: str) -> bool:
			
 
				+    """判断是否为结构化行（章节标题、编号条款、图表标题等）"""
			
 
				+    stripped = line.strip()
			
 
				+    if not stripped:
			
 
				+        return False
			
 
				+    return any(p.match(stripped) for p in _STRUCTURAL_START_PATTERNS)
			
 
				+
			
 
				+
			
 
				+def preprocess_review_text(text: str) -> str:
			
 
				+    """
			
 
				+    合并段落内的 PDF 物理折行，消除排版断行对 LLM 审查的干扰。
			
 
				+
			
 
				+    PDF 提取时，长句在行宽限制处被物理折行（如 "漆膜" 拆为 "漆\\n膜"），
			
 
				+    这些换行在整个审查流水线中从未被处理，直接传入 LLM prompt，
			
 
				+    导致审查器将排版断行误判为文字错误。
			
 
				+
			
 
				+    保留换行的条件（满足任一即保留）：
			
 
				+    - 当前行为空行（段落分隔）
			
 
				+    - 上一行以句末标点（。！？）结尾（句子边界）
			
 
				+    - 当前行以结构化编号开头（章节标题、条款编号、图表编号）
			
 
				+
			
 
				+    其余情况一律合并（去掉 \\n，直接拼接）。
			
 
				+
			
 
				+    附加处理：
			
 
				+    - \\r\\n → \\n 统一换行符
			
 
				+    - 连续 3+ 换行 → 2 换行
			
 
				+    """
			
 
				+    if not text:
			
 
				+        return text
			
 
				+
			
 
				+    # 统一换行符
			
 
				+    text = text.replace('\r\n', '\n').replace('\r', '\n')
			
 
				+
			
 
				+    # 压缩连续空行（3+ → 2）
			
 
				+    text = re.sub(r'\n{3,}', '\n\n', text)
			
 
				+
			
 
				+    lines = text.split('\n')
			
 
				+    if len(lines) <= 1:
			
 
				+        return text
			
 
				+
			
 
				+    result = [lines[0]]
			
 
				+    for line in lines[1:]:
			
 
				+        prev = result[-1]
			
 
				+
			
 
				+        # 当前行为空行 → 保留（段落分隔）
			
 
				+        if _BLANK_RE.match(line):
			
 
				+            result.append(line)
			
 
				+
			
 
				+        # 上一行以句末标点结尾 → 保留（句子/段落边界）
			
 
				+        elif _SENTENCE_END_RE.search(prev.rstrip()):
			
 
				+            result.append(line)
			
 
				+
			
 
				+        # 当前行是结构化编号 → 保留（章节/条款/图表标题）
			
 
				+        elif _is_structural_line(line):
			
 
				+            result.append(line)
			
 
				+
			
 
				+        # 其余：PDF 物理折行 → 合并
			
 
				+        else:
			
 
				+            result[-1] = prev + line
			
 
				+
			
 
				+    return '\n'.join(result)