فهرست منبع

Merge branch 'dev' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev_sgsc_wxm

WangXuMing 1 هفته پیش
والد
کامیت
9ab7fa134d

+ 6 - 6
config/config.ini

@@ -70,7 +70,7 @@ ENGINE=glm-ocr
 # GLM-OCR 配置
 # GLM-OCR 配置
 GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
 GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
 GLM_OCR_TIMEOUT=600
 GLM_OCR_TIMEOUT=600
-GLM_OCR_API_KEY=2026_Unified_Secure_Key
+GLM_OCR_API_KEY=sk_prod_sXgHYxfVvZdw7O-cki6i7Cp2TbguOvbA_f4beb12a
 
 
 # MinerU 配置  
 # MinerU 配置  
 MINERU_API_URL=http://183.220.37.46:25428/file_parse
 MINERU_API_URL=http://183.220.37.46:25428/file_parse
@@ -167,7 +167,7 @@ PGVECTOR_PASSWORD=pg16@123
 # Qwen3.5-122B-A10B 模型(端口25423)
 # Qwen3.5-122B-A10B 模型(端口25423)
 SHUTIAN_122B_SERVER_URL=http://183.220.37.46:25423/v1
 SHUTIAN_122B_SERVER_URL=http://183.220.37.46:25423/v1
 SHUTIAN_122B_MODEL_ID=/model/Qwen3.5-122B-A10B
 SHUTIAN_122B_MODEL_ID=/model/Qwen3.5-122B-A10B
-SHUTIAN_122B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_122B_API_KEY=sk-prod_ojkjwcO4TTd9TL3vK6uo8a2Dvcdoz64u_9a89845f
 
 
 # Qwen3-8B 模型(端口25424)
 # Qwen3-8B 模型(端口25424)
 SHUTIAN_8B_SERVER_URL=http://183.220.37.46:25424/v1
 SHUTIAN_8B_SERVER_URL=http://183.220.37.46:25424/v1
@@ -177,22 +177,22 @@ SHUTIAN_8B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
 # Qwen3.6-27B 模型(端口25424)
 # Qwen3.6-27B 模型(端口25424)
 SHUTIAN_27B_SERVER_URL=http://183.220.37.46:25424/v1
 SHUTIAN_27B_SERVER_URL=http://183.220.37.46:25424/v1
 SHUTIAN_27B_MODEL_ID=/model/Qwen3.6-27B
 SHUTIAN_27B_MODEL_ID=/model/Qwen3.6-27B
-SHUTIAN_27B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_27B_API_KEY=sk_prod_HH21x5WB9Pm7IM9Bf808BoJPEn_4bPX5_f2c5f3f6
 
 
 # Qwen3.5-35B 模型(端口25427)
 # Qwen3.5-35B 模型(端口25427)
 SHUTIAN_35B_SERVER_URL=http://183.220.37.46:25427/v1
 SHUTIAN_35B_SERVER_URL=http://183.220.37.46:25427/v1
 SHUTIAN_35B_MODEL_ID=/model/Qwen3.5-35B
 SHUTIAN_35B_MODEL_ID=/model/Qwen3.5-35B
-SHUTIAN_35B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_35B_API_KEY=sk_prod_0NuLZt1a2UrD80F9iB-GTxOIuAkJSZxH_5522d7ae
 
 
 # Qwen3-Embedding-8B 嵌入模型(端口25425)
 # Qwen3-Embedding-8B 嵌入模型(端口25425)
 SHUTIAN_EMBED_SERVER_URL=http://183.220.37.46:25425/v1
 SHUTIAN_EMBED_SERVER_URL=http://183.220.37.46:25425/v1
 SHUTIAN_EMBED_MODEL_ID=/model/Qwen3-Embedding-8B
 SHUTIAN_EMBED_MODEL_ID=/model/Qwen3-Embedding-8B
-SHUTIAN_EMBED_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_EMBED_API_KEY=sk_prod_3HDoVka8mU8Jqj9Xnmfkn8bxk5kmzKrz_700c186f
 
 
 # Qwen3-Reranker-8B 重排序模型(端口25426)
 # Qwen3-Reranker-8B 重排序模型(端口25426)
 SHUTIAN_RERANK_SERVER_URL=http://183.220.37.46:25426/v1/rerank
 SHUTIAN_RERANK_SERVER_URL=http://183.220.37.46:25426/v1/rerank
 SHUTIAN_RERANK_MODEL_ID=/model/Qwen3-Reranker-8B
 SHUTIAN_RERANK_MODEL_ID=/model/Qwen3-Reranker-8B
-SHUTIAN_RERANK_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_RERANK_API_KEY=sk_prod_dvgYHKWFoQlYAKmkIvBSyuguNSQGeNh0_23c65608
 
 
 
 
 [milvus]
 [milvus]

+ 2 - 2
core/construction_review/component/minimal_pipeline/pdf_extractor1.py

@@ -2151,7 +2151,7 @@ class PdfStructureExtractor:
         # 1. 提取标题少于 5 个字时,必须与标准目录名完全相等。
         # 1. 提取标题少于 5 个字时,必须与标准目录名完全相等。
         # 2. 提取标题超过 15 个字时,直接判定为非标准目录标题。
         # 2. 提取标题超过 15 个字时,直接判定为非标准目录标题。
         # 3. 提取标题 5 到 15 个字时,允许一定 OCR/抽取误差:
         # 3. 提取标题 5 到 15 个字时,允许一定 OCR/抽取误差:
-        #    只要提取标题中至少 80% 的字出现在标准目录名中即可,字符顺序不作要求。
+        #    只要提取标题中至少 65% 的字出现在标准目录名中即可,字符顺序不作要求。
         extracted_len = len(extracted)
         extracted_len = len(extracted)
         if extracted_len < 5:
         if extracted_len < 5:
             return extracted == standard
             return extracted == standard
@@ -2159,7 +2159,7 @@ class PdfStructureExtractor:
             return False
             return False
 
 
         overlap_count = sum((Counter(extracted) & Counter(standard)).values())
         overlap_count = sum((Counter(extracted) & Counter(standard)).values())
-        return (overlap_count / max(extracted_len, 1)) >= 0.8
+        return (overlap_count / max(extracted_len, 1)) >= 0.65
 
 
     @classmethod
     @classmethod
     def _normalize_catalog_name(cls, text: str) -> str:
     def _normalize_catalog_name(cls, text: str) -> str:

+ 94 - 0
core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py

@@ -36,6 +36,7 @@ if str(REPO_ROOT) not in sys.path:
 
 
 SPECIAL_SECTION_KEYS = {"章节标题", "默认部分"}
 SPECIAL_SECTION_KEYS = {"章节标题", "默认部分"}
 STAT_FILE_NAME = "static.text"
 STAT_FILE_NAME = "static.text"
+UNMATCHED_STANDARD_CATALOG_FILE_NAME = "unmatched_standard_catalog_titles.md"
 TOC_LINE_PATTERN = re.compile(r"(?:[.\u2026·•…]{2,}|-{3,}).{0,30}\d+\s*$")
 TOC_LINE_PATTERN = re.compile(r"(?:[.\u2026·•…]{2,}|-{3,}).{0,30}\d+\s*$")
 TOC_PAGE_SUFFIX_PATTERN = re.compile(
 TOC_PAGE_SUFFIX_PATTERN = re.compile(
     r"(?:[.\u2026\u00b7\u2022·•…]{2,}|-{3,})[-\u2013\u2014 ]*(?:-\s*)?\d{1,3}(?:\s*-)?\s*$"
     r"(?:[.\u2026\u00b7\u2022·•…]{2,}|-{3,})[-\u2013\u2014 ]*(?:-\s*)?\d{1,3}(?:\s*-)?\s*$"
@@ -883,6 +884,85 @@ def append_static_record(
         )
         )
 
 
 
 
+def collect_unmatched_standard_catalog_titles(
+    pdf_path: Path,
+    extractor: Any,
+    extractor_result: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    rule_name = extractor_result.get("body_rule")
+    rule_set = getattr(extractor, "RULE_LIB", {}).get(rule_name)
+    if not rule_name or not rule_set:
+        return []
+
+    records: List[Dict[str, Any]] = []
+    seen: set[Tuple[str, int]] = set()
+
+    try:
+        with fitz.open(pdf_path) as doc:
+            body_lines, _ = extractor._extract_body_lines(doc)
+    except Exception:
+        return []
+
+    current_l1_title = ""
+    for item in body_lines:
+        original = str(getattr(item, "text", "") or "").strip()
+        if not original or original.isdigit():
+            continue
+
+        line = extractor._strip_leading_page_number_from_heading(original)
+        if not line:
+            continue
+
+        source = "目录候选" if extractor._looks_like_toc_candidate(line) else "正文标题"
+        candidates: List[int] = []
+
+        if rule_set["l1"].match(line):
+            if source == "目录候选" or extractor._is_valid_heading_strict(line, is_l1=True):
+                current_l1_title = extractor._normalize_catalog_name(line)
+                if not extractor._match_standard_catalog_chapter(line):
+                    candidates.append(1)
+
+        if rule_set["l2"].match(line):
+            if source == "目录候选" or extractor._is_valid_heading_strict(line, is_l1=False):
+                if not extractor._match_standard_catalog_section(line, None):
+                    candidates.append(2)
+
+        for level in candidates:
+            normalized = extractor._normalize_catalog_name(line)
+            key = (normalized, level)
+            if not normalized or key in seen:
+                continue
+            seen.add(key)
+            records.append({
+                "normalized": normalized,
+                "level": "一级" if level == 1 else "二级",
+                "parent_l1": normalized if level == 1 else current_l1_title,
+            })
+
+    return records
+
+
+def append_unmatched_standard_catalog_titles(
+    md_path: Path,
+    records: List[Dict[str, Any]],
+) -> None:
+    if not records:
+        return
+
+    md_path.parent.mkdir(parents=True, exist_ok=True)
+    needs_header = not md_path.exists() or md_path.stat().st_size == 0
+    with md_path.open("a", encoding="utf-8", newline="") as file:
+        if needs_header:
+            file.write("| 一级标题 | 标题名称 | 级别 |\n")
+            file.write("|---|---|---|\n")
+        for record in records:
+            parent_l1 = str(record.get("parent_l1", "") or "").strip()
+            title = str(record.get("normalized", "") or "").strip()
+            level = str(record.get("level", "") or "").strip()
+            if title:
+                file.write(f"| {parent_l1} | {title} | {level} |\n")
+
+
 def sanitize_filename_component(value: str) -> str:
 def sanitize_filename_component(value: str) -> str:
     sanitized = value.strip()
     sanitized = value.strip()
     for char in '<>:"/\\|?*':
     for char in '<>:"/\\|?*':
@@ -1017,6 +1097,15 @@ def process_pdf(
         catalog_quality_rate_text=catalog_quality_rate_text,
         catalog_quality_rate_text=catalog_quality_rate_text,
         content_quality_rate_text=content_quality_rate_text,
         content_quality_rate_text=content_quality_rate_text,
     )
     )
+    unmatched_records = collect_unmatched_standard_catalog_titles(
+        pdf_path=pdf_path,
+        extractor=extractor,
+        extractor_result=extractor_result,
+    )
+    append_unmatched_standard_catalog_titles(
+        md_path=output_path.parent / UNMATCHED_STANDARD_CATALOG_FILE_NAME,
+        records=unmatched_records,
+    )
     return output_path, quality_rate_text
     return output_path, quality_rate_text
 
 
 
 
@@ -1052,6 +1141,11 @@ def main() -> int:
     print(f"Extractor: {args.extractor}")
     print(f"Extractor: {args.extractor}")
     print("=" * 80)
     print("=" * 80)
 
 
+    if output_dir is not None:
+        unmatched_path = output_dir / UNMATCHED_STANDARD_CATALOG_FILE_NAME
+        if unmatched_path.exists():
+            unmatched_path.unlink()
+
     success_count = 0
     success_count = 0
     for index, pdf_path in enumerate(pdf_files, 1):
     for index, pdf_path in enumerate(pdf_files, 1):
         print(f"[{index}/{len(pdf_files)}] Processing: {pdf_path.name}")
         print(f"[{index}/{len(pdf_files)}] Processing: {pdf_path.name}")

+ 2 - 2
core/construction_write/component/outline_generator.py

@@ -1111,7 +1111,7 @@ class OutlineGenerator:
 只能输出参考模板中已经存在的内容,并在原句范围内做轻微润色。
 只能输出参考模板中已经存在的内容,并在原句范围内做轻微润色。
 
 
 【硬性禁止】
 【硬性禁止】
-1. 禁止新增任何标题、编号、段落、条目、表格、图片说明或结尾语。
+1. 禁止新增任何标题、编号、段落、条目、表格、图片说明、文字删除线或结尾语。
 2. 禁止扩写施工方法、技术措施、工程概况、注意事项等模板中没有的事实内容。
 2. 禁止扩写施工方法、技术措施、工程概况、注意事项等模板中没有的事实内容。
 3. 禁止为了“完整”“专业”“丰富”而补充示例、解释、背景或过渡句。
 3. 禁止为了“完整”“专业”“丰富”而补充示例、解释、背景或过渡句。
 4. 禁止输出“以下是”“根据模板”“已润色”等说明性文字。
 4. 禁止输出“以下是”“根据模板”“已润色”等说明性文字。
@@ -1120,7 +1120,7 @@ class OutlineGenerator:
 7. 禁止输出完整电话号码,电话必须打码处理。
 7. 禁止输出完整电话号码,电话必须打码处理。
 
 
 【允许修改】
 【允许修改】
-1. 仅可对已有句子做轻微语病修正、标点修正、措辞顺滑和专业化表达。
+1. 仅可对已有句子做轻微语病修正、标点修正、措辞顺滑、文字删除线去除和专业化表达。
 2. 可将模板中的“xx”等占位符替换为项目信息中明确给出的内容;没有明确值时保留原文。
 2. 可将模板中的“xx”等占位符替换为项目信息中明确给出的内容;没有明确值时保留原文。
 3. 人名占位符不得替换为真实姓名,必须保留或改为“XX”;电话号码必须保留打码形式,不得输出完整号码。
 3. 人名占位符不得替换为真实姓名,必须保留或改为“XX”;电话号码必须保留打码形式,不得输出完整号码。
 4. 表格、编号、标题层级、段落数量、段落顺序必须与参考模板一致。
 4. 表格、编号、标题层级、段落数量、段落顺序必须与参考模板一致。

+ 19 - 2
utils_test/Model_Test/test_model_stress.py

@@ -16,6 +16,9 @@
      
      
     # python utils_test/Model_Test/test_model_stress.py --concurrency 150 --count 150 --model shutian_qwen3_6_27b --context-size 8k
     # python utils_test/Model_Test/test_model_stress.py --concurrency 150 --count 150 --model shutian_qwen3_6_27b --context-size 8k
 
 
+    # 避免服务端 KV 缓存命中(注入随机值)
+    python utils_test/Model_Test/test_model_stress.py --concurrency 10 --count 50 --bust-cache
+
     # 自定义参数
     # 自定义参数
     python utils_test/Model_Test/test_model_stress.py --concurrency 20 --count 100 --model shutian_qwen3_5_122b
     python utils_test/Model_Test/test_model_stress.py --concurrency 20 --count 100 --model shutian_qwen3_5_122b
 
 
@@ -41,6 +44,7 @@ import asyncio
 import argparse
 import argparse
 import time
 import time
 import statistics
 import statistics
+import uuid
 from pathlib import Path
 from pathlib import Path
 from dataclasses import dataclass, field
 from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
 from typing import List, Optional, Tuple
@@ -191,11 +195,13 @@ def _extract_token_usage(response) -> Tuple[int, int]:
 
 
 async def _run_llm_request(trace_id: str, model_name: Optional[str] = None,
 async def _run_llm_request(trace_id: str, model_name: Optional[str] = None,
                            function_name: Optional[str] = None,
                            function_name: Optional[str] = None,
-                           context_size: int = 0) -> RequestResult:
+                           context_size: int = 0,
+                           bust_cache: bool = False) -> RequestResult:
     """执行单次 LLM 调用并记录延迟和 token 用量
     """执行单次 LLM 调用并记录延迟和 token 用量
 
 
     Args:
     Args:
         context_size: 上下文 token 数,>0 时在 user_prompt 前拼接填充文本
         context_size: 上下文 token 数,>0 时在 user_prompt 前拼接填充文本
+        bust_cache: 在 prompt 末尾追加随机值避免 KV 缓存命中
     """
     """
     from foundation.ai.models.model_handler import model_handler
     from foundation.ai.models.model_handler import model_handler
     from foundation.ai.models.model_config_loader import get_model_for_function, get_thinking_mode_for_function
     from foundation.ai.models.model_config_loader import get_model_for_function, get_thinking_mode_for_function
@@ -222,6 +228,10 @@ async def _run_llm_request(trace_id: str, model_name: Optional[str] = None,
         padding = _generate_context_text(context_size)
         padding = _generate_context_text(context_size)
         user_prompt = f"{padding}\n\n---\n\n{TEST_USER_PROMPT}"
         user_prompt = f"{padding}\n\n---\n\n{TEST_USER_PROMPT}"
 
 
+    if bust_cache:
+        rand = uuid.uuid4().hex[:12]
+        user_prompt = f"[noise:{rand}]\n{user_prompt}"
+
     messages = [SystemMessage(content=TEST_SYSTEM_PROMPT), HumanMessage(content=user_prompt)]
     messages = [SystemMessage(content=TEST_SYSTEM_PROMPT), HumanMessage(content=user_prompt)]
 
 
     start = time.perf_counter()
     start = time.perf_counter()
@@ -274,6 +284,7 @@ async def run_stress_test(
     total_count: int,
     total_count: int,
     function_name: Optional[str] = None,
     function_name: Optional[str] = None,
     context_size: int = 0,
     context_size: int = 0,
+    bust_cache: bool = False,
 ) -> StressTestResult:
 ) -> StressTestResult:
     """执行压力测试
     """执行压力测试
 
 
@@ -304,7 +315,7 @@ async def run_stress_test(
             if model_type == "embedding":
             if model_type == "embedding":
                 return await _run_embedding_request(trace_id, model_name)
                 return await _run_embedding_request(trace_id, model_name)
             else:
             else:
-                return await _run_llm_request(trace_id, model_name, function_name, context_size)
+                return await _run_llm_request(trace_id, model_name, function_name, context_size, bust_cache)
 
 
     ctx_label = f" | 上下文: {context_size//1024}k tokens" if context_size > 0 else ""
     ctx_label = f" | 上下文: {context_size//1024}k tokens" if context_size > 0 else ""
     print(f"\n{'='*60}")
     print(f"\n{'='*60}")
@@ -502,6 +513,10 @@ def parse_args():
         "--all-embeddings", action="store_true",
         "--all-embeddings", action="store_true",
         help="逐个测试所有 Embedding 模型",
         help="逐个测试所有 Embedding 模型",
     )
     )
+    parser.add_argument(
+        "--bust-cache", action="store_true",
+        help="在每次请求的 prompt 末尾注入随机值,避免服务端 KV 缓存命中",
+    )
     return parser.parse_args()
     return parser.parse_args()
 
 
 
 
@@ -539,6 +554,7 @@ async def _run_single_model_test(args, model_name: str, function_name: Optional[
                 total_count=args.count,
                 total_count=args.count,
                 function_name=function_name,
                 function_name=function_name,
                 context_size=ctx_size,
                 context_size=ctx_size,
+                bust_cache=args.bust_cache,
             )
             )
             summary = print_report(result)
             summary = print_report(result)
             summary["context_display"] = ctx_display
             summary["context_display"] = ctx_display
@@ -571,6 +587,7 @@ async def main():
                     model_type=model_type,
                     model_type=model_type,
                     concurrency=args.concurrency,
                     concurrency=args.concurrency,
                     total_count=args.count,
                     total_count=args.count,
+                    bust_cache=args.bust_cache,
                 )
                 )
                 summary = print_report(result)
                 summary = print_report(result)
                 results_summary.append(summary)
                 results_summary.append(summary)