Просмотр исходного кода

Merge branch 'dev' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev_sgsc_wxm

WangXuMing 1 неделя назад
Родитель
Сommit
9ab7fa134d

+ 6 - 6
config/config.ini

@@ -70,7 +70,7 @@ ENGINE=glm-ocr
 # GLM-OCR 配置
 GLM_OCR_API_URL=http://183.220.37.46:25429/v1/chat/completions
 GLM_OCR_TIMEOUT=600
-GLM_OCR_API_KEY=2026_Unified_Secure_Key
+GLM_OCR_API_KEY=sk_prod_sXgHYxfVvZdw7O-cki6i7Cp2TbguOvbA_f4beb12a
 
 # MinerU 配置  
 MINERU_API_URL=http://183.220.37.46:25428/file_parse
@@ -167,7 +167,7 @@ PGVECTOR_PASSWORD=pg16@123
 # Qwen3.5-122B-A10B 模型(端口25423)
 SHUTIAN_122B_SERVER_URL=http://183.220.37.46:25423/v1
 SHUTIAN_122B_MODEL_ID=/model/Qwen3.5-122B-A10B
-SHUTIAN_122B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_122B_API_KEY=sk-prod_ojkjwcO4TTd9TL3vK6uo8a2Dvcdoz64u_9a89845f
 
 # Qwen3-8B 模型(端口25424)
 SHUTIAN_8B_SERVER_URL=http://183.220.37.46:25424/v1
@@ -177,22 +177,22 @@ SHUTIAN_8B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
 # Qwen3.6-27B 模型(端口25424)
 SHUTIAN_27B_SERVER_URL=http://183.220.37.46:25424/v1
 SHUTIAN_27B_MODEL_ID=/model/Qwen3.6-27B
-SHUTIAN_27B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_27B_API_KEY=sk_prod_HH21x5WB9Pm7IM9Bf808BoJPEn_4bPX5_f2c5f3f6
 
 # Qwen3.5-35B 模型(端口25427)
 SHUTIAN_35B_SERVER_URL=http://183.220.37.46:25427/v1
 SHUTIAN_35B_MODEL_ID=/model/Qwen3.5-35B
-SHUTIAN_35B_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_35B_API_KEY=sk_prod_0NuLZt1a2UrD80F9iB-GTxOIuAkJSZxH_5522d7ae
 
 # Qwen3-Embedding-8B 嵌入模型(端口25425)
 SHUTIAN_EMBED_SERVER_URL=http://183.220.37.46:25425/v1
 SHUTIAN_EMBED_MODEL_ID=/model/Qwen3-Embedding-8B
-SHUTIAN_EMBED_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_EMBED_API_KEY=sk_prod_3HDoVka8mU8Jqj9Xnmfkn8bxk5kmzKrz_700c186f
 
 # Qwen3-Reranker-8B 重排序模型(端口25426)
 SHUTIAN_RERANK_SERVER_URL=http://183.220.37.46:25426/v1/rerank
 SHUTIAN_RERANK_MODEL_ID=/model/Qwen3-Reranker-8B
-SHUTIAN_RERANK_API_KEY=sk_prod_SELVoIV1d3gku28koH_ONg8L_B2cQis__71f55615
+SHUTIAN_RERANK_API_KEY=sk_prod_dvgYHKWFoQlYAKmkIvBSyuguNSQGeNh0_23c65608
 
 
 [milvus]

+ 2 - 2
core/construction_review/component/minimal_pipeline/pdf_extractor1.py

@@ -2151,7 +2151,7 @@ class PdfStructureExtractor:
         # 1. 提取标题少于 5 个字时,必须与标准目录名完全相等。
         # 2. 提取标题超过 15 个字时,直接判定为非标准目录标题。
         # 3. 提取标题 5 到 15 个字时,允许一定 OCR/抽取误差:
-        #    只要提取标题中至少 80% 的字出现在标准目录名中即可,字符顺序不作要求。
+        #    只要提取标题中至少 65% 的字出现在标准目录名中即可,字符顺序不作要求。
         extracted_len = len(extracted)
         if extracted_len < 5:
             return extracted == standard
@@ -2159,7 +2159,7 @@ class PdfStructureExtractor:
             return False
 
         overlap_count = sum((Counter(extracted) & Counter(standard)).values())
-        return (overlap_count / max(extracted_len, 1)) >= 0.8
+        return (overlap_count / max(extracted_len, 1)) >= 0.65
 
     @classmethod
     def _normalize_catalog_name(cls, text: str) -> str:

+ 94 - 0
core/construction_review/component/minimal_pipeline/pdf_extractor_batch_runner.py

@@ -36,6 +36,7 @@ if str(REPO_ROOT) not in sys.path:
 
 SPECIAL_SECTION_KEYS = {"章节标题", "默认部分"}
 STAT_FILE_NAME = "static.text"
+UNMATCHED_STANDARD_CATALOG_FILE_NAME = "unmatched_standard_catalog_titles.md"
 TOC_LINE_PATTERN = re.compile(r"(?:[.\u2026·•…]{2,}|-{3,}).{0,30}\d+\s*$")
 TOC_PAGE_SUFFIX_PATTERN = re.compile(
     r"(?:[.\u2026\u00b7\u2022·•…]{2,}|-{3,})[-\u2013\u2014 ]*(?:-\s*)?\d{1,3}(?:\s*-)?\s*$"
@@ -883,6 +884,85 @@ def append_static_record(
         )
 
 
+def collect_unmatched_standard_catalog_titles(
+    pdf_path: Path,
+    extractor: Any,
+    extractor_result: Dict[str, Any],
+) -> List[Dict[str, Any]]:
+    rule_name = extractor_result.get("body_rule")
+    rule_set = getattr(extractor, "RULE_LIB", {}).get(rule_name)
+    if not rule_name or not rule_set:
+        return []
+
+    records: List[Dict[str, Any]] = []
+    seen: set[Tuple[str, int]] = set()
+
+    try:
+        with fitz.open(pdf_path) as doc:
+            body_lines, _ = extractor._extract_body_lines(doc)
+    except Exception:
+        return []
+
+    current_l1_title = ""
+    for item in body_lines:
+        original = str(getattr(item, "text", "") or "").strip()
+        if not original or original.isdigit():
+            continue
+
+        line = extractor._strip_leading_page_number_from_heading(original)
+        if not line:
+            continue
+
+        source = "目录候选" if extractor._looks_like_toc_candidate(line) else "正文标题"
+        candidates: List[int] = []
+
+        if rule_set["l1"].match(line):
+            if source == "目录候选" or extractor._is_valid_heading_strict(line, is_l1=True):
+                current_l1_title = extractor._normalize_catalog_name(line)
+                if not extractor._match_standard_catalog_chapter(line):
+                    candidates.append(1)
+
+        if rule_set["l2"].match(line):
+            if source == "目录候选" or extractor._is_valid_heading_strict(line, is_l1=False):
+                if not extractor._match_standard_catalog_section(line, None):
+                    candidates.append(2)
+
+        for level in candidates:
+            normalized = extractor._normalize_catalog_name(line)
+            key = (normalized, level)
+            if not normalized or key in seen:
+                continue
+            seen.add(key)
+            records.append({
+                "normalized": normalized,
+                "level": "一级" if level == 1 else "二级",
+                "parent_l1": normalized if level == 1 else current_l1_title,
+            })
+
+    return records
+
+
+def append_unmatched_standard_catalog_titles(
+    md_path: Path,
+    records: List[Dict[str, Any]],
+) -> None:
+    if not records:
+        return
+
+    md_path.parent.mkdir(parents=True, exist_ok=True)
+    needs_header = not md_path.exists() or md_path.stat().st_size == 0
+    with md_path.open("a", encoding="utf-8", newline="") as file:
+        if needs_header:
+            file.write("| 一级标题 | 标题名称 | 级别 |\n")
+            file.write("|---|---|---|\n")
+        for record in records:
+            parent_l1 = str(record.get("parent_l1", "") or "").strip()
+            title = str(record.get("normalized", "") or "").strip()
+            level = str(record.get("level", "") or "").strip()
+            if title:
+                file.write(f"| {parent_l1} | {title} | {level} |\n")
+
+
 def sanitize_filename_component(value: str) -> str:
     sanitized = value.strip()
     for char in '<>:"/\\|?*':
@@ -1017,6 +1097,15 @@ def process_pdf(
         catalog_quality_rate_text=catalog_quality_rate_text,
         content_quality_rate_text=content_quality_rate_text,
     )
+    unmatched_records = collect_unmatched_standard_catalog_titles(
+        pdf_path=pdf_path,
+        extractor=extractor,
+        extractor_result=extractor_result,
+    )
+    append_unmatched_standard_catalog_titles(
+        md_path=output_path.parent / UNMATCHED_STANDARD_CATALOG_FILE_NAME,
+        records=unmatched_records,
+    )
     return output_path, quality_rate_text
 
 
@@ -1052,6 +1141,11 @@ def main() -> int:
     print(f"Extractor: {args.extractor}")
     print("=" * 80)
 
+    if output_dir is not None:
+        unmatched_path = output_dir / UNMATCHED_STANDARD_CATALOG_FILE_NAME
+        if unmatched_path.exists():
+            unmatched_path.unlink()
+
     success_count = 0
     for index, pdf_path in enumerate(pdf_files, 1):
         print(f"[{index}/{len(pdf_files)}] Processing: {pdf_path.name}")

+ 2 - 2
core/construction_write/component/outline_generator.py

@@ -1111,7 +1111,7 @@ class OutlineGenerator:
 只能输出参考模板中已经存在的内容,并在原句范围内做轻微润色。
 
 【硬性禁止】
-1. 禁止新增任何标题、编号、段落、条目、表格、图片说明或结尾语。
+1. 禁止新增任何标题、编号、段落、条目、表格、图片说明、文字删除线或结尾语。
 2. 禁止扩写施工方法、技术措施、工程概况、注意事项等模板中没有的事实内容。
 3. 禁止为了“完整”“专业”“丰富”而补充示例、解释、背景或过渡句。
 4. 禁止输出“以下是”“根据模板”“已润色”等说明性文字。
@@ -1120,7 +1120,7 @@ class OutlineGenerator:
 7. 禁止输出完整电话号码,电话必须打码处理。
 
 【允许修改】
-1. 仅可对已有句子做轻微语病修正、标点修正、措辞顺滑和专业化表达。
+1. 仅可对已有句子做轻微语病修正、标点修正、措辞顺滑、文字删除线去除和专业化表达。
 2. 可将模板中的“xx”等占位符替换为项目信息中明确给出的内容;没有明确值时保留原文。
 3. 人名占位符不得替换为真实姓名,必须保留或改为“XX”;电话号码必须保留打码形式,不得输出完整号码。
 4. 表格、编号、标题层级、段落数量、段落顺序必须与参考模板一致。

+ 19 - 2
utils_test/Model_Test/test_model_stress.py

@@ -16,6 +16,9 @@
      
     # python utils_test/Model_Test/test_model_stress.py --concurrency 150 --count 150 --model shutian_qwen3_6_27b --context-size 8k
 
+    # 避免服务端 KV 缓存命中(注入随机值)
+    python utils_test/Model_Test/test_model_stress.py --concurrency 10 --count 50 --bust-cache
+
     # 自定义参数
     python utils_test/Model_Test/test_model_stress.py --concurrency 20 --count 100 --model shutian_qwen3_5_122b
 
@@ -41,6 +44,7 @@ import asyncio
 import argparse
 import time
 import statistics
+import uuid
 from pathlib import Path
 from dataclasses import dataclass, field
 from typing import List, Optional, Tuple
@@ -191,11 +195,13 @@ def _extract_token_usage(response) -> Tuple[int, int]:
 
 async def _run_llm_request(trace_id: str, model_name: Optional[str] = None,
                            function_name: Optional[str] = None,
-                           context_size: int = 0) -> RequestResult:
+                           context_size: int = 0,
+                           bust_cache: bool = False) -> RequestResult:
     """执行单次 LLM 调用并记录延迟和 token 用量
 
     Args:
         context_size: 上下文 token 数,>0 时在 user_prompt 前拼接填充文本
+        bust_cache: 在 prompt 末尾追加随机值避免 KV 缓存命中
     """
     from foundation.ai.models.model_handler import model_handler
     from foundation.ai.models.model_config_loader import get_model_for_function, get_thinking_mode_for_function
@@ -222,6 +228,10 @@ async def _run_llm_request(trace_id: str, model_name: Optional[str] = None,
         padding = _generate_context_text(context_size)
         user_prompt = f"{padding}\n\n---\n\n{TEST_USER_PROMPT}"
 
+    if bust_cache:
+        rand = uuid.uuid4().hex[:12]
+        user_prompt = f"[noise:{rand}]\n{user_prompt}"
+
     messages = [SystemMessage(content=TEST_SYSTEM_PROMPT), HumanMessage(content=user_prompt)]
 
     start = time.perf_counter()
@@ -274,6 +284,7 @@ async def run_stress_test(
     total_count: int,
     function_name: Optional[str] = None,
     context_size: int = 0,
+    bust_cache: bool = False,
 ) -> StressTestResult:
     """执行压力测试
 
@@ -304,7 +315,7 @@ async def run_stress_test(
             if model_type == "embedding":
                 return await _run_embedding_request(trace_id, model_name)
             else:
-                return await _run_llm_request(trace_id, model_name, function_name, context_size)
+                return await _run_llm_request(trace_id, model_name, function_name, context_size, bust_cache)
 
     ctx_label = f" | 上下文: {context_size//1024}k tokens" if context_size > 0 else ""
     print(f"\n{'='*60}")
@@ -502,6 +513,10 @@ def parse_args():
         "--all-embeddings", action="store_true",
         help="逐个测试所有 Embedding 模型",
     )
+    parser.add_argument(
+        "--bust-cache", action="store_true",
+        help="在每次请求的 prompt 末尾注入随机值,避免服务端 KV 缓存命中",
+    )
     return parser.parse_args()
 
 
@@ -539,6 +554,7 @@ async def _run_single_model_test(args, model_name: str, function_name: Optional[
                 total_count=args.count,
                 function_name=function_name,
                 context_size=ctx_size,
+                bust_cache=args.bust_cache,
             )
             summary = print_report(result)
             summary["context_display"] = ctx_display
@@ -571,6 +587,7 @@ async def main():
                     model_type=model_type,
                     concurrency=args.concurrency,
                     total_count=args.count,
+                    bust_cache=args.bust_cache,
                 )
                 summary = print_report(result)
                 results_summary.append(summary)