před 4 týdny · 7237fd9378
--- a/src/app/scripts/first_bfp_collection_entity_import.py
+++ b/src/app/scripts/first_bfp_collection_entity_import.py
@@ -0,0 +1,1198 @@
 
				+"""
			
 
				+编制依据实体抽取与导入脚本（jieba 版）
			
 
				+
			
 
				+功能：
			
 
				+1. 实体抽取：使用 jieba 分词 + 词性标注 + TF-IDF 关键词提取 + 规则补充
			
 
				+2. 关系抽取：基于规则模式匹配
			
 
				+3. 背景信息：废止状态、管理单位、适用范围
			
 
				+
			
 
				+字段结构：
			
 
				+- text: 实体文本（用于 BM25 检索）
			
 
				+- dense: 实体向量
			
 
				+- content: 与 text 内容相同
			
 
				+- metadata: JSON 字符串 {uuid, file, title, backgrounds}
			
 
				+  - backgrounds 不能为空
			
 
				+
			
 
				+依赖：
			
 
				+    uv add jieba
			
 
				+
			
 
				+用法:
			
 
				+    uv run -m src.app.scripts.first_bfp_collection_entity_import
			
 
				+"""
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import json
			
 
				+import re
			
 
				+import uuid
			
 
				+import warnings
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, List, Optional, Tuple, Set
			
 
				+from dataclasses import dataclass, asdict
			
 
				+from collections import Counter
			
 
				+import math
			
 
				+
			
 
				+from app.config.embeddings import get_embeddings
			
 
				+from app.config.milvus_client import get_milvusclient
			
 
				+
			
 
				+# Collection 名称
			
 
				+COLLECTION_NAME = "first_bfp_collection_entity"
			
 
				+
			
 
				+# 源文件夹路径
			
 
				+ROOT_FOLDER = r"F:\第二阶段编制依据及施工方案数据治理-20260206\133"
			
 
				+
			
 
				+# 批量插入大小
			
 
				+BATCH_SIZE = 100
			
 
				+
			
 
				+# jieba 依赖
			
 
				+try:
			
 
				+    import jieba
			
 
				+    import jieba.posseg as pseg
			
 
				+    JIEBA_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    JIEBA_AVAILABLE = False
			
 
				+    warnings.warn("jieba not installed. Using rule-based extraction only. Run: uv add jieba")
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class Entity:
			
 
				+    """实体数据结构"""
			
 
				+    text: str
			
 
				+    entity_type: str
			
 
				+    position: int
			
 
				+    context: str = ""  # 实体出现的上下文
			
 
				+    source: str = "rule"  # 来源: jieba / rule / combined
			
 
				+    weight: float = 1.0  # 权重（TF-IDF 分数）
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class Relationship:
			
 
				+    """关系数据结构"""
			
 
				+    source: str
			
 
				+    relation_type: str
			
 
				+    target: str
			
 
				+    context: str = ""
			
 
				+    confidence: float = 1.0  # 置信度
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class BackgroundInfo:
			
 
				+    """背景信息结构"""
			
 
				+    abolish_status: List[str]
			
 
				+    manage_orgs: Dict[str, List[str]]
			
 
				+    scope: Dict[str, List[str]]
			
 
				+    relations: List[Relationship]
			
 
				+
			
 
				+
			
 
				+# ==================== jieba 工具类 ====================
			
 
				+
			
 
				+class JiebaExtractor:
			
 
				+    """基于 jieba 的专业 NLP 抽取器"""
			
 
				+    
			
 
				+    _instance = None
			
 
				+    _initialized = False
			
 
				+    
			
 
				+    # 停用词表
			
 
				+    STOP_WORDS = {
			
 
				+        '的', '了', '在', '是', '和', '与', '及', '或', '等', '本', '第', '之', '为', '有',
			
 
				+        '而', '于', '以', '及其', '该', '这', '那', '此', '其', '个', '中', '上', '下',
			
 
				+        '后', '前', '内', '外', '将', '应', '可', '按', '根据', '按照', '依据', '有关',
			
 
				+        '相关', '规定', '要求', '所述', '所示', '所述', '其中', '如下', '如下所述',
			
 
				+        '分别', '不得', '必须', '需要', '应当', '可以', '不得', '禁止', '允许',
			
 
				+    }
			
 
				+    
			
 
				+    # 专业领域词典（可扩展）
			
 
				+    DOMAIN_WORDS = {
			
 
				+        '混凝土', '钢筋', '预应力', '桥梁', '隧道', '路基', '路面', '涵洞',
			
 
				+        '边坡', '基坑', '桩基', '墩柱', '梁体', '支座', '伸缩缝', '挡土墙',
			
 
				+        '施工', '检测', '监测', '设计', '验收', '养护', '抗震', '承载力',
			
 
				+        '稳定性', '变形', '沉降', '抗剪', '抗弯', '裂缝', '焊接', '浇筑',
			
 
				+        '张拉', '压浆', '注浆', '爆破', '开挖', '支护', '地基处理',
			
 
				+        '安全检查', '脚手架', '模板', '高处作业', '临时用电', '起重机械',
			
 
				+        '文明施工', '扬尘治理', '绿色施工', '质量管理', '安全生产',
			
 
				+    }
			
 
				+    
			
 
				+    # 词性到实体类型的映射
			
 
				+    POS_MAPPING = {
			
 
				+        'n': 'noun',           # 名词
			
 
				+        'nr': 'person',        # 人名
			
 
				+        'ns': 'location',      # 地名
			
 
				+        'nt': 'organization',  # 机构名
			
 
				+        'nz': 'term',          # 其他专名
			
 
				+        'vn': 'verb_noun',     # 名动词
			
 
				+        'an': 'adj_noun',      # 名形词
			
 
				+        's': 'space',          # 处所词
			
 
				+        'f': 'direction',      # 方位词
			
 
				+        't': 'time',           # 时间词
			
 
				+    }
			
 
				+    
			
 
				+    def __new__(cls):
			
 
				+        if cls._instance is None:
			
 
				+            cls._instance = super().__new__(cls)
			
 
				+        return cls._instance
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        if not self._initialized and JIEBA_AVAILABLE:
			
 
				+            self._load_models()
			
 
				+            self._initialized = True
			
 
				+    
			
 
				+    def _load_models(self):
			
 
				+        """加载 jieba 词典和配置"""
			
 
				+        try:
			
 
				+            print("🔄 加载 jieba 分词器...")
			
 
				+            
			
 
				+            # 添加专业领域词汇
			
 
				+            for word in self.DOMAIN_WORDS:
			
 
				+                jieba.add_word(word, freq=1000)
			
 
				+            
			
 
				+            # 启用paddle模式（如果可用）
			
 
				+            try:
			
 
				+                jieba.enable_paddle()
			
 
				+                print("✅ jieba 加载完成（启用 Paddle 模式）")
			
 
				+            except:
			
 
				+                print("✅ jieba 加载完成（基础模式）")
			
 
				+                
			
 
				+        except Exception as e:
			
 
				+            print(f"⚠️ jieba 加载失败: {e}")
			
 
				+    
			
 
				+    @property
			
 
				+    def is_ready(self) -> bool:
			
 
				+        return JIEBA_AVAILABLE
			
 
				+    
			
 
				+    def extract_entities(self, text: str, topk: int = 50) -> List[Entity]:
			
 
				+        """
			
 
				+        使用 jieba 抽取实体
			
 
				+        结合分词+词性标注+TF-IDF权重
			
 
				+        自动清理编号前缀
			
 
				+        """
			
 
				+        if not self.is_ready:
			
 
				+            return []
			
 
				+        
			
 
				+        entities = []
			
 
				+        seen: Set[str] = set()
			
 
				+        
			
 
				+        try:
			
 
				+            # 先对文本进行编号前缀清理，用于提取更干净的实体
			
 
				+            cleaned_text = clean_number_prefix(text)
			
 
				+            
			
 
				+            # 1. 词性标注分词
			
 
				+            words_pos = list(pseg.cut(text))
			
 
				+            
			
 
				+            # 2. 提取命名实体和名词短语
			
 
				+            i = 0
			
 
				+            while i < len(words_pos):
			
 
				+                word, flag = words_pos[i]
			
 
				+                
			
 
				+                # 跳过停用词和短词
			
 
				+                if len(word) < 2 or word in self.STOP_WORDS:
			
 
				+                    i += 1
			
 
				+                    continue
			
 
				+                
			
 
				+                # 提取命名实体（人名、地名、机构名等）
			
 
				+                if flag in ['nr', 'ns', 'nt', 'nz', 's', 'f', 't']:
			
 
				+                    ent_type = self.POS_MAPPING.get(flag, 'term')
			
 
				+                    # 清理实体文本的编号前缀
			
 
				+                    cleaned_word = clean_number_prefix(word)
			
 
				+                    if not cleaned_word:
			
 
				+                        cleaned_word = word
			
 
				+                    position = text.find(word)
			
 
				+                    if position >= 0 and cleaned_word not in seen:
			
 
				+                        seen.add(cleaned_word)
			
 
				+                        entities.append(Entity(
			
 
				+                            text=cleaned_word,
			
 
				+                            entity_type=ent_type,
			
 
				+                            position=position,
			
 
				+                            context=self._get_context(text, position, 30),
			
 
				+                            source="jieba_ner",
			
 
				+                            weight=2.0  # 命名实体权重更高
			
 
				+                        ))
			
 
				+                
			
 
				+                # 提取连续的名词短语（n + vn + an）
			
 
				+                if flag.startswith(('n', 'vn', 'an')):
			
 
				+                    phrase = [word]
			
 
				+                    j = i + 1
			
 
				+                    while j < len(words_pos) and words_pos[j][1].startswith(('n', 'vn', 'an', 'v')):
			
 
				+                        next_word = words_pos[j][0]
			
 
				+                        if len(next_word) >= 1 and next_word not in self.STOP_WORDS:
			
 
				+                            phrase.append(next_word)
			
 
				+                        j += 1
			
 
				+                    
			
 
				+                    if len(phrase) >= 2:
			
 
				+                        phrase_text = ''.join(phrase)
			
 
				+                        # 清理短语中的编号前缀
			
 
				+                        cleaned_phrase = clean_number_prefix(phrase_text)
			
 
				+                        if not cleaned_phrase:
			
 
				+                            cleaned_phrase = phrase_text
			
 
				+                        if 4 <= len(cleaned_phrase) <= 30 and cleaned_phrase not in seen:
			
 
				+                            seen.add(cleaned_phrase)
			
 
				+                            position = text.find(phrase_text)
			
 
				+                            if position >= 0:
			
 
				+                                entities.append(Entity(
			
 
				+                                    text=cleaned_phrase,
			
 
				+                                    entity_type="technical_term",
			
 
				+                                    position=position,
			
 
				+                                    context=self._get_context(text, position, 30),
			
 
				+                                    source="jieba_phrase",
			
 
				+                                    weight=1.5
			
 
				+                                ))
			
 
				+                    i = j if j > i + 1 else i + 1
			
 
				+                else:
			
 
				+                    i += 1
			
 
				+            
			
 
				+            # 3. TF-IDF 关键词提取
			
 
				+            keywords = self.extract_keywords(text, topk=topk)
			
 
				+            for word, weight in keywords:
			
 
				+                # 清理关键词的编号前缀
			
 
				+                cleaned_word = clean_number_prefix(word)
			
 
				+                if not cleaned_word:
			
 
				+                    cleaned_word = word
			
 
				+                if cleaned_word not in seen and len(cleaned_word) >= 2 and cleaned_word not in self.STOP_WORDS:
			
 
				+                    seen.add(cleaned_word)
			
 
				+                    position = text.find(word)
			
 
				+                    if position >= 0:
			
 
				+                        entities.append(Entity(
			
 
				+                            text=cleaned_word,
			
 
				+                            entity_type="keyword",
			
 
				+                            position=position,
			
 
				+                            context=self._get_context(text, position, 30),
			
 
				+                            source="jieba_tfidf",
			
 
				+                            weight=weight
			
 
				+                        ))
			
 
				+            
			
 
				+            # 4. 使用 TextRank 提取关键词作为补充
			
 
				+            textrank_words = self.extract_textrank(text, topk=topk//2)
			
 
				+            for word, weight in textrank_words:
			
 
				+                # 清理关键词的编号前缀
			
 
				+                cleaned_word = clean_number_prefix(word)
			
 
				+                if not cleaned_word:
			
 
				+                    cleaned_word = word
			
 
				+                if cleaned_word not in seen and len(cleaned_word) >= 2 and cleaned_word not in self.STOP_WORDS:
			
 
				+                    seen.add(cleaned_word)
			
 
				+                    position = text.find(word)
			
 
				+                    if position >= 0:
			
 
				+                        entities.append(Entity(
			
 
				+                            text=cleaned_word,
			
 
				+                            entity_type="keyword",
			
 
				+                            position=position,
			
 
				+                            context=self._get_context(text, position, 30),
			
 
				+                            source="jieba_textrank",
			
 
				+                            weight=weight
			
 
				+                        ))
			
 
				+                        
			
 
				+        except Exception as e:
			
 
				+            print(f"⚠️ jieba 实体抽取失败: {e}")
			
 
				+        
			
 
				+        # 按权重排序
			
 
				+        entities.sort(key=lambda x: x.weight, reverse=True)
			
 
				+        return entities
			
 
				+    
			
 
				+    def extract_keywords(self, text: str, topk: int = 20) -> List[Tuple[str, float]]:
			
 
				+        """
			
 
				+        使用 TF-IDF 算法提取关键词
			
 
				+        返回: [(word, weight), ...]
			
 
				+        """
			
 
				+        if not self.is_ready:
			
 
				+            return []
			
 
				+        
			
 
				+        try:
			
 
				+            # 分词
			
 
				+            words = list(jieba.cut(text))
			
 
				+            
			
 
				+            # 过滤停用词和短词
			
 
				+            filtered_words = [
			
 
				+                w for w in words 
			
 
				+                if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
			
 
				+            ]
			
 
				+            
			
 
				+            if not filtered_words:
			
 
				+                return []
			
 
				+            
			
 
				+            # 计算 TF
			
 
				+            word_count = Counter(filtered_words)
			
 
				+            total_words = len(filtered_words)
			
 
				+            tf_scores = {word: count / total_words for word, count in word_count.items()}
			
 
				+            
			
 
				+            # 计算 IDF（简化版，使用语料库统计）
			
 
				+            idf_scores = self._calculate_idf(filtered_words)
			
 
				+            
			
 
				+            # 计算 TF-IDF
			
 
				+            tfidf_scores = {}
			
 
				+            for word in tf_scores:
			
 
				+                tfidf_scores[word] = tf_scores[word] * idf_scores.get(word, 1.0)
			
 
				+            
			
 
				+            # 返回 topk
			
 
				+            sorted_words = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
			
 
				+            return sorted_words[:topk]
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"⚠️ TF-IDF 提取失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def extract_textrank(self, text: str, topk: int = 10) -> List[Tuple[str, float]]:
			
 
				+        """
			
 
				+        使用 TextRank 算法提取关键词
			
 
				+        基于词共现图的关键词提取
			
 
				+        """
			
 
				+        if not self.is_ready:
			
 
				+            return []
			
 
				+        
			
 
				+        try:
			
 
				+            # 分词和过滤
			
 
				+            words = list(jieba.cut(text))
			
 
				+            filtered_words = [
			
 
				+                w for w in words 
			
 
				+                if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
			
 
				+            ]
			
 
				+            
			
 
				+            if len(filtered_words) < 3:
			
 
				+                return []
			
 
				+            
			
 
				+            # 构建共现图（滑动窗口大小为5）
			
 
				+            window_size = 5
			
 
				+            word_graph = {}
			
 
				+            word_set = set(filtered_words)
			
 
				+            
			
 
				+            for word in word_set:
			
 
				+                word_graph[word] = {}
			
 
				+            
			
 
				+            # 统计共现关系
			
 
				+            for i in range(len(filtered_words)):
			
 
				+                for j in range(i + 1, min(i + window_size, len(filtered_words))):
			
 
				+                    w1, w2 = filtered_words[i], filtered_words[j]
			
 
				+                    if w1 != w2:
			
 
				+                        word_graph[w1][w2] = word_graph[w1].get(w2, 0) + 1
			
 
				+                        word_graph[w2][w1] = word_graph[w2].get(w1, 0) + 1
			
 
				+            
			
 
				+            # TextRank 迭代计算
			
 
				+            damping = 0.85
			
 
				+            max_iter = 30
			
 
				+            min_diff = 0.0001
			
 
				+            
			
 
				+            # 初始化权重
			
 
				+            ranks = {word: 1.0 for word in word_set}
			
 
				+            
			
 
				+            for _ in range(max_iter):
			
 
				+                new_ranks = {}
			
 
				+                max_diff = 0
			
 
				+                
			
 
				+                for word in word_set:
			
 
				+                    rank = (1 - damping)
			
 
				+                    for neighbor, weight in word_graph[word].items():
			
 
				+                        neighbor_sum = sum(word_graph[neighbor].values())
			
 
				+                        if neighbor_sum > 0:
			
 
				+                            rank += damping * weight * ranks[neighbor] / neighbor_sum
			
 
				+                    
			
 
				+                    new_ranks[word] = rank
			
 
				+                    max_diff = max(max_diff, abs(rank - ranks[word]))
			
 
				+                
			
 
				+                ranks = new_ranks
			
 
				+                
			
 
				+                if max_diff < min_diff:
			
 
				+                    break
			
 
				+            
			
 
				+            # 返回排序结果
			
 
				+            sorted_words = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
			
 
				+            return sorted_words[:topk]
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"⚠️ TextRank 提取失败: {e}")
			
 
				+            return []
			
 
				+    
			
 
				+    def _calculate_idf(self, words: List[str]) -> Dict[str, float]:
			
 
				+        """
			
 
				+        计算简化版 IDF
			
 
				+        由于没有大规模语料库，使用基于词频的近似
			
 
				+        """
			
 
				+        # 基于词的长度惩罚 + 词性奖励的简化 IDF
			
 
				+        idf_scores = {}
			
 
				+        for word in set(words):
			
 
				+            # 基础分数
			
 
				+            base_idf = 1.0
			
 
				+            
			
 
				+            # 长度奖励（2-6字最佳）
			
 
				+            if 2 <= len(word) <= 6:
			
 
				+                base_idf *= 1.2
			
 
				+            elif len(word) > 10:
			
 
				+                base_idf *= 0.8
			
 
				+            
			
 
				+            # 专业词汇奖励
			
 
				+            if word in self.DOMAIN_WORDS:
			
 
				+                base_idf *= 1.5
			
 
				+            
			
 
				+            idf_scores[word] = base_idf
			
 
				+        
			
 
				+        return idf_scores
			
 
				+    
			
 
				+    def _get_context(self, text: str, position: int, window: int = 30) -> str:
			
 
				+        """获取上下文"""
			
 
				+        start = max(0, position - window)
			
 
				+        end = min(len(text), position + window)
			
 
				+        return text[start:end]
			
 
				+
			
 
				+
			
 
				+# 全局 jieba 抽取器实例
			
 
				+_jieba_extractor: Optional[JiebaExtractor] = None
			
 
				+
			
 
				+def get_jieba_extractor() -> Optional[JiebaExtractor]:
			
 
				+    """获取 jieba 抽取器（懒加载）"""
			
 
				+    global _jieba_extractor
			
 
				+    if _jieba_extractor is None and JIEBA_AVAILABLE:
			
 
				+        _jieba_extractor = JiebaExtractor()
			
 
				+    return _jieba_extractor
			
 
				+
			
 
				+
			
 
				+# ==================== 规则抽取模块 ====================
			
 
				+
			
 
				+# 标准名称模式
			
 
				+STANDARD_NAME_PATTERN = r'《([^》]{2,100}?)》'
			
 
				+
			
 
				+# 标准编号模式
			
 
				+STANDARD_NUMBER_PATTERNS = [
			
 
				+    r'GB\s*/?T?\s*\d+[\./\-]?\d*(?:[-–—]\d{4})?',
			
 
				+    r'JTG\s*[TD]?\s*\d+[\./\-]?\d*[a-zA-Z]?',
			
 
				+    r'JTJ\s*\d+[\./\-]?\d*',
			
 
				+    r'JGJ\s*\d+[\./\-]?\d*',
			
 
				+    r'CJJ\s*\d+[\./\-]?\d*',
			
 
				+    r'TB\s*\d+[\./\-]?\d*',
			
 
				+    r'SL\s*\d+[\./\-]?\d*',
			
 
				+    r'DL\s*/?T?\s*\d+[\./\-]?\d*',
			
 
				+    r'NB\s*/?T?\s*\d+[\./\-]?\d*',
			
 
				+    r'HG\s*/?T?\s*\d+[\./\-]?\d*',
			
 
				+    r'CECS\s*\d+[:：]?\d*',
			
 
				+    r'T/[A-Z]+\s*\d+[\./\-]?\d*',
			
 
				+    r'DB\d{2,3}[/\-]T?\s*\d+[\./\-]?\d*',
			
 
				+    r'Q/[A-Z]+\s*\d+[\./\-]?\d*',
			
 
				+    r'建标\s*\d+[\./\-]?\d*',
			
 
				+]
			
 
				+
			
 
				+# 条款引用模式
			
 
				+CLAUSE_PATTERNS = [
			
 
				+    r'第\s*[一二三四五六七八九十百千]+\s*条',
			
 
				+    r'第\s*\d+\s*条',
			
 
				+    r'第\s*\d+\.\d+\s*条',
			
 
				+    r'第\s*\d+\.\d+\.\d+\s*条',
			
 
				+    r'[\(（]\s*\d+\s*[\)）]',
			
 
				+]
			
 
				+
			
 
				+# 工程领域专业术语模式
			
 
				+TECH_TERM_PATTERNS = [
			
 
				+    # 工程类型
			
 
				+    r'(?:公路|桥梁|隧道|路基|路面|涵洞|边坡|基坑|桩基|墩柱|梁体|涵洞|挡土墙|护坡|排水|支挡)[\w\s]{0,10}?(?:工程|结构|设施|系统)',
			
 
				+    # 设计/计算相关
			
 
				+    r'(?:抗震|承载力|稳定性|变形|沉降|承载能力|抗剪|抗弯|抗冲切|局部稳定|疲劳|裂缝)[\w\s]{0,10}?(?:计算|设计|验算|分析|控制|校核)',
			
 
				+    r'(?:设计|计算|验算)[\w\s]{0,10}?(?:公式|方法|模型|参数|标准|规范|准则|规定)',
			
 
				+    # 材料相关
			
 
				+    r'(?:混凝土|钢筋|钢材|沥青|水泥|砂石|外加剂|掺合料|预应力筋)[\w\s]{0,10}?(?:强度|等级|性能|配比|用量|标号|规格)',
			
 
				+    # 工艺/施工方法
			
 
				+    r'(?:浇筑|张拉|压浆|焊接|检测|监测|养护|支护|开挖|爆破|注浆|灌浆)[\w\s]{0,10}?(?:工艺|方法|标准|要求|技术|规范)',
			
 
				+    # 地基/基础
			
 
				+    r'(?:地基|基础|支挡|防护|排水|围堰|支护)[\w\s]{0,10}?(?:设计|处理|加固|施工|工程)',
			
 
				+    # 地质灾害
			
 
				+    r'(?:液化|沉陷|滑坡|崩塌|泥石流|地震|岩溶|采空区|软土|湿陷性黄土)[\w\s]{0,10}?(?:处理|防治|评价|分析|地段)',
			
 
				+    # 结构构件
			
 
				+    r'(?:梁|板|柱|墙|拱|索|缆|锚|支座|伸缩缝|护栏|标线|标志)[\w\s]{0,5}?(?:结构|构件|部件|构造)',
			
 
				+]
			
 
				+
			
 
				+# 安全事故类型
			
 
				+SAFETY_TERM_PATTERNS = [
			
 
				+    r'(?:特大|重大|较大|一般)?(?:交通|火灾|瓦斯爆炸|透水|坍塌|冒顶片帮|放炮|火药爆炸|锅炉爆炸|容器爆炸|其他爆炸|中毒和窒息|高处坠落|物体打击|机械伤害|起重伤害|触电|淹溺|灼烫|其他)?(?:安全)?事故',
			
 
				+]
			
 
				+
			
 
				+# 管理机构/单位模式
			
 
				+ORG_PATTERNS = [
			
 
				+    r'(?:交通运输部?|住建部?|水利部?|工信部?|发改委|质检总局?|应急管理部?|自然资源部?)',
			
 
				+    r'(?:中国|中交|中铁|中建|中冶|中水|中港)[\w\s]{2,20}?(?:研究院|设计院|工程局|公司|集团)',
			
 
				+    r'(?:各省|自治区|直辖市)交通运输厅?',
			
 
				+    r'[\u4e00-\u9fa5]{2,8}(?:省|市|自治区)\s*(?:交通运输厅|住建厅|水利厅)',
			
 
				+]
			
 
				+
			
 
				+# 发布关系
			
 
				+PUBLISH_PATTERNS = [
			
 
				+    (r'由\s*([^，。\n]{2,30}?)\s*(?:发布|制定|颁发|出台)', '由发布'),
			
 
				+    (r'根据\s*《([^》]+)》\s*(?:制定|编制|发布)', '根据制定'),
			
 
				+]
			
 
				+
			
 
				+# 替代/废止关系
			
 
				+REPLACE_PATTERNS = [
			
 
				+    (r'代替\s*《([^》]+)》', '代替标准'),
			
 
				+    (r'(?:自\s*[\d年月日\-]+\s*起)?\s*废止', '已废止'),
			
 
				+    (r'已被\s*《([^》]+)》\s*代替', '被标准代替'),
			
 
				+    (r'被\s*(GB[/T]?\s*\d+[\-]\d*)\s*代替', '被编号标准代替'),
			
 
				+]
			
 
				+
			
 
				+# 管理关系
			
 
				+MANAGE_PATTERNS = [
			
 
				+    (r'(?:主编单位|主编部门)[：:]\s*([^，。\n]{2,50})', '主编单位'),
			
 
				+    (r'(?:参编单位|参编部门)[：:]\s*([^，。\n]{2,50})', '参编单位'),
			
 
				+    (r'(?:解释单位|解释部门|技术归口)[：:]\s*([^，。\n]{2,50})', '解释单位'),
			
 
				+    (r'由\s*([^，。]{2,30})\s*负责解释', '负责解释'),
			
 
				+    (r'归口单位[：:]\s*([^，。\n]{2,50})', '归口单位'),
			
 
				+]
			
 
				+
			
 
				+# 引用关系
			
 
				+REFERENCE_PATTERNS = [
			
 
				+    (r'应符合\s*《([^》]+)》\s*(?:GB[/T]?\s*\d+[\-]?\d*)?\s*的?规定', '应符合'),
			
 
				+    (r'应遵守\s*《([^》]+)》\s*(?:GB[/T]?\s*\d+[\-]?\d*)?', '应遵守'),
			
 
				+    (r'参照\s*《([^》]+)》\s*(?:JTG[/T]?\s*\d+[\-]?\d*)?', '参照'),
			
 
				+    (r'引用\s*《([^》]+)》', '引用'),
			
 
				+    (r'依据\s*《([^》]+)》', '依据'),
			
 
				+]
			
 
				+
			
 
				+# 适用范围
			
 
				+SCOPE_PATTERNS = {
			
 
				+    '工程类型': [
			
 
				+        r'适用(?:于)?\s*(?:新建|改建|扩建)?\s*([公路桥梁隧道路基路面]{2,8}\s*工程?)',
			
 
				+        r'([公路桥梁隧道路基路面涵洞]{2,6})\s*(?:的)?\s*(?:设计|施工|验收|检测)',
			
 
				+    ],
			
 
				+    '地区': [
			
 
				+        r'适用(?:于)?\s*(全国|各省|自治区|直辖市)',
			
 
				+        r'适用(?:于)?\s*([\u4e00-\u9fa5]{2,8}省|[\u4e00-\u9fa5]{2,8}市|[^。，]{2,10}地区)',
			
 
				+    ],
			
 
				+    '阶段': [
			
 
				+        r'(设计|施工|验收|勘察|检测|养护|监理|招投标)\s*阶段',
			
 
				+        r'适用(?:于)?\s*([^。，]{2,10})\s*(?:的)?\s*(设计|施工|验收|勘察)',
			
 
				+    ],
			
 
				+}
			
 
				+
			
 
				+# 废止状态
			
 
				+ABOLISH_PATTERNS = [
			
 
				+    r'自\s*(\d{4}年\d{1,2}月\d{1,2}日|\d{4}-\d{2}-\d{2})\s*起\s*废止',
			
 
				+    r'已被?\s*《([^》]+)》\s*代替',
			
 
				+    r'代替\s*《([^》]+)》',
			
 
				+    r'已\s*废止',
			
 
				+    r'自\s*[\d年月日]+\s*起\s*实施[^。]*原[^。]*(?:废止|代替)',
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def extract_entities_rule_based(text: str) -> List[Entity]:
			
 
				+    """基于规则的实体抽取（自动清理编号前缀）"""
			
 
				+    entities = []
			
 
				+    seen: Set[str] = set()
			
 
				+    
			
 
				+    def add_entity(text_content: str, e_type: str, pos: int, source: str = "rule", weight: float = 1.0):
			
 
				+        # 清理编号前缀（标准名称、编号、条款引用除外）
			
 
				+        if e_type not in ['standard_name', 'standard_number', 'clause']:
			
 
				+            text_content = clean_number_prefix(text_content)
			
 
				+        
			
 
				+        if not text_content:
			
 
				+            return
			
 
				+            
			
 
				+        key = f"{e_type}:{text_content}"
			
 
				+        if key not in seen and len(text_content) >= 2:
			
 
				+            seen.add(key)
			
 
				+            context = get_context(text, pos, 40)
			
 
				+            entities.append(Entity(
			
 
				+                text=text_content,
			
 
				+                entity_type=e_type,
			
 
				+                position=pos,
			
 
				+                context=context,
			
 
				+                source=source,
			
 
				+                weight=weight
			
 
				+            ))
			
 
				+    
			
 
				+    # 1. 标准名称
			
 
				+    for match in re.finditer(STANDARD_NAME_PATTERN, text):
			
 
				+        name = match.group(1).strip()
			
 
				+        if 2 <= len(name) <= 100:
			
 
				+            add_entity(f"《{name}》", "standard_name", match.start(), weight=3.0)
			
 
				+    
			
 
				+    # 2. 标准编号
			
 
				+    for pattern in STANDARD_NUMBER_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text, re.IGNORECASE):
			
 
				+            number = re.sub(r'\s+', '', match.group(0)).upper()
			
 
				+            if len(number) >= 3:
			
 
				+                add_entity(number, "standard_number", match.start(), weight=2.5)
			
 
				+    
			
 
				+    # 3. 条款引用
			
 
				+    for pattern in CLAUSE_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            clause = re.sub(r'\s+', '', match.group(0))
			
 
				+            if clause and len(clause) < 50:
			
 
				+                add_entity(clause, "clause", match.start(), weight=1.5)
			
 
				+    
			
 
				+    # 4. 专业术语（清理编号前缀）
			
 
				+    for pattern in TECH_TERM_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            term = re.sub(r'\s+', '', match.group(0))
			
 
				+            if 4 <= len(term) <= 50:
			
 
				+                add_entity(term, "technical_term", match.start(), weight=2.0)
			
 
				+    
			
 
				+    # 5. 安全事故类型（清理编号前缀）
			
 
				+    for pattern in SAFETY_TERM_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            term = match.group(0).strip()
			
 
				+            if 4 <= len(term) <= 30:
			
 
				+                add_entity(term, "safety_term", match.start(), weight=1.8)
			
 
				+    
			
 
				+    # 6. 管理机构/单位（清理编号前缀）
			
 
				+    for pattern in ORG_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            org = match.group(0).strip()
			
 
				+            if 4 <= len(org) <= 50:
			
 
				+                add_entity(org, "organization", match.start(), weight=2.0)
			
 
				+    
			
 
				+    return entities
			
 
				+
			
 
				+
			
 
				+def merge_entities(jieba_entities: List[Entity], rule_entities: List[Entity]) -> List[Entity]:
			
 
				+    """合并 jieba 和规则抽取的实体，去重并加权"""
			
 
				+    seen: Set[str] = set()
			
 
				+    merged = []
			
 
				+    
			
 
				+    # 合并两个列表，按权重排序
			
 
				+    all_entities = jieba_entities + rule_entities
			
 
				+    all_entities.sort(key=lambda x: x.weight, reverse=True)
			
 
				+    
			
 
				+    for ent in all_entities:
			
 
				+        # 使用小写文本作为去重 key
			
 
				+        key = f"{ent.text.lower()}:{ent.entity_type}"
			
 
				+        if key not in seen:
			
 
				+            seen.add(key)
			
 
				+            merged.append(ent)
			
 
				+    
			
 
				+    # 按位置排序
			
 
				+    merged.sort(key=lambda x: x.position)
			
 
				+    return merged
			
 
				+
			
 
				+
			
 
				+def extract_relationships(text: str) -> List[Relationship]:
			
 
				+    """抽取关系（基于规则）"""
			
 
				+    relations = []
			
 
				+    
			
 
				+    # 1. 规则匹配
			
 
				+    # 发布关系
			
 
				+    for pattern, rel_type in PUBLISH_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            target = match.group(1) if match.groups() else match.group(0)
			
 
				+            relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.9))
			
 
				+    
			
 
				+    # 替代/废止关系
			
 
				+    for pattern, rel_type in REPLACE_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            target = match.group(1) if match.groups() else "未知"
			
 
				+            relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.95))
			
 
				+    
			
 
				+    # 管理关系
			
 
				+    for pattern, rel_type in MANAGE_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            target = match.group(1).strip() if match.groups() else "未知"
			
 
				+            relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.9))
			
 
				+    
			
 
				+    # 引用关系
			
 
				+    for pattern, rel_type in REFERENCE_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            target = match.group(1).strip() if match.groups() else "未知"
			
 
				+            relations.append(Relationship("本标准", rel_type, f"《{target}》", match.group(0), 0.85))
			
 
				+    
			
 
				+    return relations
			
 
				+
			
 
				+
			
 
				+def extract_background_info(text: str) -> BackgroundInfo:
			
 
				+    """抽取背景信息"""
			
 
				+    # 废止状态
			
 
				+    abolish_status = []
			
 
				+    for pattern in ABOLISH_PATTERNS:
			
 
				+        matches = re.findall(pattern, text)
			
 
				+        for m in matches:
			
 
				+            if isinstance(m, tuple):
			
 
				+                abolish_status.extend([x for x in m if x])
			
 
				+            elif m:
			
 
				+                abolish_status.append(m)
			
 
				+    
			
 
				+    # 管理单位
			
 
				+    manage_orgs = {"主编单位": [], "参编单位": [], "解释单位": [], "归口单位": []}
			
 
				+    for pattern, org_type in MANAGE_PATTERNS:
			
 
				+        matches = re.findall(pattern, text)
			
 
				+        for m in matches:
			
 
				+            org_name = m[0] if isinstance(m, tuple) else m
			
 
				+            if org_name and org_type in manage_orgs:
			
 
				+                manage_orgs[org_type].append(org_name.strip())
			
 
				+    
			
 
				+    # 适用范围
			
 
				+    scope = {"工程类型": [], "地区": [], "阶段": []}
			
 
				+    for scope_type, patterns in SCOPE_PATTERNS.items():
			
 
				+        for pattern in patterns:
			
 
				+            matches = re.findall(pattern, text)
			
 
				+            for m in matches:
			
 
				+                if isinstance(m, tuple):
			
 
				+                    scope[scope_type].extend([x for x in m if x])
			
 
				+                elif m:
			
 
				+                    scope[scope_type].append(m)
			
 
				+    
			
 
				+    # 关系
			
 
				+    relations = extract_relationships(text)
			
 
				+    
			
 
				+    return BackgroundInfo(abolish_status, manage_orgs, scope, relations)
			
 
				+
			
 
				+
			
 
				+def get_context(text: str, position: int, window: int = 40) -> str:
			
 
				+    """获取实体上下文"""
			
 
				+    start = max(0, position - window)
			
 
				+    end = min(len(text), position + window)
			
 
				+    context = text[start:end]
			
 
				+    context = re.sub(r'\s+', ' ', context).strip()
			
 
				+    return context[:200]
			
 
				+
			
 
				+
			
 
				+def clean_number_prefix(text: str) -> str:
			
 
				+    """
			
 
				+    清理文本开头的编号前缀
			
 
				+    
			
 
				+    处理的编号格式：
			
 
				+    - 数字编号：1.  2.0.1  3.1.2.1  10.
			
 
				+    - 括号编号：(1)  (2)  (a)  (A)
			
 
				+    - 中文编号：一、 二、 三、  （一） （二）
			
 
				+    - 混合编号：1)  2)  a)  A)
			
 
				+    """
			
 
				+    if not text:
			
 
				+        return text
			
 
				+    
			
 
				+    original_text = text
			
 
				+    
			
 
				+    # 1. 清理多级数字编号 (如: 2.0.1  3.1.2  1.2.3.4)
			
 
				+    text = re.sub(r'^\s*\d+(?:\.\d+)+\.?\s*', '', text)
			
 
				+    
			
 
				+    # 2. 清理简单数字编号 (如: 1.  10.  99.)
			
 
				+    text = re.sub(r'^\s*\d+\.\s*', '', text)
			
 
				+    
			
 
				+    # 3. 清理括号数字编号 (如: (1)  (2)  (10))
			
 
				+    text = re.sub(r'^\s*[\(（]\d+[\)）]\s*', '', text)
			
 
				+    
			
 
				+    # 4. 清理括号字母编号 (如: (a)  (b)  (A)  (B))
			
 
				+    text = re.sub(r'^\s*[\(（][a-zA-Z][\)）]\s*', '', text)
			
 
				+    
			
 
				+    # 5. 清理右括号编号 (如: 1)  2)  a)  A))
			
 
				+    text = re.sub(r'^\s*[\d]+\)\s*', '', text)
			
 
				+    text = re.sub(r'^\s*[a-zA-Z]\)\s*', '', text)
			
 
				+    
			
 
				+    # 6. 清理中文编号（一）（二）（三）
			
 
				+    text = re.sub(r'^\s*[（(][一二三四五六七八九十百千]+[）)]\s*', '', text)
			
 
				+    
			
 
				+    # 7. 清理中文顿号编号（一、二、三、）
			
 
				+    text = re.sub(r'^[一二三四五六七八九十百千]+[、．.]\s*', '', text)
			
 
				+    
			
 
				+    # 8. 如果清理后内容太短，可能是误清理，返回原文
			
 
				+    if len(text.strip()) < 3 and len(original_text.strip()) > 3:
			
 
				+        return original_text.strip()
			
 
				+    
			
 
				+    return text.strip()
			
 
				+
			
 
				+
			
 
				+def clean_markdown_content(text: str) -> str:
			
 
				+    """
			
 
				+    清理 Markdown 内容，剔除代码块、链接、标题符号等非核心内容
			
 
				+    转换 md 为纯文本，保留语义核心
			
 
				+    
			
 
				+    清理规则：
			
 
				+    1. 移除代码块 (```...``` 和 ~~~...~~~)
			
 
				+    2. 移除行内代码 (`...`)
			
 
				+    3. 移除链接，保留链接文本 [text](url) -> text
			
 
				+    4. 移除图片 ![alt](url)
			
 
				+    5. 移除 HTML 标签
			
 
				+    6. 移除标题符号 (# ## ### 等)
			
 
				+    7. 移除强调符号 (** * __ _)
			
 
				+    8. 移除表格分隔符 (| --- |)
			
 
				+    9. 移除引用符号 (>)
			
 
				+    10. 移除列表符号 (- * + 1.)
			
 
				+    11. 清理编号前缀 (2.0.1  (1)  一、 等)
			
 
				+    12. 清理多余空行
			
 
				+    """
			
 
				+    if not text:
			
 
				+        return ""
			
 
				+    
			
 
				+    # 1. 移除代码块 (```...``` 和 ~~~...~~~)
			
 
				+    text = re.sub(r'```[\s\S]*?```', '', text)
			
 
				+    text = re.sub(r'~~~[\s\S]*?~~~', '', text)
			
 
				+    
			
 
				+    # 2. 移除行内代码 (`...`)
			
 
				+    text = re.sub(r'`[^`]*`', '', text)
			
 
				+    
			
 
				+    # 3. 处理链接：保留链接文本，移除 URL
			
 
				+    # [text](url "title") -> text
			
 
				+    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
			
 
				+    # 处理裸链接 <url>
			
 
				+    text = re.sub(r'<[^>]+>', '', text)
			
 
				+    
			
 
				+    # 4. 移除图片 ![alt](url)
			
 
				+    text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
			
 
				+    
			
 
				+    # 5. 移除 HTML 标签
			
 
				+    text = re.sub(r'<[^>]+>', '', text)
			
 
				+    
			
 
				+    # 6. 移除标题符号 (# ## ### 等)
			
 
				+    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
			
 
				+    
			
 
				+    # 7. 移除强调符号 (** * __ _)
			
 
				+    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)  # **bold**
			
 
				+    text = re.sub(r'__([^_]+)__', r'\1', text)       # __bold__
			
 
				+    text = re.sub(r'\*([^*]+)\*', r'\1', text)       # *italic*
			
 
				+    text = re.sub(r'_([^_]+)_', r'\1', text)         # _italic_
			
 
				+    text = re.sub(r'~~([^~]+)~~', r'\1', text)       # ~~strikethrough~~
			
 
				+    
			
 
				+    # 8. 移除表格分隔符行 (| --- | --- |)
			
 
				+    text = re.sub(r'\|?[\s\-:]+\|', '', text)
			
 
				+    text = re.sub(r'\|', ' ', text)  # 将表格分隔符替换为空格
			
 
				+    
			
 
				+    # 9. 移除引用符号 (>)
			
 
				+    text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
			
 
				+    
			
 
				+    # 10. 移除列表符号 (- * + 1.)
			
 
				+    text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
			
 
				+    text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
			
 
				+    
			
 
				+    # 11. 清理每行的编号前缀（处理多行文本）
			
 
				+    lines = text.split('\n')
			
 
				+    cleaned_lines = []
			
 
				+    for line in lines:
			
 
				+        cleaned_line = clean_number_prefix(line)
			
 
				+        if cleaned_line:
			
 
				+            cleaned_lines.append(cleaned_line)
			
 
				+    text = '\n'.join(cleaned_lines)
			
 
				+    
			
 
				+    # 12. 清理多余空行和空格
			
 
				+    text = re.sub(r'\n{3,}', '\n\n', text)  # 多于2个换行合并为2个
			
 
				+    text = re.sub(r'[ \t]+', ' ', text)      # 多个空格/制表符合并为一个
			
 
				+    text = text.strip()
			
 
				+    
			
 
				+    return text
			
 
				+
			
 
				+
			
 
				+def split_document(md_content: str) -> List[Tuple[str, str]]:
			
 
				+    """将文档拆分为段落，返回 (标题, 内容) 列表"""
			
 
				+    chunks = []
			
 
				+    
			
 
				+    # 按标题分割
			
 
				+    sections = re.split(r'\n(?=##+\s)', md_content)
			
 
				+    
			
 
				+    for section in sections:
			
 
				+        section = section.strip()
			
 
				+        if not section:
			
 
				+            continue
			
 
				+        
			
 
				+        # 提取标题
			
 
				+        title_match = re.match(r'##+\s+(.+)\n', section)
			
 
				+        if title_match:
			
 
				+            # 获取原始标题并清理编号前缀
			
 
				+            raw_title = title_match.group(1).strip()
			
 
				+            title = clean_number_prefix(raw_title)
			
 
				+            # 如果清理后标题为空，使用原文
			
 
				+            if not title:
			
 
				+                title = raw_title
			
 
				+        else:
			
 
				+            title = "前言/总则"
			
 
				+        
			
 
				+        # 清理内容 - 先移除标题行
			
 
				+        content = re.sub(r'^#+\s+.+\n?', '', section, flags=re.MULTILINE)
			
 
				+        # 清理 Markdown 格式（包含编号前缀清理）
			
 
				+        content = clean_markdown_content(content)
			
 
				+        # 清理多余换行
			
 
				+        content = re.sub(r'\n+', '\n', content).strip()
			
 
				+        
			
 
				+        if len(content) < 10:
			
 
				+            continue
			
 
				+        
			
 
				+        chunks.append((title, content))
			
 
				+    
			
 
				+    # 如果没有分块，整个文档作为一个块
			
 
				+    if not chunks and md_content.strip():
			
 
				+        cleaned = clean_markdown_content(md_content)
			
 
				+        if cleaned.strip():
			
 
				+            chunks.append(("全文", cleaned))
			
 
				+    
			
 
				+    return chunks
			
 
				+
			
 
				+
			
 
				+def extract_title_from_filename(file_name: str) -> str:
			
 
				+    """从文件名提取标准名称"""
			
 
				+    title = re.sub(r'^\d+', '', file_name)
			
 
				+    title = re.sub(r'\.md$', '', title, re.IGNORECASE)
			
 
				+    return title.strip()
			
 
				+
			
 
				+
			
 
				+def build_backgrounds(bg_info: BackgroundInfo, doc_title: str, file_name: str) -> List[str]:
			
 
				+    """构建 backgrounds 列表，确保不为空"""
			
 
				+    backgrounds = []
			
 
				+    
			
 
				+    # 1. 废止状态（最高优先级）
			
 
				+    if bg_info.abolish_status:
			
 
				+        for status in bg_info.abolish_status[:2]:
			
 
				+            if isinstance(status, str):
			
 
				+                backgrounds.append(f"废止状态：{status}")
			
 
				+    
			
 
				+    # 2. 管理单位
			
 
				+    for org_type, orgs in bg_info.manage_orgs.items():
			
 
				+        if orgs:
			
 
				+            unique_orgs = list(dict.fromkeys(orgs))[:2]
			
 
				+            backgrounds.append(f"{org_type}：{', '.join(unique_orgs)}")
			
 
				+    
			
 
				+    # 3. 适用范围
			
 
				+    if bg_info.scope["工程类型"]:
			
 
				+        types = list(dict.fromkeys(bg_info.scope["工程类型"]))[:2]
			
 
				+        backgrounds.append(f"适用工程类型：{', '.join(types)}")
			
 
				+    
			
 
				+    if bg_info.scope["阶段"]:
			
 
				+        stages = list(dict.fromkeys(bg_info.scope["阶段"]))[:2]
			
 
				+        backgrounds.append(f"适用阶段：{', '.join(stages)}")
			
 
				+    
			
 
				+    # 4. 关系信息
			
 
				+    important_rels = [r for r in bg_info.relations if r.relation_type in ['代替标准', '被标准代替', '应符合', '根据制定']]
			
 
				+    for rel in important_rels[:2]:
			
 
				+        target = rel.target[:50] + "..." if len(rel.target) > 50 else rel.target
			
 
				+        backgrounds.append(f"{rel.relation_type}：{target}")
			
 
				+    
			
 
				+    # 5. 兜底填充
			
 
				+    if not backgrounds:
			
 
				+        backgrounds.append(f"编制依据文件：{doc_title}")
			
 
				+        backgrounds.append(f"来源文档：{file_name}")
			
 
				+    
			
 
				+    return backgrounds[:5]
			
 
				+
			
 
				+
			
 
				+def extract_all_entities(text: str) -> List[Entity]:
			
 
				+    """综合抽取实体（jieba + 规则）"""
			
 
				+    # jieba 抽取
			
 
				+    jieba_entities = []
			
 
				+    jieba_ext = get_jieba_extractor()
			
 
				+    if jieba_ext and jieba_ext.is_ready:
			
 
				+        jieba_entities = jieba_ext.extract_entities(text)
			
 
				+    
			
 
				+    # 规则抽取
			
 
				+    rule_entities = extract_entities_rule_based(text)
			
 
				+    
			
 
				+    # 合并去重
			
 
				+    return merge_entities(jieba_entities, rule_entities)
			
 
				+
			
 
				+
			
 
				+def import_single_file(md_path: Path, embeddings) -> List[Dict[str, Any]]:
			
 
				+    """导入单个 MD 文件（同一文件内实体去重）"""
			
 
				+    file_name = md_path.name
			
 
				+    doc_title = extract_title_from_filename(file_name)
			
 
				+    
			
 
				+    try:
			
 
				+        with open(md_path, "r", encoding="utf-8") as f:
			
 
				+            md_content = f.read()
			
 
				+    except Exception as e:
			
 
				+        print(f"   读取失败: {e}")
			
 
				+        return []
			
 
				+    
			
 
				+    if not md_content.strip():
			
 
				+        return []
			
 
				+    
			
 
				+    # 抽取文档级背景信息
			
 
				+    doc_bg_info = extract_background_info(md_content)
			
 
				+    doc_backgrounds = build_backgrounds(doc_bg_info, doc_title, file_name)
			
 
				+    
			
 
				+    # 分块处理
			
 
				+    chunks = split_document(md_content)
			
 
				+    
			
 
				+    all_rows = []
			
 
				+    # 用于同一文件内实体去重：key = (entity_text, entity_type)
			
 
				+    file_entity_seen: Set[Tuple[str, str]] = set()
			
 
				+    
			
 
				+    for chunk_title, chunk_text in chunks:
			
 
				+        # 综合抽取实体
			
 
				+        entities = extract_all_entities(chunk_text)
			
 
				+        
			
 
				+        # 抽取段落级背景信息
			
 
				+        chunk_bg_info = extract_background_info(chunk_text)
			
 
				+        chunk_backgrounds = build_backgrounds(chunk_bg_info, chunk_title, file_name)
			
 
				+        
			
 
				+        # 合并背景信息
			
 
				+        final_backgrounds = chunk_backgrounds if len(chunk_backgrounds) > 1 else doc_backgrounds
			
 
				+        
			
 
				+        if entities:
			
 
				+            for entity in entities:
			
 
				+                entity_text = entity.text.strip()
			
 
				+                entity_type = entity.entity_type
			
 
				+                
			
 
				+                # 同一文件内实体去重
			
 
				+                dedup_key = (entity_text.lower(), entity_type)
			
 
				+                if dedup_key in file_entity_seen:
			
 
				+                    continue
			
 
				+                file_entity_seen.add(dedup_key)
			
 
				+                
			
 
				+                # 生成向量
			
 
				+                try:
			
 
				+                    vector = embeddings.embed_query(entity_text)
			
 
				+                except Exception as e:
			
 
				+                    print(f"   向量生成失败: {e}")
			
 
				+                    continue
			
 
				+                
			
 
				+                # 构造 metadata
			
 
				+                metadata = {
			
 
				+                    "uuid": str(uuid.uuid4()),
			
 
				+                    "file": file_name,
			
 
				+                    "title": chunk_title,
			
 
				+                    "backgrounds": final_backgrounds,
			
 
				+                }
			
 
				+                
			
 
				+                all_rows.append({
			
 
				+                    "text": entity_text,
			
 
				+                    "dense": vector,
			
 
				+                    "content": entity_text,
			
 
				+                    "metadata": json.dumps(metadata, ensure_ascii=False),
			
 
				+                })
			
 
				+        else:
			
 
				+            # 无实体时，用段落标题作为实体
			
 
				+            chunk_title_clean = chunk_title.strip()
			
 
				+            dedup_key = (chunk_title_clean.lower(), "chunk_title")
			
 
				+            if dedup_key not in file_entity_seen:
			
 
				+                file_entity_seen.add(dedup_key)
			
 
				+                
			
 
				+                try:
			
 
				+                    vector = embeddings.embed_query(chunk_title_clean)
			
 
				+                except Exception:
			
 
				+                    continue
			
 
				+                
			
 
				+                metadata = {
			
 
				+                    "uuid": str(uuid.uuid4()),
			
 
				+                    "file": file_name,
			
 
				+                    "title": chunk_title_clean,
			
 
				+                    "backgrounds": final_backgrounds,
			
 
				+                }
			
 
				+                
			
 
				+                all_rows.append({
			
 
				+                    "text": chunk_title_clean[:200],
			
 
				+                    "dense": vector,
			
 
				+                    "content": chunk_title_clean[:200],
			
 
				+                    "metadata": json.dumps(metadata, ensure_ascii=False),
			
 
				+                })
			
 
				+    
			
 
				+    return all_rows
			
 
				+
			
 
				+
			
 
				+def batch_insert(client, rows: List[Dict[str, Any]]) -> Tuple[int, List[Dict[str, Any]]]:
			
 
				+    """批量插入数据"""
			
 
				+    if not rows:
			
 
				+        return 0, []
			
 
				+    
			
 
				+    inserted = 0
			
 
				+    failed_rows = []
			
 
				+    
			
 
				+    for i in range(0, len(rows), BATCH_SIZE):
			
 
				+        batch = rows[i:i + BATCH_SIZE]
			
 
				+        try:
			
 
				+            client.insert(collection_name=COLLECTION_NAME, data=batch)
			
 
				+            inserted += len(batch)
			
 
				+        except Exception as e:
			
 
				+            print(f"   插入失败: {e}")
			
 
				+            failed_rows.extend(batch)
			
 
				+    
			
 
				+    return inserted, failed_rows
			
 
				+
			
 
				+
			
 
				+def import_from_folder(root_folder: str):
			
 
				+    """从文件夹批量导入（只扫描指定目录下的md文件，不递归子目录）"""
			
 
				+    root = Path(root_folder)
			
 
				+    if not root.exists():
			
 
				+        print(f"文件夹不存在: {root}")
			
 
				+        return
			
 
				+    
			
 
				+    print(f"扫描文件夹: {root}（不递归子目录）")
			
 
				+    
			
 
				+    # 只扫描当前目录下的 .md 文件，不递归子目录
			
 
				+    md_files = [f for f in root.glob("*.md") if f.is_file()]
			
 
				+    print(f"发现 {len(md_files)} 个 MD 文件")
			
 
				+    
			
 
				+    if not md_files:
			
 
				+        return
			
 
				+    
			
 
				+    # 初始化 jieba
			
 
				+    jieba_ext = get_jieba_extractor()
			
 
				+    if jieba_ext and jieba_ext.is_ready:
			
 
				+        print("jieba 已启用")
			
 
				+    else:
			
 
				+        print("jieba 未启用，使用纯规则抽取")
			
 
				+        print("建议安装: uv add jieba")
			
 
				+    
			
 
				+    # 初始化 Milvus
			
 
				+    client = get_milvusclient()
			
 
				+    embeddings = get_embeddings()
			
 
				+    
			
 
				+    if not client.has_collection(collection_name=COLLECTION_NAME):
			
 
				+        print(f"Collection 不存在: {COLLECTION_NAME}")
			
 
				+        print(f"运行: uv run -m src.app.scripts.first_bfp_collection_entity_create")
			
 
				+        return
			
 
				+    
			
 
				+    client.load_collection(collection_name=COLLECTION_NAME)
			
 
				+    
			
 
				+    # 统计
			
 
				+    total_entities = 0
			
 
				+    total_inserted = 0
			
 
				+    entity_source_stats: Dict[str, int] = {}
			
 
				+    entity_type_stats: Dict[str, int] = {}
			
 
				+    failed_files = []
			
 
				+    
			
 
				+    for idx, md_path in enumerate(md_files, 1):
			
 
				+        print(f"\n[{idx}/{len(md_files)}] 处理: {md_path.name}")
			
 
				+        
			
 
				+        try:
			
 
				+            rows = import_single_file(md_path, embeddings)
			
 
				+            
			
 
				+            if rows:
			
 
				+                # 统计
			
 
				+                for row in rows:
			
 
				+                    text = row.get("text", "")
			
 
				+                    # 推断实体类型
			
 
				+                    if text.startswith("《"):
			
 
				+                        etype = "standard_name"
			
 
				+                    elif re.match(r'^[A-Z]{2,}', text):
			
 
				+                        etype = "standard_number"
			
 
				+                    elif re.match(r'^第', text):
			
 
				+                        etype = "clause"
			
 
				+                    else:
			
 
				+                        etype = "term"
			
 
				+                    entity_type_stats[etype] = entity_type_stats.get(etype, 0) + 1
			
 
				+                
			
 
				+                print(f"   抽取 {len(rows)} 个实体")
			
 
				+                
			
 
				+                # 插入
			
 
				+                inserted, failed = batch_insert(client, rows)
			
 
				+                total_inserted += inserted
			
 
				+                if failed:
			
 
				+                    print(f"   {len(failed)} 条插入失败")
			
 
				+                else:
			
 
				+                    print(f"   插入 {inserted} 条")
			
 
				+            else:
			
 
				+                print(f"   无有效实体")
			
 
				+            
			
 
				+            total_entities += len(rows)
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"   处理失败: {e}")
			
 
				+            failed_files.append(md_path.name)
			
 
				+    
			
 
				+    # 汇总
			
 
				+    print("\n" + "=" * 70)
			
 
				+    print("导入完成")
			
 
				+    print("=" * 70)
			
 
				+    print(f"处理文件: {len(md_files)}")
			
 
				+    print(f"抽取实体: {total_entities}")
			
 
				+    print(f"成功插入: {total_inserted}")
			
 
				+    if failed_files:
			
 
				+        print(f"失败文件: {len(failed_files)}")
			
 
				+    print("\n实体类型分布:")
			
 
				+    for etype, count in sorted(entity_type_stats.items(), key=lambda x: -x[1]):
			
 
				+        print(f"   - {etype}: {count}")
			
 
				+    print("=" * 70)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    print("=" * 70)
			
 
				+    print("编制依据实体抽取与导入（jieba 版）")
			
 
				+    print("=" * 70)
			
 
				+    print("实体抽取: jieba 分词 + TF-IDF + TextRank + 规则补充")
			
 
				+    print("关系抽取: 规则模式匹配")
			
 
				+    print("字段结构: text, dense, content(=text), metadata")
			
 
				+    print("=" * 70)
			
 
				+    
			
 
				+    try:
			
 
				+        import_from_folder(ROOT_FOLDER)
			
 
				+    except Exception as e:
			
 
				+        print(f"\n导入失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/src/app/scripts/first_bfp_collection_entity_import_v2.py
+++ b/src/app/scripts/first_bfp_collection_entity_import_v2.py
@@ -0,0 +1,1219 @@
 
				+"""
			
 
				+编制依据实体抽取与导入脚本（V2 优化版）
			
 
				+
			
 
				+主要改进：
			
 
				+1. 层级路径标题：支持 "1.总则 > 1.1术语定义" 格式的层级标题
			
 
				+2. LLM辅助实体抽取：可配置启用大模型进行专业术语识别
			
 
				+3. 移除硬编码工程术语模式：使用通用NLP方法 + 可选LLM增强
			
 
				+4. 改进background信息抽取：更准确识别文档元数据
			
 
				+
			
 
				+字段结构：
			
 
				+- text: 实体文本（用于 BM25 检索）
			
 
				+- dense: 实体向量
			
 
				+- content: 与 text 内容相同
			
 
				+- metadata: JSON 字符串 {uuid, file, title(层级路径), backgrounds}
			
 
				+
			
 
				+依赖：
			
 
				+    uv add jieba
			
 
				+
			
 
				+用法:
			
 
				+    uv run -m src.app.scripts.first_bfp_collection_entity_import_v2
			
 
				+"""
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import json
			
 
				+import re
			
 
				+import uuid
			
 
				+import warnings
			
 
				+import os
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Dict, List, Optional, Tuple, Set
			
 
				+from dataclasses import dataclass, asdict, field
			
 
				+from collections import Counter
			
 
				+import math
			
 
				+
			
 
				+from app.config.embeddings import get_embeddings
			
 
				+from app.config.milvus_client import get_milvus_client
			
 
				+
			
 
				+# ==================== 配置区域 ====================
			
 
				+
			
 
				+# Collection 名称
			
 
				+COLLECTION_NAME = "first_bfp_collection_entity"
			
 
				+
			
 
				+# 源文件夹路径
			
 
				+ROOT_FOLDER = r"F:\第二阶段编制依据及施工方案数据治理-20260206\133"
			
 
				+
			
 
				+# 批量插入大小
			
 
				+BATCH_SIZE = 100
			
 
				+
			
 
				+# LLM 配置（可选）
			
 
				+LLM_CONFIG = {
			
 
				+      "enabled": True,  # ← 启用 LLM
			
 
				+      "api_key": os.getenv("LLM_API_KEY", "lm-studio"),
			
 
				+      "api_base": os.getenv("LLM_API_BASE", "http://localhost:1234/v1"),
			
 
				+      "model": os.getenv("LLM_MODEL", "Qwen2.5-7B-Instruct-Uncensored.Q4_K_M"),
			
 
				+      "batch_size": 5,  # ← 建议调小，GGUF模型并发能力有限
			
 
				+      "max_entities_per_chunk": 8,
			
 
				+}
			
 
				+
			
 
				+# jieba 依赖
			
 
				+try:
			
 
				+    import jieba
			
 
				+    import jieba.posseg as pseg
			
 
				+    JIEBA_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    JIEBA_AVAILABLE = False
			
 
				+    warnings.warn("jieba not installed. Using rule-based extraction only. Run: uv add jieba")
			
 
				+
			
 
				+
			
 
				+# ==================== 数据模型 ====================
			
 
				+
			
 
				+@dataclass
			
 
				+class Entity:
			
 
				+    """实体数据结构"""
			
 
				+    text: str
			
 
				+    entity_type: str
			
 
				+    position: int
			
 
				+    context: str = ""  # 实体出现的上下文
			
 
				+    source: str = "rule"  # 来源: jieba / rule / llm / combined
			
 
				+    weight: float = 1.0  # 权重
			
 
				+    confidence: float = 1.0  # 置信度
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class Relationship:
			
 
				+    """关系数据结构"""
			
 
				+    source: str
			
 
				+    relation_type: str
			
 
				+    target: str
			
 
				+    context: str = ""
			
 
				+    confidence: float = 1.0
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class BackgroundInfo:
			
 
				+    """背景信息结构"""
			
 
				+    abolish_status: List[str] = field(default_factory=list)
			
 
				+    manage_orgs: Dict[str, List[str]] = field(default_factory=dict)
			
 
				+    scope: Dict[str, List[str]] = field(default_factory=dict)
			
 
				+    relations: List[Relationship] = field(default_factory=list)
			
 
				+    doc_type: str = ""  # 文档类型：标准/规范/办法等
			
 
				+    publish_date: str = ""  # 发布日期
			
 
				+    effective_date: str = ""  # 实施日期
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class DocumentChunk:
			
 
				+    """文档分块结构"""
			
 
				+    content: str  # 清理后的内容
			
 
				+    level: int  # 标题层级 0=文档级, 1=#, 2=##, ...
			
 
				+    title: str  # 当前标题（原始文本）
			
 
				+    hierarchy_path: str  # 层级路径 "1.总则 > 1.1术语"
			
 
				+    position: int  # 在文档中的位置
			
 
				+
			
 
				+
			
 
				+# ==================== LLM 实体抽取模块（可选）====================
			
 
				+
			
 
				+class LLMEntityExtractor:
			
 
				+    """基于大语言模型的专业实体抽取器"""
			
 
				+    
			
 
				+    _instance = None
			
 
				+    
			
 
				+    def __new__(cls):
			
 
				+        if cls._instance is None:
			
 
				+            cls._instance = super().__new__(cls)
			
 
				+        return cls._instance
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.enabled = LLM_CONFIG.get("enabled", False)
			
 
				+        self.cache: Dict[str, List[Entity]] = {}  # 简单缓存
			
 
				+    
			
 
				+    def is_available(self) -> bool:
			
 
				+        """检查LLM是否可用"""
			
 
				+        if not self.enabled:
			
 
				+            return False
			
 
				+        try:
			
 
				+            import openai
			
 
				+            return True
			
 
				+        except ImportError:
			
 
				+            return False
			
 
				+    
			
 
				+    def extract_entities(self, text: str, context: str = "") -> List[Entity]:
			
 
				+        """
			
 
				+        使用LLM抽取专业术语实体
			
 
				+        返回高置信度的专业术语列表
			
 
				+        """
			
 
				+        if not self.is_available() or len(text) < 20:
			
 
				+            return []
			
 
				+        
			
 
				+        # 检查缓存
			
 
				+        cache_key = hash(text[:100])
			
 
				+        if cache_key in self.cache:
			
 
				+            return self.cache[cache_key]
			
 
				+        
			
 
				+        try:
			
 
				+            import openai
			
 
				+            
			
 
				+            client = openai.OpenAI(
			
 
				+                api_key=LLM_CONFIG["api_key"],
			
 
				+                base_url=LLM_CONFIG["api_base"]
			
 
				+            )
			
 
				+            
			
 
				+            prompt = f"""从以下工程/技术文档段落中提取专业术语和重要概念。
			
 
				+
			
 
				+要求：
			
 
				+1. 提取真正的专业术语（如"混凝土强度等级"、"承载力计算"、"抗震设防烈度"）
			
 
				+2. 不要提取通用词汇（如"规定"、"要求"、"方法"）
			
 
				+3. 每个术语标注类型：technical_term(技术术语) / material(材料) / process(工艺) / standard(标准) / parameter(参数)
			
 
				+4. 最多返回10个最重要的术语
			
 
				+
			
 
				+文档上下文：{context if context else "工程技术标准文档"}
			
 
				+
			
 
				+待分析文本：
			
 
				+{text[:800]}
			
 
				+
			
 
				+请以JSON格式返回：
			
 
				+{{"entities": [{{"term": "术语", "type": "类型", "importance": "high/medium/low"}}]}}
			
 
				+"""
			
 
				+            
			
 
				+            response = client.chat.completions.create(
			
 
				+                model=LLM_CONFIG["model"],
			
 
				+                messages=[
			
 
				+                    {"role": "system", "content": "你是工程文档分析专家，擅长提取专业术语。只返回JSON格式结果。"},
			
 
				+                    {"role": "user", "content": prompt}
			
 
				+                ],
			
 
				+                temperature=0.1,
			
 
				+                max_tokens=500
			
 
				+            )
			
 
				+            
			
 
				+            result_text = response.choices[0].message.content
			
 
				+            
			
 
				+            # 解析JSON结果
			
 
				+            entities = []
			
 
				+            try:
			
 
				+                # 尝试提取JSON部分
			
 
				+                json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
			
 
				+                if json_match:
			
 
				+                    result = json.loads(json_match.group())
			
 
				+                    for item in result.get("entities", []):
			
 
				+                        term = item.get("term", "").strip()
			
 
				+                        etype = item.get("type", "technical_term")
			
 
				+                        importance = item.get("importance", "medium")
			
 
				+                        
			
 
				+                        if term and len(term) >= 2:
			
 
				+                            weight = {"high": 3.0, "medium": 2.0, "low": 1.0}.get(importance, 2.0)
			
 
				+                            entities.append(Entity(
			
 
				+                                text=term,
			
 
				+                                entity_type=etype,
			
 
				+                                position=text.find(term) if term in text else 0,
			
 
				+                                context=text[:100],
			
 
				+                                source="llm",
			
 
				+                                weight=weight,
			
 
				+                                confidence=0.9
			
 
				+                            ))
			
 
				+            except Exception as e:
			
 
				+                print(f"  LLM结果解析失败: {e}")
			
 
				+            
			
 
				+            # 缓存结果
			
 
				+            self.cache[cache_key] = entities
			
 
				+            return entities
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"  LLM调用失败: {e}")
			
 
				+            return []
			
 
				+
			
 
				+
			
 
				+# ==================== jieba NLP模块 ====================
			
 
				+
			
 
				+class JiebaExtractor:
			
 
				+    """基于jieba的NLP抽取器"""
			
 
				+    
			
 
				+    _instance = None
			
 
				+    _initialized = False
			
 
				+    
			
 
				+    # 停用词表
			
 
				+    STOP_WORDS = {
			
 
				+        '的', '了', '在', '是', '和', '与', '及', '或', '等', '本', '第', '之', '为', '有',
			
 
				+        '而', '于', '以', '及其', '该', '这', '那', '此', '其', '个', '中', '上', '下',
			
 
				+        '后', '前', '内', '外', '将', '应', '可', '按', '根据', '按照', '依据', '有关',
			
 
				+        '相关', '规定', '要求', '所述', '所示', '其中', '如下', '分别', '不得', '必须',
			
 
				+        '需要', '应当', '可以', '禁止', '允许', '分别', '其他', '以及', '或者', '并且',
			
 
				+        '进行', '予以', '予以', '采用', '使用', '提出', '作出', '超过', '低于', '高于',
			
 
				+        '符合', '满足', '达到', '完成', '形成', '产生', '引起', '导致', '造成',
			
 
				+    }
			
 
				+    
			
 
				+    # 词性到实体类型的映射
			
 
				+    POS_MAPPING = {
			
 
				+        'n': 'noun',           # 名词
			
 
				+        'nr': 'person',        # 人名
			
 
				+        'ns': 'location',      # 地名
			
 
				+        'nt': 'organization',  # 机构名
			
 
				+        'nz': 'term',          # 其他专名
			
 
				+        'vn': 'verb_noun',     # 名动词
			
 
				+        'an': 'adj_noun',      # 名形词
			
 
				+        's': 'space',          # 处所词
			
 
				+        'f': 'direction',      # 方位词
			
 
				+        't': 'time',           # 时间词
			
 
				+    }
			
 
				+    
			
 
				+    def __new__(cls):
			
 
				+        if cls._instance is None:
			
 
				+            cls._instance = super().__new__(cls)
			
 
				+        return cls._instance
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        if not self._initialized and JIEBA_AVAILABLE:
			
 
				+            self._load_models()
			
 
				+            self._initialized = True
			
 
				+    
			
 
				+    def _load_models(self):
			
 
				+        """加载jieba配置"""
			
 
				+        try:
			
 
				+            print("🔄 加载jieba分词器...")
			
 
				+            # 启用paddle模式（如果可用）
			
 
				+            try:
			
 
				+                jieba.enable_paddle()
			
 
				+                print("✅ jieba加载完成（启用Paddle模式）")
			
 
				+            except:
			
 
				+                print("✅ jieba加载完成（基础模式）")
			
 
				+        except Exception as e:
			
 
				+            print(f"⚠️ jieba加载失败: {e}")
			
 
				+    
			
 
				+    @property
			
 
				+    def is_ready(self) -> bool:
			
 
				+        return JIEBA_AVAILABLE
			
 
				+    
			
 
				+    def extract_entities(self, text: str, topk: int = 30) -> List[Entity]:
			
 
				+        """使用jieba抽取实体"""
			
 
				+        if not self.is_ready:
			
 
				+            return []
			
 
				+        
			
 
				+        entities = []
			
 
				+        seen: Set[str] = set()
			
 
				+        
			
 
				+        try:
			
 
				+            # 1. 词性标注分词
			
 
				+            words_pos = list(pseg.cut(text))
			
 
				+            
			
 
				+            # 2. 提取命名实体和名词短语
			
 
				+            i = 0
			
 
				+            while i < len(words_pos):
			
 
				+                word, flag = words_pos[i]
			
 
				+                
			
 
				+                # 跳过停用词和短词
			
 
				+                if len(word) < 2 or word in self.STOP_WORDS:
			
 
				+                    i += 1
			
 
				+                    continue
			
 
				+                
			
 
				+                # 提取命名实体
			
 
				+                if flag in ['nr', 'ns', 'nt', 'nz']:
			
 
				+                    ent_type = self.POS_MAPPING.get(flag, 'term')
			
 
				+                    if word not in seen:
			
 
				+                        seen.add(word)
			
 
				+                        position = text.find(word)
			
 
				+                        entities.append(Entity(
			
 
				+                            text=word,
			
 
				+                            entity_type=ent_type,
			
 
				+                            position=max(0, position),
			
 
				+                            context=self._get_context(text, position, 30),
			
 
				+                            source="jieba_ner",
			
 
				+                            weight=2.0
			
 
				+                        ))
			
 
				+                
			
 
				+                # 提取连续的名词短语（长度4-20）
			
 
				+                if flag.startswith(('n', 'vn', 'an')):
			
 
				+                    phrase = [word]
			
 
				+                    j = i + 1
			
 
				+                    while j < len(words_pos) and words_pos[j][1].startswith(('n', 'vn', 'an', 'v')):
			
 
				+                        next_word = words_pos[j][0]
			
 
				+                        if len(next_word) >= 1 and next_word not in self.STOP_WORDS:
			
 
				+                            phrase.append(next_word)
			
 
				+                        j += 1
			
 
				+                    
			
 
				+                    if len(phrase) >= 2:
			
 
				+                        phrase_text = ''.join(phrase)
			
 
				+                        if 4 <= len(phrase_text) <= 20 and phrase_text not in seen:
			
 
				+                            seen.add(phrase_text)
			
 
				+                            position = text.find(phrase_text)
			
 
				+                            if position >= 0:
			
 
				+                                entities.append(Entity(
			
 
				+                                    text=phrase_text,
			
 
				+                                    entity_type="noun_phrase",
			
 
				+                                    position=position,
			
 
				+                                    context=self._get_context(text, position, 30),
			
 
				+                                    source="jieba_phrase",
			
 
				+                                    weight=1.5
			
 
				+                                ))
			
 
				+                    i = j if j > i + 1 else i + 1
			
 
				+                else:
			
 
				+                    i += 1
			
 
				+            
			
 
				+            # 3. TF-IDF关键词提取
			
 
				+            keywords = self.extract_keywords(text, topk=topk)
			
 
				+            for word, weight in keywords:
			
 
				+                if word not in seen and len(word) >= 2:
			
 
				+                    seen.add(word)
			
 
				+                    position = text.find(word)
			
 
				+                    entities.append(Entity(
			
 
				+                        text=word,
			
 
				+                        entity_type="keyword",
			
 
				+                        position=max(0, position),
			
 
				+                        context=self._get_context(text, position, 30),
			
 
				+                        source="jieba_tfidf",
			
 
				+                        weight=weight
			
 
				+                    ))
			
 
				+            
			
 
				+            # 4. TextRank关键词
			
 
				+            textrank_words = self.extract_textrank(text, topk=topk//2)
			
 
				+            for word, weight in textrank_words:
			
 
				+                if word not in seen and len(word) >= 2:
			
 
				+                    seen.add(word)
			
 
				+                    position = text.find(word)
			
 
				+                    entities.append(Entity(
			
 
				+                        text=word,
			
 
				+                        entity_type="keyword",
			
 
				+                        position=max(0, position),
			
 
				+                        context=self._get_context(text, position, 30),
			
 
				+                        source="jieba_textrank",
			
 
				+                        weight=weight
			
 
				+                    ))
			
 
				+                    
			
 
				+        except Exception as e:
			
 
				+            print(f"⚠️ jieba实体抽取失败: {e}")
			
 
				+        
			
 
				+        # 按权重排序
			
 
				+        entities.sort(key=lambda x: x.weight, reverse=True)
			
 
				+        return entities[:topk]
			
 
				+    
			
 
				+    def extract_keywords(self, text: str, topk: int = 20) -> List[Tuple[str, float]]:
			
 
				+        """TF-IDF关键词提取"""
			
 
				+        if not self.is_ready:
			
 
				+            return []
			
 
				+        
			
 
				+        try:
			
 
				+            words = list(jieba.cut(text))
			
 
				+            filtered_words = [
			
 
				+                w for w in words 
			
 
				+                if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
			
 
				+            ]
			
 
				+            
			
 
				+            if not filtered_words:
			
 
				+                return []
			
 
				+            
			
 
				+            # 计算TF
			
 
				+            word_count = Counter(filtered_words)
			
 
				+            total_words = len(filtered_words)
			
 
				+            tf_scores = {word: count / total_words for word, count in word_count.items()}
			
 
				+            
			
 
				+            # 简化的IDF计算
			
 
				+            idf_scores = self._calculate_idf(filtered_words)
			
 
				+            
			
 
				+            # TF-IDF
			
 
				+            tfidf_scores = {word: tf_scores[word] * idf_scores.get(word, 1.0) 
			
 
				+                          for word in tf_scores}
			
 
				+            
			
 
				+            sorted_words = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
			
 
				+            return sorted_words[:topk]
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            return []
			
 
				+    
			
 
				+    def extract_textrank(self, text: str, topk: int = 10) -> List[Tuple[str, float]]:
			
 
				+        """TextRank关键词提取"""
			
 
				+        if not self.is_ready:
			
 
				+            return []
			
 
				+        
			
 
				+        try:
			
 
				+            words = list(jieba.cut(text))
			
 
				+            filtered_words = [
			
 
				+                w for w in words 
			
 
				+                if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
			
 
				+            ]
			
 
				+            
			
 
				+            if len(filtered_words) < 3:
			
 
				+                return []
			
 
				+            
			
 
				+            # 构建共现图
			
 
				+            window_size = 5
			
 
				+            word_graph = {word: {} for word in set(filtered_words)}
			
 
				+            
			
 
				+            for i in range(len(filtered_words)):
			
 
				+                for j in range(i + 1, min(i + window_size, len(filtered_words))):
			
 
				+                    w1, w2 = filtered_words[i], filtered_words[j]
			
 
				+                    if w1 != w2:
			
 
				+                        word_graph[w1][w2] = word_graph[w1].get(w2, 0) + 1
			
 
				+                        word_graph[w2][w1] = word_graph[w2].get(w1, 0) + 1
			
 
				+            
			
 
				+            # TextRank迭代
			
 
				+            damping = 0.85
			
 
				+            max_iter = 30
			
 
				+            min_diff = 0.0001
			
 
				+            ranks = {word: 1.0 for word in word_graph}
			
 
				+            
			
 
				+            for _ in range(max_iter):
			
 
				+                new_ranks = {}
			
 
				+                max_diff = 0
			
 
				+                
			
 
				+                for word in word_graph:
			
 
				+                    rank = (1 - damping)
			
 
				+                    for neighbor, weight in word_graph[word].items():
			
 
				+                        neighbor_sum = sum(word_graph[neighbor].values())
			
 
				+                        if neighbor_sum > 0:
			
 
				+                            rank += damping * weight * ranks[neighbor] / neighbor_sum
			
 
				+                    new_ranks[word] = rank
			
 
				+                    max_diff = max(max_diff, abs(rank - ranks[word]))
			
 
				+                
			
 
				+                ranks = new_ranks
			
 
				+                if max_diff < min_diff:
			
 
				+                    break
			
 
				+            
			
 
				+            sorted_words = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
			
 
				+            return sorted_words[:topk]
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            return []
			
 
				+    
			
 
				+    def _calculate_idf(self, words: List[str]) -> Dict[str, float]:
			
 
				+        """简化IDF计算"""
			
 
				+        idf_scores = {}
			
 
				+        for word in set(words):
			
 
				+            base_idf = 1.0
			
 
				+            # 长度奖励（2-6字最佳）
			
 
				+            if 2 <= len(word) <= 6:
			
 
				+                base_idf *= 1.2
			
 
				+            elif len(word) > 10:
			
 
				+                base_idf *= 0.8
			
 
				+            idf_scores[word] = base_idf
			
 
				+        return idf_scores
			
 
				+    
			
 
				+    def _get_context(self, text: str, position: int, window: int = 30) -> str:
			
 
				+        """获取上下文"""
			
 
				+        start = max(0, position - window)
			
 
				+        end = min(len(text), position + window)
			
 
				+        return text[start:end]
			
 
				+
			
 
				+
			
 
				+# ==================== 规则抽取模块（通用模式）====================
			
 
				+
			
 
				+# 标准名称模式
			
 
				+STANDARD_NAME_PATTERN = r'《([^》]{2,100}?)》'
			
 
				+
			
 
				+# 标准编号模式
			
 
				+STANDARD_NUMBER_PATTERNS = [
			
 
				+    r'GB\s*/?T?\s*\d+[\./\-]?\d*(?:[-–—]\d{4})?',
			
 
				+    r'JTG\s*[TD]?\s*\d+[\./\-]?\d*[a-zA-Z]?',
			
 
				+    r'JTJ\s*\d+[\./\-]?\d*',
			
 
				+    r'JGJ\s*\d+[\./\-]?\d*',
			
 
				+    r'CJJ\s*\d+[\./\-]?\d*',
			
 
				+    r'TB\s*\d+[\./\-]?\d*',
			
 
				+    r'SL\s*\d+[\./\-]?\d*',
			
 
				+    r'DL\s*/?T?\s*\d+[\./\-]?\d*',
			
 
				+    r'NB\s*/?T?\s*\d+[\./\-]?\d*',
			
 
				+    r'HG\s*/?T?\s*\d+[\./\-]?\d*',
			
 
				+    r'CECS\s*\d+[:：]?\d*',
			
 
				+    r'T/[A-Z]+\s*\d+[\./\-]?\d*',
			
 
				+    r'DB\d{2,3}[/\-]T?\s*\d+[\./\-]?\d*',
			
 
				+    r'Q/[A-Z]+\s*\d+[\./\-]?\d*',
			
 
				+    r'建标\s*\d+[\./\-]?\d*',
			
 
				+]
			
 
				+
			
 
				+# 条款引用模式
			
 
				+CLAUSE_PATTERNS = [
			
 
				+    r'第\s*[一二三四五六七八九十百千]+\s*条',
			
 
				+    r'第\s*\d+\s*条',
			
 
				+    r'第\s*\d+\.\d+\s*条',
			
 
				+    r'第\s*\d+\.\d+\.\d+\s*条',
			
 
				+    r'[\(（]\s*\d+\s*[\)）]',
			
 
				+]
			
 
				+
			
 
				+# 日期模式
			
 
				+DATE_PATTERNS = [
			
 
				+    r'(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日',
			
 
				+    r'(\d{4})-(\d{2})-(\d{2})',
			
 
				+]
			
 
				+
			
 
				+# 管理单位模式（更通用）
			
 
				+ORG_PATTERNS = [
			
 
				+    r'(?:主编单位|主编部门)[：:]\s*([^，。\n]{2,50})',
			
 
				+    r'(?:参编单位|参编部门)[：:]\s*([^，。\n]{2,50})',
			
 
				+    r'(?:解释单位|技术归口)[：:]\s*([^，。\n]{2,50})',
			
 
				+    r'由\s*([^，。]{2,30})\s*负责解释',
			
 
				+    r'批准部门[：:]\s*([^，。\n]{2,50})',
			
 
				+]
			
 
				+
			
 
				+# 废止/替代关系
			
 
				+ABOLISH_PATTERNS = [
			
 
				+    (r'代替\s*《([^》]+)》', '代替标准'),
			
 
				+    (r'被\s*《([^》]+)》\s*代替', '被标准代替'),
			
 
				+    (r'(?:自\s*[\d年月日\-]+\s*起)?\s*废止', '已废止'),
			
 
				+    (r'原\s*([^\s]+)\s*同时废止', '原标准废止'),
			
 
				+]
			
 
				+
			
 
				+# 适用范围模式（更通用）
			
 
				+SCOPE_PATTERNS = [
			
 
				+    r'适用(?:于)?\s*([^。，]{3,30}?)(?:的)?\s*(?:设计|施工|验收|检测|勘察|监理)',
			
 
				+    r'适用(?:于)?\s*([^。，]{3,30}?)\s*工程',
			
 
				+    r'适用(?:于)?\s*([^。，]{3,30}?)(?:建设|管理)',
			
 
				+]
			
 
				+
			
 
				+
			
 
				+def extract_entities_rule_based(text: str) -> List[Entity]:
			
 
				+    """基于规则的实体抽取（通用模式）"""
			
 
				+    entities = []
			
 
				+    seen: Set[str] = set()
			
 
				+    
			
 
				+    def add_entity(text_content: str, e_type: str, pos: int, weight: float = 1.0):
			
 
				+        key = f"{e_type}:{text_content.lower()}"
			
 
				+        if key not in seen and len(text_content) >= 2:
			
 
				+            seen.add(key)
			
 
				+            entities.append(Entity(
			
 
				+                text=text_content,
			
 
				+                entity_type=e_type,
			
 
				+                position=pos,
			
 
				+                context=get_context(text, pos, 40),
			
 
				+                source="rule",
			
 
				+                weight=weight
			
 
				+            ))
			
 
				+    
			
 
				+    # 1. 标准名称（最高权重）
			
 
				+    for match in re.finditer(STANDARD_NAME_PATTERN, text):
			
 
				+        name = match.group(1).strip()
			
 
				+        if 2 <= len(name) <= 100:
			
 
				+            add_entity(f"《{name}》", "standard_name", match.start(), weight=3.0)
			
 
				+    
			
 
				+    # 2. 标准编号
			
 
				+    for pattern in STANDARD_NUMBER_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text, re.IGNORECASE):
			
 
				+            number = re.sub(r'\s+', '', match.group(0)).upper()
			
 
				+            if len(number) >= 3:
			
 
				+                add_entity(number, "standard_number", match.start(), weight=2.5)
			
 
				+    
			
 
				+    # 3. 条款引用
			
 
				+    for pattern in CLAUSE_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            clause = re.sub(r'\s+', '', match.group(0))
			
 
				+            if clause and len(clause) < 50:
			
 
				+                add_entity(clause, "clause", match.start(), weight=1.5)
			
 
				+    
			
 
				+    # 4. 日期信息
			
 
				+    for pattern in DATE_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            date_str = match.group(0)
			
 
				+            add_entity(date_str, "date", match.start(), weight=1.2)
			
 
				+    
			
 
				+    return entities
			
 
				+
			
 
				+
			
 
				+def extract_relationships(text: str) -> List[Relationship]:
			
 
				+    """抽取关系"""
			
 
				+    relations = []
			
 
				+    
			
 
				+    # 替代/废止关系
			
 
				+    for pattern, rel_type in ABOLISH_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            target = match.group(1) if match.groups() else "未知"
			
 
				+            relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.95))
			
 
				+    
			
 
				+    return relations
			
 
				+
			
 
				+
			
 
				+def extract_background_info(text: str) -> BackgroundInfo:
			
 
				+    """抽取背景信息（改进版）"""
			
 
				+    bg = BackgroundInfo()
			
 
				+    
			
 
				+    # 1. 管理单位
			
 
				+    bg.manage_orgs = {"主编单位": [], "参编单位": [], "解释单位": [], "批准部门": []}
			
 
				+    for pattern in ORG_PATTERNS:
			
 
				+        matches = re.findall(pattern, text)
			
 
				+        for m in matches:
			
 
				+            org_name = m[0] if isinstance(m, tuple) else m
			
 
				+            if org_name:
			
 
				+                # 判断类型
			
 
				+                if "主编" in pattern:
			
 
				+                    bg.manage_orgs["主编单位"].append(org_name.strip())
			
 
				+                elif "参编" in pattern:
			
 
				+                    bg.manage_orgs["参编单位"].append(org_name.strip())
			
 
				+                elif "解释" in pattern or "归口" in pattern:
			
 
				+                    bg.manage_orgs["解释单位"].append(org_name.strip())
			
 
				+                elif "批准" in pattern:
			
 
				+                    bg.manage_orgs["批准部门"].append(org_name.strip())
			
 
				+    
			
 
				+    # 2. 废止状态
			
 
				+    bg.abolish_status = []
			
 
				+    for pattern, _ in ABOLISH_PATTERNS:
			
 
				+        matches = re.findall(pattern, text)
			
 
				+        for m in matches:
			
 
				+            status = m if isinstance(m, str) else (m[0] if m else "")
			
 
				+            if status:
			
 
				+                bg.abolish_status.append(status)
			
 
				+    
			
 
				+    # 3. 适用范围
			
 
				+    bg.scope = {"适用范围": []}
			
 
				+    for pattern in SCOPE_PATTERNS:
			
 
				+        matches = re.findall(pattern, text)
			
 
				+        for m in matches:
			
 
				+            scope_text = m[0] if isinstance(m, tuple) else m
			
 
				+            if scope_text and len(scope_text) < 50:
			
 
				+                bg.scope["适用范围"].append(scope_text.strip())
			
 
				+    
			
 
				+    # 4. 发布/实施日期
			
 
				+    dates = []
			
 
				+    for pattern in DATE_PATTERNS:
			
 
				+        for match in re.finditer(pattern, text):
			
 
				+            dates.append((match.group(0), match.start()))
			
 
				+    
			
 
				+    # 根据位置推断发布日期和实施日期（通常发布日期在前）
			
 
				+    if dates:
			
 
				+        dates.sort(key=lambda x: x[1])
			
 
				+        if len(dates) >= 2:
			
 
				+            bg.publish_date = dates[0][0]
			
 
				+            bg.effective_date = dates[1][0]
			
 
				+        else:
			
 
				+            bg.publish_date = dates[0][0]
			
 
				+    
			
 
				+    # 5. 关系
			
 
				+    bg.relations = extract_relationships(text)
			
 
				+    
			
 
				+    # 6. 文档类型
			
 
				+    doc_types = re.findall(r'(国家标准|行业标准|地方标准|团体标准|企业标准|规范|规程|指南|办法)', text)
			
 
				+    if doc_types:
			
 
				+        bg.doc_type = doc_types[0]
			
 
				+    
			
 
				+    return bg
			
 
				+
			
 
				+
			
 
				+# ==================== 文档处理工具函数 ====================
			
 
				+
			
 
				+def get_context(text: str, position: int, window: int = 40) -> str:
			
 
				+    """获取实体上下文"""
			
 
				+    start = max(0, position - window)
			
 
				+    end = min(len(text), position + window)
			
 
				+    context = text[start:end]
			
 
				+    context = re.sub(r'\s+', ' ', context).strip()
			
 
				+    return context[:200]
			
 
				+
			
 
				+
			
 
				+def clean_markdown_content(text: str) -> str:
			
 
				+    """清理Markdown内容为纯文本"""
			
 
				+    if not text:
			
 
				+        return ""
			
 
				+    
			
 
				+    # 1. 移除代码块
			
 
				+    text = re.sub(r'```[\s\S]*?```', '', text)
			
 
				+    text = re.sub(r'~~~[\s\S]*?~~~', '', text)
			
 
				+    
			
 
				+    # 2. 移除行内代码
			
 
				+    text = re.sub(r'`[^`]*`', '', text)
			
 
				+    
			
 
				+    # 3. 处理链接：保留文本
			
 
				+    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
			
 
				+    text = re.sub(r'<[^>]+>', '', text)
			
 
				+    
			
 
				+    # 4. 移除图片
			
 
				+    text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
			
 
				+    
			
 
				+    # 5. 移除HTML标签
			
 
				+    text = re.sub(r'<[^>]+>', '', text)
			
 
				+    
			
 
				+    # 6. 移除标题符号（保留标题文本）
			
 
				+    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
			
 
				+    
			
 
				+    # 7. 移除强调符号
			
 
				+    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
			
 
				+    text = re.sub(r'__([^_]+)__', r'\1', text)
			
 
				+    text = re.sub(r'\*([^*]+)\*', r'\1', text)
			
 
				+    text = re.sub(r'_([^_]+)_', r'\1', text)
			
 
				+    text = re.sub(r'~~([^~]+)~~', r'\1', text)
			
 
				+    
			
 
				+    # 8. 清理表格和列表
			
 
				+    text = re.sub(r'\|?[\s\-:]+\|', '', text)
			
 
				+    text = re.sub(r'\|', ' ', text)
			
 
				+    text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
			
 
				+    text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
			
 
				+    
			
 
				+    # 9. 移除引用符号
			
 
				+    text = re.sub(r'^[\s]*>\s*', '', text, flags=re.MULTILINE)
			
 
				+    
			
 
				+    # 10. 清理多余空白
			
 
				+    text = re.sub(r'\n{3,}', '\n\n', text)
			
 
				+    text = re.sub(r'[ \t]+', ' ', text)
			
 
				+    
			
 
				+    return text.strip()
			
 
				+
			
 
				+
			
 
				+def parse_heading(line: str) -> Tuple[int, str]:
			
 
				+    """
			
 
				+    解析标题行，返回(层级, 标题文本)
			
 
				+    层级：0=不是标题, 1=#, 2=##, ...
			
 
				+    """
			
 
				+    match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
			
 
				+    if match:
			
 
				+        level = len(match.group(1))
			
 
				+        title = match.group(2).strip()
			
 
				+        return level, title
			
 
				+    return 0, ""
			
 
				+
			
 
				+
			
 
				+def split_document_hierarchy(md_content: str) -> List[DocumentChunk]:
			
 
				+    """
			
 
				+    将文档按层级结构拆分为块
			
 
				+    返回层级路径如 "1.总则 > 1.1术语定义"
			
 
				+    """
			
 
				+    chunks = []
			
 
				+    lines = md_content.split('\n')
			
 
				+    
			
 
				+    # 当前层级路径栈
			
 
				+    hierarchy_stack: List[Tuple[int, str]] = []  # [(level, title), ...]
			
 
				+    current_content_lines = []
			
 
				+    current_start_pos = 0
			
 
				+    
			
 
				+    def build_hierarchy_path() -> str:
			
 
				+        """构建层级路径字符串"""
			
 
				+        if not hierarchy_stack:
			
 
				+            return "前言/总则"
			
 
				+        return " > ".join([title for _, title in hierarchy_stack])
			
 
				+    
			
 
				+    def flush_current_chunk(end_pos: int):
			
 
				+        """将当前内容保存为chunk"""
			
 
				+        nonlocal current_content_lines
			
 
				+        if current_content_lines:
			
 
				+            content = '\n'.join(current_content_lines)
			
 
				+            content = clean_markdown_content(content)
			
 
				+            if len(content) >= 10:
			
 
				+                level = hierarchy_stack[-1][0] if hierarchy_stack else 0
			
 
				+                title = hierarchy_stack[-1][1] if hierarchy_stack else "前言/总则"
			
 
				+                chunks.append(DocumentChunk(
			
 
				+                    content=content,
			
 
				+                    level=level,
			
 
				+                    title=title,
			
 
				+                    hierarchy_path=build_hierarchy_path(),
			
 
				+                    position=current_start_pos
			
 
				+                ))
			
 
				+        current_content_lines = []
			
 
				+    
			
 
				+    i = 0
			
 
				+    while i < len(lines):
			
 
				+        line = lines[i]
			
 
				+        level, title = parse_heading(line)
			
 
				+        
			
 
				+        if level > 0:
			
 
				+            # 遇到新标题，先保存当前内容
			
 
				+            flush_current_chunk(i)
			
 
				+            current_start_pos = i
			
 
				+            
			
 
				+            # 更新层级栈
			
 
				+            # 弹出同层级或更深层的标题
			
 
				+            while hierarchy_stack and hierarchy_stack[-1][0] >= level:
			
 
				+                hierarchy_stack.pop()
			
 
				+            hierarchy_stack.append((level, title))
			
 
				+        else:
			
 
				+            # 普通内容行
			
 
				+            if line.strip():
			
 
				+                current_content_lines.append(line)
			
 
				+        
			
 
				+        i += 1
			
 
				+    
			
 
				+    # 处理最后一块内容
			
 
				+    flush_current_chunk(len(lines))
			
 
				+    
			
 
				+    # 如果没有分块，整个文档作为一个块
			
 
				+    if not chunks and md_content.strip():
			
 
				+        cleaned = clean_markdown_content(md_content)
			
 
				+        if cleaned.strip():
			
 
				+            chunks.append(DocumentChunk(
			
 
				+                content=cleaned,
			
 
				+                level=0,
			
 
				+                title="全文",
			
 
				+                hierarchy_path="全文",
			
 
				+                position=0
			
 
				+            ))
			
 
				+    
			
 
				+    return chunks
			
 
				+
			
 
				+
			
 
				+def extract_title_from_filename(file_name: str) -> str:
			
 
				+    """从文件名提取文档标题"""
			
 
				+    title = re.sub(r'^\d+[_\-]?', '', file_name)  # 移除开头数字
			
 
				+    title = re.sub(r'\.md$', '', title, re.IGNORECASE)
			
 
				+    title = re.sub(r'[_\-]', ' ', title)
			
 
				+    return title.strip()
			
 
				+
			
 
				+
			
 
				+def build_backgrounds(bg_info: BackgroundInfo, hierarchy_path: str, file_name: str) -> List[str]:
			
 
				+    """构建backgrounds列表（改进版）"""
			
 
				+    backgrounds = []
			
 
				+    
			
 
				+    # 1. 文档类型
			
 
				+    if bg_info.doc_type:
			
 
				+        backgrounds.append(f"文档类型：{bg_info.doc_type}")
			
 
				+    
			
 
				+    # 2. 层级路径
			
 
				+    if hierarchy_path and hierarchy_path != "前言/总则":
			
 
				+        backgrounds.append(f"章节位置：{hierarchy_path[:80]}")
			
 
				+    
			
 
				+    # 3. 发布/实施日期
			
 
				+    if bg_info.publish_date:
			
 
				+        backgrounds.append(f"发布日期：{bg_info.publish_date}")
			
 
				+    if bg_info.effective_date:
			
 
				+        backgrounds.append(f"实施日期：{bg_info.effective_date}")
			
 
				+    
			
 
				+    # 4. 管理单位（只取第一个）
			
 
				+    for org_type, orgs in bg_info.manage_orgs.items():
			
 
				+        if orgs:
			
 
				+            unique_orgs = list(dict.fromkeys(orgs))[:1]  # 只取第一个
			
 
				+            backgrounds.append(f"{org_type}：{', '.join(unique_orgs)[:50]}")
			
 
				+            break  # 只添加一种管理单位
			
 
				+    
			
 
				+    # 5. 废止状态
			
 
				+    if bg_info.abolish_status:
			
 
				+        for status in bg_info.abolish_status[:1]:
			
 
				+            if isinstance(status, str):
			
 
				+                backgrounds.append(f"废止状态：{status[:50]}")
			
 
				+    
			
 
				+    # 6. 适用范围
			
 
				+    if bg_info.scope.get("适用范围"):
			
 
				+        scopes = list(dict.fromkeys(bg_info.scope["适用范围"]))[:1]
			
 
				+        backgrounds.append(f"适用范围：{', '.join(scopes)[:50]}")
			
 
				+    
			
 
				+    # 7. 兜底
			
 
				+    if not backgrounds:
			
 
				+        backgrounds.append(f"来源文档：{file_name}")
			
 
				+    
			
 
				+    return backgrounds[:5]  # 最多5条
			
 
				+
			
 
				+
			
 
				+# ==================== 实体抽取主函数 ====================
			
 
				+
			
 
				+def merge_entities(entity_lists: List[List[Entity]]) -> List[Entity]:
			
 
				+    """合并多个来源的实体，去重并加权"""
			
 
				+    seen: Dict[str, Entity] = {}
			
 
				+    
			
 
				+    for entities in entity_lists:
			
 
				+        for ent in entities:
			
 
				+            # 使用小写文本+类型作为去重key
			
 
				+            key = f"{ent.text.lower()}:{ent.entity_type}"
			
 
				+            if key in seen:
			
 
				+                # 合并权重和置信度
			
 
				+                existing = seen[key]
			
 
				+                existing.weight = max(existing.weight, ent.weight)
			
 
				+                existing.confidence = max(existing.confidence, ent.confidence)
			
 
				+                existing.source = f"{existing.source}+{ent.source}"
			
 
				+            else:
			
 
				+                seen[key] = ent
			
 
				+    
			
 
				+    # 转换为列表并排序
			
 
				+    merged = list(seen.values())
			
 
				+    merged.sort(key=lambda x: (x.weight * x.confidence), reverse=True)
			
 
				+    return merged
			
 
				+
			
 
				+
			
 
				+def extract_all_entities(text: str, hierarchy_path: str = "", use_llm: bool = False) -> List[Entity]:
			
 
				+    """
			
 
				+    综合抽取实体（多策略融合）
			
 
				+    
			
 
				+    策略优先级：
			
 
				+    1. 规则抽取（标准名称、编号等）- 高置信度
			
 
				+    2. jieba NLP抽取（TF-IDF/TextRank）- 中置信度
			
 
				+    3. LLM抽取（可选）- 高置信度但成本高
			
 
				+    """
			
 
				+    all_entities = []
			
 
				+    
			
 
				+    # 1. 规则抽取
			
 
				+    rule_entities = extract_entities_rule_based(text)
			
 
				+    all_entities.append(rule_entities)
			
 
				+    
			
 
				+    # 2. jieba NLP抽取
			
 
				+    jieba_ext = get_jieba_extractor()
			
 
				+    if jieba_ext and jieba_ext.is_ready:
			
 
				+        jieba_entities = jieba_ext.extract_entities(text, topk=20)
			
 
				+        all_entities.append(jieba_entities)
			
 
				+    
			
 
				+    # 3. LLM抽取（可选，受配置控制）
			
 
				+    if use_llm and LLM_CONFIG.get("enabled"):
			
 
				+        llm_ext = get_llm_extractor()
			
 
				+        if llm_ext.is_available():
			
 
				+            llm_entities = llm_ext.extract_entities(text, context=hierarchy_path)
			
 
				+            all_entities.append(llm_entities)
			
 
				+    
			
 
				+    # 合并去重
			
 
				+    return merge_entities(all_entities)
			
 
				+
			
 
				+
			
 
				+# ==================== 全局实例 ====================
			
 
				+
			
 
				+_jieba_extractor: Optional[JiebaExtractor] = None
			
 
				+_llm_extractor: Optional[LLMEntityExtractor] = None
			
 
				+
			
 
				+
			
 
				+def get_jieba_extractor() -> Optional[JiebaExtractor]:
			
 
				+    """获取jieba抽取器"""
			
 
				+    global _jieba_extractor
			
 
				+    if _jieba_extractor is None and JIEBA_AVAILABLE:
			
 
				+        _jieba_extractor = JiebaExtractor()
			
 
				+    return _jieba_extractor
			
 
				+
			
 
				+
			
 
				+def get_llm_extractor() -> Optional[LLMEntityExtractor]:
			
 
				+    """获取LLM抽取器"""
			
 
				+    global _llm_extractor
			
 
				+    if _llm_extractor is None:
			
 
				+        _llm_extractor = LLMEntityExtractor()
			
 
				+    return _llm_extractor
			
 
				+
			
 
				+
			
 
				+# ==================== 导入主逻辑 ====================
			
 
				+
			
 
				+def import_single_file(md_path: Path, embeddings, use_llm: bool = False) -> List[Dict[str, Any]]:
			
 
				+    """导入单个MD文件"""
			
 
				+    file_name = md_path.name
			
 
				+    doc_title = extract_title_from_filename(file_name)
			
 
				+    
			
 
				+    try:
			
 
				+        with open(md_path, "r", encoding="utf-8") as f:
			
 
				+            md_content = f.read()
			
 
				+    except Exception as e:
			
 
				+        print(f"   读取失败: {e}")
			
 
				+        return []
			
 
				+    
			
 
				+    if not md_content.strip():
			
 
				+        return []
			
 
				+    
			
 
				+    # 抽取文档级背景信息
			
 
				+    doc_bg_info = extract_background_info(md_content)
			
 
				+    
			
 
				+    # 使用层级结构拆分文档
			
 
				+    chunks = split_document_hierarchy(md_content)
			
 
				+    
			
 
				+    all_rows = []
			
 
				+    # 用于同一文件内实体去重
			
 
				+    file_entity_seen: Set[Tuple[str, str]] = set()
			
 
				+    
			
 
				+    for chunk in chunks:
			
 
				+        # 跳过太短的段落
			
 
				+        if len(chunk.content) < 20:
			
 
				+            continue
			
 
				+        
			
 
				+        # 抽取实体
			
 
				+        entities = extract_all_entities(chunk.content, chunk.hierarchy_path, use_llm)
			
 
				+        
			
 
				+        # 构建背景信息（使用层级路径）
			
 
				+        chunk_bg_info = extract_background_info(chunk.content)
			
 
				+        # 合并文档级和段落级背景信息
			
 
				+        merged_bg = BackgroundInfo()
			
 
				+        merged_bg.doc_type = doc_bg_info.doc_type or chunk_bg_info.doc_type
			
 
				+        merged_bg.publish_date = doc_bg_info.publish_date or chunk_bg_info.publish_date
			
 
				+        merged_bg.effective_date = doc_bg_info.effective_date or chunk_bg_info.effective_date
			
 
				+        merged_bg.manage_orgs = doc_bg_info.manage_orgs if doc_bg_info.manage_orgs else chunk_bg_info.manage_orgs
			
 
				+        merged_bg.abolish_status = doc_bg_info.abolish_status if doc_bg_info.abolish_status else chunk_bg_info.abolish_status
			
 
				+        merged_bg.scope = doc_bg_info.scope if doc_bg_info.scope else chunk_bg_info.scope
			
 
				+        
			
 
				+        final_backgrounds = build_backgrounds(merged_bg, chunk.hierarchy_path, file_name)
			
 
				+        
			
 
				+        if entities:
			
 
				+            for entity in entities:
			
 
				+                entity_text = entity.text.strip()
			
 
				+                entity_type = entity.entity_type
			
 
				+                
			
 
				+                # 同一文件内实体去重
			
 
				+                dedup_key = (entity_text.lower(), entity_type)
			
 
				+                if dedup_key in file_entity_seen:
			
 
				+                    continue
			
 
				+                file_entity_seen.add(dedup_key)
			
 
				+                
			
 
				+                # 生成向量
			
 
				+                try:
			
 
				+                    vector = embeddings.embed_query(entity_text)
			
 
				+                except Exception as e:
			
 
				+                    print(f"   向量生成失败: {e}")
			
 
				+                    continue
			
 
				+                
			
 
				+                # 构造metadata - 使用层级路径作为title
			
 
				+                metadata = {
			
 
				+                    "uuid": str(uuid.uuid4()),
			
 
				+                    "file": file_name,
			
 
				+                    "title": chunk.hierarchy_path,  # 层级路径
			
 
				+                    "section_title": chunk.title,   # 当前章节标题
			
 
				+                    "backgrounds": final_backgrounds,
			
 
				+                    "entity_type": entity_type,
			
 
				+                    "source": entity.source,
			
 
				+                }
			
 
				+                
			
 
				+                all_rows.append({
			
 
				+                    "text": entity_text,
			
 
				+                    "dense": vector,
			
 
				+                    "content": entity_text,
			
 
				+                    "metadata": json.dumps(metadata, ensure_ascii=False),
			
 
				+                })
			
 
				+        else:
			
 
				+            # 无实体时，用层级路径作为实体
			
 
				+            hierarchy_clean = chunk.hierarchy_path.strip()
			
 
				+            dedup_key = (hierarchy_clean.lower(), "section")
			
 
				+            if dedup_key not in file_entity_seen:
			
 
				+                file_entity_seen.add(dedup_key)
			
 
				+                
			
 
				+                try:
			
 
				+                    vector = embeddings.embed_query(hierarchy_clean)
			
 
				+                except Exception:
			
 
				+                    continue
			
 
				+                
			
 
				+                metadata = {
			
 
				+                    "uuid": str(uuid.uuid4()),
			
 
				+                    "file": file_name,
			
 
				+                    "title": chunk.hierarchy_path,
			
 
				+                    "section_title": chunk.title,
			
 
				+                    "backgrounds": final_backgrounds,
			
 
				+                    "entity_type": "section",
			
 
				+                    "source": "hierarchy",
			
 
				+                }
			
 
				+                
			
 
				+                all_rows.append({
			
 
				+                    "text": hierarchy_clean[:200],
			
 
				+                    "dense": vector,
			
 
				+                    "content": hierarchy_clean[:200],
			
 
				+                    "metadata": json.dumps(metadata, ensure_ascii=False),
			
 
				+                })
			
 
				+    
			
 
				+    return all_rows
			
 
				+
			
 
				+
			
 
				+def batch_insert(client, rows: List[Dict[str, Any]]) -> Tuple[int, List[Dict[str, Any]]]:
			
 
				+    """批量插入数据"""
			
 
				+    if not rows:
			
 
				+        return 0, []
			
 
				+    
			
 
				+    inserted = 0
			
 
				+    failed_rows = []
			
 
				+    
			
 
				+    for i in range(0, len(rows), BATCH_SIZE):
			
 
				+        batch = rows[i:i + BATCH_SIZE]
			
 
				+        try:
			
 
				+            client.insert(collection_name=COLLECTION_NAME, data=batch)
			
 
				+            inserted += len(batch)
			
 
				+        except Exception as e:
			
 
				+            print(f"   插入失败: {e}")
			
 
				+            failed_rows.extend(batch)
			
 
				+    
			
 
				+    return inserted, failed_rows
			
 
				+
			
 
				+
			
 
				+def import_from_folder(root_folder: str, use_llm: bool = False):
			
 
				+    """从文件夹批量导入"""
			
 
				+    root = Path(root_folder)
			
 
				+    if not root.exists():
			
 
				+        print(f"文件夹不存在: {root}")
			
 
				+        return
			
 
				+    
			
 
				+    print(f"扫描文件夹: {root}（不递归子目录）")
			
 
				+    
			
 
				+    # 只扫描当前目录下的.md文件
			
 
				+    md_files = [f for f in root.glob("*.md") if f.is_file()]
			
 
				+    print(f"发现 {len(md_files)} 个MD文件")
			
 
				+    
			
 
				+    if not md_files:
			
 
				+        return
			
 
				+    
			
 
				+    # 初始化抽取器
			
 
				+    jieba_ext = get_jieba_extractor()
			
 
				+    if jieba_ext and jieba_ext.is_ready:
			
 
				+        print("✅ jieba已启用")
			
 
				+    else:
			
 
				+        print("⚠️ jieba未启用，使用纯规则抽取")
			
 
				+    
			
 
				+    if use_llm and LLM_CONFIG.get("enabled"):
			
 
				+        llm_ext = get_llm_extractor()
			
 
				+        if llm_ext.is_available():
			
 
				+            print("✅ LLM增强已启用")
			
 
				+        else:
			
 
				+            print("⚠️ LLM不可用，请检查配置和openai包")
			
 
				+            use_llm = False
			
 
				+    
			
 
				+    # 初始化Milvus
			
 
				+    client = get_milvus_client()
			
 
				+    embeddings = get_embeddings()
			
 
				+    
			
 
				+    if not client.has_collection(collection_name=COLLECTION_NAME):
			
 
				+        print(f"Collection不存在: {COLLECTION_NAME}")
			
 
				+        print(f"运行: uv run -m src.app.scripts.first_bfp_collection_entity_create")
			
 
				+        return
			
 
				+    
			
 
				+    client.load_collection(collection_name=COLLECTION_NAME)
			
 
				+    
			
 
				+    # 统计
			
 
				+    total_entities = 0
			
 
				+    total_inserted = 0
			
 
				+    entity_type_stats: Dict[str, int] = {}
			
 
				+    failed_files = []
			
 
				+    
			
 
				+    for idx, md_path in enumerate(md_files, 1):
			
 
				+        print(f"\n[{idx}/{len(md_files)}] 处理: {md_path.name}")
			
 
				+        
			
 
				+        try:
			
 
				+            rows = import_single_file(md_path, embeddings, use_llm)
			
 
				+            
			
 
				+            if rows:
			
 
				+                # 统计
			
 
				+                for row in rows:
			
 
				+                    metadata = json.loads(row.get("metadata", "{}"))
			
 
				+                    etype = metadata.get("entity_type", "unknown")
			
 
				+                    entity_type_stats[etype] = entity_type_stats.get(etype, 0) + 1
			
 
				+                
			
 
				+                print(f"   抽取 {len(rows)} 个实体")
			
 
				+                
			
 
				+                # 插入
			
 
				+                inserted, failed = batch_insert(client, rows)
			
 
				+                total_inserted += inserted
			
 
				+                if failed:
			
 
				+                    print(f"   {len(failed)} 条插入失败")
			
 
				+                else:
			
 
				+                    print(f"   插入 {inserted} 条")
			
 
				+            else:
			
 
				+                print(f"   无有效实体")
			
 
				+            
			
 
				+            total_entities += len(rows)
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"   处理失败: {e}")
			
 
				+            import traceback
			
 
				+            traceback.print_exc()
			
 
				+            failed_files.append(md_path.name)
			
 
				+    
			
 
				+    # 汇总
			
 
				+    print("\n" + "=" * 70)
			
 
				+    print("导入完成")
			
 
				+    print("=" * 70)
			
 
				+    print(f"处理文件: {len(md_files)}")
			
 
				+    print(f"抽取实体: {total_entities}")
			
 
				+    print(f"成功插入: {total_inserted}")
			
 
				+    if failed_files:
			
 
				+        print(f"失败文件: {len(failed_files)}")
			
 
				+    print("\n实体类型分布:")
			
 
				+    for etype, count in sorted(entity_type_stats.items(), key=lambda x: -x[1]):
			
 
				+        print(f"   - {etype}: {count}")
			
 
				+    print("=" * 70)
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """主函数"""
			
 
				+    print("=" * 70)
			
 
				+    print("编制依据实体抽取与导入（V2 优化版）")
			
 
				+    print("=" * 70)
			
 
				+    print("主要改进:")
			
 
				+    print("  1. 层级路径标题: '1.总则 > 1.1术语定义'")
			
 
				+    print("  2. 移除硬编码工程术语模式")
			
 
				+    print("  3. 改进background信息抽取")
			
 
				+    print("  4. 可选LLM增强（需配置）")
			
 
				+    print("=" * 70)
			
 
				+    
			
 
				+    # 检查是否需要启用LLM
			
 
				+    use_llm = LLM_CONFIG.get("enabled", False)
			
 
				+    
			
 
				+    try:
			
 
				+        import_from_folder(ROOT_FOLDER, use_llm=use_llm)
			
 
				+    except Exception as e:
			
 
				+        print(f"\n导入失败: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()