Procházet zdrojové kódy

feat: first_bfp_collection_entity 导入

ai02 před 4 týdny
rodič
revize
7237fd9378

+ 1198 - 0
src/app/scripts/first_bfp_collection_entity_import.py

@@ -0,0 +1,1198 @@
+"""
+编制依据实体抽取与导入脚本(jieba 版)
+
+功能:
+1. 实体抽取:使用 jieba 分词 + 词性标注 + TF-IDF 关键词提取 + 规则补充
+2. 关系抽取:基于规则模式匹配
+3. 背景信息:废止状态、管理单位、适用范围
+
+字段结构:
+- text: 实体文本(用于 BM25 检索)
+- dense: 实体向量
+- content: 与 text 内容相同
+- metadata: JSON 字符串 {uuid, file, title, backgrounds}
+  - backgrounds 不能为空
+
+依赖:
+    uv add jieba
+
+用法:
+    uv run -m src.app.scripts.first_bfp_collection_entity_import
+"""
+from __future__ import annotations
+
+import json
+import re
+import uuid
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Set
+from dataclasses import dataclass, asdict
+from collections import Counter
+import math
+
+from app.config.embeddings import get_embeddings
+from app.config.milvus_client import get_milvusclient
+
+# Collection 名称
+COLLECTION_NAME = "first_bfp_collection_entity"
+
+# 源文件夹路径
+ROOT_FOLDER = r"F:\第二阶段编制依据及施工方案数据治理-20260206\133"
+
+# 批量插入大小
+BATCH_SIZE = 100
+
+# jieba 依赖
+try:
+    import jieba
+    import jieba.posseg as pseg
+    JIEBA_AVAILABLE = True
+except ImportError:
+    JIEBA_AVAILABLE = False
+    warnings.warn("jieba not installed. Using rule-based extraction only. Run: uv add jieba")
+
+
+@dataclass
+class Entity:
+    """实体数据结构"""
+    text: str
+    entity_type: str
+    position: int
+    context: str = ""  # 实体出现的上下文
+    source: str = "rule"  # 来源: jieba / rule / combined
+    weight: float = 1.0  # 权重(TF-IDF 分数)
+
+
+@dataclass
+class Relationship:
+    """关系数据结构"""
+    source: str
+    relation_type: str
+    target: str
+    context: str = ""
+    confidence: float = 1.0  # 置信度
+
+
+@dataclass
+class BackgroundInfo:
+    """背景信息结构"""
+    abolish_status: List[str]
+    manage_orgs: Dict[str, List[str]]
+    scope: Dict[str, List[str]]
+    relations: List[Relationship]
+
+
+# ==================== jieba 工具类 ====================
+
+class JiebaExtractor:
+    """基于 jieba 的专业 NLP 抽取器"""
+    
+    _instance = None
+    _initialized = False
+    
+    # 停用词表
+    STOP_WORDS = {
+        '的', '了', '在', '是', '和', '与', '及', '或', '等', '本', '第', '之', '为', '有',
+        '而', '于', '以', '及其', '该', '这', '那', '此', '其', '个', '中', '上', '下',
+        '后', '前', '内', '外', '将', '应', '可', '按', '根据', '按照', '依据', '有关',
+        '相关', '规定', '要求', '所述', '所示', '所述', '其中', '如下', '如下所述',
+        '分别', '不得', '必须', '需要', '应当', '可以', '不得', '禁止', '允许',
+    }
+    
+    # 专业领域词典(可扩展)
+    DOMAIN_WORDS = {
+        '混凝土', '钢筋', '预应力', '桥梁', '隧道', '路基', '路面', '涵洞',
+        '边坡', '基坑', '桩基', '墩柱', '梁体', '支座', '伸缩缝', '挡土墙',
+        '施工', '检测', '监测', '设计', '验收', '养护', '抗震', '承载力',
+        '稳定性', '变形', '沉降', '抗剪', '抗弯', '裂缝', '焊接', '浇筑',
+        '张拉', '压浆', '注浆', '爆破', '开挖', '支护', '地基处理',
+        '安全检查', '脚手架', '模板', '高处作业', '临时用电', '起重机械',
+        '文明施工', '扬尘治理', '绿色施工', '质量管理', '安全生产',
+    }
+    
+    # 词性到实体类型的映射
+    POS_MAPPING = {
+        'n': 'noun',           # 名词
+        'nr': 'person',        # 人名
+        'ns': 'location',      # 地名
+        'nt': 'organization',  # 机构名
+        'nz': 'term',          # 其他专名
+        'vn': 'verb_noun',     # 名动词
+        'an': 'adj_noun',      # 名形词
+        's': 'space',          # 处所词
+        'f': 'direction',      # 方位词
+        't': 'time',           # 时间词
+    }
+    
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    
+    def __init__(self):
+        if not self._initialized and JIEBA_AVAILABLE:
+            self._load_models()
+            self._initialized = True
+    
+    def _load_models(self):
+        """加载 jieba 词典和配置"""
+        try:
+            print("🔄 加载 jieba 分词器...")
+            
+            # 添加专业领域词汇
+            for word in self.DOMAIN_WORDS:
+                jieba.add_word(word, freq=1000)
+            
+            # 启用paddle模式(如果可用)
+            try:
+                jieba.enable_paddle()
+                print("✅ jieba 加载完成(启用 Paddle 模式)")
+            except:
+                print("✅ jieba 加载完成(基础模式)")
+                
+        except Exception as e:
+            print(f"⚠️ jieba 加载失败: {e}")
+    
+    @property
+    def is_ready(self) -> bool:
+        return JIEBA_AVAILABLE
+    
+    def extract_entities(self, text: str, topk: int = 50) -> List[Entity]:
+        """
+        使用 jieba 抽取实体
+        结合分词+词性标注+TF-IDF权重
+        自动清理编号前缀
+        """
+        if not self.is_ready:
+            return []
+        
+        entities = []
+        seen: Set[str] = set()
+        
+        try:
+            # 先对文本进行编号前缀清理,用于提取更干净的实体
+            cleaned_text = clean_number_prefix(text)
+            
+            # 1. 词性标注分词
+            words_pos = list(pseg.cut(text))
+            
+            # 2. 提取命名实体和名词短语
+            i = 0
+            while i < len(words_pos):
+                word, flag = words_pos[i]
+                
+                # 跳过停用词和短词
+                if len(word) < 2 or word in self.STOP_WORDS:
+                    i += 1
+                    continue
+                
+                # 提取命名实体(人名、地名、机构名等)
+                if flag in ['nr', 'ns', 'nt', 'nz', 's', 'f', 't']:
+                    ent_type = self.POS_MAPPING.get(flag, 'term')
+                    # 清理实体文本的编号前缀
+                    cleaned_word = clean_number_prefix(word)
+                    if not cleaned_word:
+                        cleaned_word = word
+                    position = text.find(word)
+                    if position >= 0 and cleaned_word not in seen:
+                        seen.add(cleaned_word)
+                        entities.append(Entity(
+                            text=cleaned_word,
+                            entity_type=ent_type,
+                            position=position,
+                            context=self._get_context(text, position, 30),
+                            source="jieba_ner",
+                            weight=2.0  # 命名实体权重更高
+                        ))
+                
+                # 提取连续的名词短语(n + vn + an)
+                if flag.startswith(('n', 'vn', 'an')):
+                    phrase = [word]
+                    j = i + 1
+                    while j < len(words_pos) and words_pos[j][1].startswith(('n', 'vn', 'an', 'v')):
+                        next_word = words_pos[j][0]
+                        if len(next_word) >= 1 and next_word not in self.STOP_WORDS:
+                            phrase.append(next_word)
+                        j += 1
+                    
+                    if len(phrase) >= 2:
+                        phrase_text = ''.join(phrase)
+                        # 清理短语中的编号前缀
+                        cleaned_phrase = clean_number_prefix(phrase_text)
+                        if not cleaned_phrase:
+                            cleaned_phrase = phrase_text
+                        if 4 <= len(cleaned_phrase) <= 30 and cleaned_phrase not in seen:
+                            seen.add(cleaned_phrase)
+                            position = text.find(phrase_text)
+                            if position >= 0:
+                                entities.append(Entity(
+                                    text=cleaned_phrase,
+                                    entity_type="technical_term",
+                                    position=position,
+                                    context=self._get_context(text, position, 30),
+                                    source="jieba_phrase",
+                                    weight=1.5
+                                ))
+                    i = j if j > i + 1 else i + 1
+                else:
+                    i += 1
+            
+            # 3. TF-IDF 关键词提取
+            keywords = self.extract_keywords(text, topk=topk)
+            for word, weight in keywords:
+                # 清理关键词的编号前缀
+                cleaned_word = clean_number_prefix(word)
+                if not cleaned_word:
+                    cleaned_word = word
+                if cleaned_word not in seen and len(cleaned_word) >= 2 and cleaned_word not in self.STOP_WORDS:
+                    seen.add(cleaned_word)
+                    position = text.find(word)
+                    if position >= 0:
+                        entities.append(Entity(
+                            text=cleaned_word,
+                            entity_type="keyword",
+                            position=position,
+                            context=self._get_context(text, position, 30),
+                            source="jieba_tfidf",
+                            weight=weight
+                        ))
+            
+            # 4. 使用 TextRank 提取关键词作为补充
+            textrank_words = self.extract_textrank(text, topk=topk//2)
+            for word, weight in textrank_words:
+                # 清理关键词的编号前缀
+                cleaned_word = clean_number_prefix(word)
+                if not cleaned_word:
+                    cleaned_word = word
+                if cleaned_word not in seen and len(cleaned_word) >= 2 and cleaned_word not in self.STOP_WORDS:
+                    seen.add(cleaned_word)
+                    position = text.find(word)
+                    if position >= 0:
+                        entities.append(Entity(
+                            text=cleaned_word,
+                            entity_type="keyword",
+                            position=position,
+                            context=self._get_context(text, position, 30),
+                            source="jieba_textrank",
+                            weight=weight
+                        ))
+                        
+        except Exception as e:
+            print(f"⚠️ jieba 实体抽取失败: {e}")
+        
+        # 按权重排序
+        entities.sort(key=lambda x: x.weight, reverse=True)
+        return entities
+    
+    def extract_keywords(self, text: str, topk: int = 20) -> List[Tuple[str, float]]:
+        """
+        使用 TF-IDF 算法提取关键词
+        返回: [(word, weight), ...]
+        """
+        if not self.is_ready:
+            return []
+        
+        try:
+            # 分词
+            words = list(jieba.cut(text))
+            
+            # 过滤停用词和短词
+            filtered_words = [
+                w for w in words 
+                if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
+            ]
+            
+            if not filtered_words:
+                return []
+            
+            # 计算 TF
+            word_count = Counter(filtered_words)
+            total_words = len(filtered_words)
+            tf_scores = {word: count / total_words for word, count in word_count.items()}
+            
+            # 计算 IDF(简化版,使用语料库统计)
+            idf_scores = self._calculate_idf(filtered_words)
+            
+            # 计算 TF-IDF
+            tfidf_scores = {}
+            for word in tf_scores:
+                tfidf_scores[word] = tf_scores[word] * idf_scores.get(word, 1.0)
+            
+            # 返回 topk
+            sorted_words = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
+            return sorted_words[:topk]
+            
+        except Exception as e:
+            print(f"⚠️ TF-IDF 提取失败: {e}")
+            return []
+    
+    def extract_textrank(self, text: str, topk: int = 10) -> List[Tuple[str, float]]:
+        """
+        使用 TextRank 算法提取关键词
+        基于词共现图的关键词提取
+        """
+        if not self.is_ready:
+            return []
+        
+        try:
+            # 分词和过滤
+            words = list(jieba.cut(text))
+            filtered_words = [
+                w for w in words 
+                if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
+            ]
+            
+            if len(filtered_words) < 3:
+                return []
+            
+            # 构建共现图(滑动窗口大小为5)
+            window_size = 5
+            word_graph = {}
+            word_set = set(filtered_words)
+            
+            for word in word_set:
+                word_graph[word] = {}
+            
+            # 统计共现关系
+            for i in range(len(filtered_words)):
+                for j in range(i + 1, min(i + window_size, len(filtered_words))):
+                    w1, w2 = filtered_words[i], filtered_words[j]
+                    if w1 != w2:
+                        word_graph[w1][w2] = word_graph[w1].get(w2, 0) + 1
+                        word_graph[w2][w1] = word_graph[w2].get(w1, 0) + 1
+            
+            # TextRank 迭代计算
+            damping = 0.85
+            max_iter = 30
+            min_diff = 0.0001
+            
+            # 初始化权重
+            ranks = {word: 1.0 for word in word_set}
+            
+            for _ in range(max_iter):
+                new_ranks = {}
+                max_diff = 0
+                
+                for word in word_set:
+                    rank = (1 - damping)
+                    for neighbor, weight in word_graph[word].items():
+                        neighbor_sum = sum(word_graph[neighbor].values())
+                        if neighbor_sum > 0:
+                            rank += damping * weight * ranks[neighbor] / neighbor_sum
+                    
+                    new_ranks[word] = rank
+                    max_diff = max(max_diff, abs(rank - ranks[word]))
+                
+                ranks = new_ranks
+                
+                if max_diff < min_diff:
+                    break
+            
+            # 返回排序结果
+            sorted_words = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
+            return sorted_words[:topk]
+            
+        except Exception as e:
+            print(f"⚠️ TextRank 提取失败: {e}")
+            return []
+    
+    def _calculate_idf(self, words: List[str]) -> Dict[str, float]:
+        """
+        计算简化版 IDF
+        由于没有大规模语料库,使用基于词频的近似
+        """
+        # 基于词的长度惩罚 + 词性奖励的简化 IDF
+        idf_scores = {}
+        for word in set(words):
+            # 基础分数
+            base_idf = 1.0
+            
+            # 长度奖励(2-6字最佳)
+            if 2 <= len(word) <= 6:
+                base_idf *= 1.2
+            elif len(word) > 10:
+                base_idf *= 0.8
+            
+            # 专业词汇奖励
+            if word in self.DOMAIN_WORDS:
+                base_idf *= 1.5
+            
+            idf_scores[word] = base_idf
+        
+        return idf_scores
+    
+    def _get_context(self, text: str, position: int, window: int = 30) -> str:
+        """获取上下文"""
+        start = max(0, position - window)
+        end = min(len(text), position + window)
+        return text[start:end]
+
+
+# 全局 jieba 抽取器实例
+_jieba_extractor: Optional[JiebaExtractor] = None
+
+def get_jieba_extractor() -> Optional[JiebaExtractor]:
+    """获取 jieba 抽取器(懒加载)"""
+    global _jieba_extractor
+    if _jieba_extractor is None and JIEBA_AVAILABLE:
+        _jieba_extractor = JiebaExtractor()
+    return _jieba_extractor
+
+
+# ==================== 规则抽取模块 ====================
+
+# 标准名称模式
+STANDARD_NAME_PATTERN = r'《([^》]{2,100}?)》'
+
+# 标准编号模式
+STANDARD_NUMBER_PATTERNS = [
+    r'GB\s*/?T?\s*\d+[\./\-]?\d*(?:[-–—]\d{4})?',
+    r'JTG\s*[TD]?\s*\d+[\./\-]?\d*[a-zA-Z]?',
+    r'JTJ\s*\d+[\./\-]?\d*',
+    r'JGJ\s*\d+[\./\-]?\d*',
+    r'CJJ\s*\d+[\./\-]?\d*',
+    r'TB\s*\d+[\./\-]?\d*',
+    r'SL\s*\d+[\./\-]?\d*',
+    r'DL\s*/?T?\s*\d+[\./\-]?\d*',
+    r'NB\s*/?T?\s*\d+[\./\-]?\d*',
+    r'HG\s*/?T?\s*\d+[\./\-]?\d*',
+    r'CECS\s*\d+[::]?\d*',
+    r'T/[A-Z]+\s*\d+[\./\-]?\d*',
+    r'DB\d{2,3}[/\-]T?\s*\d+[\./\-]?\d*',
+    r'Q/[A-Z]+\s*\d+[\./\-]?\d*',
+    r'建标\s*\d+[\./\-]?\d*',
+]
+
+# 条款引用模式
+CLAUSE_PATTERNS = [
+    r'第\s*[一二三四五六七八九十百千]+\s*条',
+    r'第\s*\d+\s*条',
+    r'第\s*\d+\.\d+\s*条',
+    r'第\s*\d+\.\d+\.\d+\s*条',
+    r'[\((]\s*\d+\s*[\))]',
+]
+
+# 工程领域专业术语模式
+TECH_TERM_PATTERNS = [
+    # 工程类型
+    r'(?:公路|桥梁|隧道|路基|路面|涵洞|边坡|基坑|桩基|墩柱|梁体|涵洞|挡土墙|护坡|排水|支挡)[\w\s]{0,10}?(?:工程|结构|设施|系统)',
+    # 设计/计算相关
+    r'(?:抗震|承载力|稳定性|变形|沉降|承载能力|抗剪|抗弯|抗冲切|局部稳定|疲劳|裂缝)[\w\s]{0,10}?(?:计算|设计|验算|分析|控制|校核)',
+    r'(?:设计|计算|验算)[\w\s]{0,10}?(?:公式|方法|模型|参数|标准|规范|准则|规定)',
+    # 材料相关
+    r'(?:混凝土|钢筋|钢材|沥青|水泥|砂石|外加剂|掺合料|预应力筋)[\w\s]{0,10}?(?:强度|等级|性能|配比|用量|标号|规格)',
+    # 工艺/施工方法
+    r'(?:浇筑|张拉|压浆|焊接|检测|监测|养护|支护|开挖|爆破|注浆|灌浆)[\w\s]{0,10}?(?:工艺|方法|标准|要求|技术|规范)',
+    # 地基/基础
+    r'(?:地基|基础|支挡|防护|排水|围堰|支护)[\w\s]{0,10}?(?:设计|处理|加固|施工|工程)',
+    # 地质灾害
+    r'(?:液化|沉陷|滑坡|崩塌|泥石流|地震|岩溶|采空区|软土|湿陷性黄土)[\w\s]{0,10}?(?:处理|防治|评价|分析|地段)',
+    # 结构构件
+    r'(?:梁|板|柱|墙|拱|索|缆|锚|支座|伸缩缝|护栏|标线|标志)[\w\s]{0,5}?(?:结构|构件|部件|构造)',
+]
+
+# 安全事故类型
+SAFETY_TERM_PATTERNS = [
+    r'(?:特大|重大|较大|一般)?(?:交通|火灾|瓦斯爆炸|透水|坍塌|冒顶片帮|放炮|火药爆炸|锅炉爆炸|容器爆炸|其他爆炸|中毒和窒息|高处坠落|物体打击|机械伤害|起重伤害|触电|淹溺|灼烫|其他)?(?:安全)?事故',
+]
+
+# 管理机构/单位模式
+ORG_PATTERNS = [
+    r'(?:交通运输部?|住建部?|水利部?|工信部?|发改委|质检总局?|应急管理部?|自然资源部?)',
+    r'(?:中国|中交|中铁|中建|中冶|中水|中港)[\w\s]{2,20}?(?:研究院|设计院|工程局|公司|集团)',
+    r'(?:各省|自治区|直辖市)交通运输厅?',
+    r'[\u4e00-\u9fa5]{2,8}(?:省|市|自治区)\s*(?:交通运输厅|住建厅|水利厅)',
+]
+
+# 发布关系
+PUBLISH_PATTERNS = [
+    (r'由\s*([^,。\n]{2,30}?)\s*(?:发布|制定|颁发|出台)', '由发布'),
+    (r'根据\s*《([^》]+)》\s*(?:制定|编制|发布)', '根据制定'),
+]
+
+# 替代/废止关系
+REPLACE_PATTERNS = [
+    (r'代替\s*《([^》]+)》', '代替标准'),
+    (r'(?:自\s*[\d年月日\-]+\s*起)?\s*废止', '已废止'),
+    (r'已被\s*《([^》]+)》\s*代替', '被标准代替'),
+    (r'被\s*(GB[/T]?\s*\d+[\-]\d*)\s*代替', '被编号标准代替'),
+]
+
+# 管理关系
+MANAGE_PATTERNS = [
+    (r'(?:主编单位|主编部门)[::]\s*([^,。\n]{2,50})', '主编单位'),
+    (r'(?:参编单位|参编部门)[::]\s*([^,。\n]{2,50})', '参编单位'),
+    (r'(?:解释单位|解释部门|技术归口)[::]\s*([^,。\n]{2,50})', '解释单位'),
+    (r'由\s*([^,。]{2,30})\s*负责解释', '负责解释'),
+    (r'归口单位[::]\s*([^,。\n]{2,50})', '归口单位'),
+]
+
+# 引用关系
+REFERENCE_PATTERNS = [
+    (r'应符合\s*《([^》]+)》\s*(?:GB[/T]?\s*\d+[\-]?\d*)?\s*的?规定', '应符合'),
+    (r'应遵守\s*《([^》]+)》\s*(?:GB[/T]?\s*\d+[\-]?\d*)?', '应遵守'),
+    (r'参照\s*《([^》]+)》\s*(?:JTG[/T]?\s*\d+[\-]?\d*)?', '参照'),
+    (r'引用\s*《([^》]+)》', '引用'),
+    (r'依据\s*《([^》]+)》', '依据'),
+]
+
+# 适用范围
+SCOPE_PATTERNS = {
+    '工程类型': [
+        r'适用(?:于)?\s*(?:新建|改建|扩建)?\s*([公路桥梁隧道路基路面]{2,8}\s*工程?)',
+        r'([公路桥梁隧道路基路面涵洞]{2,6})\s*(?:的)?\s*(?:设计|施工|验收|检测)',
+    ],
+    '地区': [
+        r'适用(?:于)?\s*(全国|各省|自治区|直辖市)',
+        r'适用(?:于)?\s*([\u4e00-\u9fa5]{2,8}省|[\u4e00-\u9fa5]{2,8}市|[^。,]{2,10}地区)',
+    ],
+    '阶段': [
+        r'(设计|施工|验收|勘察|检测|养护|监理|招投标)\s*阶段',
+        r'适用(?:于)?\s*([^。,]{2,10})\s*(?:的)?\s*(设计|施工|验收|勘察)',
+    ],
+}
+
+# 废止状态
+ABOLISH_PATTERNS = [
+    r'自\s*(\d{4}年\d{1,2}月\d{1,2}日|\d{4}-\d{2}-\d{2})\s*起\s*废止',
+    r'已被?\s*《([^》]+)》\s*代替',
+    r'代替\s*《([^》]+)》',
+    r'已\s*废止',
+    r'自\s*[\d年月日]+\s*起\s*实施[^。]*原[^。]*(?:废止|代替)',
+]
+
+
+def extract_entities_rule_based(text: str) -> List[Entity]:
+    """基于规则的实体抽取(自动清理编号前缀)"""
+    entities = []
+    seen: Set[str] = set()
+    
+    def add_entity(text_content: str, e_type: str, pos: int, source: str = "rule", weight: float = 1.0):
+        # 清理编号前缀(标准名称、编号、条款引用除外)
+        if e_type not in ['standard_name', 'standard_number', 'clause']:
+            text_content = clean_number_prefix(text_content)
+        
+        if not text_content:
+            return
+            
+        key = f"{e_type}:{text_content}"
+        if key not in seen and len(text_content) >= 2:
+            seen.add(key)
+            context = get_context(text, pos, 40)
+            entities.append(Entity(
+                text=text_content,
+                entity_type=e_type,
+                position=pos,
+                context=context,
+                source=source,
+                weight=weight
+            ))
+    
+    # 1. 标准名称
+    for match in re.finditer(STANDARD_NAME_PATTERN, text):
+        name = match.group(1).strip()
+        if 2 <= len(name) <= 100:
+            add_entity(f"《{name}》", "standard_name", match.start(), weight=3.0)
+    
+    # 2. 标准编号
+    for pattern in STANDARD_NUMBER_PATTERNS:
+        for match in re.finditer(pattern, text, re.IGNORECASE):
+            number = re.sub(r'\s+', '', match.group(0)).upper()
+            if len(number) >= 3:
+                add_entity(number, "standard_number", match.start(), weight=2.5)
+    
+    # 3. 条款引用
+    for pattern in CLAUSE_PATTERNS:
+        for match in re.finditer(pattern, text):
+            clause = re.sub(r'\s+', '', match.group(0))
+            if clause and len(clause) < 50:
+                add_entity(clause, "clause", match.start(), weight=1.5)
+    
+    # 4. 专业术语(清理编号前缀)
+    for pattern in TECH_TERM_PATTERNS:
+        for match in re.finditer(pattern, text):
+            term = re.sub(r'\s+', '', match.group(0))
+            if 4 <= len(term) <= 50:
+                add_entity(term, "technical_term", match.start(), weight=2.0)
+    
+    # 5. 安全事故类型(清理编号前缀)
+    for pattern in SAFETY_TERM_PATTERNS:
+        for match in re.finditer(pattern, text):
+            term = match.group(0).strip()
+            if 4 <= len(term) <= 30:
+                add_entity(term, "safety_term", match.start(), weight=1.8)
+    
+    # 6. 管理机构/单位(清理编号前缀)
+    for pattern in ORG_PATTERNS:
+        for match in re.finditer(pattern, text):
+            org = match.group(0).strip()
+            if 4 <= len(org) <= 50:
+                add_entity(org, "organization", match.start(), weight=2.0)
+    
+    return entities
+
+
+def merge_entities(jieba_entities: List[Entity], rule_entities: List[Entity]) -> List[Entity]:
+    """合并 jieba 和规则抽取的实体,去重并加权"""
+    seen: Set[str] = set()
+    merged = []
+    
+    # 合并两个列表,按权重排序
+    all_entities = jieba_entities + rule_entities
+    all_entities.sort(key=lambda x: x.weight, reverse=True)
+    
+    for ent in all_entities:
+        # 使用小写文本作为去重 key
+        key = f"{ent.text.lower()}:{ent.entity_type}"
+        if key not in seen:
+            seen.add(key)
+            merged.append(ent)
+    
+    # 按位置排序
+    merged.sort(key=lambda x: x.position)
+    return merged
+
+
+def extract_relationships(text: str) -> List[Relationship]:
+    """抽取关系(基于规则)"""
+    relations = []
+    
+    # 1. 规则匹配
+    # 发布关系
+    for pattern, rel_type in PUBLISH_PATTERNS:
+        for match in re.finditer(pattern, text):
+            target = match.group(1) if match.groups() else match.group(0)
+            relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.9))
+    
+    # 替代/废止关系
+    for pattern, rel_type in REPLACE_PATTERNS:
+        for match in re.finditer(pattern, text):
+            target = match.group(1) if match.groups() else "未知"
+            relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.95))
+    
+    # 管理关系
+    for pattern, rel_type in MANAGE_PATTERNS:
+        for match in re.finditer(pattern, text):
+            target = match.group(1).strip() if match.groups() else "未知"
+            relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.9))
+    
+    # 引用关系
+    for pattern, rel_type in REFERENCE_PATTERNS:
+        for match in re.finditer(pattern, text):
+            target = match.group(1).strip() if match.groups() else "未知"
+            relations.append(Relationship("本标准", rel_type, f"《{target}》", match.group(0), 0.85))
+    
+    return relations
+
+
+def extract_background_info(text: str) -> BackgroundInfo:
+    """抽取背景信息"""
+    # 废止状态
+    abolish_status = []
+    for pattern in ABOLISH_PATTERNS:
+        matches = re.findall(pattern, text)
+        for m in matches:
+            if isinstance(m, tuple):
+                abolish_status.extend([x for x in m if x])
+            elif m:
+                abolish_status.append(m)
+    
+    # 管理单位
+    manage_orgs = {"主编单位": [], "参编单位": [], "解释单位": [], "归口单位": []}
+    for pattern, org_type in MANAGE_PATTERNS:
+        matches = re.findall(pattern, text)
+        for m in matches:
+            org_name = m[0] if isinstance(m, tuple) else m
+            if org_name and org_type in manage_orgs:
+                manage_orgs[org_type].append(org_name.strip())
+    
+    # 适用范围
+    scope = {"工程类型": [], "地区": [], "阶段": []}
+    for scope_type, patterns in SCOPE_PATTERNS.items():
+        for pattern in patterns:
+            matches = re.findall(pattern, text)
+            for m in matches:
+                if isinstance(m, tuple):
+                    scope[scope_type].extend([x for x in m if x])
+                elif m:
+                    scope[scope_type].append(m)
+    
+    # 关系
+    relations = extract_relationships(text)
+    
+    return BackgroundInfo(abolish_status, manage_orgs, scope, relations)
+
+
+def get_context(text: str, position: int, window: int = 40) -> str:
+    """获取实体上下文"""
+    start = max(0, position - window)
+    end = min(len(text), position + window)
+    context = text[start:end]
+    context = re.sub(r'\s+', ' ', context).strip()
+    return context[:200]
+
+
+def clean_number_prefix(text: str) -> str:
+    """
+    清理文本开头的编号前缀
+    
+    处理的编号格式:
+    - 数字编号:1.  2.0.1  3.1.2.1  10.
+    - 括号编号:(1)  (2)  (a)  (A)
+    - 中文编号:一、 二、 三、  (一) (二)
+    - 混合编号:1)  2)  a)  A)
+    """
+    if not text:
+        return text
+    
+    original_text = text
+    
+    # 1. 清理多级数字编号 (如: 2.0.1  3.1.2  1.2.3.4)
+    text = re.sub(r'^\s*\d+(?:\.\d+)+\.?\s*', '', text)
+    
+    # 2. 清理简单数字编号 (如: 1.  10.  99.)
+    text = re.sub(r'^\s*\d+\.\s*', '', text)
+    
+    # 3. 清理括号数字编号 (如: (1)  (2)  (10))
+    text = re.sub(r'^\s*[\((]\d+[\))]\s*', '', text)
+    
+    # 4. 清理括号字母编号 (如: (a)  (b)  (A)  (B))
+    text = re.sub(r'^\s*[\((][a-zA-Z][\))]\s*', '', text)
+    
+    # 5. 清理右括号编号 (如: 1)  2)  a)  A))
+    text = re.sub(r'^\s*[\d]+\)\s*', '', text)
+    text = re.sub(r'^\s*[a-zA-Z]\)\s*', '', text)
+    
+    # 6. 清理中文编号(一)(二)(三)
+    text = re.sub(r'^\s*[((][一二三四五六七八九十百千]+[))]\s*', '', text)
+    
+    # 7. 清理中文顿号编号(一、二、三、)
+    text = re.sub(r'^[一二三四五六七八九十百千]+[、..]\s*', '', text)
+    
+    # 8. 如果清理后内容太短,可能是误清理,返回原文
+    if len(text.strip()) < 3 and len(original_text.strip()) > 3:
+        return original_text.strip()
+    
+    return text.strip()
+
+
+def clean_markdown_content(text: str) -> str:
+    """
+    清理 Markdown 内容,剔除代码块、链接、标题符号等非核心内容
+    转换 md 为纯文本,保留语义核心
+    
+    清理规则:
+    1. 移除代码块 (```...``` 和 ~~~...~~~)
+    2. 移除行内代码 (`...`)
+    3. 移除链接,保留链接文本 [text](url) -> text
+    4. 移除图片 ![alt](url)
+    5. 移除 HTML 标签
+    6. 移除标题符号 (# ## ### 等)
+    7. 移除强调符号 (** * __ _)
+    8. 移除表格分隔符 (| --- |)
+    9. 移除引用符号 (>)
+    10. 移除列表符号 (- * + 1.)
+    11. 清理编号前缀 (2.0.1  (1)  一、 等)
+    12. 清理多余空行
+    """
+    if not text:
+        return ""
+    
+    # 1. 移除代码块 (```...``` 和 ~~~...~~~)
+    text = re.sub(r'```[\s\S]*?```', '', text)
+    text = re.sub(r'~~~[\s\S]*?~~~', '', text)
+    
+    # 2. 移除行内代码 (`...`)
+    text = re.sub(r'`[^`]*`', '', text)
+    
+    # 3. 处理链接:保留链接文本,移除 URL
+    # [text](url "title") -> text
+    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
+    # 处理裸链接 <url>
+    text = re.sub(r'<[^>]+>', '', text)
+    
+    # 4. 移除图片 ![alt](url)
+    text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
+    
+    # 5. 移除 HTML 标签
+    text = re.sub(r'<[^>]+>', '', text)
+    
+    # 6. 移除标题符号 (# ## ### 等)
+    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
+    
+    # 7. 移除强调符号 (** * __ _)
+    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)  # **bold**
+    text = re.sub(r'__([^_]+)__', r'\1', text)       # __bold__
+    text = re.sub(r'\*([^*]+)\*', r'\1', text)       # *italic*
+    text = re.sub(r'_([^_]+)_', r'\1', text)         # _italic_
+    text = re.sub(r'~~([^~]+)~~', r'\1', text)       # ~~strikethrough~~
+    
+    # 8. 移除表格分隔符行 (| --- | --- |)
+    text = re.sub(r'\|?[\s\-:]+\|', '', text)
+    text = re.sub(r'\|', ' ', text)  # 将表格分隔符替换为空格
+    
+    # 9. 移除引用符号 (>)
+    text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
+    
+    # 10. 移除列表符号 (- * + 1.)
+    text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
+    text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
+    
+    # 11. 清理每行的编号前缀(处理多行文本)
+    lines = text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        cleaned_line = clean_number_prefix(line)
+        if cleaned_line:
+            cleaned_lines.append(cleaned_line)
+    text = '\n'.join(cleaned_lines)
+    
+    # 12. 清理多余空行和空格
+    text = re.sub(r'\n{3,}', '\n\n', text)  # 多于2个换行合并为2个
+    text = re.sub(r'[ \t]+', ' ', text)      # 多个空格/制表符合并为一个
+    text = text.strip()
+    
+    return text
+
+
+def split_document(md_content: str) -> List[Tuple[str, str]]:
+    """将文档拆分为段落,返回 (标题, 内容) 列表"""
+    chunks = []
+    
+    # 按标题分割
+    sections = re.split(r'\n(?=##+\s)', md_content)
+    
+    for section in sections:
+        section = section.strip()
+        if not section:
+            continue
+        
+        # 提取标题
+        title_match = re.match(r'##+\s+(.+)\n', section)
+        if title_match:
+            # 获取原始标题并清理编号前缀
+            raw_title = title_match.group(1).strip()
+            title = clean_number_prefix(raw_title)
+            # 如果清理后标题为空,使用原文
+            if not title:
+                title = raw_title
+        else:
+            title = "前言/总则"
+        
+        # 清理内容 - 先移除标题行
+        content = re.sub(r'^#+\s+.+\n?', '', section, flags=re.MULTILINE)
+        # 清理 Markdown 格式(包含编号前缀清理)
+        content = clean_markdown_content(content)
+        # 清理多余换行
+        content = re.sub(r'\n+', '\n', content).strip()
+        
+        if len(content) < 10:
+            continue
+        
+        chunks.append((title, content))
+    
+    # 如果没有分块,整个文档作为一个块
+    if not chunks and md_content.strip():
+        cleaned = clean_markdown_content(md_content)
+        if cleaned.strip():
+            chunks.append(("全文", cleaned))
+    
+    return chunks
+
+
+def extract_title_from_filename(file_name: str) -> str:
+    """从文件名提取标准名称"""
+    title = re.sub(r'^\d+', '', file_name)
+    title = re.sub(r'\.md$', '', title, re.IGNORECASE)
+    return title.strip()
+
+
+def build_backgrounds(bg_info: BackgroundInfo, doc_title: str, file_name: str) -> List[str]:
+    """构建 backgrounds 列表,确保不为空"""
+    backgrounds = []
+    
+    # 1. 废止状态(最高优先级)
+    if bg_info.abolish_status:
+        for status in bg_info.abolish_status[:2]:
+            if isinstance(status, str):
+                backgrounds.append(f"废止状态:{status}")
+    
+    # 2. 管理单位
+    for org_type, orgs in bg_info.manage_orgs.items():
+        if orgs:
+            unique_orgs = list(dict.fromkeys(orgs))[:2]
+            backgrounds.append(f"{org_type}:{', '.join(unique_orgs)}")
+    
+    # 3. 适用范围
+    if bg_info.scope["工程类型"]:
+        types = list(dict.fromkeys(bg_info.scope["工程类型"]))[:2]
+        backgrounds.append(f"适用工程类型:{', '.join(types)}")
+    
+    if bg_info.scope["阶段"]:
+        stages = list(dict.fromkeys(bg_info.scope["阶段"]))[:2]
+        backgrounds.append(f"适用阶段:{', '.join(stages)}")
+    
+    # 4. 关系信息
+    important_rels = [r for r in bg_info.relations if r.relation_type in ['代替标准', '被标准代替', '应符合', '根据制定']]
+    for rel in important_rels[:2]:
+        target = rel.target[:50] + "..." if len(rel.target) > 50 else rel.target
+        backgrounds.append(f"{rel.relation_type}:{target}")
+    
+    # 5. 兜底填充
+    if not backgrounds:
+        backgrounds.append(f"编制依据文件:{doc_title}")
+        backgrounds.append(f"来源文档:{file_name}")
+    
+    return backgrounds[:5]
+
+
+def extract_all_entities(text: str) -> List[Entity]:
+    """综合抽取实体(jieba + 规则)"""
+    # jieba 抽取
+    jieba_entities = []
+    jieba_ext = get_jieba_extractor()
+    if jieba_ext and jieba_ext.is_ready:
+        jieba_entities = jieba_ext.extract_entities(text)
+    
+    # 规则抽取
+    rule_entities = extract_entities_rule_based(text)
+    
+    # 合并去重
+    return merge_entities(jieba_entities, rule_entities)
+
+
+def import_single_file(md_path: Path, embeddings) -> List[Dict[str, Any]]:
+    """导入单个 MD 文件(同一文件内实体去重)"""
+    file_name = md_path.name
+    doc_title = extract_title_from_filename(file_name)
+    
+    try:
+        with open(md_path, "r", encoding="utf-8") as f:
+            md_content = f.read()
+    except Exception as e:
+        print(f"   读取失败: {e}")
+        return []
+    
+    if not md_content.strip():
+        return []
+    
+    # 抽取文档级背景信息
+    doc_bg_info = extract_background_info(md_content)
+    doc_backgrounds = build_backgrounds(doc_bg_info, doc_title, file_name)
+    
+    # 分块处理
+    chunks = split_document(md_content)
+    
+    all_rows = []
+    # 用于同一文件内实体去重:key = (entity_text, entity_type)
+    file_entity_seen: Set[Tuple[str, str]] = set()
+    
+    for chunk_title, chunk_text in chunks:
+        # 综合抽取实体
+        entities = extract_all_entities(chunk_text)
+        
+        # 抽取段落级背景信息
+        chunk_bg_info = extract_background_info(chunk_text)
+        chunk_backgrounds = build_backgrounds(chunk_bg_info, chunk_title, file_name)
+        
+        # 合并背景信息
+        final_backgrounds = chunk_backgrounds if len(chunk_backgrounds) > 1 else doc_backgrounds
+        
+        if entities:
+            for entity in entities:
+                entity_text = entity.text.strip()
+                entity_type = entity.entity_type
+                
+                # 同一文件内实体去重
+                dedup_key = (entity_text.lower(), entity_type)
+                if dedup_key in file_entity_seen:
+                    continue
+                file_entity_seen.add(dedup_key)
+                
+                # 生成向量
+                try:
+                    vector = embeddings.embed_query(entity_text)
+                except Exception as e:
+                    print(f"   向量生成失败: {e}")
+                    continue
+                
+                # 构造 metadata
+                metadata = {
+                    "uuid": str(uuid.uuid4()),
+                    "file": file_name,
+                    "title": chunk_title,
+                    "backgrounds": final_backgrounds,
+                }
+                
+                all_rows.append({
+                    "text": entity_text,
+                    "dense": vector,
+                    "content": entity_text,
+                    "metadata": json.dumps(metadata, ensure_ascii=False),
+                })
+        else:
+            # 无实体时,用段落标题作为实体
+            chunk_title_clean = chunk_title.strip()
+            dedup_key = (chunk_title_clean.lower(), "chunk_title")
+            if dedup_key not in file_entity_seen:
+                file_entity_seen.add(dedup_key)
+                
+                try:
+                    vector = embeddings.embed_query(chunk_title_clean)
+                except Exception:
+                    continue
+                
+                metadata = {
+                    "uuid": str(uuid.uuid4()),
+                    "file": file_name,
+                    "title": chunk_title_clean,
+                    "backgrounds": final_backgrounds,
+                }
+                
+                all_rows.append({
+                    "text": chunk_title_clean[:200],
+                    "dense": vector,
+                    "content": chunk_title_clean[:200],
+                    "metadata": json.dumps(metadata, ensure_ascii=False),
+                })
+    
+    return all_rows
+
+
+def batch_insert(client, rows: List[Dict[str, Any]]) -> Tuple[int, List[Dict[str, Any]]]:
+    """批量插入数据"""
+    if not rows:
+        return 0, []
+    
+    inserted = 0
+    failed_rows = []
+    
+    for i in range(0, len(rows), BATCH_SIZE):
+        batch = rows[i:i + BATCH_SIZE]
+        try:
+            client.insert(collection_name=COLLECTION_NAME, data=batch)
+            inserted += len(batch)
+        except Exception as e:
+            print(f"   插入失败: {e}")
+            failed_rows.extend(batch)
+    
+    return inserted, failed_rows
+
+
+def import_from_folder(root_folder: str):
+    """从文件夹批量导入(只扫描指定目录下的md文件,不递归子目录)"""
+    root = Path(root_folder)
+    if not root.exists():
+        print(f"文件夹不存在: {root}")
+        return
+    
+    print(f"扫描文件夹: {root}(不递归子目录)")
+    
+    # 只扫描当前目录下的 .md 文件,不递归子目录
+    md_files = [f for f in root.glob("*.md") if f.is_file()]
+    print(f"发现 {len(md_files)} 个 MD 文件")
+    
+    if not md_files:
+        return
+    
+    # 初始化 jieba
+    jieba_ext = get_jieba_extractor()
+    if jieba_ext and jieba_ext.is_ready:
+        print("jieba 已启用")
+    else:
+        print("jieba 未启用,使用纯规则抽取")
+        print("建议安装: uv add jieba")
+    
+    # 初始化 Milvus
+    client = get_milvusclient()
+    embeddings = get_embeddings()
+    
+    if not client.has_collection(collection_name=COLLECTION_NAME):
+        print(f"Collection 不存在: {COLLECTION_NAME}")
+        print(f"运行: uv run -m src.app.scripts.first_bfp_collection_entity_create")
+        return
+    
+    client.load_collection(collection_name=COLLECTION_NAME)
+    
+    # 统计
+    total_entities = 0
+    total_inserted = 0
+    entity_source_stats: Dict[str, int] = {}
+    entity_type_stats: Dict[str, int] = {}
+    failed_files = []
+    
+    for idx, md_path in enumerate(md_files, 1):
+        print(f"\n[{idx}/{len(md_files)}] 处理: {md_path.name}")
+        
+        try:
+            rows = import_single_file(md_path, embeddings)
+            
+            if rows:
+                # 统计
+                for row in rows:
+                    text = row.get("text", "")
+                    # 推断实体类型
+                    if text.startswith("《"):
+                        etype = "standard_name"
+                    elif re.match(r'^[A-Z]{2,}', text):
+                        etype = "standard_number"
+                    elif re.match(r'^第', text):
+                        etype = "clause"
+                    else:
+                        etype = "term"
+                    entity_type_stats[etype] = entity_type_stats.get(etype, 0) + 1
+                
+                print(f"   抽取 {len(rows)} 个实体")
+                
+                # 插入
+                inserted, failed = batch_insert(client, rows)
+                total_inserted += inserted
+                if failed:
+                    print(f"   {len(failed)} 条插入失败")
+                else:
+                    print(f"   插入 {inserted} 条")
+            else:
+                print(f"   无有效实体")
+            
+            total_entities += len(rows)
+            
+        except Exception as e:
+            print(f"   处理失败: {e}")
+            failed_files.append(md_path.name)
+    
+    # 汇总
+    print("\n" + "=" * 70)
+    print("导入完成")
+    print("=" * 70)
+    print(f"处理文件: {len(md_files)}")
+    print(f"抽取实体: {total_entities}")
+    print(f"成功插入: {total_inserted}")
+    if failed_files:
+        print(f"失败文件: {len(failed_files)}")
+    print("\n实体类型分布:")
+    for etype, count in sorted(entity_type_stats.items(), key=lambda x: -x[1]):
+        print(f"   - {etype}: {count}")
+    print("=" * 70)
+
+
+def main():
+    """主函数"""
+    print("=" * 70)
+    print("编制依据实体抽取与导入(jieba 版)")
+    print("=" * 70)
+    print("实体抽取: jieba 分词 + TF-IDF + TextRank + 规则补充")
+    print("关系抽取: 规则模式匹配")
+    print("字段结构: text, dense, content(=text), metadata")
+    print("=" * 70)
+    
+    try:
+        import_from_folder(ROOT_FOLDER)
+    except Exception as e:
+        print(f"\n导入失败: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()

+ 1219 - 0
src/app/scripts/first_bfp_collection_entity_import_v2.py

@@ -0,0 +1,1219 @@
+"""
+编制依据实体抽取与导入脚本(V2 优化版)
+
+主要改进:
+1. 层级路径标题:支持 "1.总则 > 1.1术语定义" 格式的层级标题
+2. LLM辅助实体抽取:可配置启用大模型进行专业术语识别
+3. 移除硬编码工程术语模式:使用通用NLP方法 + 可选LLM增强
+4. 改进background信息抽取:更准确识别文档元数据
+
+字段结构:
+- text: 实体文本(用于 BM25 检索)
+- dense: 实体向量
+- content: 与 text 内容相同
+- metadata: JSON 字符串 {uuid, file, title(层级路径), backgrounds}
+
+依赖:
+    uv add jieba
+
+用法:
+    uv run -m src.app.scripts.first_bfp_collection_entity_import_v2
+"""
+from __future__ import annotations
+
+import json
+import re
+import uuid
+import warnings
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Set
+from dataclasses import dataclass, asdict, field
+from collections import Counter
+import math
+
+from app.config.embeddings import get_embeddings
+from app.config.milvus_client import get_milvus_client
+
+# ==================== 配置区域 ====================
+
+# Collection 名称
+COLLECTION_NAME = "first_bfp_collection_entity"
+
+# 源文件夹路径
+ROOT_FOLDER = r"F:\第二阶段编制依据及施工方案数据治理-20260206\133"
+
+# 批量插入大小
+BATCH_SIZE = 100
+
+# LLM 配置(可选)
+LLM_CONFIG = {
+      "enabled": True,  # ← 启用 LLM
+      "api_key": os.getenv("LLM_API_KEY", "lm-studio"),
+      "api_base": os.getenv("LLM_API_BASE", "http://localhost:1234/v1"),
+      "model": os.getenv("LLM_MODEL", "Qwen2.5-7B-Instruct-Uncensored.Q4_K_M"),
+      "batch_size": 5,  # ← 建议调小,GGUF模型并发能力有限
+      "max_entities_per_chunk": 8,
+}
+
+# jieba 依赖
+try:
+    import jieba
+    import jieba.posseg as pseg
+    JIEBA_AVAILABLE = True
+except ImportError:
+    JIEBA_AVAILABLE = False
+    warnings.warn("jieba not installed. Using rule-based extraction only. Run: uv add jieba")
+
+
+# ==================== 数据模型 ====================
+
+@dataclass
+class Entity:
+    """实体数据结构"""
+    text: str
+    entity_type: str
+    position: int
+    context: str = ""  # 实体出现的上下文
+    source: str = "rule"  # 来源: jieba / rule / llm / combined
+    weight: float = 1.0  # 权重
+    confidence: float = 1.0  # 置信度
+
+
+@dataclass
+class Relationship:
+    """关系数据结构"""
+    source: str
+    relation_type: str
+    target: str
+    context: str = ""
+    confidence: float = 1.0
+
+
+@dataclass
+class BackgroundInfo:
+    """背景信息结构"""
+    abolish_status: List[str] = field(default_factory=list)
+    manage_orgs: Dict[str, List[str]] = field(default_factory=dict)
+    scope: Dict[str, List[str]] = field(default_factory=dict)
+    relations: List[Relationship] = field(default_factory=list)
+    doc_type: str = ""  # 文档类型:标准/规范/办法等
+    publish_date: str = ""  # 发布日期
+    effective_date: str = ""  # 实施日期
+
+
+@dataclass
+class DocumentChunk:
+    """文档分块结构"""
+    content: str  # 清理后的内容
+    level: int  # 标题层级 0=文档级, 1=#, 2=##, ...
+    title: str  # 当前标题(原始文本)
+    hierarchy_path: str  # 层级路径 "1.总则 > 1.1术语"
+    position: int  # 在文档中的位置
+
+
+# ==================== LLM 实体抽取模块(可选)====================
+
+class LLMEntityExtractor:
+    """基于大语言模型的专业实体抽取器"""
+    
+    _instance = None
+    
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    
+    def __init__(self):
+        self.enabled = LLM_CONFIG.get("enabled", False)
+        self.cache: Dict[str, List[Entity]] = {}  # 简单缓存
+    
+    def is_available(self) -> bool:
+        """检查LLM是否可用"""
+        if not self.enabled:
+            return False
+        try:
+            import openai
+            return True
+        except ImportError:
+            return False
+    
+    def extract_entities(self, text: str, context: str = "") -> List[Entity]:
+        """
+        使用LLM抽取专业术语实体
+        返回高置信度的专业术语列表
+        """
+        if not self.is_available() or len(text) < 20:
+            return []
+        
+        # 检查缓存
+        cache_key = hash(text[:100])
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+        
+        try:
+            import openai
+            
+            client = openai.OpenAI(
+                api_key=LLM_CONFIG["api_key"],
+                base_url=LLM_CONFIG["api_base"]
+            )
+            
+            prompt = f"""从以下工程/技术文档段落中提取专业术语和重要概念。
+
+要求:
+1. 提取真正的专业术语(如"混凝土强度等级"、"承载力计算"、"抗震设防烈度")
+2. 不要提取通用词汇(如"规定"、"要求"、"方法")
+3. 每个术语标注类型:technical_term(技术术语) / material(材料) / process(工艺) / standard(标准) / parameter(参数)
+4. 最多返回10个最重要的术语
+
+文档上下文:{context if context else "工程技术标准文档"}
+
+待分析文本:
+{text[:800]}
+
+请以JSON格式返回:
+{{"entities": [{{"term": "术语", "type": "类型", "importance": "high/medium/low"}}]}}
+"""
+            
+            response = client.chat.completions.create(
+                model=LLM_CONFIG["model"],
+                messages=[
+                    {"role": "system", "content": "你是工程文档分析专家,擅长提取专业术语。只返回JSON格式结果。"},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.1,
+                max_tokens=500
+            )
+            
+            result_text = response.choices[0].message.content
+            
+            # 解析JSON结果
+            entities = []
+            try:
+                # 尝试提取JSON部分
+                json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
+                if json_match:
+                    result = json.loads(json_match.group())
+                    for item in result.get("entities", []):
+                        term = item.get("term", "").strip()
+                        etype = item.get("type", "technical_term")
+                        importance = item.get("importance", "medium")
+                        
+                        if term and len(term) >= 2:
+                            weight = {"high": 3.0, "medium": 2.0, "low": 1.0}.get(importance, 2.0)
+                            entities.append(Entity(
+                                text=term,
+                                entity_type=etype,
+                                position=text.find(term) if term in text else 0,
+                                context=text[:100],
+                                source="llm",
+                                weight=weight,
+                                confidence=0.9
+                            ))
+            except Exception as e:
+                print(f"  LLM结果解析失败: {e}")
+            
+            # 缓存结果
+            self.cache[cache_key] = entities
+            return entities
+            
+        except Exception as e:
+            print(f"  LLM调用失败: {e}")
+            return []
+
+
+# ==================== jieba NLP模块 ====================
+
+class JiebaExtractor:
+    """基于jieba的NLP抽取器"""
+    
+    _instance = None
+    _initialized = False
+    
+    # 停用词表
+    STOP_WORDS = {
+        '的', '了', '在', '是', '和', '与', '及', '或', '等', '本', '第', '之', '为', '有',
+        '而', '于', '以', '及其', '该', '这', '那', '此', '其', '个', '中', '上', '下',
+        '后', '前', '内', '外', '将', '应', '可', '按', '根据', '按照', '依据', '有关',
+        '相关', '规定', '要求', '所述', '所示', '其中', '如下', '分别', '不得', '必须',
+        '需要', '应当', '可以', '禁止', '允许', '分别', '其他', '以及', '或者', '并且',
+        '进行', '予以', '予以', '采用', '使用', '提出', '作出', '超过', '低于', '高于',
+        '符合', '满足', '达到', '完成', '形成', '产生', '引起', '导致', '造成',
+    }
+    
+    # 词性到实体类型的映射
+    POS_MAPPING = {
+        'n': 'noun',           # 名词
+        'nr': 'person',        # 人名
+        'ns': 'location',      # 地名
+        'nt': 'organization',  # 机构名
+        'nz': 'term',          # 其他专名
+        'vn': 'verb_noun',     # 名动词
+        'an': 'adj_noun',      # 名形词
+        's': 'space',          # 处所词
+        'f': 'direction',      # 方位词
+        't': 'time',           # 时间词
+    }
+    
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    
+    def __init__(self):
+        if not self._initialized and JIEBA_AVAILABLE:
+            self._load_models()
+            self._initialized = True
+    
+    def _load_models(self):
+        """加载jieba配置"""
+        try:
+            print("🔄 加载jieba分词器...")
+            # 启用paddle模式(如果可用)
+            try:
+                jieba.enable_paddle()
+                print("✅ jieba加载完成(启用Paddle模式)")
+            except:
+                print("✅ jieba加载完成(基础模式)")
+        except Exception as e:
+            print(f"⚠️ jieba加载失败: {e}")
+    
+    @property
+    def is_ready(self) -> bool:
+        return JIEBA_AVAILABLE
+    
+    def extract_entities(self, text: str, topk: int = 30) -> List[Entity]:
+        """使用jieba抽取实体"""
+        if not self.is_ready:
+            return []
+        
+        entities = []
+        seen: Set[str] = set()
+        
+        try:
+            # 1. 词性标注分词
+            words_pos = list(pseg.cut(text))
+            
+            # 2. 提取命名实体和名词短语
+            i = 0
+            while i < len(words_pos):
+                word, flag = words_pos[i]
+                
+                # 跳过停用词和短词
+                if len(word) < 2 or word in self.STOP_WORDS:
+                    i += 1
+                    continue
+                
+                # 提取命名实体
+                if flag in ['nr', 'ns', 'nt', 'nz']:
+                    ent_type = self.POS_MAPPING.get(flag, 'term')
+                    if word not in seen:
+                        seen.add(word)
+                        position = text.find(word)
+                        entities.append(Entity(
+                            text=word,
+                            entity_type=ent_type,
+                            position=max(0, position),
+                            context=self._get_context(text, position, 30),
+                            source="jieba_ner",
+                            weight=2.0
+                        ))
+                
+                # 提取连续的名词短语(长度4-20)
+                if flag.startswith(('n', 'vn', 'an')):
+                    phrase = [word]
+                    j = i + 1
+                    while j < len(words_pos) and words_pos[j][1].startswith(('n', 'vn', 'an', 'v')):
+                        next_word = words_pos[j][0]
+                        if len(next_word) >= 1 and next_word not in self.STOP_WORDS:
+                            phrase.append(next_word)
+                        j += 1
+                    
+                    if len(phrase) >= 2:
+                        phrase_text = ''.join(phrase)
+                        if 4 <= len(phrase_text) <= 20 and phrase_text not in seen:
+                            seen.add(phrase_text)
+                            position = text.find(phrase_text)
+                            if position >= 0:
+                                entities.append(Entity(
+                                    text=phrase_text,
+                                    entity_type="noun_phrase",
+                                    position=position,
+                                    context=self._get_context(text, position, 30),
+                                    source="jieba_phrase",
+                                    weight=1.5
+                                ))
+                    i = j if j > i + 1 else i + 1
+                else:
+                    i += 1
+            
+            # 3. TF-IDF关键词提取
+            keywords = self.extract_keywords(text, topk=topk)
+            for word, weight in keywords:
+                if word not in seen and len(word) >= 2:
+                    seen.add(word)
+                    position = text.find(word)
+                    entities.append(Entity(
+                        text=word,
+                        entity_type="keyword",
+                        position=max(0, position),
+                        context=self._get_context(text, position, 30),
+                        source="jieba_tfidf",
+                        weight=weight
+                    ))
+            
+            # 4. TextRank关键词
+            textrank_words = self.extract_textrank(text, topk=topk//2)
+            for word, weight in textrank_words:
+                if word not in seen and len(word) >= 2:
+                    seen.add(word)
+                    position = text.find(word)
+                    entities.append(Entity(
+                        text=word,
+                        entity_type="keyword",
+                        position=max(0, position),
+                        context=self._get_context(text, position, 30),
+                        source="jieba_textrank",
+                        weight=weight
+                    ))
+                    
+        except Exception as e:
+            print(f"⚠️ jieba实体抽取失败: {e}")
+        
+        # 按权重排序
+        entities.sort(key=lambda x: x.weight, reverse=True)
+        return entities[:topk]
+    
+    def extract_keywords(self, text: str, topk: int = 20) -> List[Tuple[str, float]]:
+        """TF-IDF关键词提取"""
+        if not self.is_ready:
+            return []
+        
+        try:
+            words = list(jieba.cut(text))
+            filtered_words = [
+                w for w in words 
+                if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
+            ]
+            
+            if not filtered_words:
+                return []
+            
+            # 计算TF
+            word_count = Counter(filtered_words)
+            total_words = len(filtered_words)
+            tf_scores = {word: count / total_words for word, count in word_count.items()}
+            
+            # 简化的IDF计算
+            idf_scores = self._calculate_idf(filtered_words)
+            
+            # TF-IDF
+            tfidf_scores = {word: tf_scores[word] * idf_scores.get(word, 1.0) 
+                          for word in tf_scores}
+            
+            sorted_words = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
+            return sorted_words[:topk]
+            
+        except Exception as e:
+            return []
+    
+    def extract_textrank(self, text: str, topk: int = 10) -> List[Tuple[str, float]]:
+        """TextRank关键词提取"""
+        if not self.is_ready:
+            return []
+        
+        try:
+            words = list(jieba.cut(text))
+            filtered_words = [
+                w for w in words 
+                if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
+            ]
+            
+            if len(filtered_words) < 3:
+                return []
+            
+            # 构建共现图
+            window_size = 5
+            word_graph = {word: {} for word in set(filtered_words)}
+            
+            for i in range(len(filtered_words)):
+                for j in range(i + 1, min(i + window_size, len(filtered_words))):
+                    w1, w2 = filtered_words[i], filtered_words[j]
+                    if w1 != w2:
+                        word_graph[w1][w2] = word_graph[w1].get(w2, 0) + 1
+                        word_graph[w2][w1] = word_graph[w2].get(w1, 0) + 1
+            
+            # TextRank迭代
+            damping = 0.85
+            max_iter = 30
+            min_diff = 0.0001
+            ranks = {word: 1.0 for word in word_graph}
+            
+            for _ in range(max_iter):
+                new_ranks = {}
+                max_diff = 0
+                
+                for word in word_graph:
+                    rank = (1 - damping)
+                    for neighbor, weight in word_graph[word].items():
+                        neighbor_sum = sum(word_graph[neighbor].values())
+                        if neighbor_sum > 0:
+                            rank += damping * weight * ranks[neighbor] / neighbor_sum
+                    new_ranks[word] = rank
+                    max_diff = max(max_diff, abs(rank - ranks[word]))
+                
+                ranks = new_ranks
+                if max_diff < min_diff:
+                    break
+            
+            sorted_words = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
+            return sorted_words[:topk]
+            
+        except Exception as e:
+            return []
+    
+    def _calculate_idf(self, words: List[str]) -> Dict[str, float]:
+        """简化IDF计算"""
+        idf_scores = {}
+        for word in set(words):
+            base_idf = 1.0
+            # 长度奖励(2-6字最佳)
+            if 2 <= len(word) <= 6:
+                base_idf *= 1.2
+            elif len(word) > 10:
+                base_idf *= 0.8
+            idf_scores[word] = base_idf
+        return idf_scores
+    
+    def _get_context(self, text: str, position: int, window: int = 30) -> str:
+        """获取上下文"""
+        start = max(0, position - window)
+        end = min(len(text), position + window)
+        return text[start:end]
+
+
+# ==================== 规则抽取模块(通用模式)====================
+
+# 标准名称模式
+STANDARD_NAME_PATTERN = r'《([^》]{2,100}?)》'
+
+# 标准编号模式
+STANDARD_NUMBER_PATTERNS = [
+    r'GB\s*/?T?\s*\d+[\./\-]?\d*(?:[-–—]\d{4})?',
+    r'JTG\s*[TD]?\s*\d+[\./\-]?\d*[a-zA-Z]?',
+    r'JTJ\s*\d+[\./\-]?\d*',
+    r'JGJ\s*\d+[\./\-]?\d*',
+    r'CJJ\s*\d+[\./\-]?\d*',
+    r'TB\s*\d+[\./\-]?\d*',
+    r'SL\s*\d+[\./\-]?\d*',
+    r'DL\s*/?T?\s*\d+[\./\-]?\d*',
+    r'NB\s*/?T?\s*\d+[\./\-]?\d*',
+    r'HG\s*/?T?\s*\d+[\./\-]?\d*',
+    r'CECS\s*\d+[::]?\d*',
+    r'T/[A-Z]+\s*\d+[\./\-]?\d*',
+    r'DB\d{2,3}[/\-]T?\s*\d+[\./\-]?\d*',
+    r'Q/[A-Z]+\s*\d+[\./\-]?\d*',
+    r'建标\s*\d+[\./\-]?\d*',
+]
+
+# 条款引用模式
+CLAUSE_PATTERNS = [
+    r'第\s*[一二三四五六七八九十百千]+\s*条',
+    r'第\s*\d+\s*条',
+    r'第\s*\d+\.\d+\s*条',
+    r'第\s*\d+\.\d+\.\d+\s*条',
+    r'[\((]\s*\d+\s*[\))]',
+]
+
+# 日期模式
+DATE_PATTERNS = [
+    r'(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日',
+    r'(\d{4})-(\d{2})-(\d{2})',
+]
+
+# 管理单位模式(更通用)
+ORG_PATTERNS = [
+    r'(?:主编单位|主编部门)[::]\s*([^,。\n]{2,50})',
+    r'(?:参编单位|参编部门)[::]\s*([^,。\n]{2,50})',
+    r'(?:解释单位|技术归口)[::]\s*([^,。\n]{2,50})',
+    r'由\s*([^,。]{2,30})\s*负责解释',
+    r'批准部门[::]\s*([^,。\n]{2,50})',
+]
+
+# 废止/替代关系
+ABOLISH_PATTERNS = [
+    (r'代替\s*《([^》]+)》', '代替标准'),
+    (r'被\s*《([^》]+)》\s*代替', '被标准代替'),
+    (r'(?:自\s*[\d年月日\-]+\s*起)?\s*废止', '已废止'),
+    (r'原\s*([^\s]+)\s*同时废止', '原标准废止'),
+]
+
+# 适用范围模式(更通用)
+SCOPE_PATTERNS = [
+    r'适用(?:于)?\s*([^。,]{3,30}?)(?:的)?\s*(?:设计|施工|验收|检测|勘察|监理)',
+    r'适用(?:于)?\s*([^。,]{3,30}?)\s*工程',
+    r'适用(?:于)?\s*([^。,]{3,30}?)(?:建设|管理)',
+]
+
+
+def extract_entities_rule_based(text: str) -> List[Entity]:
+    """基于规则的实体抽取(通用模式)"""
+    entities = []
+    seen: Set[str] = set()
+    
+    def add_entity(text_content: str, e_type: str, pos: int, weight: float = 1.0):
+        key = f"{e_type}:{text_content.lower()}"
+        if key not in seen and len(text_content) >= 2:
+            seen.add(key)
+            entities.append(Entity(
+                text=text_content,
+                entity_type=e_type,
+                position=pos,
+                context=get_context(text, pos, 40),
+                source="rule",
+                weight=weight
+            ))
+    
+    # 1. 标准名称(最高权重)
+    for match in re.finditer(STANDARD_NAME_PATTERN, text):
+        name = match.group(1).strip()
+        if 2 <= len(name) <= 100:
+            add_entity(f"《{name}》", "standard_name", match.start(), weight=3.0)
+    
+    # 2. 标准编号
+    for pattern in STANDARD_NUMBER_PATTERNS:
+        for match in re.finditer(pattern, text, re.IGNORECASE):
+            number = re.sub(r'\s+', '', match.group(0)).upper()
+            if len(number) >= 3:
+                add_entity(number, "standard_number", match.start(), weight=2.5)
+    
+    # 3. 条款引用
+    for pattern in CLAUSE_PATTERNS:
+        for match in re.finditer(pattern, text):
+            clause = re.sub(r'\s+', '', match.group(0))
+            if clause and len(clause) < 50:
+                add_entity(clause, "clause", match.start(), weight=1.5)
+    
+    # 4. 日期信息
+    for pattern in DATE_PATTERNS:
+        for match in re.finditer(pattern, text):
+            date_str = match.group(0)
+            add_entity(date_str, "date", match.start(), weight=1.2)
+    
+    return entities
+
+
+def extract_relationships(text: str) -> List[Relationship]:
+    """抽取关系"""
+    relations = []
+    
+    # 替代/废止关系
+    for pattern, rel_type in ABOLISH_PATTERNS:
+        for match in re.finditer(pattern, text):
+            target = match.group(1) if match.groups() else "未知"
+            relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.95))
+    
+    return relations
+
+
+def extract_background_info(text: str) -> BackgroundInfo:
+    """抽取背景信息(改进版)"""
+    bg = BackgroundInfo()
+    
+    # 1. 管理单位
+    bg.manage_orgs = {"主编单位": [], "参编单位": [], "解释单位": [], "批准部门": []}
+    for pattern in ORG_PATTERNS:
+        matches = re.findall(pattern, text)
+        for m in matches:
+            org_name = m[0] if isinstance(m, tuple) else m
+            if org_name:
+                # 判断类型
+                if "主编" in pattern:
+                    bg.manage_orgs["主编单位"].append(org_name.strip())
+                elif "参编" in pattern:
+                    bg.manage_orgs["参编单位"].append(org_name.strip())
+                elif "解释" in pattern or "归口" in pattern:
+                    bg.manage_orgs["解释单位"].append(org_name.strip())
+                elif "批准" in pattern:
+                    bg.manage_orgs["批准部门"].append(org_name.strip())
+    
+    # 2. 废止状态
+    bg.abolish_status = []
+    for pattern, _ in ABOLISH_PATTERNS:
+        matches = re.findall(pattern, text)
+        for m in matches:
+            status = m if isinstance(m, str) else (m[0] if m else "")
+            if status:
+                bg.abolish_status.append(status)
+    
+    # 3. 适用范围
+    bg.scope = {"适用范围": []}
+    for pattern in SCOPE_PATTERNS:
+        matches = re.findall(pattern, text)
+        for m in matches:
+            scope_text = m[0] if isinstance(m, tuple) else m
+            if scope_text and len(scope_text) < 50:
+                bg.scope["适用范围"].append(scope_text.strip())
+    
+    # 4. 发布/实施日期
+    dates = []
+    for pattern in DATE_PATTERNS:
+        for match in re.finditer(pattern, text):
+            dates.append((match.group(0), match.start()))
+    
+    # 根据位置推断发布日期和实施日期(通常发布日期在前)
+    if dates:
+        dates.sort(key=lambda x: x[1])
+        if len(dates) >= 2:
+            bg.publish_date = dates[0][0]
+            bg.effective_date = dates[1][0]
+        else:
+            bg.publish_date = dates[0][0]
+    
+    # 5. 关系
+    bg.relations = extract_relationships(text)
+    
+    # 6. 文档类型
+    doc_types = re.findall(r'(国家标准|行业标准|地方标准|团体标准|企业标准|规范|规程|指南|办法)', text)
+    if doc_types:
+        bg.doc_type = doc_types[0]
+    
+    return bg
+
+
+# ==================== 文档处理工具函数 ====================
+
+def get_context(text: str, position: int, window: int = 40) -> str:
+    """获取实体上下文"""
+    start = max(0, position - window)
+    end = min(len(text), position + window)
+    context = text[start:end]
+    context = re.sub(r'\s+', ' ', context).strip()
+    return context[:200]
+
+
+def clean_markdown_content(text: str) -> str:
+    """清理Markdown内容为纯文本"""
+    if not text:
+        return ""
+    
+    # 1. 移除代码块
+    text = re.sub(r'```[\s\S]*?```', '', text)
+    text = re.sub(r'~~~[\s\S]*?~~~', '', text)
+    
+    # 2. 移除行内代码
+    text = re.sub(r'`[^`]*`', '', text)
+    
+    # 3. 处理链接:保留文本
+    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
+    text = re.sub(r'<[^>]+>', '', text)
+    
+    # 4. 移除图片
+    text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
+    
+    # 5. 移除HTML标签
+    text = re.sub(r'<[^>]+>', '', text)
+    
+    # 6. 移除标题符号(保留标题文本)
+    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
+    
+    # 7. 移除强调符号
+    text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
+    text = re.sub(r'__([^_]+)__', r'\1', text)
+    text = re.sub(r'\*([^*]+)\*', r'\1', text)
+    text = re.sub(r'_([^_]+)_', r'\1', text)
+    text = re.sub(r'~~([^~]+)~~', r'\1', text)
+    
+    # 8. 清理表格和列表
+    text = re.sub(r'\|?[\s\-:]+\|', '', text)
+    text = re.sub(r'\|', ' ', text)
+    text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
+    text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
+    
+    # 9. 移除引用符号
+    text = re.sub(r'^[\s]*>\s*', '', text, flags=re.MULTILINE)
+    
+    # 10. 清理多余空白
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = re.sub(r'[ \t]+', ' ', text)
+    
+    return text.strip()
+
+
+def parse_heading(line: str) -> Tuple[int, str]:
+    """
+    解析标题行,返回(层级, 标题文本)
+    层级:0=不是标题, 1=#, 2=##, ...
+    """
+    match = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
+    if match:
+        level = len(match.group(1))
+        title = match.group(2).strip()
+        return level, title
+    return 0, ""
+
+
+def split_document_hierarchy(md_content: str) -> List[DocumentChunk]:
+    """
+    将文档按层级结构拆分为块
+    返回层级路径如 "1.总则 > 1.1术语定义"
+    """
+    chunks = []
+    lines = md_content.split('\n')
+    
+    # 当前层级路径栈
+    hierarchy_stack: List[Tuple[int, str]] = []  # [(level, title), ...]
+    current_content_lines = []
+    current_start_pos = 0
+    
+    def build_hierarchy_path() -> str:
+        """构建层级路径字符串"""
+        if not hierarchy_stack:
+            return "前言/总则"
+        return " > ".join([title for _, title in hierarchy_stack])
+    
+    def flush_current_chunk(end_pos: int):
+        """将当前内容保存为chunk"""
+        nonlocal current_content_lines
+        if current_content_lines:
+            content = '\n'.join(current_content_lines)
+            content = clean_markdown_content(content)
+            if len(content) >= 10:
+                level = hierarchy_stack[-1][0] if hierarchy_stack else 0
+                title = hierarchy_stack[-1][1] if hierarchy_stack else "前言/总则"
+                chunks.append(DocumentChunk(
+                    content=content,
+                    level=level,
+                    title=title,
+                    hierarchy_path=build_hierarchy_path(),
+                    position=current_start_pos
+                ))
+        current_content_lines = []
+    
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        level, title = parse_heading(line)
+        
+        if level > 0:
+            # 遇到新标题,先保存当前内容
+            flush_current_chunk(i)
+            current_start_pos = i
+            
+            # 更新层级栈
+            # 弹出同层级或更深层的标题
+            while hierarchy_stack and hierarchy_stack[-1][0] >= level:
+                hierarchy_stack.pop()
+            hierarchy_stack.append((level, title))
+        else:
+            # 普通内容行
+            if line.strip():
+                current_content_lines.append(line)
+        
+        i += 1
+    
+    # 处理最后一块内容
+    flush_current_chunk(len(lines))
+    
+    # 如果没有分块,整个文档作为一个块
+    if not chunks and md_content.strip():
+        cleaned = clean_markdown_content(md_content)
+        if cleaned.strip():
+            chunks.append(DocumentChunk(
+                content=cleaned,
+                level=0,
+                title="全文",
+                hierarchy_path="全文",
+                position=0
+            ))
+    
+    return chunks
+
+
+def extract_title_from_filename(file_name: str) -> str:
+    """从文件名提取文档标题"""
+    title = re.sub(r'^\d+[_\-]?', '', file_name)  # 移除开头数字
+    title = re.sub(r'\.md$', '', title, re.IGNORECASE)
+    title = re.sub(r'[_\-]', ' ', title)
+    return title.strip()
+
+
+def build_backgrounds(bg_info: BackgroundInfo, hierarchy_path: str, file_name: str) -> List[str]:
+    """构建backgrounds列表(改进版)"""
+    backgrounds = []
+    
+    # 1. 文档类型
+    if bg_info.doc_type:
+        backgrounds.append(f"文档类型:{bg_info.doc_type}")
+    
+    # 2. 层级路径
+    if hierarchy_path and hierarchy_path != "前言/总则":
+        backgrounds.append(f"章节位置:{hierarchy_path[:80]}")
+    
+    # 3. 发布/实施日期
+    if bg_info.publish_date:
+        backgrounds.append(f"发布日期:{bg_info.publish_date}")
+    if bg_info.effective_date:
+        backgrounds.append(f"实施日期:{bg_info.effective_date}")
+    
+    # 4. 管理单位(只取第一个)
+    for org_type, orgs in bg_info.manage_orgs.items():
+        if orgs:
+            unique_orgs = list(dict.fromkeys(orgs))[:1]  # 只取第一个
+            backgrounds.append(f"{org_type}:{', '.join(unique_orgs)[:50]}")
+            break  # 只添加一种管理单位
+    
+    # 5. 废止状态
+    if bg_info.abolish_status:
+        for status in bg_info.abolish_status[:1]:
+            if isinstance(status, str):
+                backgrounds.append(f"废止状态:{status[:50]}")
+    
+    # 6. 适用范围
+    if bg_info.scope.get("适用范围"):
+        scopes = list(dict.fromkeys(bg_info.scope["适用范围"]))[:1]
+        backgrounds.append(f"适用范围:{', '.join(scopes)[:50]}")
+    
+    # 7. 兜底
+    if not backgrounds:
+        backgrounds.append(f"来源文档:{file_name}")
+    
+    return backgrounds[:5]  # 最多5条
+
+
+# ==================== 实体抽取主函数 ====================
+
+def merge_entities(entity_lists: List[List[Entity]]) -> List[Entity]:
+    """合并多个来源的实体,去重并加权"""
+    seen: Dict[str, Entity] = {}
+    
+    for entities in entity_lists:
+        for ent in entities:
+            # 使用小写文本+类型作为去重key
+            key = f"{ent.text.lower()}:{ent.entity_type}"
+            if key in seen:
+                # 合并权重和置信度
+                existing = seen[key]
+                existing.weight = max(existing.weight, ent.weight)
+                existing.confidence = max(existing.confidence, ent.confidence)
+                existing.source = f"{existing.source}+{ent.source}"
+            else:
+                seen[key] = ent
+    
+    # 转换为列表并排序
+    merged = list(seen.values())
+    merged.sort(key=lambda x: (x.weight * x.confidence), reverse=True)
+    return merged
+
+
+def extract_all_entities(text: str, hierarchy_path: str = "", use_llm: bool = False) -> List[Entity]:
+    """
+    综合抽取实体(多策略融合)
+    
+    策略优先级:
+    1. 规则抽取(标准名称、编号等)- 高置信度
+    2. jieba NLP抽取(TF-IDF/TextRank)- 中置信度
+    3. LLM抽取(可选)- 高置信度但成本高
+    """
+    all_entities = []
+    
+    # 1. 规则抽取
+    rule_entities = extract_entities_rule_based(text)
+    all_entities.append(rule_entities)
+    
+    # 2. jieba NLP抽取
+    jieba_ext = get_jieba_extractor()
+    if jieba_ext and jieba_ext.is_ready:
+        jieba_entities = jieba_ext.extract_entities(text, topk=20)
+        all_entities.append(jieba_entities)
+    
+    # 3. LLM抽取(可选,受配置控制)
+    if use_llm and LLM_CONFIG.get("enabled"):
+        llm_ext = get_llm_extractor()
+        if llm_ext.is_available():
+            llm_entities = llm_ext.extract_entities(text, context=hierarchy_path)
+            all_entities.append(llm_entities)
+    
+    # 合并去重
+    return merge_entities(all_entities)
+
+
+# ==================== 全局实例 ====================
+
+_jieba_extractor: Optional[JiebaExtractor] = None
+_llm_extractor: Optional[LLMEntityExtractor] = None
+
+
+def get_jieba_extractor() -> Optional[JiebaExtractor]:
+    """获取jieba抽取器"""
+    global _jieba_extractor
+    if _jieba_extractor is None and JIEBA_AVAILABLE:
+        _jieba_extractor = JiebaExtractor()
+    return _jieba_extractor
+
+
+def get_llm_extractor() -> Optional[LLMEntityExtractor]:
+    """获取LLM抽取器"""
+    global _llm_extractor
+    if _llm_extractor is None:
+        _llm_extractor = LLMEntityExtractor()
+    return _llm_extractor
+
+
+# ==================== 导入主逻辑 ====================
+
+def import_single_file(md_path: Path, embeddings, use_llm: bool = False) -> List[Dict[str, Any]]:
+    """导入单个MD文件"""
+    file_name = md_path.name
+    doc_title = extract_title_from_filename(file_name)
+    
+    try:
+        with open(md_path, "r", encoding="utf-8") as f:
+            md_content = f.read()
+    except Exception as e:
+        print(f"   读取失败: {e}")
+        return []
+    
+    if not md_content.strip():
+        return []
+    
+    # 抽取文档级背景信息
+    doc_bg_info = extract_background_info(md_content)
+    
+    # 使用层级结构拆分文档
+    chunks = split_document_hierarchy(md_content)
+    
+    all_rows = []
+    # 用于同一文件内实体去重
+    file_entity_seen: Set[Tuple[str, str]] = set()
+    
+    for chunk in chunks:
+        # 跳过太短的段落
+        if len(chunk.content) < 20:
+            continue
+        
+        # 抽取实体
+        entities = extract_all_entities(chunk.content, chunk.hierarchy_path, use_llm)
+        
+        # 构建背景信息(使用层级路径)
+        chunk_bg_info = extract_background_info(chunk.content)
+        # 合并文档级和段落级背景信息
+        merged_bg = BackgroundInfo()
+        merged_bg.doc_type = doc_bg_info.doc_type or chunk_bg_info.doc_type
+        merged_bg.publish_date = doc_bg_info.publish_date or chunk_bg_info.publish_date
+        merged_bg.effective_date = doc_bg_info.effective_date or chunk_bg_info.effective_date
+        merged_bg.manage_orgs = doc_bg_info.manage_orgs if doc_bg_info.manage_orgs else chunk_bg_info.manage_orgs
+        merged_bg.abolish_status = doc_bg_info.abolish_status if doc_bg_info.abolish_status else chunk_bg_info.abolish_status
+        merged_bg.scope = doc_bg_info.scope if doc_bg_info.scope else chunk_bg_info.scope
+        
+        final_backgrounds = build_backgrounds(merged_bg, chunk.hierarchy_path, file_name)
+        
+        if entities:
+            for entity in entities:
+                entity_text = entity.text.strip()
+                entity_type = entity.entity_type
+                
+                # 同一文件内实体去重
+                dedup_key = (entity_text.lower(), entity_type)
+                if dedup_key in file_entity_seen:
+                    continue
+                file_entity_seen.add(dedup_key)
+                
+                # 生成向量
+                try:
+                    vector = embeddings.embed_query(entity_text)
+                except Exception as e:
+                    print(f"   向量生成失败: {e}")
+                    continue
+                
+                # 构造metadata - 使用层级路径作为title
+                metadata = {
+                    "uuid": str(uuid.uuid4()),
+                    "file": file_name,
+                    "title": chunk.hierarchy_path,  # 层级路径
+                    "section_title": chunk.title,   # 当前章节标题
+                    "backgrounds": final_backgrounds,
+                    "entity_type": entity_type,
+                    "source": entity.source,
+                }
+                
+                all_rows.append({
+                    "text": entity_text,
+                    "dense": vector,
+                    "content": entity_text,
+                    "metadata": json.dumps(metadata, ensure_ascii=False),
+                })
+        else:
+            # 无实体时,用层级路径作为实体
+            hierarchy_clean = chunk.hierarchy_path.strip()
+            dedup_key = (hierarchy_clean.lower(), "section")
+            if dedup_key not in file_entity_seen:
+                file_entity_seen.add(dedup_key)
+                
+                try:
+                    vector = embeddings.embed_query(hierarchy_clean)
+                except Exception:
+                    continue
+                
+                metadata = {
+                    "uuid": str(uuid.uuid4()),
+                    "file": file_name,
+                    "title": chunk.hierarchy_path,
+                    "section_title": chunk.title,
+                    "backgrounds": final_backgrounds,
+                    "entity_type": "section",
+                    "source": "hierarchy",
+                }
+                
+                all_rows.append({
+                    "text": hierarchy_clean[:200],
+                    "dense": vector,
+                    "content": hierarchy_clean[:200],
+                    "metadata": json.dumps(metadata, ensure_ascii=False),
+                })
+    
+    return all_rows
+
+
+def batch_insert(client, rows: List[Dict[str, Any]]) -> Tuple[int, List[Dict[str, Any]]]:
+    """批量插入数据"""
+    if not rows:
+        return 0, []
+    
+    inserted = 0
+    failed_rows = []
+    
+    for i in range(0, len(rows), BATCH_SIZE):
+        batch = rows[i:i + BATCH_SIZE]
+        try:
+            client.insert(collection_name=COLLECTION_NAME, data=batch)
+            inserted += len(batch)
+        except Exception as e:
+            print(f"   插入失败: {e}")
+            failed_rows.extend(batch)
+    
+    return inserted, failed_rows
+
+
+def import_from_folder(root_folder: str, use_llm: bool = False):
+    """从文件夹批量导入"""
+    root = Path(root_folder)
+    if not root.exists():
+        print(f"文件夹不存在: {root}")
+        return
+    
+    print(f"扫描文件夹: {root}(不递归子目录)")
+    
+    # 只扫描当前目录下的.md文件
+    md_files = [f for f in root.glob("*.md") if f.is_file()]
+    print(f"发现 {len(md_files)} 个MD文件")
+    
+    if not md_files:
+        return
+    
+    # 初始化抽取器
+    jieba_ext = get_jieba_extractor()
+    if jieba_ext and jieba_ext.is_ready:
+        print("✅ jieba已启用")
+    else:
+        print("⚠️ jieba未启用,使用纯规则抽取")
+    
+    if use_llm and LLM_CONFIG.get("enabled"):
+        llm_ext = get_llm_extractor()
+        if llm_ext.is_available():
+            print("✅ LLM增强已启用")
+        else:
+            print("⚠️ LLM不可用,请检查配置和openai包")
+            use_llm = False
+    
+    # 初始化Milvus
+    client = get_milvus_client()
+    embeddings = get_embeddings()
+    
+    if not client.has_collection(collection_name=COLLECTION_NAME):
+        print(f"Collection不存在: {COLLECTION_NAME}")
+        print(f"运行: uv run -m src.app.scripts.first_bfp_collection_entity_create")
+        return
+    
+    client.load_collection(collection_name=COLLECTION_NAME)
+    
+    # 统计
+    total_entities = 0
+    total_inserted = 0
+    entity_type_stats: Dict[str, int] = {}
+    failed_files = []
+    
+    for idx, md_path in enumerate(md_files, 1):
+        print(f"\n[{idx}/{len(md_files)}] 处理: {md_path.name}")
+        
+        try:
+            rows = import_single_file(md_path, embeddings, use_llm)
+            
+            if rows:
+                # 统计
+                for row in rows:
+                    metadata = json.loads(row.get("metadata", "{}"))
+                    etype = metadata.get("entity_type", "unknown")
+                    entity_type_stats[etype] = entity_type_stats.get(etype, 0) + 1
+                
+                print(f"   抽取 {len(rows)} 个实体")
+                
+                # 插入
+                inserted, failed = batch_insert(client, rows)
+                total_inserted += inserted
+                if failed:
+                    print(f"   {len(failed)} 条插入失败")
+                else:
+                    print(f"   插入 {inserted} 条")
+            else:
+                print(f"   无有效实体")
+            
+            total_entities += len(rows)
+            
+        except Exception as e:
+            print(f"   处理失败: {e}")
+            import traceback
+            traceback.print_exc()
+            failed_files.append(md_path.name)
+    
+    # 汇总
+    print("\n" + "=" * 70)
+    print("导入完成")
+    print("=" * 70)
+    print(f"处理文件: {len(md_files)}")
+    print(f"抽取实体: {total_entities}")
+    print(f"成功插入: {total_inserted}")
+    if failed_files:
+        print(f"失败文件: {len(failed_files)}")
+    print("\n实体类型分布:")
+    for etype, count in sorted(entity_type_stats.items(), key=lambda x: -x[1]):
+        print(f"   - {etype}: {count}")
+    print("=" * 70)
+
+
+def main():
+    """主函数"""
+    print("=" * 70)
+    print("编制依据实体抽取与导入(V2 优化版)")
+    print("=" * 70)
+    print("主要改进:")
+    print("  1. 层级路径标题: '1.总则 > 1.1术语定义'")
+    print("  2. 移除硬编码工程术语模式")
+    print("  3. 改进background信息抽取")
+    print("  4. 可选LLM增强(需配置)")
+    print("=" * 70)
+    
+    # 检查是否需要启用LLM
+    use_llm = LLM_CONFIG.get("enabled", False)
+    
+    try:
+        import_from_folder(ROOT_FOLDER, use_llm=use_llm)
+    except Exception as e:
+        print(f"\n导入失败: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    main()