|
@@ -0,0 +1,1198 @@
|
|
|
|
|
+"""
|
|
|
|
|
+编制依据实体抽取与导入脚本(jieba 版)
|
|
|
|
|
+
|
|
|
|
|
+功能:
|
|
|
|
|
+1. 实体抽取:使用 jieba 分词 + 词性标注 + TF-IDF 关键词提取 + 规则补充
|
|
|
|
|
+2. 关系抽取:基于规则模式匹配
|
|
|
|
|
+3. 背景信息:废止状态、管理单位、适用范围
|
|
|
|
|
+
|
|
|
|
|
+字段结构:
|
|
|
|
|
+- text: 实体文本(用于 BM25 检索)
|
|
|
|
|
+- dense: 实体向量
|
|
|
|
|
+- content: 与 text 内容相同
|
|
|
|
|
+- metadata: JSON 字符串 {uuid, file, title, backgrounds}
|
|
|
|
|
+ - backgrounds 不能为空
|
|
|
|
|
+
|
|
|
|
|
+依赖:
|
|
|
|
|
+ uv add jieba
|
|
|
|
|
+
|
|
|
|
|
+用法:
|
|
|
|
|
+ uv run -m src.app.scripts.first_bfp_collection_entity_import
|
|
|
|
|
+"""
|
|
|
|
|
+from __future__ import annotations
|
|
|
|
|
+
|
|
|
|
|
+import json
|
|
|
|
|
+import re
|
|
|
|
|
+import uuid
|
|
|
|
|
+import warnings
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+from typing import Any, Dict, List, Optional, Tuple, Set
|
|
|
|
|
+from dataclasses import dataclass, asdict
|
|
|
|
|
+from collections import Counter
|
|
|
|
|
+import math
|
|
|
|
|
+
|
|
|
|
|
+from app.config.embeddings import get_embeddings
|
|
|
|
|
+from app.config.milvus_client import get_milvusclient
|
|
|
|
|
+
|
|
|
|
|
+# Collection 名称
|
|
|
|
|
+COLLECTION_NAME = "first_bfp_collection_entity"
|
|
|
|
|
+
|
|
|
|
|
+# 源文件夹路径
|
|
|
|
|
+ROOT_FOLDER = r"F:\第二阶段编制依据及施工方案数据治理-20260206\133"
|
|
|
|
|
+
|
|
|
|
|
+# 批量插入大小
|
|
|
|
|
+BATCH_SIZE = 100
|
|
|
|
|
+
|
|
|
|
|
+# jieba 依赖
|
|
|
|
|
+try:
|
|
|
|
|
+ import jieba
|
|
|
|
|
+ import jieba.posseg as pseg
|
|
|
|
|
+ JIEBA_AVAILABLE = True
|
|
|
|
|
+except ImportError:
|
|
|
|
|
+ JIEBA_AVAILABLE = False
|
|
|
|
|
+ warnings.warn("jieba not installed. Using rule-based extraction only. Run: uv add jieba")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+@dataclass
|
|
|
|
|
+class Entity:
|
|
|
|
|
+ """实体数据结构"""
|
|
|
|
|
+ text: str
|
|
|
|
|
+ entity_type: str
|
|
|
|
|
+ position: int
|
|
|
|
|
+ context: str = "" # 实体出现的上下文
|
|
|
|
|
+ source: str = "rule" # 来源: jieba / rule / combined
|
|
|
|
|
+ weight: float = 1.0 # 权重(TF-IDF 分数)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+@dataclass
|
|
|
|
|
+class Relationship:
|
|
|
|
|
+ """关系数据结构"""
|
|
|
|
|
+ source: str
|
|
|
|
|
+ relation_type: str
|
|
|
|
|
+ target: str
|
|
|
|
|
+ context: str = ""
|
|
|
|
|
+ confidence: float = 1.0 # 置信度
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+@dataclass
|
|
|
|
|
+class BackgroundInfo:
|
|
|
|
|
+ """背景信息结构"""
|
|
|
|
|
+ abolish_status: List[str]
|
|
|
|
|
+ manage_orgs: Dict[str, List[str]]
|
|
|
|
|
+ scope: Dict[str, List[str]]
|
|
|
|
|
+ relations: List[Relationship]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# ==================== jieba 工具类 ====================
|
|
|
|
|
+
|
|
|
|
|
+class JiebaExtractor:
|
|
|
|
|
+ """基于 jieba 的专业 NLP 抽取器"""
|
|
|
|
|
+
|
|
|
|
|
+ _instance = None
|
|
|
|
|
+ _initialized = False
|
|
|
|
|
+
|
|
|
|
|
+ # 停用词表
|
|
|
|
|
+ STOP_WORDS = {
|
|
|
|
|
+ '的', '了', '在', '是', '和', '与', '及', '或', '等', '本', '第', '之', '为', '有',
|
|
|
|
|
+ '而', '于', '以', '及其', '该', '这', '那', '此', '其', '个', '中', '上', '下',
|
|
|
|
|
+ '后', '前', '内', '外', '将', '应', '可', '按', '根据', '按照', '依据', '有关',
|
|
|
|
|
+ '相关', '规定', '要求', '所述', '所示', '所述', '其中', '如下', '如下所述',
|
|
|
|
|
+ '分别', '不得', '必须', '需要', '应当', '可以', '不得', '禁止', '允许',
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # 专业领域词典(可扩展)
|
|
|
|
|
+ DOMAIN_WORDS = {
|
|
|
|
|
+ '混凝土', '钢筋', '预应力', '桥梁', '隧道', '路基', '路面', '涵洞',
|
|
|
|
|
+ '边坡', '基坑', '桩基', '墩柱', '梁体', '支座', '伸缩缝', '挡土墙',
|
|
|
|
|
+ '施工', '检测', '监测', '设计', '验收', '养护', '抗震', '承载力',
|
|
|
|
|
+ '稳定性', '变形', '沉降', '抗剪', '抗弯', '裂缝', '焊接', '浇筑',
|
|
|
|
|
+ '张拉', '压浆', '注浆', '爆破', '开挖', '支护', '地基处理',
|
|
|
|
|
+ '安全检查', '脚手架', '模板', '高处作业', '临时用电', '起重机械',
|
|
|
|
|
+ '文明施工', '扬尘治理', '绿色施工', '质量管理', '安全生产',
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ # 词性到实体类型的映射
|
|
|
|
|
+ POS_MAPPING = {
|
|
|
|
|
+ 'n': 'noun', # 名词
|
|
|
|
|
+ 'nr': 'person', # 人名
|
|
|
|
|
+ 'ns': 'location', # 地名
|
|
|
|
|
+ 'nt': 'organization', # 机构名
|
|
|
|
|
+ 'nz': 'term', # 其他专名
|
|
|
|
|
+ 'vn': 'verb_noun', # 名动词
|
|
|
|
|
+ 'an': 'adj_noun', # 名形词
|
|
|
|
|
+ 's': 'space', # 处所词
|
|
|
|
|
+ 'f': 'direction', # 方位词
|
|
|
|
|
+ 't': 'time', # 时间词
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ def __new__(cls):
|
|
|
|
|
+ if cls._instance is None:
|
|
|
|
|
+ cls._instance = super().__new__(cls)
|
|
|
|
|
+ return cls._instance
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(self):
|
|
|
|
|
+ if not self._initialized and JIEBA_AVAILABLE:
|
|
|
|
|
+ self._load_models()
|
|
|
|
|
+ self._initialized = True
|
|
|
|
|
+
|
|
|
|
|
+ def _load_models(self):
|
|
|
|
|
+ """加载 jieba 词典和配置"""
|
|
|
|
|
+ try:
|
|
|
|
|
+ print("🔄 加载 jieba 分词器...")
|
|
|
|
|
+
|
|
|
|
|
+ # 添加专业领域词汇
|
|
|
|
|
+ for word in self.DOMAIN_WORDS:
|
|
|
|
|
+ jieba.add_word(word, freq=1000)
|
|
|
|
|
+
|
|
|
|
|
+ # 启用paddle模式(如果可用)
|
|
|
|
|
+ try:
|
|
|
|
|
+ jieba.enable_paddle()
|
|
|
|
|
+ print("✅ jieba 加载完成(启用 Paddle 模式)")
|
|
|
|
|
+ except:
|
|
|
|
|
+ print("✅ jieba 加载完成(基础模式)")
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"⚠️ jieba 加载失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ @property
|
|
|
|
|
+ def is_ready(self) -> bool:
|
|
|
|
|
+ return JIEBA_AVAILABLE
|
|
|
|
|
+
|
|
|
|
|
+ def extract_entities(self, text: str, topk: int = 50) -> List[Entity]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 使用 jieba 抽取实体
|
|
|
|
|
+ 结合分词+词性标注+TF-IDF权重
|
|
|
|
|
+ 自动清理编号前缀
|
|
|
|
|
+ """
|
|
|
|
|
+ if not self.is_ready:
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ entities = []
|
|
|
|
|
+ seen: Set[str] = set()
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 先对文本进行编号前缀清理,用于提取更干净的实体
|
|
|
|
|
+ cleaned_text = clean_number_prefix(text)
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 词性标注分词
|
|
|
|
|
+ words_pos = list(pseg.cut(text))
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 提取命名实体和名词短语
|
|
|
|
|
+ i = 0
|
|
|
|
|
+ while i < len(words_pos):
|
|
|
|
|
+ word, flag = words_pos[i]
|
|
|
|
|
+
|
|
|
|
|
+ # 跳过停用词和短词
|
|
|
|
|
+ if len(word) < 2 or word in self.STOP_WORDS:
|
|
|
|
|
+ i += 1
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 提取命名实体(人名、地名、机构名等)
|
|
|
|
|
+ if flag in ['nr', 'ns', 'nt', 'nz', 's', 'f', 't']:
|
|
|
|
|
+ ent_type = self.POS_MAPPING.get(flag, 'term')
|
|
|
|
|
+ # 清理实体文本的编号前缀
|
|
|
|
|
+ cleaned_word = clean_number_prefix(word)
|
|
|
|
|
+ if not cleaned_word:
|
|
|
|
|
+ cleaned_word = word
|
|
|
|
|
+ position = text.find(word)
|
|
|
|
|
+ if position >= 0 and cleaned_word not in seen:
|
|
|
|
|
+ seen.add(cleaned_word)
|
|
|
|
|
+ entities.append(Entity(
|
|
|
|
|
+ text=cleaned_word,
|
|
|
|
|
+ entity_type=ent_type,
|
|
|
|
|
+ position=position,
|
|
|
|
|
+ context=self._get_context(text, position, 30),
|
|
|
|
|
+ source="jieba_ner",
|
|
|
|
|
+ weight=2.0 # 命名实体权重更高
|
|
|
|
|
+ ))
|
|
|
|
|
+
|
|
|
|
|
+ # 提取连续的名词短语(n + vn + an)
|
|
|
|
|
+ if flag.startswith(('n', 'vn', 'an')):
|
|
|
|
|
+ phrase = [word]
|
|
|
|
|
+ j = i + 1
|
|
|
|
|
+ while j < len(words_pos) and words_pos[j][1].startswith(('n', 'vn', 'an', 'v')):
|
|
|
|
|
+ next_word = words_pos[j][0]
|
|
|
|
|
+ if len(next_word) >= 1 and next_word not in self.STOP_WORDS:
|
|
|
|
|
+ phrase.append(next_word)
|
|
|
|
|
+ j += 1
|
|
|
|
|
+
|
|
|
|
|
+ if len(phrase) >= 2:
|
|
|
|
|
+ phrase_text = ''.join(phrase)
|
|
|
|
|
+ # 清理短语中的编号前缀
|
|
|
|
|
+ cleaned_phrase = clean_number_prefix(phrase_text)
|
|
|
|
|
+ if not cleaned_phrase:
|
|
|
|
|
+ cleaned_phrase = phrase_text
|
|
|
|
|
+ if 4 <= len(cleaned_phrase) <= 30 and cleaned_phrase not in seen:
|
|
|
|
|
+ seen.add(cleaned_phrase)
|
|
|
|
|
+ position = text.find(phrase_text)
|
|
|
|
|
+ if position >= 0:
|
|
|
|
|
+ entities.append(Entity(
|
|
|
|
|
+ text=cleaned_phrase,
|
|
|
|
|
+ entity_type="technical_term",
|
|
|
|
|
+ position=position,
|
|
|
|
|
+ context=self._get_context(text, position, 30),
|
|
|
|
|
+ source="jieba_phrase",
|
|
|
|
|
+ weight=1.5
|
|
|
|
|
+ ))
|
|
|
|
|
+ i = j if j > i + 1 else i + 1
|
|
|
|
|
+ else:
|
|
|
|
|
+ i += 1
|
|
|
|
|
+
|
|
|
|
|
+ # 3. TF-IDF 关键词提取
|
|
|
|
|
+ keywords = self.extract_keywords(text, topk=topk)
|
|
|
|
|
+ for word, weight in keywords:
|
|
|
|
|
+ # 清理关键词的编号前缀
|
|
|
|
|
+ cleaned_word = clean_number_prefix(word)
|
|
|
|
|
+ if not cleaned_word:
|
|
|
|
|
+ cleaned_word = word
|
|
|
|
|
+ if cleaned_word not in seen and len(cleaned_word) >= 2 and cleaned_word not in self.STOP_WORDS:
|
|
|
|
|
+ seen.add(cleaned_word)
|
|
|
|
|
+ position = text.find(word)
|
|
|
|
|
+ if position >= 0:
|
|
|
|
|
+ entities.append(Entity(
|
|
|
|
|
+ text=cleaned_word,
|
|
|
|
|
+ entity_type="keyword",
|
|
|
|
|
+ position=position,
|
|
|
|
|
+ context=self._get_context(text, position, 30),
|
|
|
|
|
+ source="jieba_tfidf",
|
|
|
|
|
+ weight=weight
|
|
|
|
|
+ ))
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 使用 TextRank 提取关键词作为补充
|
|
|
|
|
+ textrank_words = self.extract_textrank(text, topk=topk//2)
|
|
|
|
|
+ for word, weight in textrank_words:
|
|
|
|
|
+ # 清理关键词的编号前缀
|
|
|
|
|
+ cleaned_word = clean_number_prefix(word)
|
|
|
|
|
+ if not cleaned_word:
|
|
|
|
|
+ cleaned_word = word
|
|
|
|
|
+ if cleaned_word not in seen and len(cleaned_word) >= 2 and cleaned_word not in self.STOP_WORDS:
|
|
|
|
|
+ seen.add(cleaned_word)
|
|
|
|
|
+ position = text.find(word)
|
|
|
|
|
+ if position >= 0:
|
|
|
|
|
+ entities.append(Entity(
|
|
|
|
|
+ text=cleaned_word,
|
|
|
|
|
+ entity_type="keyword",
|
|
|
|
|
+ position=position,
|
|
|
|
|
+ context=self._get_context(text, position, 30),
|
|
|
|
|
+ source="jieba_textrank",
|
|
|
|
|
+ weight=weight
|
|
|
|
|
+ ))
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"⚠️ jieba 实体抽取失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ # 按权重排序
|
|
|
|
|
+ entities.sort(key=lambda x: x.weight, reverse=True)
|
|
|
|
|
+ return entities
|
|
|
|
|
+
|
|
|
|
|
+ def extract_keywords(self, text: str, topk: int = 20) -> List[Tuple[str, float]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 使用 TF-IDF 算法提取关键词
|
|
|
|
|
+ 返回: [(word, weight), ...]
|
|
|
|
|
+ """
|
|
|
|
|
+ if not self.is_ready:
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 分词
|
|
|
|
|
+ words = list(jieba.cut(text))
|
|
|
|
|
+
|
|
|
|
|
+ # 过滤停用词和短词
|
|
|
|
|
+ filtered_words = [
|
|
|
|
|
+ w for w in words
|
|
|
|
|
+ if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ if not filtered_words:
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ # 计算 TF
|
|
|
|
|
+ word_count = Counter(filtered_words)
|
|
|
|
|
+ total_words = len(filtered_words)
|
|
|
|
|
+ tf_scores = {word: count / total_words for word, count in word_count.items()}
|
|
|
|
|
+
|
|
|
|
|
+ # 计算 IDF(简化版,使用语料库统计)
|
|
|
|
|
+ idf_scores = self._calculate_idf(filtered_words)
|
|
|
|
|
+
|
|
|
|
|
+ # 计算 TF-IDF
|
|
|
|
|
+ tfidf_scores = {}
|
|
|
|
|
+ for word in tf_scores:
|
|
|
|
|
+ tfidf_scores[word] = tf_scores[word] * idf_scores.get(word, 1.0)
|
|
|
|
|
+
|
|
|
|
|
+ # 返回 topk
|
|
|
|
|
+ sorted_words = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
|
+ return sorted_words[:topk]
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"⚠️ TF-IDF 提取失败: {e}")
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ def extract_textrank(self, text: str, topk: int = 10) -> List[Tuple[str, float]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 使用 TextRank 算法提取关键词
|
|
|
|
|
+ 基于词共现图的关键词提取
|
|
|
|
|
+ """
|
|
|
|
|
+ if not self.is_ready:
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 分词和过滤
|
|
|
|
|
+ words = list(jieba.cut(text))
|
|
|
|
|
+ filtered_words = [
|
|
|
|
|
+ w for w in words
|
|
|
|
|
+ if len(w) >= 2 and w not in self.STOP_WORDS and not w.isdigit()
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ if len(filtered_words) < 3:
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ # 构建共现图(滑动窗口大小为5)
|
|
|
|
|
+ window_size = 5
|
|
|
|
|
+ word_graph = {}
|
|
|
|
|
+ word_set = set(filtered_words)
|
|
|
|
|
+
|
|
|
|
|
+ for word in word_set:
|
|
|
|
|
+ word_graph[word] = {}
|
|
|
|
|
+
|
|
|
|
|
+ # 统计共现关系
|
|
|
|
|
+ for i in range(len(filtered_words)):
|
|
|
|
|
+ for j in range(i + 1, min(i + window_size, len(filtered_words))):
|
|
|
|
|
+ w1, w2 = filtered_words[i], filtered_words[j]
|
|
|
|
|
+ if w1 != w2:
|
|
|
|
|
+ word_graph[w1][w2] = word_graph[w1].get(w2, 0) + 1
|
|
|
|
|
+ word_graph[w2][w1] = word_graph[w2].get(w1, 0) + 1
|
|
|
|
|
+
|
|
|
|
|
+ # TextRank 迭代计算
|
|
|
|
|
+ damping = 0.85
|
|
|
|
|
+ max_iter = 30
|
|
|
|
|
+ min_diff = 0.0001
|
|
|
|
|
+
|
|
|
|
|
+ # 初始化权重
|
|
|
|
|
+ ranks = {word: 1.0 for word in word_set}
|
|
|
|
|
+
|
|
|
|
|
+ for _ in range(max_iter):
|
|
|
|
|
+ new_ranks = {}
|
|
|
|
|
+ max_diff = 0
|
|
|
|
|
+
|
|
|
|
|
+ for word in word_set:
|
|
|
|
|
+ rank = (1 - damping)
|
|
|
|
|
+ for neighbor, weight in word_graph[word].items():
|
|
|
|
|
+ neighbor_sum = sum(word_graph[neighbor].values())
|
|
|
|
|
+ if neighbor_sum > 0:
|
|
|
|
|
+ rank += damping * weight * ranks[neighbor] / neighbor_sum
|
|
|
|
|
+
|
|
|
|
|
+ new_ranks[word] = rank
|
|
|
|
|
+ max_diff = max(max_diff, abs(rank - ranks[word]))
|
|
|
|
|
+
|
|
|
|
|
+ ranks = new_ranks
|
|
|
|
|
+
|
|
|
|
|
+ if max_diff < min_diff:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ # 返回排序结果
|
|
|
|
|
+ sorted_words = sorted(ranks.items(), key=lambda x: x[1], reverse=True)
|
|
|
|
|
+ return sorted_words[:topk]
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"⚠️ TextRank 提取失败: {e}")
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ def _calculate_idf(self, words: List[str]) -> Dict[str, float]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 计算简化版 IDF
|
|
|
|
|
+ 由于没有大规模语料库,使用基于词频的近似
|
|
|
|
|
+ """
|
|
|
|
|
+ # 基于词的长度惩罚 + 词性奖励的简化 IDF
|
|
|
|
|
+ idf_scores = {}
|
|
|
|
|
+ for word in set(words):
|
|
|
|
|
+ # 基础分数
|
|
|
|
|
+ base_idf = 1.0
|
|
|
|
|
+
|
|
|
|
|
+ # 长度奖励(2-6字最佳)
|
|
|
|
|
+ if 2 <= len(word) <= 6:
|
|
|
|
|
+ base_idf *= 1.2
|
|
|
|
|
+ elif len(word) > 10:
|
|
|
|
|
+ base_idf *= 0.8
|
|
|
|
|
+
|
|
|
|
|
+ # 专业词汇奖励
|
|
|
|
|
+ if word in self.DOMAIN_WORDS:
|
|
|
|
|
+ base_idf *= 1.5
|
|
|
|
|
+
|
|
|
|
|
+ idf_scores[word] = base_idf
|
|
|
|
|
+
|
|
|
|
|
+ return idf_scores
|
|
|
|
|
+
|
|
|
|
|
+ def _get_context(self, text: str, position: int, window: int = 30) -> str:
|
|
|
|
|
+ """获取上下文"""
|
|
|
|
|
+ start = max(0, position - window)
|
|
|
|
|
+ end = min(len(text), position + window)
|
|
|
|
|
+ return text[start:end]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# 全局 jieba 抽取器实例
|
|
|
|
|
+_jieba_extractor: Optional[JiebaExtractor] = None
|
|
|
|
|
+
|
|
|
|
|
+def get_jieba_extractor() -> Optional[JiebaExtractor]:
|
|
|
|
|
+ """获取 jieba 抽取器(懒加载)"""
|
|
|
|
|
+ global _jieba_extractor
|
|
|
|
|
+ if _jieba_extractor is None and JIEBA_AVAILABLE:
|
|
|
|
|
+ _jieba_extractor = JiebaExtractor()
|
|
|
|
|
+ return _jieba_extractor
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# ==================== 规则抽取模块 ====================
|
|
|
|
|
+
|
|
|
|
|
+# 标准名称模式
|
|
|
|
|
+STANDARD_NAME_PATTERN = r'《([^》]{2,100}?)》'
|
|
|
|
|
+
|
|
|
|
|
+# 标准编号模式
|
|
|
|
|
+STANDARD_NUMBER_PATTERNS = [
|
|
|
|
|
+ r'GB\s*/?T?\s*\d+[\./\-]?\d*(?:[-–—]\d{4})?',
|
|
|
|
|
+ r'JTG\s*[TD]?\s*\d+[\./\-]?\d*[a-zA-Z]?',
|
|
|
|
|
+ r'JTJ\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'JGJ\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'CJJ\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'TB\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'SL\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'DL\s*/?T?\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'NB\s*/?T?\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'HG\s*/?T?\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'CECS\s*\d+[::]?\d*',
|
|
|
|
|
+ r'T/[A-Z]+\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'DB\d{2,3}[/\-]T?\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'Q/[A-Z]+\s*\d+[\./\-]?\d*',
|
|
|
|
|
+ r'建标\s*\d+[\./\-]?\d*',
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# 条款引用模式
|
|
|
|
|
+CLAUSE_PATTERNS = [
|
|
|
|
|
+ r'第\s*[一二三四五六七八九十百千]+\s*条',
|
|
|
|
|
+ r'第\s*\d+\s*条',
|
|
|
|
|
+ r'第\s*\d+\.\d+\s*条',
|
|
|
|
|
+ r'第\s*\d+\.\d+\.\d+\s*条',
|
|
|
|
|
+ r'[\((]\s*\d+\s*[\))]',
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# 工程领域专业术语模式
|
|
|
|
|
+TECH_TERM_PATTERNS = [
|
|
|
|
|
+ # 工程类型
|
|
|
|
|
+ r'(?:公路|桥梁|隧道|路基|路面|涵洞|边坡|基坑|桩基|墩柱|梁体|涵洞|挡土墙|护坡|排水|支挡)[\w\s]{0,10}?(?:工程|结构|设施|系统)',
|
|
|
|
|
+ # 设计/计算相关
|
|
|
|
|
+ r'(?:抗震|承载力|稳定性|变形|沉降|承载能力|抗剪|抗弯|抗冲切|局部稳定|疲劳|裂缝)[\w\s]{0,10}?(?:计算|设计|验算|分析|控制|校核)',
|
|
|
|
|
+ r'(?:设计|计算|验算)[\w\s]{0,10}?(?:公式|方法|模型|参数|标准|规范|准则|规定)',
|
|
|
|
|
+ # 材料相关
|
|
|
|
|
+ r'(?:混凝土|钢筋|钢材|沥青|水泥|砂石|外加剂|掺合料|预应力筋)[\w\s]{0,10}?(?:强度|等级|性能|配比|用量|标号|规格)',
|
|
|
|
|
+ # 工艺/施工方法
|
|
|
|
|
+ r'(?:浇筑|张拉|压浆|焊接|检测|监测|养护|支护|开挖|爆破|注浆|灌浆)[\w\s]{0,10}?(?:工艺|方法|标准|要求|技术|规范)',
|
|
|
|
|
+ # 地基/基础
|
|
|
|
|
+ r'(?:地基|基础|支挡|防护|排水|围堰|支护)[\w\s]{0,10}?(?:设计|处理|加固|施工|工程)',
|
|
|
|
|
+ # 地质灾害
|
|
|
|
|
+ r'(?:液化|沉陷|滑坡|崩塌|泥石流|地震|岩溶|采空区|软土|湿陷性黄土)[\w\s]{0,10}?(?:处理|防治|评价|分析|地段)',
|
|
|
|
|
+ # 结构构件
|
|
|
|
|
+ r'(?:梁|板|柱|墙|拱|索|缆|锚|支座|伸缩缝|护栏|标线|标志)[\w\s]{0,5}?(?:结构|构件|部件|构造)',
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# 安全事故类型
|
|
|
|
|
+SAFETY_TERM_PATTERNS = [
|
|
|
|
|
+ r'(?:特大|重大|较大|一般)?(?:交通|火灾|瓦斯爆炸|透水|坍塌|冒顶片帮|放炮|火药爆炸|锅炉爆炸|容器爆炸|其他爆炸|中毒和窒息|高处坠落|物体打击|机械伤害|起重伤害|触电|淹溺|灼烫|其他)?(?:安全)?事故',
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# 管理机构/单位模式
|
|
|
|
|
+ORG_PATTERNS = [
|
|
|
|
|
+ r'(?:交通运输部?|住建部?|水利部?|工信部?|发改委|质检总局?|应急管理部?|自然资源部?)',
|
|
|
|
|
+ r'(?:中国|中交|中铁|中建|中冶|中水|中港)[\w\s]{2,20}?(?:研究院|设计院|工程局|公司|集团)',
|
|
|
|
|
+ r'(?:各省|自治区|直辖市)交通运输厅?',
|
|
|
|
|
+ r'[\u4e00-\u9fa5]{2,8}(?:省|市|自治区)\s*(?:交通运输厅|住建厅|水利厅)',
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# 发布关系
|
|
|
|
|
+PUBLISH_PATTERNS = [
|
|
|
|
|
+ (r'由\s*([^,。\n]{2,30}?)\s*(?:发布|制定|颁发|出台)', '由发布'),
|
|
|
|
|
+ (r'根据\s*《([^》]+)》\s*(?:制定|编制|发布)', '根据制定'),
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# 替代/废止关系
|
|
|
|
|
+REPLACE_PATTERNS = [
|
|
|
|
|
+ (r'代替\s*《([^》]+)》', '代替标准'),
|
|
|
|
|
+ (r'(?:自\s*[\d年月日\-]+\s*起)?\s*废止', '已废止'),
|
|
|
|
|
+ (r'已被\s*《([^》]+)》\s*代替', '被标准代替'),
|
|
|
|
|
+ (r'被\s*(GB[/T]?\s*\d+[\-]\d*)\s*代替', '被编号标准代替'),
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# 管理关系
|
|
|
|
|
+MANAGE_PATTERNS = [
|
|
|
|
|
+ (r'(?:主编单位|主编部门)[::]\s*([^,。\n]{2,50})', '主编单位'),
|
|
|
|
|
+ (r'(?:参编单位|参编部门)[::]\s*([^,。\n]{2,50})', '参编单位'),
|
|
|
|
|
+ (r'(?:解释单位|解释部门|技术归口)[::]\s*([^,。\n]{2,50})', '解释单位'),
|
|
|
|
|
+ (r'由\s*([^,。]{2,30})\s*负责解释', '负责解释'),
|
|
|
|
|
+ (r'归口单位[::]\s*([^,。\n]{2,50})', '归口单位'),
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# 引用关系
|
|
|
|
|
+REFERENCE_PATTERNS = [
|
|
|
|
|
+ (r'应符合\s*《([^》]+)》\s*(?:GB[/T]?\s*\d+[\-]?\d*)?\s*的?规定', '应符合'),
|
|
|
|
|
+ (r'应遵守\s*《([^》]+)》\s*(?:GB[/T]?\s*\d+[\-]?\d*)?', '应遵守'),
|
|
|
|
|
+ (r'参照\s*《([^》]+)》\s*(?:JTG[/T]?\s*\d+[\-]?\d*)?', '参照'),
|
|
|
|
|
+ (r'引用\s*《([^》]+)》', '引用'),
|
|
|
|
|
+ (r'依据\s*《([^》]+)》', '依据'),
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# 适用范围
|
|
|
|
|
+SCOPE_PATTERNS = {
|
|
|
|
|
+ '工程类型': [
|
|
|
|
|
+ r'适用(?:于)?\s*(?:新建|改建|扩建)?\s*([公路桥梁隧道路基路面]{2,8}\s*工程?)',
|
|
|
|
|
+ r'([公路桥梁隧道路基路面涵洞]{2,6})\s*(?:的)?\s*(?:设计|施工|验收|检测)',
|
|
|
|
|
+ ],
|
|
|
|
|
+ '地区': [
|
|
|
|
|
+ r'适用(?:于)?\s*(全国|各省|自治区|直辖市)',
|
|
|
|
|
+ r'适用(?:于)?\s*([\u4e00-\u9fa5]{2,8}省|[\u4e00-\u9fa5]{2,8}市|[^。,]{2,10}地区)',
|
|
|
|
|
+ ],
|
|
|
|
|
+ '阶段': [
|
|
|
|
|
+ r'(设计|施工|验收|勘察|检测|养护|监理|招投标)\s*阶段',
|
|
|
|
|
+ r'适用(?:于)?\s*([^。,]{2,10})\s*(?:的)?\s*(设计|施工|验收|勘察)',
|
|
|
|
|
+ ],
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+# 废止状态
|
|
|
|
|
+ABOLISH_PATTERNS = [
|
|
|
|
|
+ r'自\s*(\d{4}年\d{1,2}月\d{1,2}日|\d{4}-\d{2}-\d{2})\s*起\s*废止',
|
|
|
|
|
+ r'已被?\s*《([^》]+)》\s*代替',
|
|
|
|
|
+ r'代替\s*《([^》]+)》',
|
|
|
|
|
+ r'已\s*废止',
|
|
|
|
|
+ r'自\s*[\d年月日]+\s*起\s*实施[^。]*原[^。]*(?:废止|代替)',
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def extract_entities_rule_based(text: str) -> List[Entity]:
|
|
|
|
|
+ """基于规则的实体抽取(自动清理编号前缀)"""
|
|
|
|
|
+ entities = []
|
|
|
|
|
+ seen: Set[str] = set()
|
|
|
|
|
+
|
|
|
|
|
+ def add_entity(text_content: str, e_type: str, pos: int, source: str = "rule", weight: float = 1.0):
|
|
|
|
|
+ # 清理编号前缀(标准名称、编号、条款引用除外)
|
|
|
|
|
+ if e_type not in ['standard_name', 'standard_number', 'clause']:
|
|
|
|
|
+ text_content = clean_number_prefix(text_content)
|
|
|
|
|
+
|
|
|
|
|
+ if not text_content:
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ key = f"{e_type}:{text_content}"
|
|
|
|
|
+ if key not in seen and len(text_content) >= 2:
|
|
|
|
|
+ seen.add(key)
|
|
|
|
|
+ context = get_context(text, pos, 40)
|
|
|
|
|
+ entities.append(Entity(
|
|
|
|
|
+ text=text_content,
|
|
|
|
|
+ entity_type=e_type,
|
|
|
|
|
+ position=pos,
|
|
|
|
|
+ context=context,
|
|
|
|
|
+ source=source,
|
|
|
|
|
+ weight=weight
|
|
|
|
|
+ ))
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 标准名称
|
|
|
|
|
+ for match in re.finditer(STANDARD_NAME_PATTERN, text):
|
|
|
|
|
+ name = match.group(1).strip()
|
|
|
|
|
+ if 2 <= len(name) <= 100:
|
|
|
|
|
+ add_entity(f"《{name}》", "standard_name", match.start(), weight=3.0)
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 标准编号
|
|
|
|
|
+ for pattern in STANDARD_NUMBER_PATTERNS:
|
|
|
|
|
+ for match in re.finditer(pattern, text, re.IGNORECASE):
|
|
|
|
|
+ number = re.sub(r'\s+', '', match.group(0)).upper()
|
|
|
|
|
+ if len(number) >= 3:
|
|
|
|
|
+ add_entity(number, "standard_number", match.start(), weight=2.5)
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 条款引用
|
|
|
|
|
+ for pattern in CLAUSE_PATTERNS:
|
|
|
|
|
+ for match in re.finditer(pattern, text):
|
|
|
|
|
+ clause = re.sub(r'\s+', '', match.group(0))
|
|
|
|
|
+ if clause and len(clause) < 50:
|
|
|
|
|
+ add_entity(clause, "clause", match.start(), weight=1.5)
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 专业术语(清理编号前缀)
|
|
|
|
|
+ for pattern in TECH_TERM_PATTERNS:
|
|
|
|
|
+ for match in re.finditer(pattern, text):
|
|
|
|
|
+ term = re.sub(r'\s+', '', match.group(0))
|
|
|
|
|
+ if 4 <= len(term) <= 50:
|
|
|
|
|
+ add_entity(term, "technical_term", match.start(), weight=2.0)
|
|
|
|
|
+
|
|
|
|
|
+ # 5. 安全事故类型(清理编号前缀)
|
|
|
|
|
+ for pattern in SAFETY_TERM_PATTERNS:
|
|
|
|
|
+ for match in re.finditer(pattern, text):
|
|
|
|
|
+ term = match.group(0).strip()
|
|
|
|
|
+ if 4 <= len(term) <= 30:
|
|
|
|
|
+ add_entity(term, "safety_term", match.start(), weight=1.8)
|
|
|
|
|
+
|
|
|
|
|
+ # 6. 管理机构/单位(清理编号前缀)
|
|
|
|
|
+ for pattern in ORG_PATTERNS:
|
|
|
|
|
+ for match in re.finditer(pattern, text):
|
|
|
|
|
+ org = match.group(0).strip()
|
|
|
|
|
+ if 4 <= len(org) <= 50:
|
|
|
|
|
+ add_entity(org, "organization", match.start(), weight=2.0)
|
|
|
|
|
+
|
|
|
|
|
+ return entities
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def merge_entities(jieba_entities: List[Entity], rule_entities: List[Entity]) -> List[Entity]:
|
|
|
|
|
+ """合并 jieba 和规则抽取的实体,去重并加权"""
|
|
|
|
|
+ seen: Set[str] = set()
|
|
|
|
|
+ merged = []
|
|
|
|
|
+
|
|
|
|
|
+ # 合并两个列表,按权重排序
|
|
|
|
|
+ all_entities = jieba_entities + rule_entities
|
|
|
|
|
+ all_entities.sort(key=lambda x: x.weight, reverse=True)
|
|
|
|
|
+
|
|
|
|
|
+ for ent in all_entities:
|
|
|
|
|
+ # 使用小写文本作为去重 key
|
|
|
|
|
+ key = f"{ent.text.lower()}:{ent.entity_type}"
|
|
|
|
|
+ if key not in seen:
|
|
|
|
|
+ seen.add(key)
|
|
|
|
|
+ merged.append(ent)
|
|
|
|
|
+
|
|
|
|
|
+ # 按位置排序
|
|
|
|
|
+ merged.sort(key=lambda x: x.position)
|
|
|
|
|
+ return merged
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def extract_relationships(text: str) -> List[Relationship]:
|
|
|
|
|
+ """抽取关系(基于规则)"""
|
|
|
|
|
+ relations = []
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 规则匹配
|
|
|
|
|
+ # 发布关系
|
|
|
|
|
+ for pattern, rel_type in PUBLISH_PATTERNS:
|
|
|
|
|
+ for match in re.finditer(pattern, text):
|
|
|
|
|
+ target = match.group(1) if match.groups() else match.group(0)
|
|
|
|
|
+ relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.9))
|
|
|
|
|
+
|
|
|
|
|
+ # 替代/废止关系
|
|
|
|
|
+ for pattern, rel_type in REPLACE_PATTERNS:
|
|
|
|
|
+ for match in re.finditer(pattern, text):
|
|
|
|
|
+ target = match.group(1) if match.groups() else "未知"
|
|
|
|
|
+ relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.95))
|
|
|
|
|
+
|
|
|
|
|
+ # 管理关系
|
|
|
|
|
+ for pattern, rel_type in MANAGE_PATTERNS:
|
|
|
|
|
+ for match in re.finditer(pattern, text):
|
|
|
|
|
+ target = match.group(1).strip() if match.groups() else "未知"
|
|
|
|
|
+ relations.append(Relationship("本标准", rel_type, target, match.group(0), 0.9))
|
|
|
|
|
+
|
|
|
|
|
+ # 引用关系
|
|
|
|
|
+ for pattern, rel_type in REFERENCE_PATTERNS:
|
|
|
|
|
+ for match in re.finditer(pattern, text):
|
|
|
|
|
+ target = match.group(1).strip() if match.groups() else "未知"
|
|
|
|
|
+ relations.append(Relationship("本标准", rel_type, f"《{target}》", match.group(0), 0.85))
|
|
|
|
|
+
|
|
|
|
|
+ return relations
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def extract_background_info(text: str) -> BackgroundInfo:
|
|
|
|
|
+ """抽取背景信息"""
|
|
|
|
|
+ # 废止状态
|
|
|
|
|
+ abolish_status = []
|
|
|
|
|
+ for pattern in ABOLISH_PATTERNS:
|
|
|
|
|
+ matches = re.findall(pattern, text)
|
|
|
|
|
+ for m in matches:
|
|
|
|
|
+ if isinstance(m, tuple):
|
|
|
|
|
+ abolish_status.extend([x for x in m if x])
|
|
|
|
|
+ elif m:
|
|
|
|
|
+ abolish_status.append(m)
|
|
|
|
|
+
|
|
|
|
|
+ # 管理单位
|
|
|
|
|
+ manage_orgs = {"主编单位": [], "参编单位": [], "解释单位": [], "归口单位": []}
|
|
|
|
|
+ for pattern, org_type in MANAGE_PATTERNS:
|
|
|
|
|
+ matches = re.findall(pattern, text)
|
|
|
|
|
+ for m in matches:
|
|
|
|
|
+ org_name = m[0] if isinstance(m, tuple) else m
|
|
|
|
|
+ if org_name and org_type in manage_orgs:
|
|
|
|
|
+ manage_orgs[org_type].append(org_name.strip())
|
|
|
|
|
+
|
|
|
|
|
+ # 适用范围
|
|
|
|
|
+ scope = {"工程类型": [], "地区": [], "阶段": []}
|
|
|
|
|
+ for scope_type, patterns in SCOPE_PATTERNS.items():
|
|
|
|
|
+ for pattern in patterns:
|
|
|
|
|
+ matches = re.findall(pattern, text)
|
|
|
|
|
+ for m in matches:
|
|
|
|
|
+ if isinstance(m, tuple):
|
|
|
|
|
+ scope[scope_type].extend([x for x in m if x])
|
|
|
|
|
+ elif m:
|
|
|
|
|
+ scope[scope_type].append(m)
|
|
|
|
|
+
|
|
|
|
|
+ # 关系
|
|
|
|
|
+ relations = extract_relationships(text)
|
|
|
|
|
+
|
|
|
|
|
+ return BackgroundInfo(abolish_status, manage_orgs, scope, relations)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def get_context(text: str, position: int, window: int = 40) -> str:
|
|
|
|
|
+ """获取实体上下文"""
|
|
|
|
|
+ start = max(0, position - window)
|
|
|
|
|
+ end = min(len(text), position + window)
|
|
|
|
|
+ context = text[start:end]
|
|
|
|
|
+ context = re.sub(r'\s+', ' ', context).strip()
|
|
|
|
|
+ return context[:200]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def clean_number_prefix(text: str) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 清理文本开头的编号前缀
|
|
|
|
|
+
|
|
|
|
|
+ 处理的编号格式:
|
|
|
|
|
+ - 数字编号:1. 2.0.1 3.1.2.1 10.
|
|
|
|
|
+ - 括号编号:(1) (2) (a) (A)
|
|
|
|
|
+ - 中文编号:一、 二、 三、 (一) (二)
|
|
|
|
|
+ - 混合编号:1) 2) a) A)
|
|
|
|
|
+ """
|
|
|
|
|
+ if not text:
|
|
|
|
|
+ return text
|
|
|
|
|
+
|
|
|
|
|
+ original_text = text
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 清理多级数字编号 (如: 2.0.1 3.1.2 1.2.3.4)
|
|
|
|
|
+ text = re.sub(r'^\s*\d+(?:\.\d+)+\.?\s*', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 清理简单数字编号 (如: 1. 10. 99.)
|
|
|
|
|
+ text = re.sub(r'^\s*\d+\.\s*', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 清理括号数字编号 (如: (1) (2) (10))
|
|
|
|
|
+ text = re.sub(r'^\s*[\((]\d+[\))]\s*', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 清理括号字母编号 (如: (a) (b) (A) (B))
|
|
|
|
|
+ text = re.sub(r'^\s*[\((][a-zA-Z][\))]\s*', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 5. 清理右括号编号 (如: 1) 2) a) A))
|
|
|
|
|
+ text = re.sub(r'^\s*[\d]+\)\s*', '', text)
|
|
|
|
|
+ text = re.sub(r'^\s*[a-zA-Z]\)\s*', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 6. 清理中文编号(一)(二)(三)
|
|
|
|
|
+ text = re.sub(r'^\s*[((][一二三四五六七八九十百千]+[))]\s*', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 7. 清理中文顿号编号(一、二、三、)
|
|
|
|
|
+ text = re.sub(r'^[一二三四五六七八九十百千]+[、..]\s*', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 8. 如果清理后内容太短,可能是误清理,返回原文
|
|
|
|
|
+ if len(text.strip()) < 3 and len(original_text.strip()) > 3:
|
|
|
|
|
+ return original_text.strip()
|
|
|
|
|
+
|
|
|
|
|
+ return text.strip()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def clean_markdown_content(text: str) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 清理 Markdown 内容,剔除代码块、链接、标题符号等非核心内容
|
|
|
|
|
+ 转换 md 为纯文本,保留语义核心
|
|
|
|
|
+
|
|
|
|
|
+ 清理规则:
|
|
|
|
|
+ 1. 移除代码块 (```...``` 和 ~~~...~~~)
|
|
|
|
|
+ 2. 移除行内代码 (`...`)
|
|
|
|
|
+ 3. 移除链接,保留链接文本 [text](url) -> text
|
|
|
|
|
+ 4. 移除图片 
|
|
|
|
|
+ 5. 移除 HTML 标签
|
|
|
|
|
+ 6. 移除标题符号 (# ## ### 等)
|
|
|
|
|
+ 7. 移除强调符号 (** * __ _)
|
|
|
|
|
+ 8. 移除表格分隔符 (| --- |)
|
|
|
|
|
+ 9. 移除引用符号 (>)
|
|
|
|
|
+ 10. 移除列表符号 (- * + 1.)
|
|
|
|
|
+ 11. 清理编号前缀 (2.0.1 (1) 一、 等)
|
|
|
|
|
+ 12. 清理多余空行
|
|
|
|
|
+ """
|
|
|
|
|
+ if not text:
|
|
|
|
|
+ return ""
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 移除代码块 (```...``` 和 ~~~...~~~)
|
|
|
|
|
+ text = re.sub(r'```[\s\S]*?```', '', text)
|
|
|
|
|
+ text = re.sub(r'~~~[\s\S]*?~~~', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 移除行内代码 (`...`)
|
|
|
|
|
+ text = re.sub(r'`[^`]*`', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 处理链接:保留链接文本,移除 URL
|
|
|
|
|
+ # [text](url "title") -> text
|
|
|
|
|
+ text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
|
|
|
|
+ # 处理裸链接 <url>
|
|
|
|
|
+ text = re.sub(r'<[^>]+>', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 移除图片 
|
|
|
|
|
+ text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 5. 移除 HTML 标签
|
|
|
|
|
+ text = re.sub(r'<[^>]+>', '', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 6. 移除标题符号 (# ## ### 等)
|
|
|
|
|
+ text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
|
|
|
|
|
+
|
|
|
|
|
+ # 7. 移除强调符号 (** * __ _)
|
|
|
|
|
+ text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # **bold**
|
|
|
|
|
+ text = re.sub(r'__([^_]+)__', r'\1', text) # __bold__
|
|
|
|
|
+ text = re.sub(r'\*([^*]+)\*', r'\1', text) # *italic*
|
|
|
|
|
+ text = re.sub(r'_([^_]+)_', r'\1', text) # _italic_
|
|
|
|
|
+ text = re.sub(r'~~([^~]+)~~', r'\1', text) # ~~strikethrough~~
|
|
|
|
|
+
|
|
|
|
|
+ # 8. 移除表格分隔符行 (| --- | --- |)
|
|
|
|
|
+ text = re.sub(r'\|?[\s\-:]+\|', '', text)
|
|
|
|
|
+ text = re.sub(r'\|', ' ', text) # 将表格分隔符替换为空格
|
|
|
|
|
+
|
|
|
|
|
+ # 9. 移除引用符号 (>)
|
|
|
|
|
+ text = re.sub(r'^>\s*', '', text, flags=re.MULTILINE)
|
|
|
|
|
+
|
|
|
|
|
+ # 10. 移除列表符号 (- * + 1.)
|
|
|
|
|
+ text = re.sub(r'^[\s]*[-*+]\s+', '', text, flags=re.MULTILINE)
|
|
|
|
|
+ text = re.sub(r'^[\s]*\d+\.\s+', '', text, flags=re.MULTILINE)
|
|
|
|
|
+
|
|
|
|
|
+ # 11. 清理每行的编号前缀(处理多行文本)
|
|
|
|
|
+ lines = text.split('\n')
|
|
|
|
|
+ cleaned_lines = []
|
|
|
|
|
+ for line in lines:
|
|
|
|
|
+ cleaned_line = clean_number_prefix(line)
|
|
|
|
|
+ if cleaned_line:
|
|
|
|
|
+ cleaned_lines.append(cleaned_line)
|
|
|
|
|
+ text = '\n'.join(cleaned_lines)
|
|
|
|
|
+
|
|
|
|
|
+ # 12. 清理多余空行和空格
|
|
|
|
|
+ text = re.sub(r'\n{3,}', '\n\n', text) # 多于2个换行合并为2个
|
|
|
|
|
+ text = re.sub(r'[ \t]+', ' ', text) # 多个空格/制表符合并为一个
|
|
|
|
|
+ text = text.strip()
|
|
|
|
|
+
|
|
|
|
|
+ return text
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def split_document(md_content: str) -> List[Tuple[str, str]]:
|
|
|
|
|
+ """将文档拆分为段落,返回 (标题, 内容) 列表"""
|
|
|
|
|
+ chunks = []
|
|
|
|
|
+
|
|
|
|
|
+ # 按标题分割
|
|
|
|
|
+ sections = re.split(r'\n(?=##+\s)', md_content)
|
|
|
|
|
+
|
|
|
|
|
+ for section in sections:
|
|
|
|
|
+ section = section.strip()
|
|
|
|
|
+ if not section:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 提取标题
|
|
|
|
|
+ title_match = re.match(r'##+\s+(.+)\n', section)
|
|
|
|
|
+ if title_match:
|
|
|
|
|
+ # 获取原始标题并清理编号前缀
|
|
|
|
|
+ raw_title = title_match.group(1).strip()
|
|
|
|
|
+ title = clean_number_prefix(raw_title)
|
|
|
|
|
+ # 如果清理后标题为空,使用原文
|
|
|
|
|
+ if not title:
|
|
|
|
|
+ title = raw_title
|
|
|
|
|
+ else:
|
|
|
|
|
+ title = "前言/总则"
|
|
|
|
|
+
|
|
|
|
|
+ # 清理内容 - 先移除标题行
|
|
|
|
|
+ content = re.sub(r'^#+\s+.+\n?', '', section, flags=re.MULTILINE)
|
|
|
|
|
+ # 清理 Markdown 格式(包含编号前缀清理)
|
|
|
|
|
+ content = clean_markdown_content(content)
|
|
|
|
|
+ # 清理多余换行
|
|
|
|
|
+ content = re.sub(r'\n+', '\n', content).strip()
|
|
|
|
|
+
|
|
|
|
|
+ if len(content) < 10:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ chunks.append((title, content))
|
|
|
|
|
+
|
|
|
|
|
+ # 如果没有分块,整个文档作为一个块
|
|
|
|
|
+ if not chunks and md_content.strip():
|
|
|
|
|
+ cleaned = clean_markdown_content(md_content)
|
|
|
|
|
+ if cleaned.strip():
|
|
|
|
|
+ chunks.append(("全文", cleaned))
|
|
|
|
|
+
|
|
|
|
|
+ return chunks
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def extract_title_from_filename(file_name: str) -> str:
|
|
|
|
|
+ """从文件名提取标准名称"""
|
|
|
|
|
+ title = re.sub(r'^\d+', '', file_name)
|
|
|
|
|
+ title = re.sub(r'\.md$', '', title, re.IGNORECASE)
|
|
|
|
|
+ return title.strip()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def build_backgrounds(bg_info: BackgroundInfo, doc_title: str, file_name: str) -> List[str]:
|
|
|
|
|
+ """构建 backgrounds 列表,确保不为空"""
|
|
|
|
|
+ backgrounds = []
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 废止状态(最高优先级)
|
|
|
|
|
+ if bg_info.abolish_status:
|
|
|
|
|
+ for status in bg_info.abolish_status[:2]:
|
|
|
|
|
+ if isinstance(status, str):
|
|
|
|
|
+ backgrounds.append(f"废止状态:{status}")
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 管理单位
|
|
|
|
|
+ for org_type, orgs in bg_info.manage_orgs.items():
|
|
|
|
|
+ if orgs:
|
|
|
|
|
+ unique_orgs = list(dict.fromkeys(orgs))[:2]
|
|
|
|
|
+ backgrounds.append(f"{org_type}:{', '.join(unique_orgs)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 适用范围
|
|
|
|
|
+ if bg_info.scope["工程类型"]:
|
|
|
|
|
+ types = list(dict.fromkeys(bg_info.scope["工程类型"]))[:2]
|
|
|
|
|
+ backgrounds.append(f"适用工程类型:{', '.join(types)}")
|
|
|
|
|
+
|
|
|
|
|
+ if bg_info.scope["阶段"]:
|
|
|
|
|
+ stages = list(dict.fromkeys(bg_info.scope["阶段"]))[:2]
|
|
|
|
|
+ backgrounds.append(f"适用阶段:{', '.join(stages)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 关系信息
|
|
|
|
|
+ important_rels = [r for r in bg_info.relations if r.relation_type in ['代替标准', '被标准代替', '应符合', '根据制定']]
|
|
|
|
|
+ for rel in important_rels[:2]:
|
|
|
|
|
+ target = rel.target[:50] + "..." if len(rel.target) > 50 else rel.target
|
|
|
|
|
+ backgrounds.append(f"{rel.relation_type}:{target}")
|
|
|
|
|
+
|
|
|
|
|
+ # 5. 兜底填充
|
|
|
|
|
+ if not backgrounds:
|
|
|
|
|
+ backgrounds.append(f"编制依据文件:{doc_title}")
|
|
|
|
|
+ backgrounds.append(f"来源文档:{file_name}")
|
|
|
|
|
+
|
|
|
|
|
+ return backgrounds[:5]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def extract_all_entities(text: str) -> List[Entity]:
|
|
|
|
|
+ """综合抽取实体(jieba + 规则)"""
|
|
|
|
|
+ # jieba 抽取
|
|
|
|
|
+ jieba_entities = []
|
|
|
|
|
+ jieba_ext = get_jieba_extractor()
|
|
|
|
|
+ if jieba_ext and jieba_ext.is_ready:
|
|
|
|
|
+ jieba_entities = jieba_ext.extract_entities(text)
|
|
|
|
|
+
|
|
|
|
|
+ # 规则抽取
|
|
|
|
|
+ rule_entities = extract_entities_rule_based(text)
|
|
|
|
|
+
|
|
|
|
|
+ # 合并去重
|
|
|
|
|
+ return merge_entities(jieba_entities, rule_entities)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def import_single_file(md_path: Path, embeddings) -> List[Dict[str, Any]]:
|
|
|
|
|
+ """导入单个 MD 文件(同一文件内实体去重)"""
|
|
|
|
|
+ file_name = md_path.name
|
|
|
|
|
+ doc_title = extract_title_from_filename(file_name)
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(md_path, "r", encoding="utf-8") as f:
|
|
|
|
|
+ md_content = f.read()
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 读取失败: {e}")
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ if not md_content.strip():
|
|
|
|
|
+ return []
|
|
|
|
|
+
|
|
|
|
|
+ # 抽取文档级背景信息
|
|
|
|
|
+ doc_bg_info = extract_background_info(md_content)
|
|
|
|
|
+ doc_backgrounds = build_backgrounds(doc_bg_info, doc_title, file_name)
|
|
|
|
|
+
|
|
|
|
|
+ # 分块处理
|
|
|
|
|
+ chunks = split_document(md_content)
|
|
|
|
|
+
|
|
|
|
|
+ all_rows = []
|
|
|
|
|
+ # 用于同一文件内实体去重:key = (entity_text, entity_type)
|
|
|
|
|
+ file_entity_seen: Set[Tuple[str, str]] = set()
|
|
|
|
|
+
|
|
|
|
|
+ for chunk_title, chunk_text in chunks:
|
|
|
|
|
+ # 综合抽取实体
|
|
|
|
|
+ entities = extract_all_entities(chunk_text)
|
|
|
|
|
+
|
|
|
|
|
+ # 抽取段落级背景信息
|
|
|
|
|
+ chunk_bg_info = extract_background_info(chunk_text)
|
|
|
|
|
+ chunk_backgrounds = build_backgrounds(chunk_bg_info, chunk_title, file_name)
|
|
|
|
|
+
|
|
|
|
|
+ # 合并背景信息
|
|
|
|
|
+ final_backgrounds = chunk_backgrounds if len(chunk_backgrounds) > 1 else doc_backgrounds
|
|
|
|
|
+
|
|
|
|
|
+ if entities:
|
|
|
|
|
+ for entity in entities:
|
|
|
|
|
+ entity_text = entity.text.strip()
|
|
|
|
|
+ entity_type = entity.entity_type
|
|
|
|
|
+
|
|
|
|
|
+ # 同一文件内实体去重
|
|
|
|
|
+ dedup_key = (entity_text.lower(), entity_type)
|
|
|
|
|
+ if dedup_key in file_entity_seen:
|
|
|
|
|
+ continue
|
|
|
|
|
+ file_entity_seen.add(dedup_key)
|
|
|
|
|
+
|
|
|
|
|
+ # 生成向量
|
|
|
|
|
+ try:
|
|
|
|
|
+ vector = embeddings.embed_query(entity_text)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 向量生成失败: {e}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 构造 metadata
|
|
|
|
|
+ metadata = {
|
|
|
|
|
+ "uuid": str(uuid.uuid4()),
|
|
|
|
|
+ "file": file_name,
|
|
|
|
|
+ "title": chunk_title,
|
|
|
|
|
+ "backgrounds": final_backgrounds,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ all_rows.append({
|
|
|
|
|
+ "text": entity_text,
|
|
|
|
|
+ "dense": vector,
|
|
|
|
|
+ "content": entity_text,
|
|
|
|
|
+ "metadata": json.dumps(metadata, ensure_ascii=False),
|
|
|
|
|
+ })
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 无实体时,用段落标题作为实体
|
|
|
|
|
+ chunk_title_clean = chunk_title.strip()
|
|
|
|
|
+ dedup_key = (chunk_title_clean.lower(), "chunk_title")
|
|
|
|
|
+ if dedup_key not in file_entity_seen:
|
|
|
|
|
+ file_entity_seen.add(dedup_key)
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ vector = embeddings.embed_query(chunk_title_clean)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ metadata = {
|
|
|
|
|
+ "uuid": str(uuid.uuid4()),
|
|
|
|
|
+ "file": file_name,
|
|
|
|
|
+ "title": chunk_title_clean,
|
|
|
|
|
+ "backgrounds": final_backgrounds,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ all_rows.append({
|
|
|
|
|
+ "text": chunk_title_clean[:200],
|
|
|
|
|
+ "dense": vector,
|
|
|
|
|
+ "content": chunk_title_clean[:200],
|
|
|
|
|
+ "metadata": json.dumps(metadata, ensure_ascii=False),
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ return all_rows
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def batch_insert(client, rows: List[Dict[str, Any]]) -> Tuple[int, List[Dict[str, Any]]]:
|
|
|
|
|
+ """批量插入数据"""
|
|
|
|
|
+ if not rows:
|
|
|
|
|
+ return 0, []
|
|
|
|
|
+
|
|
|
|
|
+ inserted = 0
|
|
|
|
|
+ failed_rows = []
|
|
|
|
|
+
|
|
|
|
|
+ for i in range(0, len(rows), BATCH_SIZE):
|
|
|
|
|
+ batch = rows[i:i + BATCH_SIZE]
|
|
|
|
|
+ try:
|
|
|
|
|
+ client.insert(collection_name=COLLECTION_NAME, data=batch)
|
|
|
|
|
+ inserted += len(batch)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 插入失败: {e}")
|
|
|
|
|
+ failed_rows.extend(batch)
|
|
|
|
|
+
|
|
|
|
|
+ return inserted, failed_rows
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def import_from_folder(root_folder: str):
|
|
|
|
|
+ """从文件夹批量导入(只扫描指定目录下的md文件,不递归子目录)"""
|
|
|
|
|
+ root = Path(root_folder)
|
|
|
|
|
+ if not root.exists():
|
|
|
|
|
+ print(f"文件夹不存在: {root}")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ print(f"扫描文件夹: {root}(不递归子目录)")
|
|
|
|
|
+
|
|
|
|
|
+ # 只扫描当前目录下的 .md 文件,不递归子目录
|
|
|
|
|
+ md_files = [f for f in root.glob("*.md") if f.is_file()]
|
|
|
|
|
+ print(f"发现 {len(md_files)} 个 MD 文件")
|
|
|
|
|
+
|
|
|
|
|
+ if not md_files:
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ # 初始化 jieba
|
|
|
|
|
+ jieba_ext = get_jieba_extractor()
|
|
|
|
|
+ if jieba_ext and jieba_ext.is_ready:
|
|
|
|
|
+ print("jieba 已启用")
|
|
|
|
|
+ else:
|
|
|
|
|
+ print("jieba 未启用,使用纯规则抽取")
|
|
|
|
|
+ print("建议安装: uv add jieba")
|
|
|
|
|
+
|
|
|
|
|
+ # 初始化 Milvus
|
|
|
|
|
+ client = get_milvusclient()
|
|
|
|
|
+ embeddings = get_embeddings()
|
|
|
|
|
+
|
|
|
|
|
+ if not client.has_collection(collection_name=COLLECTION_NAME):
|
|
|
|
|
+ print(f"Collection 不存在: {COLLECTION_NAME}")
|
|
|
|
|
+ print(f"运行: uv run -m src.app.scripts.first_bfp_collection_entity_create")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ client.load_collection(collection_name=COLLECTION_NAME)
|
|
|
|
|
+
|
|
|
|
|
+ # 统计
|
|
|
|
|
+ total_entities = 0
|
|
|
|
|
+ total_inserted = 0
|
|
|
|
|
+ entity_source_stats: Dict[str, int] = {}
|
|
|
|
|
+ entity_type_stats: Dict[str, int] = {}
|
|
|
|
|
+ failed_files = []
|
|
|
|
|
+
|
|
|
|
|
+ for idx, md_path in enumerate(md_files, 1):
|
|
|
|
|
+ print(f"\n[{idx}/{len(md_files)}] 处理: {md_path.name}")
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ rows = import_single_file(md_path, embeddings)
|
|
|
|
|
+
|
|
|
|
|
+ if rows:
|
|
|
|
|
+ # 统计
|
|
|
|
|
+ for row in rows:
|
|
|
|
|
+ text = row.get("text", "")
|
|
|
|
|
+ # 推断实体类型
|
|
|
|
|
+ if text.startswith("《"):
|
|
|
|
|
+ etype = "standard_name"
|
|
|
|
|
+ elif re.match(r'^[A-Z]{2,}', text):
|
|
|
|
|
+ etype = "standard_number"
|
|
|
|
|
+ elif re.match(r'^第', text):
|
|
|
|
|
+ etype = "clause"
|
|
|
|
|
+ else:
|
|
|
|
|
+ etype = "term"
|
|
|
|
|
+ entity_type_stats[etype] = entity_type_stats.get(etype, 0) + 1
|
|
|
|
|
+
|
|
|
|
|
+ print(f" 抽取 {len(rows)} 个实体")
|
|
|
|
|
+
|
|
|
|
|
+ # 插入
|
|
|
|
|
+ inserted, failed = batch_insert(client, rows)
|
|
|
|
|
+ total_inserted += inserted
|
|
|
|
|
+ if failed:
|
|
|
|
|
+ print(f" {len(failed)} 条插入失败")
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f" 插入 {inserted} 条")
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f" 无有效实体")
|
|
|
|
|
+
|
|
|
|
|
+ total_entities += len(rows)
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 处理失败: {e}")
|
|
|
|
|
+ failed_files.append(md_path.name)
|
|
|
|
|
+
|
|
|
|
|
+ # 汇总
|
|
|
|
|
+ print("\n" + "=" * 70)
|
|
|
|
|
+ print("导入完成")
|
|
|
|
|
+ print("=" * 70)
|
|
|
|
|
+ print(f"处理文件: {len(md_files)}")
|
|
|
|
|
+ print(f"抽取实体: {total_entities}")
|
|
|
|
|
+ print(f"成功插入: {total_inserted}")
|
|
|
|
|
+ if failed_files:
|
|
|
|
|
+ print(f"失败文件: {len(failed_files)}")
|
|
|
|
|
+ print("\n实体类型分布:")
|
|
|
|
|
+ for etype, count in sorted(entity_type_stats.items(), key=lambda x: -x[1]):
|
|
|
|
|
+ print(f" - {etype}: {count}")
|
|
|
|
|
+ print("=" * 70)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def main():
|
|
|
|
|
+ """主函数"""
|
|
|
|
|
+ print("=" * 70)
|
|
|
|
|
+ print("编制依据实体抽取与导入(jieba 版)")
|
|
|
|
|
+ print("=" * 70)
|
|
|
|
|
+ print("实体抽取: jieba 分词 + TF-IDF + TextRank + 规则补充")
|
|
|
|
|
+ print("关系抽取: 规则模式匹配")
|
|
|
|
|
+ print("字段结构: text, dense, content(=text), metadata")
|
|
|
|
|
+ print("=" * 70)
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ import_from_folder(ROOT_FOLDER)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"\n导入失败: {e}")
|
|
|
|
|
+ import traceback
|
|
|
|
|
+ traceback.print_exc()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ main()
|