|
@@ -0,0 +1,388 @@
|
|
|
|
|
+#!/usr/bin/env python
|
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
|
+"""
|
|
|
|
|
+目录缺失检查 - 模糊匹配模块
|
|
|
|
|
+
|
|
|
|
|
+独立模块,用于 AIReviewEngine.check_outline_catalogue 方法
|
|
|
|
|
+提供基于模糊匹配的目录缺失统计功能
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+import difflib
|
|
|
|
|
+import re
|
|
|
|
|
+from typing import Dict, List, Optional, Set, Tuple, Any
|
|
|
|
|
+from collections import defaultdict
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+
|
|
|
|
|
+import pandas as pd
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class OutlineCatalogueMatcher:
|
|
|
|
|
+ """
|
|
|
|
|
+ 目录模糊匹配器
|
|
|
|
|
+
|
|
|
|
|
+ 提供独立于 LightweightCompletenessChecker 的模糊匹配功能
|
|
|
|
|
+ 支持基于名称相似度的目录匹配
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(self, standard_csv_path: str, raw_content_csv_path: str = None):
|
|
|
|
|
+ """
|
|
|
|
|
+ 初始化匹配器
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ standard_csv_path: StandardCategoryTable.csv 路径
|
|
|
|
|
+ raw_content_csv_path: construction_plan_standards.csv 路径(可选)
|
|
|
|
|
+ """
|
|
|
|
|
+ self.standard_csv_path = standard_csv_path
|
|
|
|
|
+ self.raw_content_csv_path = raw_content_csv_path
|
|
|
|
|
+
|
|
|
|
|
+ # 加载标准数据
|
|
|
|
|
+ self.first_names: Dict[str, str] = {} # code -> name
|
|
|
|
|
+ self.second_names: Dict[Tuple[str, str], str] = {} # (first_code, second_code) -> name
|
|
|
|
|
+ self.first_seq: Dict[str, int] = {} # code -> seq
|
|
|
|
|
+ self.second_seq: Dict[Tuple[str, str], int] = {} # (first_code, second_code) -> seq
|
|
|
|
|
+
|
|
|
|
|
+ # 详细定义内容
|
|
|
|
|
+ self.second_raw_content: Dict[Tuple[str, str], str] = {} # (first_name, second_name) -> content
|
|
|
|
|
+
|
|
|
|
|
+ self._load_standard_csv()
|
|
|
|
|
+ if raw_content_csv_path:
|
|
|
|
|
+ self._load_raw_content_csv()
|
|
|
|
|
+
|
|
|
|
|
+ def _load_standard_csv(self) -> None:
|
|
|
|
|
+ """加载标准分类表"""
|
|
|
|
|
+ encodings = ['utf-8-sig', 'utf-16', 'gbk', 'utf-8']
|
|
|
|
|
+ df = None
|
|
|
|
|
+
|
|
|
|
|
+ for encoding in encodings:
|
|
|
|
|
+ try:
|
|
|
|
|
+ df = pd.read_csv(self.standard_csv_path, encoding=encoding, sep=None, engine='python')
|
|
|
|
|
+ break
|
|
|
|
|
+ except UnicodeDecodeError:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if df is None:
|
|
|
|
|
+ raise ValueError(f"无法读取CSV文件: {self.standard_csv_path}")
|
|
|
|
|
+
|
|
|
|
|
+ df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
|
|
|
|
|
+
|
|
|
|
|
+ # 提取一级和二级信息(去重)
|
|
|
|
|
+ for _, row in df.iterrows():
|
|
|
|
|
+ first_code = str(row.get('first_code', '')).strip()
|
|
|
|
|
+ second_code = str(row.get('second_code', '')).strip()
|
|
|
|
|
+ first_name = str(row.get('first_name', '')).strip()
|
|
|
|
|
+ second_name = str(row.get('second_name', '')).strip()
|
|
|
|
|
+
|
|
|
|
|
+ if not all([first_code, second_code, first_name, second_name]):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ first_seq = int(row.get('first_seq', 0) or 0)
|
|
|
|
|
+ second_seq = int(row.get('second_seq', 0) or 0)
|
|
|
|
|
+ except:
|
|
|
|
|
+ first_seq = 0
|
|
|
|
|
+ second_seq = 0
|
|
|
|
|
+
|
|
|
|
|
+ # 存储一级信息
|
|
|
|
|
+ if first_code not in self.first_names:
|
|
|
|
|
+ self.first_names[first_code] = first_name
|
|
|
|
|
+ self.first_seq[first_code] = first_seq
|
|
|
|
|
+
|
|
|
|
|
+ # 存储二级信息
|
|
|
|
|
+ sec_key = (first_code, second_code)
|
|
|
|
|
+ if sec_key not in self.second_names:
|
|
|
|
|
+ self.second_names[sec_key] = second_name
|
|
|
|
|
+ self.second_seq[sec_key] = second_seq
|
|
|
|
|
+
|
|
|
|
|
+ def _load_raw_content_csv(self) -> None:
|
|
|
|
|
+ """加载详细定义表"""
|
|
|
|
|
+ try:
|
|
|
|
|
+ encodings = ['utf-8-sig', 'utf-16', 'gbk', 'utf-8']
|
|
|
|
|
+ df = None
|
|
|
|
|
+
|
|
|
|
|
+ for encoding in encodings:
|
|
|
|
|
+ try:
|
|
|
|
|
+ df = pd.read_csv(self.raw_content_csv_path, encoding=encoding, sep=None, engine='python')
|
|
|
|
|
+ break
|
|
|
|
|
+ except UnicodeDecodeError:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ if df is None:
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]
|
|
|
|
|
+
|
|
|
|
|
+ if 'second_raw_content' not in df.columns:
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ for _, row in df.iterrows():
|
|
|
|
|
+ first_name = str(row.get('first_name', '')).strip()
|
|
|
|
|
+ second_name = str(row.get('second_name', '')).strip()
|
|
|
|
|
+ raw_content = str(row.get('second_raw_content', '')).strip()
|
|
|
|
|
+
|
|
|
|
|
+ if first_name and second_name and raw_content and raw_content != 'nan':
|
|
|
|
|
+ self.second_raw_content[(first_name, second_name)] = raw_content
|
|
|
|
|
+
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass # 加载失败不影响主功能
|
|
|
|
|
+
|
|
|
|
|
+ def _normalize_text(self, text: str) -> str:
|
|
|
|
|
+ """文本标准化"""
|
|
|
|
|
+ if not text:
|
|
|
|
|
+ return ""
|
|
|
|
|
+ text = re.sub(r'[\s\n\r\t.,;:!?,。;:!?、""''()()【】\[\]《》<>]', '', text)
|
|
|
|
|
+ return text.lower().strip()
|
|
|
|
|
+
|
|
|
|
|
+ def _calculate_similarity(self, text1: str, text2: str) -> float:
|
|
|
|
|
+ """计算两个文本的相似度"""
|
|
|
|
|
+ if not text1 or not text2:
|
|
|
|
|
+ return 0.0
|
|
|
|
|
+
|
|
|
|
|
+ norm1 = self._normalize_text(text1)
|
|
|
|
|
+ norm2 = self._normalize_text(text2)
|
|
|
|
|
+
|
|
|
|
|
+ if not norm1 or not norm2:
|
|
|
|
|
+ return 0.0
|
|
|
|
|
+
|
|
|
|
|
+ return difflib.SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_keywords(self, text: str) -> List[str]:
|
|
|
|
|
+ """提取关键词"""
|
|
|
|
|
+ stopwords = {'的', '及', '与', '或', '和', '等', '之', '第', '章', '节', '条',
|
|
|
|
|
+ '编制', '施工', '措施', '要求', '管理', '保证', '质量', '安全',
|
|
|
|
|
+ '技术', '计划', '人员', '组织', '体系', '条件', '概述', '概况'}
|
|
|
|
|
+
|
|
|
|
|
+ words = []
|
|
|
|
|
+ for word in text:
|
|
|
|
|
+ if word not in stopwords and len(word.strip()) > 0:
|
|
|
|
|
+ words.append(word)
|
|
|
|
|
+
|
|
|
|
|
+ if not words and text:
|
|
|
|
|
+ return list(text)
|
|
|
|
|
+
|
|
|
|
|
+ return words
|
|
|
|
|
+
|
|
|
|
|
+ def _calculate_enhanced_similarity(
|
|
|
|
|
+ self,
|
|
|
|
|
+ standard_name: str,
|
|
|
|
|
+ actual_title: str,
|
|
|
|
|
+ standard_raw_content: str = None
|
|
|
|
|
+ ) -> float:
|
|
|
|
|
+ """
|
|
|
|
|
+ 增强的相似度计算 - 基础相似度主导
|
|
|
|
|
+
|
|
|
|
|
+ 策略:
|
|
|
|
|
+ 1. 基础相似度(SequenceMatcher)- 核心,必须 >= 0.3 才能进入加分
|
|
|
|
|
+ 2. 关键词匹配(+0.2)- 辅助
|
|
|
|
|
+ 3. 包含关系(+0.1)- 辅助
|
|
|
|
|
+ 4. 详细定义匹配(+0.2)- 辅助
|
|
|
|
|
+
|
|
|
|
|
+ 规则:基础相似度 < 0.3 时,直接返回基础分(避免完全不相关的匹配)
|
|
|
|
|
+ """
|
|
|
|
|
+ if not standard_name or not actual_title:
|
|
|
|
|
+ return 0.0
|
|
|
|
|
+
|
|
|
|
|
+ # 1. 基础相似度(核心)
|
|
|
|
|
+ base_similarity = self._calculate_similarity(standard_name, actual_title)
|
|
|
|
|
+
|
|
|
|
|
+ # 基础相似度太低,说明完全不相关,不进入加分阶段
|
|
|
|
|
+ if base_similarity < 0.3:
|
|
|
|
|
+ return base_similarity
|
|
|
|
|
+
|
|
|
|
|
+ # 基础相似度达标,开始计算加分
|
|
|
|
|
+ scores = [base_similarity]
|
|
|
|
|
+
|
|
|
|
|
+ norm_standard = self._normalize_text(standard_name)
|
|
|
|
|
+ norm_actual = self._normalize_text(actual_title)
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 关键词匹配(权重0.2,比原来降低)
|
|
|
|
|
+ keyword_bonus = 0.0
|
|
|
|
|
+ standard_keywords = self._extract_keywords(norm_standard)
|
|
|
|
|
+ actual_keywords = self._extract_keywords(norm_actual)
|
|
|
|
|
+
|
|
|
|
|
+ if standard_keywords and actual_keywords:
|
|
|
|
|
+ matched = len(set(standard_keywords) & set(actual_keywords))
|
|
|
|
|
+ total = len(set(standard_keywords) | set(actual_keywords))
|
|
|
|
|
+ if total > 0:
|
|
|
|
|
+ # 权重从0.3降到0.2,避免关键词过度影响
|
|
|
|
|
+ keyword_bonus = (matched / total) * 0.2
|
|
|
|
|
+
|
|
|
|
|
+ scores.append(keyword_bonus)
|
|
|
|
|
+
|
|
|
|
|
+ # 3. 包含关系(权重0.1,比原来降低)
|
|
|
|
|
+ contain_bonus = 0.0
|
|
|
|
|
+ if norm_standard in norm_actual or norm_actual in norm_standard:
|
|
|
|
|
+ contain_bonus = 0.1
|
|
|
|
|
+ scores.append(contain_bonus)
|
|
|
|
|
+
|
|
|
|
|
+ # 4. 详细定义匹配(权重0.2,比原来降低)
|
|
|
|
|
+ if standard_raw_content and standard_raw_content != 'nan':
|
|
|
|
|
+ raw_content_score = self._calculate_similarity(
|
|
|
|
|
+ self._normalize_text(standard_raw_content),
|
|
|
|
|
+ norm_actual
|
|
|
|
|
+ )
|
|
|
|
|
+ # 阈值提高到0.6(原来0.5),确保详细定义必须足够相关才加分
|
|
|
|
|
+ if raw_content_score > 0.6:
|
|
|
|
|
+ # 权重从0.4降到0.2,避免详细定义过度影响
|
|
|
|
|
+ scores.append(raw_content_score * 0.2)
|
|
|
|
|
+
|
|
|
|
|
+ return min(sum(scores), 1.0)
|
|
|
|
|
+
|
|
|
|
|
+ def match_catalogue(
|
|
|
|
|
+ self,
|
|
|
|
|
+ outline_first: Set[str],
|
|
|
|
|
+ outline_secondary: Dict[Tuple[str, str], str],
|
|
|
|
|
+ threshold: float = 0.6
|
|
|
|
|
+ ) -> Dict[str, Any]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 执行目录匹配
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ outline_first: 从outline中提取的一级code集合
|
|
|
|
|
+ outline_secondary: 从outline中提取的二级 {(first_code, second_code): title}
|
|
|
|
|
+ threshold: 模糊匹配阈值(默认0.6)
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 匹配结果,包含:
|
|
|
|
|
+ - matched_first: 匹配的一级code集合
|
|
|
|
|
+ - matched_second: 匹配的二级key集合
|
|
|
|
|
+ - missing_first: 缺失的一级列表
|
|
|
|
|
+ - missing_second: 缺失的二级列表
|
|
|
|
|
+ - match_details: 匹配详情
|
|
|
|
|
+ """
|
|
|
|
|
+ required_first = set(self.first_names.keys())
|
|
|
|
|
+ required_second = set(self.second_names.keys())
|
|
|
|
|
+
|
|
|
|
|
+ # 一级匹配
|
|
|
|
|
+ matched_first = outline_first & required_first
|
|
|
|
|
+ missing_first = required_first - matched_first
|
|
|
|
|
+
|
|
|
|
|
+ # 二级匹配
|
|
|
|
|
+ matched_second = set()
|
|
|
|
|
+ missing_second = set()
|
|
|
|
|
+ match_details = []
|
|
|
|
|
+
|
|
|
|
|
+ # 精确匹配
|
|
|
|
|
+ outline_second_keys = set(outline_secondary.keys())
|
|
|
|
|
+ exact_matches = outline_second_keys & required_second
|
|
|
|
|
+ matched_second.update(exact_matches)
|
|
|
|
|
+
|
|
|
|
|
+ for key in exact_matches:
|
|
|
|
|
+ first_code, second_code = key
|
|
|
|
|
+ match_details.append({
|
|
|
|
|
+ 'level': 'second',
|
|
|
|
|
+ 'required_first_code': first_code,
|
|
|
|
|
+ 'required_second_code': second_code,
|
|
|
|
|
+ 'required_second_name': self.second_names.get(key, ''),
|
|
|
|
|
+ 'matched': True,
|
|
|
|
|
+ 'match_type': 'exact',
|
|
|
|
|
+ 'similarity': 1.0
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 模糊匹配(对未精确匹配的)
|
|
|
|
|
+ required_remaining = required_second - exact_matches
|
|
|
|
|
+ outline_remaining = outline_second_keys - exact_matches
|
|
|
|
|
+
|
|
|
|
|
+ if required_remaining and outline_remaining:
|
|
|
|
|
+ # 准备outline数据
|
|
|
|
|
+ outline_list = []
|
|
|
|
|
+ for key in outline_remaining:
|
|
|
|
|
+ first_code, second_code = key
|
|
|
|
|
+ title = outline_secondary.get(key, "")
|
|
|
|
|
+ outline_list.append({
|
|
|
|
|
+ 'key': key,
|
|
|
|
|
+ 'first_code': first_code,
|
|
|
|
|
+ 'second_code': second_code,
|
|
|
|
|
+ 'title': title
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 对每个required进行模糊匹配
|
|
|
|
|
+ for req_key in required_remaining:
|
|
|
|
|
+ first_code, second_code = req_key
|
|
|
|
|
+ second_name = self.second_names.get(req_key, '')
|
|
|
|
|
+ first_name = self.first_names.get(first_code, '')
|
|
|
|
|
+
|
|
|
|
|
+ # 获取详细定义
|
|
|
|
|
+ raw_content = self.second_raw_content.get((first_name, second_name))
|
|
|
|
|
+
|
|
|
|
|
+ best_match = None
|
|
|
|
|
+ best_score = 0.0
|
|
|
|
|
+
|
|
|
|
|
+ for item in outline_list:
|
|
|
|
|
+ # 计算相似度
|
|
|
|
|
+ score1 = self._calculate_enhanced_similarity(second_name, item['title'])
|
|
|
|
|
+ score2 = self._calculate_enhanced_similarity(
|
|
|
|
|
+ f"{first_name}{second_name}",
|
|
|
|
|
+ item['title']
|
|
|
|
|
+ )
|
|
|
|
|
+ score = max(score1, score2)
|
|
|
|
|
+
|
|
|
|
|
+ # 如果有详细定义,也计算
|
|
|
|
|
+ if raw_content:
|
|
|
|
|
+ score3 = self._calculate_enhanced_similarity(
|
|
|
|
|
+ second_name,
|
|
|
|
|
+ item['title'],
|
|
|
|
|
+ raw_content
|
|
|
|
|
+ )
|
|
|
|
|
+ score = max(score, score3)
|
|
|
|
|
+
|
|
|
|
|
+ if score > best_score:
|
|
|
|
|
+ best_score = score
|
|
|
|
|
+ best_match = item
|
|
|
|
|
+
|
|
|
|
|
+ if best_score >= threshold:
|
|
|
|
|
+ matched_second.add(req_key)
|
|
|
|
|
+ match_details.append({
|
|
|
|
|
+ 'level': 'second',
|
|
|
|
|
+ 'required_first_code': first_code,
|
|
|
|
|
+ 'required_second_code': second_code,
|
|
|
|
|
+ 'required_second_name': second_name,
|
|
|
|
|
+ 'matched': True,
|
|
|
|
|
+ 'match_type': 'fuzzy',
|
|
|
|
|
+ 'similarity': best_score,
|
|
|
|
|
+ 'matched_title': best_match['title'] if best_match else None,
|
|
|
|
|
+ 'used_raw_content': raw_content is not None
|
|
|
|
|
+ })
|
|
|
|
|
+ else:
|
|
|
|
|
+ missing_second.add(req_key)
|
|
|
|
|
+ match_details.append({
|
|
|
|
|
+ 'level': 'second',
|
|
|
|
|
+ 'required_first_code': first_code,
|
|
|
|
|
+ 'required_second_code': second_code,
|
|
|
|
|
+ 'required_second_name': second_name,
|
|
|
|
|
+ 'matched': False,
|
|
|
|
|
+ 'match_type': 'none',
|
|
|
|
|
+ 'similarity': best_score
|
|
|
|
|
+ })
|
|
|
|
|
+ else:
|
|
|
|
|
+ missing_second = required_remaining
|
|
|
|
|
+
|
|
|
|
|
+ # 构建缺失详情
|
|
|
|
|
+ missing_first_details = []
|
|
|
|
|
+ for code in sorted(missing_first, key=lambda x: self.first_seq.get(x, 0)):
|
|
|
|
|
+ missing_first_details.append({
|
|
|
|
|
+ 'first_code': code,
|
|
|
|
|
+ 'first_name': self.first_names.get(code, code),
|
|
|
|
|
+ 'first_seq': self.first_seq.get(code, 0)
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ missing_second_details = []
|
|
|
|
|
+ for key in sorted(missing_second, key=lambda x: (self.first_seq.get(x[0], 0), self.second_seq.get(x, 0))):
|
|
|
|
|
+ first_code, second_code = key
|
|
|
|
|
+ missing_second_details.append({
|
|
|
|
|
+ 'first_code': first_code,
|
|
|
|
|
+ 'first_name': self.first_names.get(first_code, first_code),
|
|
|
|
|
+ 'first_seq': self.first_seq.get(first_code, 0),
|
|
|
|
|
+ 'secondary_code': second_code,
|
|
|
|
|
+ 'secondary_name': self.second_names.get(key, ''),
|
|
|
|
|
+ 'second_seq': self.second_seq.get(key, 0)
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
|
|
+ 'matched_first': matched_first,
|
|
|
|
|
+ 'matched_second': matched_second,
|
|
|
|
|
+ 'missing_first': missing_first_details,
|
|
|
|
|
+ 'missing_second': missing_second_details,
|
|
|
|
|
+ 'missing_first_count': len(missing_first),
|
|
|
|
|
+ 'missing_second_count': len(missing_second),
|
|
|
|
|
+ 'match_details': match_details
|
|
|
|
|
+ }
|