|
|
@@ -0,0 +1,662 @@
|
|
|
+import os
|
|
|
+import shutil
|
|
|
+import pandas as pd
|
|
|
+import json
|
|
|
+import re
|
|
|
+from pathlib import Path
|
|
|
+from typing import List, Dict, Tuple, Optional
|
|
|
+import numpy as np
|
|
|
+from openai import OpenAI
|
|
|
+from datetime import datetime
|
|
|
+import warnings
|
|
|
+
|
|
|
+warnings.filterwarnings('ignore')
|
|
|
+
|
|
|
+
|
|
|
+class FileMatcher:
|
|
|
+ def __init__(self, search_dirs: List[str], output_dir: str = "matched_files"):
|
|
|
+ """
|
|
|
+ 初始化文件匹配器
|
|
|
+
|
|
|
+ Args:
|
|
|
+ search_dirs: 要搜索的目录列表
|
|
|
+ output_dir: 匹配文件输出目录
|
|
|
+ """
|
|
|
+ self.search_dirs = [Path(dir_path) for dir_path in search_dirs]
|
|
|
+ self.output_dir = Path(output_dir)
|
|
|
+ self.output_dir.mkdir(exist_ok=True, parents=True)
|
|
|
+
|
|
|
+ # 存储所有找到的文件路径
|
|
|
+ self.all_files = []
|
|
|
+ self.file_info = {} # 文件名 -> [文件路径列表]
|
|
|
+
|
|
|
+ # 初始化LLM客户端
|
|
|
+ self.llm_client = None
|
|
|
+ self.use_llm = False
|
|
|
+
|
|
|
+ # 结果存储
|
|
|
+ self.results = []
|
|
|
+
|
|
|
+ def scan_directories(self):
|
|
|
+ """扫描所有目录,收集文件信息"""
|
|
|
+ print("正在扫描目录...")
|
|
|
+ for search_dir in self.search_dirs:
|
|
|
+ if not search_dir.exists():
|
|
|
+ print(f"警告: 目录不存在: {search_dir}")
|
|
|
+ continue
|
|
|
+
|
|
|
+ for root, dirs, files in os.walk(search_dir):
|
|
|
+ for file in files:
|
|
|
+ file_path = Path(root) / file
|
|
|
+ self.all_files.append(file_path)
|
|
|
+
|
|
|
+ # 存储文件名到路径的映射(支持多个同名文件)
|
|
|
+ file_name = file_path.name
|
|
|
+ file_stem = file_path.stem
|
|
|
+
|
|
|
+ # 同时存储完整文件名和主文件名
|
|
|
+ if file_name not in self.file_info:
|
|
|
+ self.file_info[file_name] = []
|
|
|
+ self.file_info[file_name].append(file_path)
|
|
|
+
|
|
|
+ if file_stem not in self.file_info:
|
|
|
+ self.file_info[file_stem] = []
|
|
|
+ self.file_info[file_stem].append(file_path)
|
|
|
+
|
|
|
+ print(f"共扫描到 {len(self.all_files)} 个文件")
|
|
|
+ print(f"索引了 {len(self.file_info)} 个不同的文件名")
|
|
|
+
|
|
|
+ def extract_main_name(self, reference: str) -> str:
|
|
|
+ """
|
|
|
+ 从编制依据名称中提取主要文件名
|
|
|
+
|
|
|
+ Args:
|
|
|
+ reference: 原始编制依据名称
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 提取后的主要文件名
|
|
|
+ """
|
|
|
+ # 去除各种括号和其中的内容
|
|
|
+ patterns = [
|
|
|
+ r'(.*?)', r'\(.*?\)', # 中文和英文括号
|
|
|
+ r'【.*?】', r'\[.*?\]', # 方括号
|
|
|
+ r'978[-–]\d[-–]\d{4}[-–]\d{4}[-–]\d', # ISBN号
|
|
|
+ r'978\d{10,13}', # ISBN号(无分隔符)
|
|
|
+ r'主席令.*?\d+号', # 主席令
|
|
|
+ r'国务院令.*?\d+号', # 国务院令
|
|
|
+ r'交通部令.*?\d+号', # 部委令
|
|
|
+ r'国家安监总局令.*?\d+号', # 安监总局令
|
|
|
+ r'GB[/\s]*T?\s*\d+[-–]\d+', # 国家标准号
|
|
|
+ r'JGJ[/\s]*\d+[-–]\d+', # 行业标准号
|
|
|
+ r'JTG[/\s]*T?\s*\d+[-–]\d+', # 交通行业标准号
|
|
|
+ r'JT[/\s]*T?\s*\d+[-–]\d+', # 交通标准号
|
|
|
+ ]
|
|
|
+
|
|
|
+ cleaned = reference
|
|
|
+ for pattern in patterns:
|
|
|
+ cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
|
|
|
+
|
|
|
+ # 提取《》中的内容
|
|
|
+ book_match = re.search(r'《([^》]+)》', cleaned)
|
|
|
+ if book_match:
|
|
|
+ cleaned = book_match.group(1)
|
|
|
+
|
|
|
+ # 清理多余字符
|
|
|
+ cleaned = re.sub(r'[《》\"\'\[\]()()【】]', '', cleaned)
|
|
|
+ cleaned = cleaned.strip(' -–\t\n')
|
|
|
+
|
|
|
+ # 如果名称太长,取前部分
|
|
|
+ if len(cleaned) > 50:
|
|
|
+ cleaned = cleaned[:50]
|
|
|
+
|
|
|
+ return cleaned if cleaned else reference
|
|
|
+
|
|
|
+ def exact_match(self, target_name: str) -> List[Tuple[Path, float]]:
|
|
|
+ """
|
|
|
+ 精确匹配文件名
|
|
|
+
|
|
|
+ Args:
|
|
|
+ target_name: 目标文件名
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 匹配到的文件路径列表和相似度(1.0表示完全匹配)
|
|
|
+ """
|
|
|
+ matched_files = []
|
|
|
+
|
|
|
+ # 尝试匹配完整文件名
|
|
|
+ for file_name, file_paths in self.file_info.items():
|
|
|
+ if target_name in file_name or file_name in target_name:
|
|
|
+ similarity = self.calculate_similarity(target_name, file_name)
|
|
|
+ for file_path in file_paths:
|
|
|
+ matched_files.append((file_path, similarity))
|
|
|
+
|
|
|
+ # 如果找到匹配,按相似度排序
|
|
|
+ if matched_files:
|
|
|
+ matched_files.sort(key=lambda x: x[1], reverse=True)
|
|
|
+ return matched_files
|
|
|
+
|
|
|
+ return []
|
|
|
+
|
|
|
+ def calculate_similarity(self, str1: str, str2: str) -> float:
|
|
|
+ """
|
|
|
+ 计算两个字符串的相似度(简单版)
|
|
|
+
|
|
|
+ Args:
|
|
|
+ str1: 字符串1
|
|
|
+ str2: 字符串2
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 相似度分数 0-1
|
|
|
+ """
|
|
|
+ # 转换为小写进行比较
|
|
|
+ s1 = str1.lower()
|
|
|
+ s2 = str2.lower()
|
|
|
+
|
|
|
+ # 如果完全相等
|
|
|
+ if s1 == s2:
|
|
|
+ return 1.0
|
|
|
+
|
|
|
+ # 如果一个是另一个的子串
|
|
|
+ if s1 in s2 or s2 in s1:
|
|
|
+ return 0.9
|
|
|
+
|
|
|
+ # 计算共同字符的比例
|
|
|
+ set1 = set(s1)
|
|
|
+ set2 = set(s2)
|
|
|
+ intersection = set1.intersection(set2)
|
|
|
+ union = set1.union(set2)
|
|
|
+
|
|
|
+ if not union:
|
|
|
+ return 0.0
|
|
|
+
|
|
|
+ return len(intersection) / len(union)
|
|
|
+
|
|
|
+ def llm_semantic_match(self, target_name: str, candidate_files: List[Path]) -> List[Tuple[Path, float]]:
|
|
|
+ """
|
|
|
+ 使用LLM进行语义相似度匹配
|
|
|
+
|
|
|
+ Args:
|
|
|
+ target_name: 目标文件名
|
|
|
+ candidate_files: 候选文件列表
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 匹配的文件路径和相似度分数列表
|
|
|
+ """
|
|
|
+ if not self.use_llm or not candidate_files:
|
|
|
+ return []
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 准备候选文件名列表(限制数量以提高效率)
|
|
|
+ max_candidates = 20
|
|
|
+ if len(candidate_files) > max_candidates:
|
|
|
+ # 使用简单的文件名相似度预筛选
|
|
|
+ candidates_with_scores = []
|
|
|
+ for file_path in candidate_files:
|
|
|
+ file_name = file_path.name
|
|
|
+ score = self.calculate_similarity(target_name, file_name)
|
|
|
+ candidates_with_scores.append((file_path, score))
|
|
|
+
|
|
|
+ # 按预筛选分数排序,取前max_candidates个
|
|
|
+ candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
|
|
|
+ candidate_files = [c[0] for c in candidates_with_scores[:max_candidates]]
|
|
|
+
|
|
|
+ # 构建候选文件名列表
|
|
|
+ candidate_names = [f"{file_path.name} (来自: {file_path.parent.name})"
|
|
|
+ for file_path in candidate_files]
|
|
|
+
|
|
|
+ # 构建提示词
|
|
|
+ prompt = f"""请分析以下目标文件名与候选文件名之间的语义相似度。
|
|
|
+
|
|
|
+ 目标文件名: "{target_name}"
|
|
|
+
|
|
|
+ 候选文件名列表:
|
|
|
+ {chr(10).join([f"{i + 1}. {name}" for i, name in enumerate(candidate_names)])}
|
|
|
+
|
|
|
+ 请根据文件名的语义内容(不考虑扩展名、路径等),评估每个候选文件与目标文件的相似程度。
|
|
|
+ 返回格式要求: JSON对象,包含"matches"数组,每个元素包含"index"(候选文件序号,从1开始)、"similarity_score"(0-1之间的相似度分数)、"reason"(简要理由)。
|
|
|
+ 只返回JSON,不要有其他内容。"""
|
|
|
+
|
|
|
+ response = self.llm_client.chat.completions.create(
|
|
|
+ model="Qwen/Qwen3-30B-A3B", # 或 "gpt-4" gpt-3.5-turbo
|
|
|
+ messages=[
|
|
|
+ {"role": "system",
|
|
|
+ "content": "你是一个专业的文档管理助手,擅长分析文件名的语义相似度。请准确评估相似度并返回JSON格式结果。"},
|
|
|
+ {"role": "user", "content": prompt}
|
|
|
+ ],
|
|
|
+ temperature=0.1,
|
|
|
+ max_tokens=1000,
|
|
|
+ extra_body={"enable_thinking": False} # 通过extra_body传递额外参数
|
|
|
+ )
|
|
|
+
|
|
|
+ result_text = response.choices[0].message.content
|
|
|
+
|
|
|
+ # 尝试解析JSON响应
|
|
|
+ try:
|
|
|
+ # 清理可能的多余字符
|
|
|
+ result_text = result_text.strip()
|
|
|
+ if result_text.startswith('```json'):
|
|
|
+ result_text = result_text[7:]
|
|
|
+ if result_text.endswith('```'):
|
|
|
+ result_text = result_text[:-3]
|
|
|
+ result_text = result_text.strip()
|
|
|
+
|
|
|
+ result_json = json.loads(result_text)
|
|
|
+ matches = result_json.get("matches", [])
|
|
|
+
|
|
|
+ # 构建结果列表
|
|
|
+ results = []
|
|
|
+ for match in matches:
|
|
|
+ idx = match.get("index", 0) - 1 # 转换为0-based索引
|
|
|
+ score = match.get("similarity_score", 0)
|
|
|
+
|
|
|
+ if 0 <= idx < len(candidate_files):
|
|
|
+ results.append((candidate_files[idx], float(score)))
|
|
|
+
|
|
|
+ # 按相似度降序排序
|
|
|
+ results.sort(key=lambda x: x[1], reverse=True)
|
|
|
+ return results
|
|
|
+
|
|
|
+ except json.JSONDecodeError as e:
|
|
|
+ print(f"LLM返回的JSON解析失败: {e}")
|
|
|
+ print(f"返回内容: {result_text}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"LLM匹配出错: {e}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ def find_best_matches(self, target_name: str, use_semantic: bool = True) -> List[Tuple[Path, float]]:
|
|
|
+ """
|
|
|
+ 查找最佳匹配文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ target_name: 目标文件名
|
|
|
+ use_semantic: 是否使用语义匹配
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 匹配的文件路径和相似度列表(按相似度降序排列)
|
|
|
+ """
|
|
|
+ # 1. 首先尝试精确匹配
|
|
|
+ exact_matches = self.exact_match(target_name)
|
|
|
+
|
|
|
+ if exact_matches:
|
|
|
+ print(f" ✓ 找到 {len(exact_matches)} 个精确/部分匹配")
|
|
|
+ return exact_matches
|
|
|
+
|
|
|
+ # 2. 如果没有精确匹配,使用语义匹配
|
|
|
+ if use_semantic and self.use_llm:
|
|
|
+ print(f" ⚡ 使用LLM进行语义匹配...")
|
|
|
+ semantic_matches = self.llm_semantic_match(target_name, self.all_files)
|
|
|
+
|
|
|
+ if semantic_matches:
|
|
|
+ print(f" ⚡ 找到 {len(semantic_matches)} 个语义匹配")
|
|
|
+ return semantic_matches
|
|
|
+
|
|
|
+ return []
|
|
|
+
|
|
|
+ def copy_matched_files(self, matches: List[Tuple[Path, float]], reference: str, seq_num: int):
|
|
|
+ """
|
|
|
+ 复制匹配的文件到输出目录
|
|
|
+
|
|
|
+ Args:
|
|
|
+ matches: 匹配的文件列表
|
|
|
+ reference: 原始编制依据
|
|
|
+ seq_num: 序号
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 复制的文件路径列表
|
|
|
+ """
|
|
|
+ copied_files = []
|
|
|
+
|
|
|
+ # 为每个编制依据创建子目录
|
|
|
+ safe_ref = re.sub(r'[<>:"/\\|?*]', '_', reference[:50])
|
|
|
+ ref_dir = self.output_dir / f"{seq_num:02d}_{safe_ref}"
|
|
|
+ ref_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ # 复制文件(限制最多复制5个)
|
|
|
+ max_copy = min(5, len(matches))
|
|
|
+ for i, (file_path, similarity) in enumerate(matches[:max_copy]):
|
|
|
+ try:
|
|
|
+ # 创建带相似度信息的文件名
|
|
|
+ dest_name = f"{similarity:.3f}_{file_path.name}"
|
|
|
+ dest_path = ref_dir / dest_name
|
|
|
+ shutil.copy2(file_path, dest_path)
|
|
|
+ copied_files.append((dest_path, similarity))
|
|
|
+
|
|
|
+ # 同时创建不带相似度信息的副本
|
|
|
+ simple_dest = ref_dir / file_path.name
|
|
|
+ if not simple_dest.exists():
|
|
|
+ shutil.copy2(file_path, simple_dest)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f" 复制文件失败 {file_path}: {e}")
|
|
|
+
|
|
|
+ return copied_files
|
|
|
+
|
|
|
+ def setup_llm(self, api_key: str, base_url: str = None):
|
|
|
+ """
|
|
|
+ 设置LLM客户端
|
|
|
+
|
|
|
+ Args:
|
|
|
+ api_key: API密钥
|
|
|
+ base_url: API基础URL(可选)
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ if base_url:
|
|
|
+ self.llm_client = OpenAI(api_key=api_key, base_url=base_url)
|
|
|
+ else:
|
|
|
+ self.llm_client = OpenAI(api_key=api_key)
|
|
|
+ self.use_llm = True
|
|
|
+ print("LLM客户端已初始化")
|
|
|
+ except Exception as e:
|
|
|
+ print(f"LLM初始化失败: {e}")
|
|
|
+ self.use_llm = False
|
|
|
+
|
|
|
+ def process_references(self, references: List[str], use_semantic: bool = True):
|
|
|
+ """
|
|
|
+ 处理所有编制依据
|
|
|
+
|
|
|
+ Args:
|
|
|
+ references: 编制依据列表
|
|
|
+ use_semantic: 是否使用语义匹配
|
|
|
+ """
|
|
|
+ print(f"开始处理 {len(references)} 个编制依据...")
|
|
|
+
|
|
|
+ for idx, reference in enumerate(references, 1):
|
|
|
+ print(f"[{idx:02d}/{len(references)}] 处理: {reference[:60]}...")
|
|
|
+
|
|
|
+ # 提取主要文件名
|
|
|
+ main_name = self.extract_main_name(reference)
|
|
|
+
|
|
|
+ # 查找匹配文件
|
|
|
+ matches = self.find_best_matches(main_name, use_semantic)
|
|
|
+
|
|
|
+ # 复制匹配的文件
|
|
|
+ copied_files = []
|
|
|
+ if matches:
|
|
|
+ copied_files = self.copy_matched_files(matches, reference, idx)
|
|
|
+
|
|
|
+ # 存储结果
|
|
|
+ result = {
|
|
|
+ "序号": idx,
|
|
|
+ "编制依据名称": reference,
|
|
|
+ "提取的主要名称": main_name,
|
|
|
+ "匹配结果": [],
|
|
|
+ "最终匹配文件": None,
|
|
|
+ "匹配目录": None
|
|
|
+ }
|
|
|
+
|
|
|
+ if matches:
|
|
|
+ # 记录所有匹配结果
|
|
|
+ for i, (file_path, similarity) in enumerate(matches):
|
|
|
+ result["匹配结果"].append({
|
|
|
+ "排名": i + 1,
|
|
|
+ "文件路径": str(file_path),
|
|
|
+ "相似度": float(similarity),
|
|
|
+ "文件名": file_path.name,
|
|
|
+ "目录": str(file_path.parent)
|
|
|
+ })
|
|
|
+
|
|
|
+ # 最佳匹配
|
|
|
+ best_match = matches[0]
|
|
|
+ result["最终匹配文件"] = best_match[0].name
|
|
|
+ result["匹配目录"] = str(best_match[0].parent)
|
|
|
+ result["最佳匹配相似度"] = float(best_match[1])
|
|
|
+
|
|
|
+ if copied_files:
|
|
|
+ result["已复制文件"] = [str(path) for path, _ in copied_files]
|
|
|
+
|
|
|
+ self.results.append(result)
|
|
|
+
|
|
|
+ # 打印简要结果
|
|
|
+ if matches:
|
|
|
+ print(f" 找到 {len(matches)} 个匹配,最佳相似度: {matches[0][1]:.3f}")
|
|
|
+ else:
|
|
|
+ print(f" ✗ 未找到匹配文件")
|
|
|
+
|
|
|
+ def save_results(self, output_file: str = "匹配结果.txt"):
|
|
|
+ """
|
|
|
+ 保存匹配结果到文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ output_file: 输出文件名
|
|
|
+ """
|
|
|
+ output_path = Path(output_file)
|
|
|
+
|
|
|
+ with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write("=" * 100 + "\n")
|
|
|
+ f.write("编制依据文件匹配结果报告\n")
|
|
|
+ f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
|
|
+ f.write(f"搜索目录: {[str(d) for d in self.search_dirs]}\n")
|
|
|
+ f.write("=" * 100 + "\n\n")
|
|
|
+
|
|
|
+ total_refs = len(self.results)
|
|
|
+ total_matches = sum(1 for r in self.results if r["匹配结果"])
|
|
|
+ total_files = sum(len(r["匹配结果"]) for r in self.results)
|
|
|
+
|
|
|
+ f.write(f"统计摘要:\n")
|
|
|
+ f.write(f"- 总编制依据数: {total_refs}\n")
|
|
|
+ f.write(f"- 成功匹配数: {total_matches}\n")
|
|
|
+ f.write(f"- 总匹配文件数: {total_files}\n")
|
|
|
+ f.write(f"- 输出目录: {self.output_dir}\n")
|
|
|
+ f.write("\n" + "=" * 100 + "\n\n")
|
|
|
+
|
|
|
+ for result in self.results:
|
|
|
+ f.write(f"序号: {result['序号']}\n")
|
|
|
+ f.write(f"编制依据名称: {result['编制依据名称']}\n")
|
|
|
+ f.write(f"提取的主要名称: {result['提取的主要名称']}\n")
|
|
|
+ f.write("-" * 80 + "\n")
|
|
|
+
|
|
|
+ if result["匹配结果"]:
|
|
|
+ f.write(f"匹配结果 (共 {len(result['匹配结果'])} 个,按相似度降序排列):\n\n")
|
|
|
+
|
|
|
+ for match in result["匹配结果"]:
|
|
|
+ f.write(f" 排名 #{match['排名']} | 相似度: {match['相似度']:.3f}\n")
|
|
|
+ f.write(f" 文件名: {match['文件名']}\n")
|
|
|
+ f.write(f" 目录: {match['目录']}\n")
|
|
|
+ f.write(f" 完整路径: {match['文件路径']}\n")
|
|
|
+ f.write(" " + "-" * 70 + "\n")
|
|
|
+
|
|
|
+ f.write(f"\n最佳匹配文件: {result['最终匹配文件']}\n")
|
|
|
+ f.write(f"最佳匹配目录: {result['匹配目录']}\n")
|
|
|
+ f.write(f"最佳匹配相似度: {result.get('最佳匹配相似度', 0):.3f}\n")
|
|
|
+
|
|
|
+ if "已复制文件" in result:
|
|
|
+ f.write(f"已复制的文件: {len(result['已复制文件'])} 个\n")
|
|
|
+ for copied in result["已复制文件"][:3]: # 只显示前3个
|
|
|
+ f.write(f" - {copied}\n")
|
|
|
+ if len(result["已复制文件"]) > 3:
|
|
|
+ f.write(f" ... 还有 {len(result['已复制文件']) - 3} 个文件\n")
|
|
|
+ else:
|
|
|
+ f.write("匹配结果: 未找到匹配文件\n")
|
|
|
+
|
|
|
+ f.write("\n" + "=" * 100 + "\n\n")
|
|
|
+
|
|
|
+ print(f"详细结果已保存到: {output_path}")
|
|
|
+
|
|
|
+ # 同时保存为JSON格式便于后续处理
|
|
|
+ json_path = output_path.with_suffix('.json')
|
|
|
+ with open(json_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(self.results, f, ensure_ascii=False, indent=2, default=str)
|
|
|
+
|
|
|
+ print(f"JSON格式结果已保存到: {json_path}")
|
|
|
+
|
|
|
+ # 生成CSV格式的摘要
|
|
|
+ csv_path = output_path.with_suffix('.csv')
|
|
|
+ self.save_summary_csv(csv_path)
|
|
|
+ print(f"CSV格式摘要已保存到: {csv_path}")
|
|
|
+
|
|
|
+ def save_summary_csv(self, csv_path: Path):
|
|
|
+ """保存CSV格式的摘要"""
|
|
|
+ summary_data = []
|
|
|
+
|
|
|
+ for result in self.results:
|
|
|
+ row = {
|
|
|
+ "序号": result["序号"],
|
|
|
+ "编制依据名称": result["编制依据名称"],
|
|
|
+ "提取的主要名称": result["提取的主要名称"],
|
|
|
+ "是否匹配成功": "是" if result["匹配结果"] else "否",
|
|
|
+ "匹配文件数量": len(result["匹配结果"]),
|
|
|
+ "最佳匹配文件名": result.get("最终匹配文件", ""),
|
|
|
+ "最佳匹配相似度": result.get("最佳匹配相似度", 0),
|
|
|
+ "最佳匹配目录": result.get("匹配目录", "")
|
|
|
+ }
|
|
|
+ summary_data.append(row)
|
|
|
+
|
|
|
+ df = pd.DataFrame(summary_data)
|
|
|
+ df.to_csv(csv_path, index=False, encoding='utf-8-sig')
|
|
|
+
|
|
|
+
|
|
|
+def read_references_from_excel(file_path: str, sheet_name: str = 0,
|
|
|
+ name_col: str = "编制依据名称") -> List[str]:
|
|
|
+ """
|
|
|
+ 从Excel文件读取编制依据列表
|
|
|
+
|
|
|
+ Args:
|
|
|
+ file_path: Excel文件路径
|
|
|
+ sheet_name: 工作表名称或索引
|
|
|
+ name_col: 编制依据名称的列名
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 编制依据列表
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ df = pd.read_excel(file_path, sheet_name=sheet_name)
|
|
|
+
|
|
|
+ if name_col in df.columns:
|
|
|
+ references = df[name_col].dropna().astype(str).tolist()
|
|
|
+ print(f"从Excel读取到 {len(references)} 个编制依据")
|
|
|
+ return references
|
|
|
+ else:
|
|
|
+ print(f"错误: Excel文件中没有'{name_col}'列")
|
|
|
+ print(f"可用列: {list(df.columns)}")
|
|
|
+ return []
|
|
|
+ except Exception as e:
|
|
|
+ print(f"读取Excel文件失败: {e}")
|
|
|
+ return []
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ """主函数"""
|
|
|
+ # ========== 配置部分 ==========
|
|
|
+
|
|
|
+ # 要搜索的目录列表(请修改为实际的目录)
|
|
|
+ SEARCH_DIRECTORIES = [
|
|
|
+ r"D:\download\规范标准",
|
|
|
+ r"F:\网讯创智\项目集\路桥项目资料\数据标准案例文档\一份完整的编制依据"
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 输出目录
|
|
|
+ OUTPUT_DIR = "F:/网讯创智/项目集/路桥项目资料/施工方案及编制依据案例库(第一阶段)/编制依据列表/44-镇巴(川陕界)至广安高速公_pdf/no"
|
|
|
+
|
|
|
+ # 结果文件
|
|
|
+ RESULTS_FILE = "F:/网讯创智/项目集/路桥项目资料/施工方案及编制依据案例库(第一阶段)/编制依据列表/44-镇巴(川陕界)至广安高速公_pdf/编制依据匹配报告.txt"
|
|
|
+
|
|
|
+ # LLM配置(如果需要使用语义匹配)
|
|
|
+ USE_LLM = True # 设置为True以启用LLM语义匹配
|
|
|
+ LLM_API_KEY = "ms-61bf873e-7536-42a9-b830-b12dca656e1f" # 替换为你的API密钥
|
|
|
+ LLM_BASE_URL = "https://api-inference.modelscope.cn/v1/" # 可选,如: "https://api.openai.com/v1"
|
|
|
+
|
|
|
+ # ========== 编制依据列表 ==========
|
|
|
+
|
|
|
+ # 方式1: 直接使用列表(从你的表格中复制)
|
|
|
+ references = [
|
|
|
+ "《中华人民共和国安全生产法》(978-7-5216-1908-9)",
|
|
|
+ "《中华人民共和国特种设备安全法》(9787511850782)",
|
|
|
+ "《中华人民共和国环境保护法》(978-7-5093-5354-7)",
|
|
|
+ "《中华人民共和国突发事件应对法》【主席令〔2007〕第69 号】",
|
|
|
+ "《生产安全事故报告和调查处理条例》【国务院令〔2007〕第493 号】",
|
|
|
+ "《建设工程质量管理条例》【国务院令〔2000〕第279 号】",
|
|
|
+ "《公路水运工程安全生产监督管理办法》【交通部令〔2017〕第25 号】",
|
|
|
+ "《劳动防护用品监督管理规定》【国家安监总局令〔2015〕第124 号】",
|
|
|
+ "《企业安全生产应急管理九条规定》【国家安监总局令〔2015〕第74 号】",
|
|
|
+ "《特种设备安全监察条例》",
|
|
|
+ "《国务院关于进一步加强企业安全生产工作的通知》",
|
|
|
+ "《大型起重机械安装安全监控管理系统实施方案》",
|
|
|
+ "《架桥机通用技术条件》(GB/T 26470-2011)",
|
|
|
+ "《起重机设计规范》(GB/T 3811-2008)",
|
|
|
+ "《起重机械安全规程第5 部分:桥式和门式起重机》(GB 6067.5-2014)",
|
|
|
+ "《电气装置安装工程起重机电气装置施工及验收规范》(GB 50256-2014)",
|
|
|
+ "《起重设备安装工程施工及验收规范》(GB50278-2010)",
|
|
|
+ "《施工现场机械设备检查技术规范》(JGJ 160-2016)",
|
|
|
+ "《公路桥涵施工技术规范》(JTG/T 3650-2020)",
|
|
|
+ "《建设工程安全生产管理条例》",
|
|
|
+ "《四川省安全生产条例》(2023)",
|
|
|
+ "《重要用途钢丝绳》(GB 8918-2006)",
|
|
|
+ "《起重机用钢丝绳》(GB T 34198-2017)",
|
|
|
+ "《起重机钢丝绳保养、维护、检验和报废》(GBT5972-2023)",
|
|
|
+ "《建筑施工起重吊装工程安全技术规范》(JGJ 276-2012)",
|
|
|
+ "《建筑施工高处作业安全技术规范》(JGJ 80-2016)",
|
|
|
+ "《公路工程施工安全标志设置规范》(JTT1507-2024)",
|
|
|
+ "《公路工程施工现场安全防护技术要求》(JTT1508-2024)",
|
|
|
+ "《公路水运工程临时用电技术规程》(JTT1499-2024)",
|
|
|
+ "《坠落防护水平生命线装置》(GB 38454-2019)",
|
|
|
+ "《坠落防护挂点装置》(GB30862-2014)",
|
|
|
+ "《钢丝绳通用技术规范》(GB/T 20118-2017)",
|
|
|
+ "《坠落防护安全带》(GB 6095-2021)",
|
|
|
+ "《一般起重用D 形和弓形锻造卸扣》(GB/T 25854-2010)",
|
|
|
+ "《钢丝绳夹》(GB/T 5976-2006)",
|
|
|
+ "《不锈钢丝绳》(GB/T 9944-2015)",
|
|
|
+ "《四川路桥集团超危大工程专项施工方案分级管理实施细则》",
|
|
|
+ "《四川路桥企业标准(2023 版)》",
|
|
|
+ "《质量、环境、职业健康安全管理手册》",
|
|
|
+ "《质量、环境、职业健康安全管理程序文件》",
|
|
|
+ "《桥梁上部结构施工主要工序安全作业指导书(试行)》",
|
|
|
+ "《桥梁公司专项施工方案管理实施细则》",
|
|
|
+ "《标准化施工工法》",
|
|
|
+ "《公路工程预制梁架设施工标准化作业手册》",
|
|
|
+ "《高处作业主要安全设施和防护用品配备及使用要点》",
|
|
|
+ "《公路建设项目高处作业安全管理要求》",
|
|
|
+ "《四川路桥企业标准》",
|
|
|
+ "《工程质量管理办法》",
|
|
|
+ "《桥梁隧道工程施工安全防护标准化图册》",
|
|
|
+ "《桥梁施工标准化设计图集》",
|
|
|
+ "《危险性较大工程管理实施细则》(2025)"
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 方式2: 从Excel文件读取(如果使用这种方式,注释掉上面的列表)
|
|
|
+ # EXCEL_FILE = "编制依据列表.xlsx"
|
|
|
+ # references = read_references_from_excel(EXCEL_FILE)
|
|
|
+
|
|
|
+ # ========== 执行匹配 ==========
|
|
|
+
|
|
|
+ # 创建匹配器
|
|
|
+ print("初始化文件匹配器...")
|
|
|
+ matcher = FileMatcher(SEARCH_DIRECTORIES, OUTPUT_DIR)
|
|
|
+
|
|
|
+ # 扫描目录
|
|
|
+ matcher.scan_directories()
|
|
|
+
|
|
|
+ # 设置LLM(如果需要)
|
|
|
+ if USE_LLM:
|
|
|
+ matcher.setup_llm(LLM_API_KEY, LLM_BASE_URL)
|
|
|
+
|
|
|
+ # 处理编制依据
|
|
|
+ print("\n开始匹配文件...")
|
|
|
+ matcher.process_references(references, use_semantic=USE_LLM)
|
|
|
+
|
|
|
+ # 保存结果
|
|
|
+ print("\n保存匹配结果...")
|
|
|
+ matcher.save_results(RESULTS_FILE)
|
|
|
+
|
|
|
+ # 打印最终统计
|
|
|
+ print("\n" + "=" * 60)
|
|
|
+ print("匹配完成!统计信息:")
|
|
|
+ print("=" * 60)
|
|
|
+
|
|
|
+ total_refs = len(matcher.results)
|
|
|
+ successful_matches = sum(1 for r in matcher.results if r["匹配结果"])
|
|
|
+ total_matches = sum(len(r["匹配结果"]) for r in matcher.results)
|
|
|
+
|
|
|
+ print(f"编制依据总数: {total_refs}")
|
|
|
+ print(f"成功匹配的编制依据: {successful_matches}")
|
|
|
+ print(f"总匹配文件数: {total_matches}")
|
|
|
+ print(f"匹配成功率: {successful_matches / total_refs * 100:.1f}%")
|
|
|
+ print(f"\n匹配文件已复制到: {OUTPUT_DIR}")
|
|
|
+ print(f"详细报告: {RESULTS_FILE}")
|
|
|
+ print(f"JSON报告: {RESULTS_FILE.replace('.txt', '.json')}")
|
|
|
+ print(f"CSV摘要: {RESULTS_FILE.replace('.txt', '.csv')}")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|