|
@@ -0,0 +1,316 @@
|
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
|
+import pandas as pd
|
|
|
|
|
+import time
|
|
|
|
|
+import random
|
|
|
|
|
+import signal
|
|
|
|
|
+import sys
|
|
|
|
|
+from urllib.parse import quote, urljoin
|
|
|
|
|
+import requests
|
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
|
+from selenium import webdriver
|
|
|
|
|
+from selenium.webdriver.chrome.service import Service
|
|
|
|
|
+from webdriver_manager.chrome import ChromeDriverManager
|
|
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
|
|
+from selenium.common.exceptions import SessionNotCreatedException, WebDriverException
|
|
|
|
|
+
|
|
|
|
|
+# ====================== 你只需改这里 ======================
|
|
|
|
|
+INPUT_FILE = r"F:\时效性相关文档\新增标准原材料\铁路规范_拆分结果.csv"
|
|
|
|
|
+OUTPUT_FILE = r"F:\时效性相关文档\新增标准原材料\1_chaxun.csv"
|
|
|
|
|
+DELAY_MIN = 4
|
|
|
|
|
+DELAY_MAX = 7
|
|
|
|
|
+# ==========================================================
|
|
|
|
|
+
|
|
|
|
|
+df_result = None
|
|
|
|
|
+status_list = []
|
|
|
|
|
+publish_date_list = []
|
|
|
|
|
+date_list = []
|
|
|
|
|
+dept_list = []
|
|
|
|
|
+query_name_list = [] # 查询标准名称
|
|
|
|
|
+query_code_list = [] # 查询到的标准编号(新增)
|
|
|
|
|
+driver = None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def empty_result(status="未查到"):
|
|
|
|
|
+ return {"标准编号": "", "标准名称": "", "状态": status, "发布日期": "", "实施日期": "", "发布部门": ""}
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def is_access_denied_page(page_text):
|
|
|
|
|
+ if not page_text:
|
|
|
|
|
+ return False
|
|
|
|
|
+ markers = ["您无权访问", "访问已经超出我们允许的范围", "noright.html"]
|
|
|
|
|
+ return any(marker in page_text for marker in markers)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def normalize_std_code(code):
|
|
|
|
|
+ if code is None:
|
|
|
|
|
+ return ""
|
|
|
|
|
+ text = str(code).strip().upper()
|
|
|
|
|
+ for ch in [" ", "\t", "\r", "\n", "\u3000"]:
|
|
|
|
|
+ text = text.replace(ch, "")
|
|
|
|
|
+ return text
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def parse_detail_page(detail_url):
|
|
|
|
|
+ if not detail_url:
|
|
|
|
|
+ return {"发布日期": "", "实施日期": "", "发布部门": ""}
|
|
|
|
|
+
|
|
|
|
|
+ headers = {
|
|
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
|
|
|
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
|
|
+ "Referer": "http://www.csres.com/",
|
|
|
|
|
+ }
|
|
|
|
|
+ try:
|
|
|
|
|
+ resp = requests.get(detail_url, headers=headers, timeout=20)
|
|
|
|
|
+ resp.raise_for_status()
|
|
|
|
|
+ resp.encoding = resp.apparent_encoding or resp.encoding
|
|
|
|
|
+ soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
|
+
|
|
|
|
|
+ field_map = {}
|
|
|
|
|
+ for tr in soup.find_all("tr"):
|
|
|
|
|
+ tds = tr.find_all("td")
|
|
|
|
|
+ if len(tds) < 2:
|
|
|
|
|
+ continue
|
|
|
|
|
+ label = tds[0].get_text(" ", strip=True).replace(":", "").replace(":", "").strip()
|
|
|
|
|
+ value = tds[1].get_text(" ", strip=True)
|
|
|
|
|
+ if label in {"发布部门", "发布日期", "实施日期"} and value:
|
|
|
|
|
+ field_map[label] = value
|
|
|
|
|
+
|
|
|
|
|
+ return {
|
|
|
|
|
+ "发布日期": field_map.get("发布日期", ""),
|
|
|
|
|
+ "实施日期": field_map.get("实施日期", ""),
|
|
|
|
|
+ "发布部门": field_map.get("发布部门", ""),
|
|
|
|
|
+ }
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ return {"发布日期": "", "实施日期": "", "发布部门": ""}
|
|
|
|
|
+
|
|
|
|
|
+def save_progress():
|
|
|
|
|
+ global df_result, status_list, publish_date_list, date_list, dept_list, query_name_list, query_code_list
|
|
|
|
|
+ if df_result is None:
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ save_df = df_result.copy()
|
|
|
|
|
+ result_len = len(status_list)
|
|
|
|
|
+ save_df = save_df.iloc[:result_len].copy()
|
|
|
|
|
+ save_df["状态"] = status_list
|
|
|
|
|
+ save_df["发布日期"] = publish_date_list
|
|
|
|
|
+ save_df["实施日期"] = date_list
|
|
|
|
|
+ save_df["发布部门"] = dept_list
|
|
|
|
|
+ save_df["查询标准名称"] = query_name_list
|
|
|
|
|
+ save_df["查询到的标准编号"] = query_code_list
|
|
|
|
|
+ save_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
|
|
|
|
|
+ print(f"✅ 已保存 {result_len} 条到 {OUTPUT_FILE}")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def cleanup_driver():
|
|
|
|
|
+ global driver
|
|
|
|
|
+ if driver:
|
|
|
|
|
+ try:
|
|
|
|
|
+ driver.quit()
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+ driver = None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def save_on_exit(signum, frame):
|
|
|
|
|
+ print("\n⚠️ 检测到中断,正在保存...")
|
|
|
|
|
+ save_progress()
|
|
|
|
|
+ cleanup_driver()
|
|
|
|
|
+ sys.exit(0)
|
|
|
|
|
+
|
|
|
|
|
+signal.signal(signal.SIGINT, save_on_exit)
|
|
|
|
|
+
|
|
|
|
|
+def init_driver():
|
|
|
|
|
+ # Windows + Selenium 在不同 Chrome 版本上表现不一致,按策略回退启动
|
|
|
|
|
+ common_args = [
|
|
|
|
|
+ "--disable-blink-features=AutomationControlled",
|
|
|
|
|
+ "--disable-gpu",
|
|
|
|
|
+ "--no-sandbox",
|
|
|
|
|
+ "--disable-dev-shm-usage",
|
|
|
|
|
+ "--window-size=1920,1080",
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ def _build_options(headless_mode):
|
|
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
|
|
+ for arg in common_args:
|
|
|
|
|
+ options.add_argument(arg)
|
|
|
|
|
+ if headless_mode == "new":
|
|
|
|
|
+ options.add_argument("--headless=new")
|
|
|
|
|
+ elif headless_mode == "old":
|
|
|
|
|
+ options.add_argument("--headless")
|
|
|
|
|
+ options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
|
|
|
+ return options
|
|
|
|
|
+
|
|
|
|
|
+ def _try_create_driver(options):
|
|
|
|
|
+ # 优先 Selenium Manager 自动匹配驱动
|
|
|
|
|
+ try:
|
|
|
|
|
+ return webdriver.Chrome(options=options)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ # 回退到 webdriver_manager
|
|
|
|
|
+ return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
|
|
|
|
+
|
|
|
|
|
+ last_err = None
|
|
|
|
|
+ for mode in ["new", "old", None]:
|
|
|
|
|
+ try:
|
|
|
|
|
+ driver = _try_create_driver(_build_options(mode))
|
|
|
|
|
+ driver.set_page_load_timeout(25)
|
|
|
|
|
+ return driver
|
|
|
|
|
+ except (SessionNotCreatedException, WebDriverException, Exception) as err:
|
|
|
|
|
+ last_err = err
|
|
|
|
|
+ print(f"Chrome 启动失败(mode={mode}): {err}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ print(f"Chrome WebDriver 不可用,回退 requests 抓取: {last_err}")
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def query_by_requests(std_code):
|
|
|
|
|
+ kw = std_code.strip()
|
|
|
|
|
+ if not kw:
|
|
|
|
|
+ return empty_result()
|
|
|
|
|
+ normalized_kw = normalize_std_code(kw)
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ url = f"http://www.csres.com/s.jsp?keyword={quote(kw)}"
|
|
|
|
|
+ headers = {
|
|
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
|
|
|
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
|
|
+ }
|
|
|
|
|
+ resp = requests.get(url, headers=headers, timeout=20)
|
|
|
|
|
+ resp.raise_for_status()
|
|
|
|
|
+ resp.encoding = resp.apparent_encoding or resp.encoding
|
|
|
|
|
+
|
|
|
|
|
+ if is_access_denied_page(resp.text) or "noright.html" in resp.url:
|
|
|
|
|
+ return empty_result("访问受限")
|
|
|
|
|
+
|
|
|
|
|
+ soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
|
+ rows = soup.select("table.heng tr")
|
|
|
|
|
+
|
|
|
|
|
+ for tr in rows:
|
|
|
|
|
+ tds = tr.find_all("td")
|
|
|
|
|
+ if len(tds) >= 5:
|
|
|
|
|
+ code_text = tds[0].get_text(strip=True)
|
|
|
|
|
+ if normalized_kw and normalized_kw == normalize_std_code(code_text):
|
|
|
|
|
+ detail_link = tr.select_one("a[href]")
|
|
|
|
|
+ detail_url = urljoin(resp.url, detail_link.get("href")) if detail_link else ""
|
|
|
|
|
+ detail_data = parse_detail_page(detail_url)
|
|
|
|
|
+ return {
|
|
|
|
|
+ "标准编号": code_text,
|
|
|
|
|
+ "标准名称": tds[1].get_text(strip=True),
|
|
|
|
|
+ "发布部门": detail_data["发布部门"] or tds[2].get_text(strip=True),
|
|
|
|
|
+ "发布日期": detail_data["发布日期"],
|
|
|
|
|
+ "实施日期": detail_data["实施日期"] or tds[3].get_text(strip=True),
|
|
|
|
|
+ "状态": tds[4].get_text(strip=True),
|
|
|
|
|
+ }
|
|
|
|
|
+ except Exception as err:
|
|
|
|
|
+ print(f"requests 抓取失败: {kw} | {err}")
|
|
|
|
|
+
|
|
|
|
|
+ return empty_result()
|
|
|
|
|
+
|
|
|
|
|
+# ====================== 万能查询:支持所有铁路标准 ======================
|
|
|
|
|
+def query(std_code, driver):
|
|
|
|
|
+ kw = std_code.strip()
|
|
|
|
|
+ if not kw:
|
|
|
|
|
+ return empty_result()
|
|
|
|
|
+ normalized_kw = normalize_std_code(kw)
|
|
|
|
|
+
|
|
|
|
|
+ if driver is None:
|
|
|
|
|
+ return query_by_requests(kw)
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ driver.get(f"http://www.csres.com/s.jsp?keyword={quote(kw)}")
|
|
|
|
|
+ time.sleep(1.5)
|
|
|
|
|
+ if is_access_denied_page(driver.page_source) or "noright.html" in driver.current_url:
|
|
|
|
|
+ return empty_result("访问受限")
|
|
|
|
|
+ result_table = driver.find_element(By.CSS_SELECTOR, "table.heng")
|
|
|
|
|
+ trs = result_table.find_elements(By.TAG_NAME, "tr")
|
|
|
|
|
+
|
|
|
|
|
+ for tr in trs:
|
|
|
|
|
+ tds = tr.find_elements(By.TAG_NAME, "td")
|
|
|
|
|
+ if len(tds) >= 5:
|
|
|
|
|
+ code_text = tds[0].text.strip()
|
|
|
|
|
+ if normalized_kw and normalized_kw == normalize_std_code(code_text):
|
|
|
|
|
+ detail_url = ""
|
|
|
|
|
+ links = tr.find_elements(By.TAG_NAME, "a")
|
|
|
|
|
+ if links:
|
|
|
|
|
+ href = links[0].get_attribute("href")
|
|
|
|
|
+ detail_url = href or ""
|
|
|
|
|
+ detail_data = parse_detail_page(detail_url)
|
|
|
|
|
+ # 提取:网站真实标准编号 + 名称 + 部门 + 日期 + 状态
|
|
|
|
|
+ query_std_code = tds[0].text.strip()
|
|
|
|
|
+ query_std_name = tds[1].text.strip()
|
|
|
|
|
+ return {
|
|
|
|
|
+ "标准编号": query_std_code,
|
|
|
|
|
+ "标准名称": query_std_name,
|
|
|
|
|
+ "发布部门": detail_data["发布部门"] or tds[2].text.strip(),
|
|
|
|
|
+ "发布日期": detail_data["发布日期"],
|
|
|
|
|
+ "实施日期": detail_data["实施日期"] or tds[3].text.strip(),
|
|
|
|
|
+ "状态": tds[4].text.strip()
|
|
|
|
|
+ }
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ # Selenium 失败时自动回退 requests,尽量保证能查到结果
|
|
|
|
|
+ return query_by_requests(kw)
|
|
|
|
|
+
|
|
|
|
|
+ return empty_result()
|
|
|
|
|
+
|
|
|
|
|
+# ====================== 主程序 ======================
|
|
|
|
|
+def main():
|
|
|
|
|
+ global df_result, status_list, publish_date_list, date_list, dept_list, query_name_list, query_code_list, driver
|
|
|
|
|
+
|
|
|
|
|
+ # 自动编码识别
|
|
|
|
|
+ try:
|
|
|
|
|
+ df = pd.read_csv(INPUT_FILE, encoding="utf-8")
|
|
|
|
|
+ except:
|
|
|
|
|
+ try:
|
|
|
|
|
+ df = pd.read_csv(INPUT_FILE, encoding="gbk")
|
|
|
|
|
+ except:
|
|
|
|
|
+ try:
|
|
|
|
|
+ df = pd.read_csv(INPUT_FILE, encoding="gb2312")
|
|
|
|
|
+ except:
|
|
|
|
|
+ df = pd.read_csv(INPUT_FILE, encoding="utf-8-sig", errors="ignore")
|
|
|
|
|
+
|
|
|
|
|
+ df_result = df.copy()
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ driver = init_driver()
|
|
|
|
|
+ print("✅ 开始查询...\n")
|
|
|
|
|
+
|
|
|
|
|
+ for idx, row in df.iterrows():
|
|
|
|
|
+ code = str(row["标准编号"]).strip()
|
|
|
|
|
+ input_name = str(row["标准名称"]).strip()
|
|
|
|
|
+ name_show = input_name[:30]
|
|
|
|
|
+ print(f"[{idx+1}/{len(df)}] {code} | {name_show}")
|
|
|
|
|
+
|
|
|
|
|
+ res = query(code, driver)
|
|
|
|
|
+
|
|
|
|
|
+ query_code = res["标准编号"].strip()
|
|
|
|
|
+ query_name = res["标准名称"].strip()
|
|
|
|
|
+
|
|
|
|
|
+ status_list.append(res["状态"])
|
|
|
|
|
+ publish_date_list.append(res["发布日期"])
|
|
|
|
|
+ date_list.append(res["实施日期"])
|
|
|
|
|
+ dept_list.append(res["发布部门"])
|
|
|
|
|
+ query_name_list.append(query_name)
|
|
|
|
|
+ query_code_list.append(query_code)
|
|
|
|
|
+
|
|
|
|
|
+ print(
|
|
|
|
|
+ f" → 状态: {res['状态']} | 查到编号: {query_code} | 发布部门: {res['发布部门']} "
|
|
|
|
|
+ f"| 发布日期: {res['发布日期']} | 日期: {res['实施日期']}"
|
|
|
|
|
+ )
|
|
|
|
|
+ save_progress()
|
|
|
|
|
+ time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))
|
|
|
|
|
+
|
|
|
|
|
+ print("\n🎉 全部完成!")
|
|
|
|
|
+ except KeyboardInterrupt:
|
|
|
|
|
+ print("\n⚠️ 用户中断,正在保存...")
|
|
|
|
|
+ save_progress()
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ print("\n⚠️ 运行异常,正在保存当前结果...")
|
|
|
|
|
+ save_progress()
|
|
|
|
|
+ raise
|
|
|
|
|
+ finally:
|
|
|
|
|
+ cleanup_driver()
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ main()
|