baidusearch.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. import os
  2. import re
  3. import sys
  4. import urllib.parse
  5. import requests
  6. from bs4 import BeautifulSoup
  7. import json
  8. import asyncio
  9. from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
  10. # Set event loop policy for Windows
  11. if sys.platform == 'win32':
  12. asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
  13. def build_headers():
  14. config_path = os.path.join(os.path.dirname(__file__), "config.json")
  15. if os.path.exists(config_path):
  16. try:
  17. with open(config_path, "r", encoding="utf-8") as f:
  18. config = json.load(f)
  19. headers = config.get("headers", {})
  20. if headers:
  21. return headers
  22. except Exception as e:
  23. print(f"Warning: Failed to load config.json: {e}", file=sys.stderr)
  24. # Fallback to default if config loading fails
  25. cookie = os.environ.get("BAIDU_COOKIE", "").strip()
  26. headers = {
  27. "Accept": "text/html",
  28. "Accept-Language": "zh-CN,zh;q=0.9",
  29. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36",
  30. }
  31. if cookie:
  32. headers["Cookie"] = cookie
  33. return headers
  34. def clean_text(s):
  35. s = re.sub(r"\s+", " ", s or "")
  36. return s.strip()
  37. def absolutize_url(u):
  38. if not u:
  39. return ""
  40. if u.startswith("//"):
  41. return "https:" + u
  42. if u.startswith("/"):
  43. return urllib.parse.urljoin("https://www.baidu.com", u)
  44. return u
  45. def extract_result_item(container):
  46. title = ""
  47. link = ""
  48. cover = ""
  49. source = ""
  50. h3 = container.find("h3")
  51. if h3:
  52. a = h3.find("a")
  53. if a:
  54. title = clean_text(a.get_text())
  55. link = absolutize_url(a.get("href", ""))
  56. if not title:
  57. a = container.find("a")
  58. if a and a.get("href"):
  59. title = clean_text(a.get_text())
  60. link = absolutize_url(a.get("href", ""))
  61. abstract = ""
  62. abs_div = container.select_one("div.c-abstract")
  63. if not abs_div:
  64. abs_div = container.find("div", attrs={"class": lambda x: isinstance(x, str) and "abstract" in x})
  65. if not abs_div:
  66. abs_div = container.find("div", attrs={"class": lambda x: isinstance(x, str) and ("content" in x or "summary" in x)})
  67. if abs_div:
  68. abstract = clean_text(abs_div.get_text())
  69. else:
  70. p = container.find("p")
  71. if p:
  72. abstract = clean_text(p.get_text())
  73. img = container.find("img")
  74. if img:
  75. cover = absolutize_url(img.get("data-src") or img.get("src") or "")
  76. src_span = None
  77. f13 = container.select_one("div.f13")
  78. if f13:
  79. src_span = f13.find("span")
  80. if not src_span:
  81. src_span = container.find("span", attrs={"class": lambda x: isinstance(x, str) and "c-color-gray" in x})
  82. if src_span:
  83. source = clean_text(src_span.get_text())
  84. if not source and link:
  85. try:
  86. parsed = urllib.parse.urlparse(link)
  87. source = parsed.netloc
  88. except Exception:
  89. source = ""
  90. # Try to extract date
  91. date = ""
  92. # Strategy 1: Look for specific class patterns commonly used by Baidu
  93. # .newTimeFactor_... is common in new PMD, .c-color-gray2 is legacy
  94. date_candidates = container.find_all("span", attrs={"class": lambda x: isinstance(x, str) and (
  95. "TimeFactor" in x or
  96. "c-color-gray2" in x or
  97. "c-gray" in x or
  98. "cos-color-text-minor" in x or
  99. "source-time" in x
  100. )})
  101. for candidate in date_candidates:
  102. text = clean_text(candidate.get_text())
  103. # Check if it matches date pattern
  104. # Matches: 2023年10月1日, 2023-10-01, 3小时前, 5分钟前, 2天前
  105. match = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日|\d{4}-\d{1,2}-\d{1,2}|\d+(小时|分钟|天)前)", text)
  106. if match:
  107. date = match.group(0).strip()
  108. break
  109. # Strategy 2: Fallback for Aladdin cards (often in div with specific structure)
  110. if not date:
  111. # Check for source-date wrapper often found in news results
  112. source_date = container.select_one(".c-span-last")
  113. if source_date:
  114. text = clean_text(source_date.get_text())
  115. match = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日|\d{4}-\d{1,2}-\d{1,2}|\d+(小时|分钟|天)前)", text)
  116. if match:
  117. date = match.group(0).strip()
  118. return {
  119. "title": title,
  120. "abstract": abstract,
  121. "source": source,
  122. "date": date,
  123. "cover": cover,
  124. "link": link,
  125. }
  126. def parse_results(html, limit=10):
  127. try:
  128. soup = BeautifulSoup(html, "lxml")
  129. except Exception:
  130. soup = BeautifulSoup(html, "html.parser")
  131. items = []
  132. seen = set()
  133. anchors = soup.select("#content_left h3.t a, #content_left h3 a")
  134. for a in anchors:
  135. c = a
  136. container = None
  137. for _ in range(10):
  138. c = c.find_parent("div")
  139. if not c:
  140. break
  141. cls = " ".join(c.get("class", []))
  142. if "result" in cls:
  143. container = c
  144. break
  145. if not container:
  146. container = a.find_parent("div")
  147. item = extract_result_item(container or a)
  148. key = (item["title"], item["abstract"])
  149. if not item["title"]:
  150. continue
  151. if key in seen:
  152. continue
  153. seen.add(key)
  154. items.append(item)
  155. if len(items) >= limit:
  156. break
  157. if not items:
  158. containers = soup.select("div.result, div[class*=result]")
  159. for c in containers:
  160. item = extract_result_item(c)
  161. key = (item["title"], item["abstract"])
  162. if not item["title"]:
  163. continue
  164. if key in seen:
  165. continue
  166. seen.add(key)
  167. items.append(item)
  168. if len(items) >= limit:
  169. break
  170. return items
  171. async def _fetch_with_crawl4ai(url):
  172. """
  173. Fetch page content using Crawl4AI (Playwright) to handle JS and basic anti-bot checks
  174. """
  175. print(f"Fallback to Crawl4AI for: {url}")
  176. browser_config = BrowserConfig(
  177. headless=True,
  178. verbose=False,
  179. java_script_enabled=True,
  180. )
  181. run_config = CrawlerRunConfig(
  182. cache_mode=CacheMode.BYPASS,
  183. # Wait for the main content container
  184. wait_for="#content_left"
  185. )
  186. async with AsyncWebCrawler(config=browser_config) as crawler:
  187. result = await crawler.arun(
  188. url=url,
  189. config=run_config
  190. )
  191. return result.html
  192. def fetch_html(wd, pn):
  193. params = {"wd": wd, "pn": str(pn)}
  194. headers = build_headers()
  195. url_https = "https://www.baidu.com/s"
  196. # Construct full URL for Crawl4AI if needed
  197. full_url = f"{url_https}?{urllib.parse.urlencode(params)}"
  198. try:
  199. r = requests.get(url_https, params=params, headers=headers, timeout=15)
  200. r.raise_for_status()
  201. # Force UTF-8 encoding as Baidu sometimes returns ISO-8859-1 header
  202. r.encoding = "utf-8"
  203. text = r.text
  204. # Check for security verification or simple redirect
  205. if "location.replace" in text or "http-equiv=\"refresh\"" in text or "百度安全验证" in text or "wappass.baidu.com" in text:
  206. print("Detected redirect/security check, trying Crawl4AI...")
  207. return asyncio.run(_fetch_with_crawl4ai(full_url))
  208. return text
  209. except Exception as e:
  210. print(f"Requests failed ({e}), trying Crawl4AI...")
  211. try:
  212. return asyncio.run(_fetch_with_crawl4ai(full_url))
  213. except Exception as ai_e:
  214. print(f"Crawl4AI failed: {ai_e}", file=sys.stderr)
  215. raise e
  216. def compute_pn(page):
  217. if page < 1:
  218. return 0
  219. return (page - 1) * 10
  220. def run_spider(keyword, page=1, limit=10):
  221. """
  222. Executes the Baidu spider
  223. :param keyword: Search keyword
  224. :param page: Page number
  225. :param limit: Result limit
  226. :return: List of results
  227. """
  228. try:
  229. pn = compute_pn(page)
  230. html = fetch_html(keyword, pn)
  231. items = parse_results(html, limit=limit)
  232. return items
  233. except Exception as e:
  234. print(f"Error running spider: {e}", file=sys.stderr)
  235. return []