engine.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import json
  4. import urllib.parse
  5. import re
  6. import asyncio
  7. import sys
  8. from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
  9. import urllib3
  10. # Disable SSL warnings
  11. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  12. if sys.platform == 'win32':
  13. asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
  14. def clean_text(s):
  15. s = re.sub(r"\s+", " ", s or "")
  16. return s.strip()
  17. def absolutize_url(base_url, u):
  18. if not u:
  19. return ""
  20. if u.startswith("//"):
  21. return "https:" + u
  22. if u.startswith("/"):
  23. parsed = urllib.parse.urlparse(base_url)
  24. return urllib.parse.urljoin(f"{parsed.scheme}://{parsed.netloc}", u)
  25. if not u.startswith("http"):
  26. return urllib.parse.urljoin(base_url, u)
  27. return u
  28. class GenericSpiderEngine:
  29. def __init__(self, source_config):
  30. """
  31. :param source_config: SpiderSource model instance or dictionary
  32. """
  33. self.config = source_config
  34. async def _crawl_with_ai(self, url, wait_for=None):
  35. browser_config = BrowserConfig(
  36. headless=True,
  37. verbose=False,
  38. java_script_enabled=True,
  39. channel="chromium",
  40. )
  41. run_config = CrawlerRunConfig(
  42. cache_mode=CacheMode.BYPASS
  43. )
  44. if wait_for:
  45. run_config.wait_for = wait_for
  46. async with AsyncWebCrawler(config=browser_config) as crawler:
  47. try:
  48. result = await crawler.arun(
  49. url=url,
  50. config=run_config
  51. )
  52. return result.html
  53. except Exception as e:
  54. # If wait_for failed, retry without it but with a delay
  55. if wait_for and "Wait condition failed" in str(e):
  56. print(f"Warning: Wait for selector '{wait_for}' timed out. Retrying with 5s delay...")
  57. run_config.wait_for = None
  58. # Use js_code to wait for 5 seconds as a fallback
  59. run_config.js_code = "await new Promise(r => setTimeout(r, 5000));"
  60. result = await crawler.arun(
  61. url=url,
  62. config=run_config
  63. )
  64. return result.html
  65. else:
  66. raise e
  67. def run(self, keyword, limit=10, pages=1):
  68. url = self.config.url
  69. method = self.config.method or 'GET'
  70. headers = json.loads(self.config.headers) if self.config.headers else {
  71. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36"
  72. }
  73. base_params = json.loads(self.config.params) if self.config.params else {}
  74. search_key = self.config.search_param_key or 'q'
  75. # Add keyword to params
  76. base_params[search_key] = keyword
  77. # Pagination Config
  78. has_pagination = getattr(self.config, 'has_pagination', False)
  79. pagination_param = getattr(self.config, 'pagination_param', 'pn')
  80. pagination_step = getattr(self.config, 'pagination_step', 10)
  81. pagination_start = getattr(self.config, 'pagination_start', 0)
  82. pages_to_crawl = pages if has_pagination else 1
  83. all_results = []
  84. for page_idx in range(pages_to_crawl):
  85. # Calculate current pagination value
  86. current_pagination_val = pagination_start + (page_idx * pagination_step)
  87. # Update params
  88. current_params = base_params.copy()
  89. if has_pagination and pagination_param:
  90. current_params[pagination_param] = current_pagination_val
  91. print(f"Crawling page {page_idx+1}/{pages_to_crawl} with {pagination_param}={current_pagination_val}")
  92. # Construct full URL for logging and fallback
  93. full_url = url
  94. if current_params:
  95. query_string = urllib.parse.urlencode(current_params)
  96. if '?' in full_url:
  97. full_url += '&' + query_string
  98. else:
  99. full_url += '?' + query_string
  100. html_content = ""
  101. current_url = full_url
  102. try:
  103. if method.upper() == 'GET':
  104. response = requests.get(url, headers=headers, params=current_params, timeout=15, verify=False)
  105. else:
  106. response = requests.post(url, headers=headers, data=current_params, timeout=15, verify=False)
  107. response.raise_for_status()
  108. # Robust decoding strategy for Chinese websites (Baidu, Gov sites, etc.)
  109. try:
  110. html_content = response.content.decode('utf-8')
  111. except UnicodeDecodeError:
  112. try:
  113. html_content = response.content.decode('gb18030')
  114. except UnicodeDecodeError:
  115. response.encoding = response.apparent_encoding
  116. html_content = response.text
  117. current_url = response.url
  118. except Exception as e:
  119. print(f"Standard fetch failed ({e}). Activating intelligent crawler for {full_url}")
  120. try:
  121. # Extract wait_for selector from config
  122. wait_for = None
  123. try:
  124. if self.config.selectors:
  125. sel_json = json.loads(self.config.selectors)
  126. wait_for = sel_json.get('list')
  127. except Exception as ex:
  128. print(f"Error parsing selectors for wait_for: {ex}")
  129. html_content = asyncio.run(self._crawl_with_ai(full_url, wait_for=wait_for))
  130. current_url = full_url # Crawl4AI result url might be different but we start with full_url
  131. except Exception as ai_e:
  132. print(f"Crawl4AI also failed: {ai_e}")
  133. continue # Skip this page
  134. if not html_content:
  135. continue
  136. page_results = self.parse(html_content, current_url, limit)
  137. all_results.extend(page_results)
  138. if page_idx < pages_to_crawl - 1:
  139. import time
  140. time.sleep(1)
  141. return all_results
  142. def parse(self, html, current_url, limit):
  143. if not self.config.selectors:
  144. return []
  145. selectors = json.loads(self.config.selectors)
  146. list_selector = selectors.get('list')
  147. if not list_selector:
  148. return []
  149. soup = BeautifulSoup(html, 'lxml')
  150. items = []
  151. containers = soup.select(list_selector)
  152. for container in containers:
  153. if len(items) >= limit:
  154. break
  155. item = {}
  156. # Helper to extract text or attr
  157. def extract(field, sel_config):
  158. if not sel_config:
  159. return ""
  160. # sel_config can be "selector" or {"selector": "...", "attr": "..."}
  161. selector = sel_config
  162. attr = None
  163. if isinstance(sel_config, dict):
  164. selector = sel_config.get('selector')
  165. attr = sel_config.get('attr')
  166. element = container.select_one(selector) if selector else container
  167. if not element:
  168. return ""
  169. if attr:
  170. return element.get(attr, "")
  171. return clean_text(element.get_text())
  172. item['title'] = extract('title', selectors.get('title'))
  173. item['link'] = absolutize_url(current_url, extract('link', selectors.get('link')))
  174. item['abstract'] = extract('abstract', selectors.get('abstract'))
  175. item['source'] = extract('source', selectors.get('source'))
  176. item['cover'] = extract('cover', selectors.get('cover'))
  177. item['date'] = extract('date', selectors.get('date'))
  178. if item['title']:
  179. items.append(item)
  180. return items
  181. def run_generic_spider(source_model, keyword, limit=10, pages=1):
  182. engine = GenericSpiderEngine(source_model)
  183. return engine.run(keyword, limit, pages)