engine.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import json
  4. import urllib.parse
  5. import re
  6. import asyncio
  7. import sys
  8. from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
  9. import urllib3
  10. # Disable SSL warnings
  11. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  12. if sys.platform == 'win32':
  13. asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
  14. def clean_text(s):
  15. s = re.sub(r"\s+", " ", s or "")
  16. return s.strip()
  17. def absolutize_url(base_url, u):
  18. if not u:
  19. return ""
  20. if u.startswith("//"):
  21. return "https:" + u
  22. if u.startswith("/"):
  23. parsed = urllib.parse.urlparse(base_url)
  24. return urllib.parse.urljoin(f"{parsed.scheme}://{parsed.netloc}", u)
  25. if not u.startswith("http"):
  26. return urllib.parse.urljoin(base_url, u)
  27. return u
  28. class GenericSpiderEngine:
  29. def __init__(self, source_config):
  30. """
  31. :param source_config: SpiderSource model instance or dictionary
  32. """
  33. self.config = source_config
  34. async def _crawl_with_ai(self, url, wait_for=None):
  35. browser_config = BrowserConfig(
  36. headless=True,
  37. verbose=False,
  38. java_script_enabled=True,
  39. )
  40. run_config = CrawlerRunConfig(
  41. cache_mode=CacheMode.BYPASS
  42. )
  43. if wait_for:
  44. run_config.wait_for = wait_for
  45. async with AsyncWebCrawler(config=browser_config) as crawler:
  46. try:
  47. result = await crawler.arun(
  48. url=url,
  49. config=run_config
  50. )
  51. return result.html
  52. except Exception as e:
  53. # If wait_for failed, retry without it but with a delay
  54. if wait_for and "Wait condition failed" in str(e):
  55. print(f"Warning: Wait for selector '{wait_for}' timed out. Retrying with 5s delay...")
  56. run_config.wait_for = None
  57. # Use js_code to wait for 5 seconds as a fallback
  58. run_config.js_code = "await new Promise(r => setTimeout(r, 5000));"
  59. result = await crawler.arun(
  60. url=url,
  61. config=run_config
  62. )
  63. return result.html
  64. else:
  65. raise e
  66. def run(self, keyword, limit=10, pages=1):
  67. url = self.config.url
  68. method = self.config.method or 'GET'
  69. headers = json.loads(self.config.headers) if self.config.headers else {
  70. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.97 Safari/537.36"
  71. }
  72. base_params = json.loads(self.config.params) if self.config.params else {}
  73. search_key = self.config.search_param_key or 'q'
  74. # Add keyword to params
  75. base_params[search_key] = keyword
  76. # Pagination Config
  77. has_pagination = getattr(self.config, 'has_pagination', False)
  78. pagination_param = getattr(self.config, 'pagination_param', 'pn')
  79. pagination_step = getattr(self.config, 'pagination_step', 10)
  80. pagination_start = getattr(self.config, 'pagination_start', 0)
  81. pages_to_crawl = pages if has_pagination else 1
  82. all_results = []
  83. for page_idx in range(pages_to_crawl):
  84. # Calculate current pagination value
  85. current_pagination_val = pagination_start + (page_idx * pagination_step)
  86. # Update params
  87. current_params = base_params.copy()
  88. if has_pagination and pagination_param:
  89. current_params[pagination_param] = current_pagination_val
  90. print(f"Crawling page {page_idx+1}/{pages_to_crawl} with {pagination_param}={current_pagination_val}")
  91. # Construct full URL for logging and fallback
  92. full_url = url
  93. if current_params:
  94. query_string = urllib.parse.urlencode(current_params)
  95. if '?' in full_url:
  96. full_url += '&' + query_string
  97. else:
  98. full_url += '?' + query_string
  99. html_content = ""
  100. current_url = full_url
  101. try:
  102. if method.upper() == 'GET':
  103. response = requests.get(url, headers=headers, params=current_params, timeout=15, verify=False)
  104. else:
  105. response = requests.post(url, headers=headers, data=current_params, timeout=15, verify=False)
  106. response.raise_for_status()
  107. # Robust decoding strategy for Chinese websites (Baidu, Gov sites, etc.)
  108. try:
  109. html_content = response.content.decode('utf-8')
  110. except UnicodeDecodeError:
  111. try:
  112. html_content = response.content.decode('gb18030')
  113. except UnicodeDecodeError:
  114. response.encoding = response.apparent_encoding
  115. html_content = response.text
  116. current_url = response.url
  117. except Exception as e:
  118. print(f"Standard fetch failed ({e}). Activating intelligent crawler for {full_url}")
  119. try:
  120. # Extract wait_for selector from config
  121. wait_for = None
  122. try:
  123. if self.config.selectors:
  124. sel_json = json.loads(self.config.selectors)
  125. wait_for = sel_json.get('list')
  126. except Exception as ex:
  127. print(f"Error parsing selectors for wait_for: {ex}")
  128. html_content = asyncio.run(self._crawl_with_ai(full_url, wait_for=wait_for))
  129. current_url = full_url # Crawl4AI result url might be different but we start with full_url
  130. except Exception as ai_e:
  131. print(f"Crawl4AI also failed: {ai_e}")
  132. continue # Skip this page
  133. if not html_content:
  134. continue
  135. page_results = self.parse(html_content, current_url, limit)
  136. all_results.extend(page_results)
  137. if page_idx < pages_to_crawl - 1:
  138. import time
  139. time.sleep(1)
  140. return all_results
  141. def parse(self, html, current_url, limit):
  142. if not self.config.selectors:
  143. return []
  144. selectors = json.loads(self.config.selectors)
  145. list_selector = selectors.get('list')
  146. if not list_selector:
  147. return []
  148. soup = BeautifulSoup(html, 'lxml')
  149. items = []
  150. containers = soup.select(list_selector)
  151. for container in containers:
  152. if len(items) >= limit:
  153. break
  154. item = {}
  155. # Helper to extract text or attr
  156. def extract(field, sel_config):
  157. if not sel_config:
  158. return ""
  159. # sel_config can be "selector" or {"selector": "...", "attr": "..."}
  160. selector = sel_config
  161. attr = None
  162. if isinstance(sel_config, dict):
  163. selector = sel_config.get('selector')
  164. attr = sel_config.get('attr')
  165. element = container.select_one(selector) if selector else container
  166. if not element:
  167. return ""
  168. if attr:
  169. return element.get(attr, "")
  170. return clean_text(element.get_text())
  171. item['title'] = extract('title', selectors.get('title'))
  172. item['link'] = absolutize_url(current_url, extract('link', selectors.get('link')))
  173. item['abstract'] = extract('abstract', selectors.get('abstract'))
  174. item['source'] = extract('source', selectors.get('source'))
  175. item['cover'] = extract('cover', selectors.get('cover'))
  176. item['date'] = extract('date', selectors.get('date'))
  177. if item['title']:
  178. items.append(item)
  179. return items
  180. def run_generic_spider(source_model, keyword, limit=10, pages=1):
  181. engine = GenericSpiderEngine(source_model)
  182. return engine.run(keyword, limit, pages)