Build a Web Crawler with Bypassing Anti-Crawler Technology Using Python

Tuan_H · 21 Tháng Sáu 2022 13:31

Method 1: Simple Program to get all URLs (Apply to all websites)

import requests
from bs4 import BeautifulSoup

domain = 'https://www.google.com/search?q='
search = 'Web Scraping'
response = requests.get(domain+search)
soup = BeautifulSoup(response.content, 'html.parser')

elm = [x['href'][x['href'].find('https'):] for x in soup.select('a') if '/url?q=' in x['href']]

for e in elm:
    print('Main URL',e)
    response = requests.get(e)
    soup = BeautifulSoup(response.content, 'html.parser')

    url = [x['href'] for x in soup.select('a') if x.has_attr('href') and 'https' in x['href']]
    print('Sub URL',url)

Method 2: Build a selenium app to crawl all URLs (Apply to all websites)

from selenium import webdriver
import numpy as np
import time

driver = webdriver.Chrome()
domain = 'https://www.google.com/search?q='
search = 'Web Scraping'
driver.get(domain+search)
time.sleep(5)

elm = [x.get_attribute('href') for x in driver.find_elements_by_tag_name('a') if x.get_attribute('href') != None]

for e in elm:
    print('Main URL', e)
    driver.get(e)
    time.sleep(5)
    url = np.unique([x.get_attribute('href') for x in driver.find_elements_by_tag_name('a') if x.get_attribute('href') != None and x.get_attribute('href').startswith('https')]).tolist()
    print('Sub URL', url)

driver.quit()

Method 3: Build a selenium app with BS4 to crawl all URLs (Apply to all websites)

from selenium import webdriver
import time
from bs4 import BeautifulSoup

driver = webdriver.Chrome()
domain = 'https://www.google.com/search?q='
search = 'Web Scraping'
driver.get(domain+search)
time.sleep(5)

DOM = driver.page_source
soup = BeautifulSoup(DOM, 'html.parser')

elm = [x['href'] for x in soup.select('a') if x.has_attr('href') and x['href'].startswith('https')]

for e in elm:
    print('Main URL', e)
    driver.get(e)
    time.sleep(5)
    DOM = driver.page_source
    soup = BeautifulSoup(DOM, 'html.parser')
    url = [x['href'] for x in soup.select('a') if x.has_attr('href') and 'https' in x['href']]
    print('Sub URL', url)

driver.quit()