Method 1: Simple Program to get all URLs (Apply to all websites)
import requests
from bs4 import BeautifulSoup
domain = 'https://www.google.com/search?q='
search = 'Web Scraping'
response = requests.get(domain+search)
soup = BeautifulSoup(response.content, 'html.parser')
elm = [x['href'][x['href'].find('https'):] for x in soup.select('a') if '/url?q=' in x['href']]
for e in elm:
print('Main URL',e)
response = requests.get(e)
soup = BeautifulSoup(response.content, 'html.parser')
url = [x['href'] for x in soup.select('a') if x.has_attr('href') and 'https' in x['href']]
print('Sub URL',url)
Method 2: Build a selenium app to crawl all URLs (Apply to all websites)
from selenium import webdriver
import numpy as np
import time
driver = webdriver.Chrome()
domain = 'https://www.google.com/search?q='
search = 'Web Scraping'
driver.get(domain+search)
time.sleep(5)
elm = [x.get_attribute('href') for x in driver.find_elements_by_tag_name('a') if x.get_attribute('href') != None]
for e in elm:
print('Main URL', e)
driver.get(e)
time.sleep(5)
url = np.unique([x.get_attribute('href') for x in driver.find_elements_by_tag_name('a') if x.get_attribute('href') != None and x.get_attribute('href').startswith('https')]).tolist()
print('Sub URL', url)
driver.quit()
Method 3: Build a selenium app with BS4 to crawl all URLs (Apply to all websites)
from selenium import webdriver
import time
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
domain = 'https://www.google.com/search?q='
search = 'Web Scraping'
driver.get(domain+search)
time.sleep(5)
DOM = driver.page_source
soup = BeautifulSoup(DOM, 'html.parser')
elm = [x['href'] for x in soup.select('a') if x.has_attr('href') and x['href'].startswith('https')]
for e in elm:
print('Main URL', e)
driver.get(e)
time.sleep(5)
DOM = driver.page_source
soup = BeautifulSoup(DOM, 'html.parser')
url = [x['href'] for x in soup.select('a') if x.has_attr('href') and 'https' in x['href']]
print('Sub URL', url)
driver.quit()