Book Depository Web Scraping with Python

Tuan_H · 21 Tháng Sáu 2022 14:07

# import libraries 

from bs4 import BeautifulSoup
import requests
import time
import datetime
import csv
import pandas as pd

import smtplib
# Connect to Website and pull in data

URL = 'https://www.bookdepository.com/bestbooksever'

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}

page = requests.get(URL, headers=headers)

soup1 = BeautifulSoup(page.content, "html.parser")

soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
## Loop the books
product = soup2.findAll('div', 'book-item')

for x in product:
    title = x.find('h3', class_ = 'title').get_text().strip()
    author = x.find('p', class_ = 'author').get_text().strip()
    date_published = x.find('p', class_ = 'published').get_text().strip()
    price  = x.find('p', class_ = 'price').get_text().split()[0]
    
    print('Title :', title)
    print('Author :', author)
    print('Published :', date_published)
    print('Price:', price, '\n')

def check_book():
    
    ##Connect to the website
    
    URL = 'https://www.bookdepository.com/bestbooksever'

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}

    page = requests.get(URL, headers=headers)

    soup1 = BeautifulSoup(page.content, "html.parser")

    soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
    
    ## Create a column header

    header = ['Title', 'Author', 'Published', 'Price', 'Scraped Date']

    ## Write the data into csv
    with open('BestBook.csv','a+') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(header)
    
        for div in soup2.find_all('div', 'book-item'):
            title = div.find('h3', class_ = 'title').get_text().strip()
        
            author = div.find('p', class_ = 'author').get_text().strip() 
        
            date_published = div.find('p', class_ = 'published').get_text().strip()
        
            price = int(float(div.find('p', class_ = 'price').get_text().split()[0][1:].replace(',','')))
        
            scraped_date = datetime.date.today()
                                              
            writer.writerow([title, author, date_published, price, scraped_date])
        
        
        
## Autonomate function to check the book daily
while(True):
    check_book()
    time.sleep(86400)