from bs4 import BeautifulSoup
import requests
import unicodedata
import re
import csv
# Starting URL (explicitly using IMDb's Advance Search pages with Detail listings)
#url = 'https://www.imdb.com/search/title/?title_type=feature&languages=en&sort=release_date,asc&count=250'
#Where it left off
url = 'https://www.imdb.com/search/title/?title_type=feature&languages=en&sort=release_date,asc&count=250&after=WzE1OTkyNjQwMDAwMDAsInR0MTM3Nzk2OTQiLDEzMDc1MV0%3D'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
movies = soup.select('div.lister-item-content')
currPage = soup.find('div', {'class': 'desc'}).find('span').text
# Creates and preps csv file
file = open('C:/Users/omara/Desktop/IMDB-INFO.csv', 'w', newline='') #, encoding='utf-8'
writer = csv.writer(file)
# Writes header row
writer.writerow(['Movie-Title', 'Year-Released', 'Month-Released', 'Day-Released', 'IMDb-Rating',
'IMDb-Users-Participated', 'Metascore', 'Age-Rating', 'Gross (US & Canada)', 'Director(s)', 'Star(s)', 'URL-Link'])
# Will log either the last url accessed or the next url in the list
fileLog = open('./Desktop/IMDB-INFO-LOG.txt', 'w', newline='')
urlStatusLogString = "Next Url to be Accessed ("
# Print iterations progress
def printProgressBar(iteration, total, prefix='', suffix='', decimals=1, length=100, fill='█', printEnd="\r"):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
printEnd - Optional : end character (e.g. "\r", "\r\n") (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 *
(iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print(f'\r{prefix} |{bar}| {percent}% {suffix}', end=printEnd)
# Print New Line on Complete
if iteration == total:
print()
# Iterate through each search page (250 movies per page);
# Specifically for 160,190 titles (641~ pages in total), though can be convert to dynamic with while loop
for index in range(0,120):
printProgressBar(0, len(movies), prefix='Progress (' +
currPage[:-1] + '):', suffix='Complete', length=50)
for index in range(0, len(movies)):
try:
# Get title of movie
title = movies[index].a.text
link = 'https://www.imdb.com'+movies[index].a.get('href')
# Gets release year of movie & earliest release date of movie; returns "DNE" if the data does not exist
year = "UNK"
month = "UNK"
day = "UNK"
if movies[index].find_all("span", {"class": "lister-item-year"}):
# Year; removes roman numerals "IV" and "()"
year = re.sub(
"[^0-9]", "", movies[index].find(class_='lister-item-year').text)
#Month and Day (SLOW)
# Take url link and jumps to the movie's premiere page
releaseURL = link + "releaseinfo?ref_=tt_ov_rdat"
response = requests.get(releaseURL)
rSoup = BeautifulSoup(response.text, 'html.parser')
# Get's first instance of release date
if rSoup.find("td", {"class": "release-date-item__date"}):
date = rSoup.find(
"td", {"class": "release-date-item__date"}).text
if re.sub('[^a-zA-Z]', '', date) != "":
month = re.sub('[^a-zA-Z]', "", date)
if date[:2].strip().isnumeric():
day = date[:2].strip()
# Gets age rating of movie; returns "DNE" if the data does not exist
age = "UNK"
if movies[index].find_all("span", {"class": "certificate"}):
age = movies[index].find(class_='certificate').text
# Gets runtime of movie; returns "DNE" if the data does not exist
runtime = "UNK"
if movies[index].find_all("span", {"class": "runtime"}):
runtime = movies[index].find(class_='runtime').text
# Gets the genres of movie; returns "DNE" if the data does not exist
genre = "UNK"
if movies[index].find_all("span", {"class": "genre"}):
genre = movies[index].find(class_='genre').text
# Gets the rating movie based on imdb users; returns "DNE" if the data does not exist
imdbRating = "UNK"
if movies[index].find_all('strong'):
imdbRating = movies[index].find('strong').text
# Gets the rating movie based on metascore; returns "DNE" if the data does not exist
metascore = "UNK"
if movies[index].find_all("span", {"class": "metascore"}):
metascore = movies[index].find(class_='metascore').text
# Gets the cast and crew; returns "DNE" if the data does not exist
directors = "UNK"
stars = "UNK"
if movies[index].find_all('p', {'class': ''}):
# Creates string list of all the data in the class
crew = []
for c in list(movies[index].find('p', {'class': ''})):
name = unicodedata.normalize("NFD", c.text.strip())
name = re.sub("[\u0300-\u036f]", "", name)
crew.append(name)
crew = list(filter(",".__ne__, crew))
# Checks if there is one director
if crew[0] == 'Director:':
directors = crew[1]
# Checks if Stars ar listed
if len(crew) > crew.index("") + 1:
s = ""
for i in range(crew.index("|") + 2, crew.index("", crew.index("|"))):
s = s + ", " + crew[i]
stars = s[2:]
# Checks if there are multiple directors
elif crew[0] == 'Directors:':
d = ""
for i in range(1, crew.index("")):
d = d + ", " + crew[i]
directors = d[2:]
# Checks if Stars are listed
if len(crew) > crew.index("") + 1:
s = ""
for i in range(crew.index("|") + 2, crew.index("", crew.index("|"))):
s = s + ", " + crew[i]
stars = s[2:]
# Checks if there are no directors/ only Stars (or in the rare case that there's just a "star")
elif crew[0] == "Stars:" or crew[0] == "Star:":
s = ""
for i in range(1, crew.index("")):
s = s + ", " + crew[i]
stars = s[2:]
# print("\n")
# Gets amount of user who voted and gross estimate made; returns "DNE" if the data does not exist
votingSize = "UNK"
grossAmount = "UNK"
if movies[index].find_all('span', {"name": "nv"}):
if len(movies[index].find_all('span', {"name": "nv"})) == 2:
votingSize = movies[index].find_all('span', {"name": "nv"})[0].text
grossAmount = movies[index].find_all('span', {"name": "nv"})[1].text
elif len(movies[index].find_all('span', {"name": "nv"})) == 1:
if str(movies[index].find(class_='sort-num_votes-visible').find_all(class_='text-muted')[0].text) == "Votes:":
votingSize = movies[index].find_all('span', {"name": "nv"})[0].text
elif str(movies[index].find(class_='sort-num_votes-visible').find_all(class_='text-muted')[0].text) == "Gross:":
grossAmount = movies[index].find_all('span', {"name": "nv"})[0].text
writer.writerow([title, year, month, day, imdbRating, votingSize,
metascore, age, grossAmount, directors, stars, url])
printProgressBar(index, len(movies), prefix='Progress (' + currPage[:-1] + '):', suffix='Complete', length=50)
except KeyboardInterrupt:
# Ctrl+C
printProgressBar(index, len(movies), prefix='Progress (' + currPage[:-1] + '):', suffix='Complete', length=50, printEnd="\n")
print("Broke out of loop at this page (" + currPage[:-1] + "): " + url)
urlStatusLogString = "Broke out of loop at this page ("
file.close()
fileLog.write(urlStatusLogString + currPage[:-1] + "): " + url)
fileLog.close()
pass
printProgressBar(len(movies), len(movies), prefix='Progress (' + currPage[:-1] + '):', suffix='Complete', length=50)
if soup.find('a', {'class': 'lister-page-next next-page'}):
url = 'https://www.imdb.com' + soup.find('a', {'class': 'lister-page-next next-page'}).get('href')
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
movies = soup.select('div.lister-item-content')
currPage = soup.find('div', {'class': 'desc'}).find('span').text
else:
print("Broke out of loop at this page (" + currPage[:-1] + "): " + url)
urlStatusLogString = "Last Url Accessed ("
break
###################################
file.close()
fileLog.write(urlStatusLogString + currPage[:-1] + "): " + url)
fileLog.close()