Book Recommendation System

Problem statement

Datasets

Amazon books

# Importing necessary libraries
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.neighbors import NearestNeighbors
import re
import string
import random
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
%matplotlib inline
df= pd.read_csv('amazon_books_data.csv')
df.head()
df.Product_Details[0]
Before regex
df['Product_Details'] = df.Product_Details.str.replace(r'\n', '', regex=True).str.strip()
df.Product_Details[0]
After regex
# Publisher feature
df['Publisher'] = df['Product_Details'].str.extract('((?<=Publisher:).+?(?=;))' ,expand=False) #'ISBN-13:.+?(?= ;)
df['Publisher'] = df.Publisher.str.replace(r'-', '', regex=True).str.strip() #Program
# ISBN feature
df['ISBN'] = df['Product_Details'].str.extract('ISBN-13:(\d+-?\d*)', expand=False)##ISBN consists of 13 digits
df['ISBN'] = df.ISBN.str.replace(r'-', '', regex=True).str.strip()
hasNAN = df.isnull().sum()
hasNAN = hasNAN[hasNAN > 0]
hasNAN = hasNAN.sort_values(ascending=False)
print(hasNAN,df.shape)
Null values
df1=df[['ISBN','Name','Description','Rating_out_of_5','No_of_Ratings','Publisher']]df1.head()
# Shape before dropping nulls
(1200, 6)
df1 = df1.dropna(how='any',axis=0)# Shape after dropping nulls
(873, 6)

Text Preprocessing

# Function for removing NonAscii characters
def _removeNonAscii(s):
return "".join(i for i in s if ord(i)<128)
# Function for converting into lower case
def make_lower_case(text):
return text.lower()
# Function for removing stop words
def remove_stop_words(text):
text = text.split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
return text
# Function for removing punctuation
def remove_punctuation(text):
tokenizer = RegexpTokenizer(r'\w+')
text = tokenizer.tokenize(text)
text = " ".join(text)
return text
# Function for removing the html tags
def remove_html(text):
html_pattern = re.compile('<.*?>')
return html_pattern.sub(r'', text)
# Applying all the functions in description and storing as a cleaned_desc
df1['cleaned_desc'] = df1['Description'].apply(_removeNonAscii)
df1['cleaned_desc'] = df1.cleaned_desc.apply(func = make_lower_case)
df1['cleaned_desc'] = df1.cleaned_desc.apply(func = remove_stop_words)
df1['cleaned_desc'] = df1.cleaned_desc.apply(func=remove_punctuation)
df1['cleaned_desc'] = df1.cleaned_desc.apply(func=remove_html)
df1.head()

Recommendation engine

tfidf = TfidfVectorizer(stop_words='english')df1['cleaned_desc'] = df1['cleaned_desc'].fillna('')#Construct the required TF-IDF matrix by applying the fit_transform method on the overview featureoverview_matrix = tfidf.fit_transform(df1['cleaned_desc'])#Output the shape of tfidf_matrixoverview_matrix.shape
similarity_matrix = linear_kernel(overview_matrix,overview_matrix)
similarity_matrix
#book index mapping
mapping = pd.Series(df1.index,index =df1['Name'])
mapping[:3]
def recommend_books_based_on_plot(book_input):
book_index = mapping[book_input]
#get similarity values with other books
#similarity_score is the list of index and similarity matrix
similarity_score = list(enumerate(similarity_matrix[book_index]))
#sort in descending order the similarity score of book inputted with all the other books
similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
# Get the scores of the 5 most similar books. Ignore the first book.
similarity_score = similarity_score[1:6]
#return book names using the mapping series
book_indices = [i[0] for i in similarity_score]
return (df1['Name'].iloc[book_indices])

Test the model

recommend_books_based_on_plot('If Animals Kissed Good Night').to_frame()
df1.Description[649]
"A must-have board book for all babies. Good night, Gorilla. Good night, Elephant.  It's bedtime at the zoo, and all the animals are going to sleep. Or are they? Who's that short, furry guy with the key in his hand and the mischievous grin? Good night, Giraffe. Good night, Hyena.  Sneak along behind the zookeeper's back, and see who gets the last laugh in this riotous good-night romp."
df1.Description[458]
'Shaped, die-cut, picture-changing pages add a subtly interactive element to this peaceful, rhyming bedtime book―say "Night, night" and turn the page to watch the animals transform from being awake to asleep.It’s nighttime down on the farm. The animals are in the barn and it’s time to say a soft and cozy, "Night, night." Say goodnight to the horse, the dog, and all their farm friends, as you turn the shaped pages and watch as the animals go to sleep, one by one. The gentle rhymes and sleepy tone make Night Night Farm perfect for settling your little one into bed and ending with your own, quiet, "Night, night." With irresistibly sweet illustrations and a magical sky of glow-in-the-dark stars, Night Night Farm is the perfect way to end the day.'

Goodreads

df= pd.read_csv('goodreads_books_data.csv')
df.head()
hasNAN = df.isnull().sum()
hasNAN = hasNAN[hasNAN > 0]hasNAN = hasNAN.sort_values(ascending=False)
print(hasNAN,df.shape)
df.isnull().sum()

EDA

df.drop(['Book_Id'],axis=1,inplace=True)
df.shape
df[df['book_Title']=='Anime and the Visual Novel: Narrative Structure Design and Play at the Crossroads of Animation and Computer Games']
df.drop_duplicates(inplace=True)df.shape
df[df['book_Title']=='Anime and the Visual Novel: Narrative Structure Design and Play at the Crossroads of Animation and Computer Games']
# Generate a new id 
df=df.assign(id=(df['book_Title']).astype('category').cat.codes)
df.head()

Visualisation

plt.subplots(figsize=(10,7))
df.Author_Name.value_counts()[:10].plot(kind="bar")
plt.show()

Content-Based Recommender Systems

# test poinst for ratings_count=40, Avg_Rating= 4.5, Publish_year= 2002, Pages_no= 523
test_point = [40, 4.5, 2002, 523]
X = df.iloc[:, [2, 3, 4, 6]].values
# build a nearest neighbor object, we are searching for just 3 neighbors so n_neighbors=3
nn = NearestNeighbors(n_neighbors=3).fit(X)
# kneighbors returns the neighbor for the test_point
print(nn.kneighbors([test_point]))
# This is the most similar book from all the available books
df.iloc[626].to_frame()
df.iloc[445].to_frame()

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store