NAIVE BAYES CLASSIFIER USING PYTHON

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import string
import re
import os
import math
import collections
import operator

from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')

Pre-processing dataset

  1. Remove punctuations.
  2. Remove trailing \t and \n
  3. Converting to lower case
  4. Splitting positive and negative reviews
In [2]:
def read_file(file_name):
    pos_reviews = []
    neg_reviews = []
    f = open(file_name,'r');
    lines = f.readlines()
    for line in lines:
        if line[-2] == '1':
            line = line.replace(line[-2], '.')
            line = line.translate(str.maketrans('', '', string.punctuation + string.digits))
            line = line.rstrip()
#             line = re.sub(r"[^a-zA-Z0-9']+", ' ', line)
            line = line.lower()
            pos_reviews.append([line, 1])
        else:
            line = line.replace(line[-2], '.')
#             line = re.sub(r"[^a-zA-Z0-9']+", ' ', line)
            line = line.translate(str.maketrans('', '', string.punctuation + string.digits))
            line = line.rstrip()
            line = line.lower()
            neg_reviews.append([line, 0])
    f.close();
    return pos_reviews, neg_reviews;

Load dataset

In [3]:
pos_reviews, neg_reviews = read_file('./imdb_labelled.txt')
In [4]:
df_dataset = pd.DataFrame(pos_reviews, columns=['review', 'sentiment'])
df_neg_rev = pd.DataFrame(neg_reviews, columns=['review', 'sentiment'])
df_dataset = df_dataset.append(df_neg_rev)

#  Shuffling dataframe
df_dataset = df_dataset.sample(frac=1).reset_index(drop=True)

df_train = pd.DataFrame()
df_dev = pd.DataFrame()
df_test = pd.DataFrame()
In [5]:
df_dataset
Out[5]:
review sentiment
0 it was a riot to see hugo weaving play a sexob... 1
1 but other than that the movie seemed to drag a... 0
2 ursula burtons portrayal of the nun is both to... 1
3 the football scenes at the end were perplexing 0
4 it showed exactly how the government and the s... 1
... ... ...
995 his losing his marbles so early in the proceed... 0
996 the fact is this film is a wonderful heartwarm... 1
997 this second appearance of mickey mouse followi... 1
998 it deserves strong love 1
999 but in terms of the writing its very fresh and... 1

1000 rows × 2 columns

Dividing the dataset into train, dev and test

In [6]:
df_train, test = train_test_split(df_dataset, test_size=0.3)
In [7]:
spdf = np.random.rand(len(test)) < 0.5

df_dev = test[spdf]
df_test = test[~spdf]
In [8]:
df_train_len = len(df_train)
print('# Train Records:', df_train_len)

df_train_pos_len = len(df_train[df_train['sentiment'] == 1])
df_train_neg_len = len(df_train[df_train['sentiment'] == 0])
prob_pos_train = df_train_pos_len / df_train_len
prob_neg_train = df_train_neg_len / df_train_len

print ('# Positive Records:', df_train_pos_len)
print ('# Negative Records:', df_train_neg_len)
print()

print ('P(Positive Records):', prob_pos_train)
print ('P(Negative Records):', prob_neg_train)

print()
print('# Dev Records:', len(df_dev))
print('# Test Records:', len(df_test))
# Train Records: 700
# Positive Records: 341
# Negative Records: 359

P(Positive Records): 0.48714285714285716
P(Negative Records): 0.5128571428571429

# Dev Records: 137
# Test Records: 163
In [9]:
df_train
Out[9]:
review sentiment
233 it has everything you could want suspense dram... 1
232 what makes this story different are the terri... 1
738 i just saw this film and i recommend it 1
223 and generally the lines and plot is weaker tha... 0
407 the characters are interesting even if a bit p... 1
... ... ...
67 not frightening in the least and barely compre... 0
842 the acting was bad 0
977 id advise anyone to go and see it 1
915 is it possible for a movie to get any worse th... 0
656 it is just the sort of pap that is screened in... 0

700 rows × 2 columns

Building the vocabulary list

In [10]:
def create_vocab_list(dataset):   
    count_from_set = collections.defaultdict(int)
    rev_list = dataset.iloc[:,0].to_list()
    doc_words = []
    for rev in rev_list:
        words = re.findall('[^\W\d_]+', rev)
        words = [w for w in words if len(w) > 1]
        doc_words.append(words)
    for vlist in doc_words:
        for i in set(vlist):
            count_from_set[i] += 1
    return count_from_set
In [11]:
vocab_dict = create_vocab_list(df_train)
vocab_dict = dict(sorted(vocab_dict.items(), key=operator.itemgetter(1), reverse=True))

# Remove words with occurences less than 5
vocabulary_list = [k for k, v in vocab_dict.items() if v >= 5]
print('First 50 words in vocabulary list are :\n\n', vocabulary_list[:50])
First 50 words in vocabulary list are :

 ['the', 'and', 'is', 'of', 'this', 'it', 'to', 'in', 'was', 'movie', 'film', 'that', 'for', 'with', 'are', 'but', 'as', 'one', 'on', 'not', 'all', 'you', 'very', 'just', 'its', 'so', 'bad', 'at', 'good', 'like', 'from', 'if', 'even', 'acting', 'be', 'his', 'by', 'there', 'have', 'an', 'or', 'about', 'really', 'time', 'out', 'has', 'who', 'characters', 'great', 'he']

Reverse Indexing

In [12]:
rev_idx_dict = {}
value = 0
for idx in vocabulary_list:
    rev_idx_dict[idx] = value
    value += 1
print('First 50 words with reverse indexing are :\n\n', {k: rev_idx_dict[k] for k in list(rev_idx_dict)[:50]})
First 50 words with reverse indexing are :

 {'the': 0, 'and': 1, 'is': 2, 'of': 3, 'this': 4, 'it': 5, 'to': 6, 'in': 7, 'was': 8, 'movie': 9, 'film': 10, 'that': 11, 'for': 12, 'with': 13, 'are': 14, 'but': 15, 'as': 16, 'one': 17, 'on': 18, 'not': 19, 'all': 20, 'you': 21, 'very': 22, 'just': 23, 'its': 24, 'so': 25, 'bad': 26, 'at': 27, 'good': 28, 'like': 29, 'from': 30, 'if': 31, 'even': 32, 'acting': 33, 'be': 34, 'his': 35, 'by': 36, 'there': 37, 'have': 38, 'an': 39, 'or': 40, 'about': 41, 'really': 42, 'time': 43, 'out': 44, 'has': 45, 'who': 46, 'characters': 47, 'great': 48, 'he': 49}

Vocabulary count

In [13]:
print("Total vocab:",len(vocab_dict))
print("Total vocab after removing words with less than 5 occurence:",len(vocabulary_list))
Total vocab: 2419
Total vocab after removing words with less than 5 occurence: 273

Most occured word in the vocabulary list

In [14]:
def generate_freq_word_graph(n_words):
    print('25 most frequent words:')
    most_common_words = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
    plt.figure(figsize=(8, 8))
    plt.barh(*zip(*{k: most_common_words[k] for k in list(most_common_words)[-n_words:]}.items()))
    plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.show()

generate_freq_word_graph(25)
25 most frequent words:

Calculating the probability

In [15]:
def create_conditional_prob(dataset, prob_word=None, disp_prob=False):
        
    probability_dict = {}
    
    docs_w_pos_rev = dataset[dataset.sentiment == 1]
    no_of_docs_w_pos_rev = docs_w_pos_rev.shape[0]

    docs_w_neg_rev = dataset[dataset.sentiment == 0]
    no_of_docs_w_neg_rev = docs_w_neg_rev.shape[0]
    

    for ind in dataset.index:
        ## Tokenizing
        review_set = set(dataset['review'][ind].split())
        
        for word in review_set:
            p_word_w_pos_rev = None
            p_word_w_neg_rev = None
            
            if word in probability_dict.keys():
                p_word_w_pos_rev = probability_dict[word]['p_pos']
                p_word_w_neg_rev = probability_dict[word]['p_neg']
            else:
                no_of_docs_w_word_pos_rev = docs_w_pos_rev[docs_w_pos_rev.review.apply(lambda x: word in x)].shape[0]
                no_of_docs_w_word_neg_rev = docs_w_neg_rev[docs_w_neg_rev.review.apply(lambda x: word in x)].shape[0]
                
                p_word_w_pos_rev = round(no_of_docs_w_word_pos_rev / no_of_docs_w_pos_rev, 4)
                p_word_w_neg_rev = round(no_of_docs_w_word_neg_rev / no_of_docs_w_neg_rev, 4)
                
                probability_dict[word] = {'p_pos': p_word_w_pos_rev, 'p_neg': p_word_w_neg_rev}
    if disp_prob:
        print('Calculating the probability of occurence:')
        print('# of documents containing the word "'+prob_word.upper()+'": ', vocab_dict[prob_word])
        print('Total number of documents: ', len(df_train))
        prob = vocab_dict[prob_word] / len(df_train)
        print('P(THE): %.3f \n' % prob)
        
        print('Calculating the conditional probability:')
        print('# of positive documents containing the word "'+prob_word.upper()+'": ', docs_w_pos_rev[docs_w_pos_rev.review.apply(lambda x: prob_word in x)].shape[0])
        print('# of positive documents: ', df_train_pos_len)
        print( 'P(THE | POSITIVE): %.3f' % probability_dict[prob_word]['p_pos'])
                
    return probability_dict
    
    
conditional_probability = create_conditional_prob(df_train, 'the', True)
Calculating the probability of occurence:
# of documents containing the word "THE":  358
Total number of documents:  700
P(THE): 0.511 

Calculating the conditional probability:
# of positive documents containing the word "THE":  194
# of positive documents:  341
P(THE | POSITIVE): 0.569

Prediction Function

In [16]:
def classify(train, test, smoothing = False):
    
    train_word_probs = create_conditional_prob(train)
    correct = 0
    smoothing_param = 0

    if smoothing:
        smoothing_param = 1 / (1 * len(vocab_dict))
        
        
    for row in test.itertuples():
        data = row.review
        s_split = set(data.split())
        
        pos_prob = 1.0
        neg_prob = 1.0
        
        for word in s_split:
            p_word_w_pos_rev = 0.0
            p_word_w_neg_rev = 0.0
            
        if word in train_word_probs.keys():
            probs_word = train_word_probs[word]
            p_word_w_pos_rev = probs_word['p_pos']
            p_word_w_neg_rev = probs_word['p_neg']
        
        
        pos_prob = pos_prob * (p_word_w_pos_rev + smoothing_param)
        neg_prob = neg_prob * (p_word_w_neg_rev + smoothing_param)

        total_train_docs = train.shape[0]
    
        no_of_docs_w_pos_rev = train[train.sentiment == 1].shape[0]
        no_of_docs_w_neg_rev = train[train.sentiment == 0].shape[0]
        
        p_pos_rev = round(no_of_docs_w_pos_rev / total_train_docs, 4)
        p_neg_rev = round(no_of_docs_w_neg_rev / total_train_docs, 4)
        
        pos_prob = p_pos_rev * pos_prob
        neg_prob = p_neg_rev * neg_prob

        predicted_rev = 0

        if pos_prob > neg_prob:
            predicted_rev = 1
        elif pos_prob < neg_prob:
            predicted_rev = 0

        if row.sentiment == predicted_rev:
            correct += 1

        test.at[row.Index, 'p_pos'] = pos_prob
        test.at[row.Index, 'p_neg'] = neg_prob
        test.at[row.Index, 'predicted_sentiment'] = predicted_rev

    accuracy = round(correct / test.shape[0] * 100, 2)
    return accuracy

Accuracy of dev dataset

In [17]:
acc = classify(df_train, df_dev)
print("Predicting setiment of review using Naive Bayes Classifer")
print("Accuracy: {}%".format(acc))
Predicting setiment of review using Naive Bayes Classifer
Accuracy: 55.47%

Five Fold Cross Validation

In [18]:
def cross_validation(train, k, smoothing=False):
    dev = 1/k
    result =0    
    split_acc = []

    for i in range(1, k + 1):
        new_dev_set = train.sample(frac=dev, replace=False, random_state=i).copy(deep=True)
        new_train_set = train.drop(new_dev_set.index, axis=0).copy(deep=True)

        if smoothing:
            print("Cross-validation Pass", i, "with smoothing")
        else:
            print("Cross-validation Pass", i)
            
        acc = classify(new_train_set, new_dev_set, smoothing)
            
        split_acc.append(acc)
        print("accuracy: ", acc, "\n")
        result += acc
    
    print("\n*********predict accuracy*********")
    print(result/5)
    return split_acc

non_smoothin_pred = cross_validation(df_train, 5)
Cross-validation Pass 1
accuracy:  62.14 

Cross-validation Pass 2
accuracy:  55.71 

Cross-validation Pass 3
accuracy:  57.86 

Cross-validation Pass 4
accuracy:  57.14 

Cross-validation Pass 5
accuracy:  56.43 


*********predict accuracy*********
57.855999999999995

Experiment

Performing laplace smoothing on dev dataset

In [19]:
print("Predicting setiment of review using Naive Bayes Classifer with smoothing")
print()

acc = classify(df_train, df_dev, smoothing=True)
print("Accuracy: {}%".format(acc))
Predicting setiment of review using Naive Bayes Classifer with smoothing

Accuracy: 55.47%

Five fold cross validation with smooothing

In [20]:
smoothin_pred = cross_validation(df_train, 5, True)
Cross-validation Pass 1 with smoothing
accuracy:  63.57 

Cross-validation Pass 2 with smoothing
accuracy:  55.71 

Cross-validation Pass 3 with smoothing
accuracy:  57.86 

Cross-validation Pass 4 with smoothing
accuracy:  57.14 

Cross-validation Pass 5 with smoothing
accuracy:  56.43 


*********predict accuracy*********
58.141999999999996

Visualization

Comparing the effect of smoothing

In [21]:
plt.figure()

x = [1, 2, 3, 4, 5]
y1 = non_smoothin_pred
y2 = smoothin_pred

plt.plot(x, y1, marker='.', label='Smoothing = No')
plt.plot(x, y2, marker='.', label='Smoothing = Yes')
plt.xlabel('Folds')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

Analysis to find which is better

In [22]:
betterNormal = 0
betterSmoothing = 0

for i in range(len(non_smoothin_pred)):
    if non_smoothin_pred[i] > smoothin_pred[i]:
        betterNormal += 1
    else:
        betterSmoothing +=1

if(betterNormal > betterSmoothing):
    print("For the given dev dataset, accuracy is better without smoothing")
else:
    print("For the given dev dataset, accuracy is better with smoothing")
For the given dev dataset, accuracy is better with smoothing

Top 10 review of each class

In [23]:
accurate_predictions = df_dev[(df_dev.sentiment == df_dev.predicted_sentiment)]

pos_preds = accurate_predictions[accurate_predictions.sentiment == 1].sort_values(by=['p_pos'], ascending=False)
top_ten_pos = pos_preds.explode('review').review.unique()[:10].tolist()

print("Top 10 words that predicts a positive review:")
for i, word in enumerate(top_ten_pos):
    print("{}. {}".format(i + 1, word))
print()

neg_preds = accurate_predictions[accurate_predictions.sentiment == 0].sort_values(by=['p_neg'], ascending=False)
top_ten_neg = neg_preds.explode('review').review.unique()[:10].tolist()

print("Top 10 words that predicts a negative review:")
for i, word in enumerate(top_ten_neg):
    print("{}. {}".format(i + 1, word))
Top 10 words that predicts a positive review:
1. this is an excellent film
2. it has northern humour and positive about the community it represents
3. conclusion  i loved it
4. i advise you to look out for it
5. i wont spoil it but the ending in pretty amazing
6. the warmth it generates is in contrast to its austere backdrop
7. it looked like a wonderful story
8. it was clear that she had the range and ability to pull off this part
9. go and see it
10. they could be used as exemplars for any set designer

Top 10 words that predicts a negative review:
1. the story line is totally predictable
2. the kids are annoying
3. the acting is beyond abysmal
4. this was a flick doomed from its conception
5. maybe there would be a reasonable explanation for this atrocity
6. almost everyone involved must be return to school acting is utterly predictable and bad script is pile of garbage all round
7. wow what a bad film
8. this is not moviemaking
9. so bad
10. in short  this was a monumental waste of time and energy and i would not recommend anyone to ever see this film

Calculating accuracy on test dataset using optimal hyperparameter

In [24]:
acc = classify(df_train, df_test, smoothing=True)

print("Predicting setiment of review using Naive Bayes Classifer with smoothing")
print("Accuracy on test set: {}%".format(acc))
Predicting setiment of review using Naive Bayes Classifer with smoothing
Accuracy on test set: 61.96%