NLP Multi Classification on Text using NLP :D¶

Soumil Nitin Shah¶

Bachelor in Electronic Engineering | Masters in Electrical Engineering | Master in Computer Engineering |

Website : https://soumilshah.herokuapp.com
Github: https://github.com/soumilshah1995
Linkedin: https://www.linkedin.com/in/shah-soumil/
Blog: https://soumilshah1995.blogspot.com/
Youtube : https://www.youtube.com/channel/UC_eOodxvwS_H7x2uLQa-svw?view_as=subscriber
Facebook Page : https://www.facebook.com/soumilshah1995/
Email : shahsoumil519@gmail.com
projects : https://soumilshah.herokuapp.com/project

Excellent experience of building scalable and high-performance Software Applications combining distinctive skill sets in Internet of Things (IoT), Machine Learning and Full Stack Web Development in Python.

try:
    import json
    import os
    
    import pandas as  pd
    import spacy
    
    import seaborn as sns
    import string

    from tqdm import tqdm
    from textblob import TextBlob
    
    from nltk.corpus import stopwords
    import nltk
    from nltk.stem import WordNetLemmatizer
    from nltk import word_tokenize
    import re
    
    
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.pipeline import Pipeline
    
    
    from sklearn.preprocessing import FunctionTransformer
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.pipeline import FeatureUnion
    from sklearn.feature_extraction import DictVectorizer
    
    import swifter
    
    tqdm.pandas()
except Exception as e:
    print("Error : {} ".format(e))

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\s.shah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s.shah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\s.shah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\s.shah\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!

True

A¶

Data Exploration

Dataset: https://www.kaggle.com/rmisra/news-category-dataset

df = pd.read_json("News_Category_Dataset_v2.json", lines=True)

sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

<matplotlib.axes._subplots.AxesSubplot at 0x2f7e7ed3f70>

df['category'].value_counts().plot( kind='bar', figsize=(15,10))

<matplotlib.axes._subplots.AxesSubplot at 0x2f7ecaede80>

#df = df.head(6000)

df.columns

Index(['category', 'headline', 'authors', 'link', 'short_description', 'date'], dtype='object')

df.describe()

df.isna().sum()

category             0
headline             0
authors              0
link                 0
short_description    0
date                 0
dtype: int64

df.head(2)

df['category'].unique()

array(['CRIME', 'ENTERTAINMENT', 'WORLD NEWS', 'IMPACT', 'POLITICS',
       'WEIRD NEWS', 'BLACK VOICES', 'WOMEN', 'COMEDY', 'QUEER VOICES',
       'SPORTS', 'BUSINESS', 'TRAVEL', 'MEDIA', 'TECH', 'RELIGION',
       'SCIENCE', 'LATINO VOICES', 'EDUCATION', 'COLLEGE', 'PARENTS',
       'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE', 'HEALTHY LIVING',
       'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST', 'FIFTY', 'ARTS',
       'WELLNESS', 'PARENTING', 'HOME & LIVING', 'STYLE & BEAUTY',
       'DIVORCE', 'WEDDINGS', 'FOOD & DRINK', 'MONEY', 'ENVIRONMENT',
       'CULTURE & ARTS'], dtype=object)

B¶

pre processing¶

remove the punctuation from text
make lowercase because we assume that punctuation and letter case don’t influence the meaning of words.
lemmatization

# Used this snippets of code from 
# https://github.com/ArmandDS/news_category/blob/master/News_Analysis_AO.ipynb

stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()
my_sw = ['make', 'amp',  'news','new' ,'time', 'u','s', 'photos',  'get', 'say']

def black_txt(token):
    return  token not in stop_words_ and token not in list(string.punctuation)  and len(token)>2 and token not in my_sw
  
def clean_txt(text):
    clean_text = []
    clean_text2 = []
    text = re.sub("'", "",text)
    text=re.sub("(\\d|\\W)+"," ",text)    
    clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
    clean_text2 = [word for word in clean_text if black_txt(word)]
    return " ".join(clean_text2)

def subj_txt(text):
    return  TextBlob(text).sentiment[1]

def polarity_txt(text):
    return TextBlob(text).sentiment[0]

def len_text(text):
    if len(text.split())>0:
         return len(set(clean_txt(text).split()))/ len(text.split())
    else:
         return 0

df['text'] = df['headline']  +  " " + df['short_description']

df['text'] = df['text'].swifter.apply(clean_txt)
df['polarity'] = df['text'].swifter.apply(polarity_txt)
df['subjectivity'] = df['text'].swifter.apply(subj_txt)
df['len'] = df['text'].swifter.apply(lambda x: len(x))

label Encoding¶

X = df[['text', 'polarity', 'subjectivity','len']]
y =df['category']

encoder = LabelEncoder()
y = encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
v = dict(zip(list(y), df['category'].to_list()))

Trying Model¶

text_clf = Pipeline([
...     ('vect', CountVectorizer(analyzer="word", stop_words="english")),
...     ('tfidf', TfidfTransformer(use_idf=True)),
...     ('clf', MultinomialNB(alpha=.01)),
... ])

text_clf.fit(x_train['text'].to_list(), list(y_train))

Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB(alpha=0.01))])

Testing Model¶

import numpy as np

X_TEST = x_test['text'].to_list()
Y_TEST = list(y_test)

predicted = text_clf.predict(X_TEST)

c = 0

for doc, category in zip(X_TEST, predicted):
    
    if c == 2:break
    
    print("-"*55)
    print(doc)
    print(v[category])
    print("-"*55)

    c = c + 1

-------------------------------------------------------
twiggy model leather collection prove hasnt lose edge twiggy check model piece collection available shop thursday
STYLE & BEAUTY
-------------------------------------------------------
-------------------------------------------------------
cities rally around paris deal reminder global problems local solutions cla europe lead germany france also step fray planet great french president emmanuel
THE WORLDPOST
-------------------------------------------------------

Accuracy¶

np.mean(predicted == Y_TEST)

0.553035772074382

Prediction¶

docs_new = ['Ten Months After George Floyd’s Death, Minneapolis Residents Are at War Over Policing']

predicted = text_clf.predict(docs_new)

v[predicted[0]]

'POLITICS'

Saving the Model¶

import pickle
with open('model.pkl','wb') as f:
    pickle.dump(text_clf,f)

# load
with open('model.pkl', 'rb') as f:
    clf2 = pickle.load(f)

docs_new = ['Ten Months After George Floyd’s Death, Minneapolis Residents Are at War Over Policing']
predicted = clf2.predict(docs_new)

v[predicted[0]]

'POLITICS'

	category	headline	authors	link	short_description	date
count	200853	200853	200853	200853	200853	200853
unique	41	199344	27993	200812	178353	2309
top	POLITICS	Sunday Roundup		https://www.huffingtonpost.comhttp://testkitch...		2013-01-17 00:00:00
freq	32739	90	36620	2	19712	100
first	NaN	NaN	NaN	NaN	NaN	2012-01-28 00:00:00
last	NaN	NaN	NaN	NaN	NaN	2018-05-26 00:00:00

	category	headline	authors	link	short_description	date
0	CRIME	There Were 2 Mass Shootings In Texas Last Week...	Melissa Jeltsen	https://www.huffingtonpost.com/entry/texas-ama...	She left her husband. He killed their children...	2018-05-26
1	ENTERTAINMENT	Will Smith Joins Diplo And Nicky Jam For The 2...	Andy McDonald	https://www.huffingtonpost.com/entry/will-smit...	Of course it has a song.	2018-05-26

Pythonist

Sunday, April 4, 2021

Simple Machine Learning Model to Predict New category | multi class classification