NLP Multi Classification on Text using NLP :D¶
Soumil Nitin Shah¶
Bachelor in Electronic Engineering | Masters in Electrical Engineering | Master in Computer Engineering |
- Website : https://soumilshah.herokuapp.com
- Github: https://github.com/soumilshah1995
- Linkedin: https://www.linkedin.com/in/shah-soumil/
- Blog: https://soumilshah1995.blogspot.com/
- Youtube : https://www.youtube.com/channel/UC_eOodxvwS_H7x2uLQa-svw?view_as=subscriber
- Facebook Page : https://www.facebook.com/soumilshah1995/
- Email : shahsoumil519@gmail.com
- projects : https://soumilshah.herokuapp.com/project
Excellent experience of building scalable and high-performance Software Applications combining distinctive skill sets in Internet of Things (IoT), Machine Learning and Full Stack Web Development in Python.
try:
import json
import os
import pandas as pd
import spacy
import seaborn as sns
import string
from tqdm import tqdm
from textblob import TextBlob
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
import swifter
tqdm.pandas()
except Exception as e:
print("Error : {} ".format(e))
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
df = pd.read_json("News_Category_Dataset_v2.json", lines=True)
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
df['category'].value_counts().plot( kind='bar', figsize=(15,10))
#df = df.head(6000)
df.columns
df.describe()
df.isna().sum()
df.head(2)
df['category'].unique()
- remove the punctuation from text
- make lowercase because we assume that punctuation and letter case don’t influence the meaning of words.
- lemmatization
# Used this snippets of code from
# https://github.com/ArmandDS/news_category/blob/master/News_Analysis_AO.ipynb
stop_words_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()
my_sw = ['make', 'amp', 'news','new' ,'time', 'u','s', 'photos', 'get', 'say']
def black_txt(token):
return token not in stop_words_ and token not in list(string.punctuation) and len(token)>2 and token not in my_sw
def clean_txt(text):
clean_text = []
clean_text2 = []
text = re.sub("'", "",text)
text=re.sub("(\\d|\\W)+"," ",text)
clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
clean_text2 = [word for word in clean_text if black_txt(word)]
return " ".join(clean_text2)
def subj_txt(text):
return TextBlob(text).sentiment[1]
def polarity_txt(text):
return TextBlob(text).sentiment[0]
def len_text(text):
if len(text.split())>0:
return len(set(clean_txt(text).split()))/ len(text.split())
else:
return 0
df['text'] = df['headline'] + " " + df['short_description']
df['text'] = df['text'].swifter.apply(clean_txt)
df['polarity'] = df['text'].swifter.apply(polarity_txt)
df['subjectivity'] = df['text'].swifter.apply(subj_txt)
df['len'] = df['text'].swifter.apply(lambda x: len(x))
label Encoding¶
X = df[['text', 'polarity', 'subjectivity','len']]
y =df['category']
encoder = LabelEncoder()
y = encoder.fit_transform(y)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
v = dict(zip(list(y), df['category'].to_list()))
Trying Model¶
text_clf = Pipeline([
... ('vect', CountVectorizer(analyzer="word", stop_words="english")),
... ('tfidf', TfidfTransformer(use_idf=True)),
... ('clf', MultinomialNB(alpha=.01)),
... ])
text_clf.fit(x_train['text'].to_list(), list(y_train))
Testing Model¶
import numpy as np
X_TEST = x_test['text'].to_list()
Y_TEST = list(y_test)
predicted = text_clf.predict(X_TEST)
c = 0
for doc, category in zip(X_TEST, predicted):
if c == 2:break
print("-"*55)
print(doc)
print(v[category])
print("-"*55)
c = c + 1
Accuracy¶
np.mean(predicted == Y_TEST)
Prediction¶
docs_new = ['Ten Months After George Floyd’s Death, Minneapolis Residents Are at War Over Policing']
predicted = text_clf.predict(docs_new)
v[predicted[0]]
Saving the Model¶
import pickle
with open('model.pkl','wb') as f:
pickle.dump(text_clf,f)
# load
with open('model.pkl', 'rb') as f:
clf2 = pickle.load(f)
docs_new = ['Ten Months After George Floyd’s Death, Minneapolis Residents Are at War Over Policing']
predicted = clf2.predict(docs_new)
v[predicted[0]]
Very Nice Model
ReplyDeletePythonist: Simple Machine Learning Model To Predict New Category >>>>> Download Now
Delete>>>>> Download Full
Pythonist: Simple Machine Learning Model To Predict New Category >>>>> Download LINK
>>>>> Download Now
Pythonist: Simple Machine Learning Model To Predict New Category >>>>> Download Full
>>>>> Download LINK 4A
Hey Soumil Could You Please Upload a series of videos on ML for beginners, I'll like it if you will make it.
ReplyDeletePythonist: Simple Machine Learning Model To Predict New Category >>>>> Download Now
ReplyDelete>>>>> Download Full
Pythonist: Simple Machine Learning Model To Predict New Category >>>>> Download LINK
>>>>> Download Now
Pythonist: Simple Machine Learning Model To Predict New Category >>>>> Download Full
>>>>> Download LINK