Using BERT with Scikit Learn to do Text classification¶
Soumil Nitin Shah¶
Bachelor in Electronic Engineering | Masters in Electrical Engineering | Master in Computer Engineering |
- Website : https://soumilshah.herokuapp.com
- Github: https://github.com/soumilshah1995
- Linkedin: https://www.linkedin.com/in/shah-soumil/
- Blog: https://soumilshah1995.blogspot.com/
- Youtube : https://www.youtube.com/channel/UC_eOodxvwS_H7x2uLQa-svw?view_as=subscriber
- Facebook Page : https://www.facebook.com/soumilshah1995/
- Email : shahsoumil519@gmail.com
- projects : https://soumilshah.herokuapp.com/project
Excellent experience of building scalable and high-performance Software Applications combining distinctive skill sets in Internet of Things (IoT), Machine Learning and Full Stack Web Development in Python.
try:
import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import warnings
import swifter
import tqdm
tqdm.pandas()
warnings.filterwarnings('ignore')
except Exception as e: pass
Reading Dataset¶
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
df = df.dropna(how='all')
df.head(2)
X = df[0]
Y = df[1]
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
Pre Processing¶
class BertTokenizer(object):
def __init__(self, text=[]):
self.text = text
# For DistilBERT:
self.model_class, self.tokenizer_class, self.pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
# Load pretrained model/tokenizer
self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)
self.model = self.model_class.from_pretrained(self.pretrained_weights)
def get(self):
df = pd.DataFrame(data={"text":self.text})
tokenized = df["text"].swifter.apply((lambda x: self.tokenizer.encode(x, add_special_tokens=True)))
max_len = 0
for i in tokenized.values:
if len(i) > max_len:
max_len = len(i)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
with torch.no_grad(): last_hidden_states = self.model(input_ids, attention_mask=attention_mask)
features = last_hidden_states[0][:, 0, :].numpy()
return features
_instance =BertTokenizer(text=x_train)
tokens = _instance.get()
Model¶
lr_clf = LogisticRegression()
lr_clf.fit(tokens, y_train)
Test¶
_instance =BertTokenizer(text=x_test)
tokensTest = _instance.get()
predicted = lr_clf.predict(tokensTest)
np.mean(predicted == y_test)
Pythonist: Using Bert With Scikit Learn To Do Text Classification¶ >>>>> Download Now
ReplyDelete>>>>> Download Full
Pythonist: Using Bert With Scikit Learn To Do Text Classification¶ >>>>> Download LINK
>>>>> Download Now
Pythonist: Using Bert With Scikit Learn To Do Text Classification¶ >>>>> Download Full
>>>>> Download LINK