Authors¶
- Soumil Nitin Shah
Soumil Nitin Shah¶
Bachelor in Electronic Engineering | Masters in Electrical Engineering | Master in Computer Engineering |
- Website : https://soumilshah.herokuapp.com
- Github: https://github.com/soumilshah1995
- Linkedin: https://www.linkedin.com/in/shah-soumil/
- Blog: https://soumilshah1995.blogspot.com/
- Youtube : https://www.youtube.com/channel/UC_eOodxvwS_H7x2uLQa-svw?view_as=subscriber
- Facebook Page : https://www.facebook.com/soumilshah1995/
- Email : shahsoumil519@gmail.com
- projects : https://soumilshah.herokuapp.com/project
Excellent experience of building scalable and high-performance Software Applications combining distinctive skill sets in Internet of Things (IoT), Machine Learning and Full Stack Web Development in Python.
Define Imports¶
try:
    import json
    import os
    import string
    
    import pandas as  pd
    import numpy as np
    
    import re
    import swifter
    # Import various Models
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import SGDClassifier
    from sklearn.svm import LinearSVC
    # Import pre processing libraries
    from sklearn.preprocessing import LabelEncoder
    # Import  Test Modules
    from sklearn.model_selection import cross_val_score, train_test_split
    from sklearn.metrics import classification_report
    
    import seaborn as sns
    import matplotlib as mpl
    
    import torch
    import transformers as ppb # pytorch transformers
    
    import matplotlib.pyplot as plt
except Exception as e:
    print("Error : {} ".format(e))
os.listdir()
df = pd.read_excel("public_use-industry-skills-needs.xlsx",sheet_name='Industry Skills Needs')
df.shape
df.head(2)
df['skill_group_category'].value_counts().plot( kind='bar', figsize=(15,10))
plt.title("Skill category Count ")
df["skill_group_category"].nunique()
list(df["skill_group_category"].unique())
X = df["skill_group_name"]
Y = df["skill_group_category"]
Pre Processing¶
encoder = LabelEncoder()
Y = encoder.fit_transform(Y)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
v = dict(zip(list(Y), df['skill_group_category'].to_list()))
v
BERT for tokenization¶
class BertTokenizer(object):
    def __init__(self, text=[]):
        self.text = text
        # For DistilBERT:
        self.model_class, self.tokenizer_class, self.pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
        # Load pretrained model/tokenizer
        self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)
        self.model = self.model_class.from_pretrained(self.pretrained_weights)
    def get(self):
        df = pd.DataFrame(data={"text":self.text})
        tokenized = df["text"].swifter.apply((lambda x: self.tokenizer.encode(x, add_special_tokens=True)))
        max_len = 0
        for i in tokenized.values:
            if len(i) > max_len:
                max_len = len(i)
        padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
        attention_mask = np.where(padded != 0, 1, 0)
        input_ids = torch.tensor(padded)
        attention_mask = torch.tensor(attention_mask)
        with torch.no_grad(): last_hidden_states = self.model(input_ids, attention_mask=attention_mask)
        
        features = last_hidden_states[0][:, 0, :].numpy()
        return features
_instance =BertTokenizer(text=x_train)
tokens = _instance.get()
clf_sv = LinearSVC(C=1, class_weight='balanced', multi_class='ovr', random_state=40, max_iter=400)
clf_sv.fit(tokens, y_train)
Test how good our model is¶
_instance =BertTokenizer(text=x_test)
tokensTest = _instance.get()
predicted = clf_sv.predict(tokensTest)
np.mean(predicted == y_test)
Almost 100 %¶
Model 2:¶
from sklearn.neighbors import KNeighborsClassifier
errorrate = []
for i in range(1,60, 10):
    print(i)
    newmodel = KNeighborsClassifier(n_neighbors = i)
    newmodel.fit(tokens, y_train)
    pred = newmodel.predict(tokensTest)
    errorrate.append(np.mean(pred != y_test))
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(10,6))
plt.plot(range(1, 60,10), errorrate, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
plt.show()
KNN proper K value would be around 2-8¶
newmodel = KNeighborsClassifier(n_neighbors = 4)
newmodel.fit(tokens, y_train)
pred = newmodel.predict(tokensTest)
np.mean(predicted == y_test)
y_test
Great¶
Test on unseen Data¶
SkillName = "Python"
_instance =BertTokenizer(text=[SkillName])
tokens_ = _instance.get()
tokens_.shape
pred = newmodel.predict(tokens_)
list(pred)
v[list(pred)[0]]
Wooohh !!!!¶
 
 
No comments:
Post a Comment