Saturday, May 23, 2020

GenSim Word2vec Visualization helper class in python | plotting made easy for genism word2vec NLP

Untitled

An Open Source GenSIm Word2Vec Plotting Libarary in python

Step 1:

laod the model
In [15]:
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
%matplotlib inline



model = Word2Vec.load("word2vec.model")

Step 2:

Create a instance of GensimWord2vec class

In [32]:
helper = GensimWord2vecPlotter(model=model)
C:\Users\s.shah\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:44: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).

Step 3:

Scatter plot

In [33]:
helper.plot_catter()
C:\Users\s.shah\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:70: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).

Step 4:

plot with Words ]

you change the ize attribute depending on how many words you want in plot
In [34]:
helper.plot_scatter_words(Size=30)
No handles with labels found to put in legend.

Step 5:

Get the pandas Dataframe

In [35]:
helper.getPandasDF().head(5)
Out[35]:
words x y
0 strategy 2.565056 18.415142
1 strategic partnerships 3.836778 -5.085860
2 start ups 13.493507 0.638033
3 business strategy -12.380363 -3.359112
4 strategic planning 4.011899 13.737834

Class

This is actual Python class that does all the hard work for you

In [29]:
from sklearn.decomposition import PCA
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec


class GensimWord2vecPlotter(object):

    __slots__ = ["model", "words", "result", "_tem"]

    def __init__(self, model):
        self.model = model
        self.words = None
        self.result = None
        self._tem = self.preprocess()

    def plot_scatter_words(self, Size=80):
        SIZE = Size
        words = self.words
        result = self.result

        for i, word in enumerate(words):
            if i == SIZE:
                break
            
            plt.annotate(word,
                         xy=(result[i, 0], result[i, 1]),
                         horizontalalignment='left',verticalalignment='bottom')

        plt.scatter(result[:SIZE, 0], result[:SIZE, 1],s=(40,))
        plt.title("Skills of Candidates")
        plt.grid(True, alpha=1)
        plt.legend()
        plt.show()


    def preprocess(self):
        """
        Pre Processing sets all the variable in constructor
        :return: None
        """
        model = self.model
        X = model[model.wv.vocab]
        pca = PCA(n_components=2)
        result = pca.fit_transform(X)
        words = list(model.wv.vocab)
        self.words = words
        self.result = result

    def getPandasDF(self):
        result = self.result
        words = self.words
        
        x = [result[i, 0]  for i, word in enumerate(words)  ]
        y = [result[i, 1]  for i, word in enumerate(words)  ]

        wordsdf = pd.DataFrame(data={
            "words":words,
            "x":x,
            "y":y
        })
        
        return wordsdf


    def plot_catter(self):

        model = self.model
        X = model[model.wv.vocab]
        result = self.result
        words = self.words

        x = [result[i, 0]  for i, word in enumerate(words)  ]
        y = [result[i, 1]  for i, word in enumerate(words)  ]

        wordsdf = pd.DataFrame(data={
            "words":words,
            "x":x,
            "y":y
        })
        wordsdf.plot.scatter("x", "y", s=10, figsize=(20, 12))

Learn How to configure your Spark Session to Join Managed (S3 Table Buckets) and Unmanaged Iceberg Tables | Hands on Labs

test-tble-bucket-joins Learn How to configure your Spark Session to Join Managed (S...