Saturday, May 2, 2020

Lets Build a simple NLP Model that Predict Similar Words from Dataset you provide

Untitled1

Step 1:

Get the Text

In [25]:
import bs4 as bs
import urllib.request
import re
import nltk
In [38]:
scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Machine_learning')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

Step 2:

Reomve Stop Words

In [27]:
try:
    import string
    from nltk.corpus import stopwords
    import nltk
except Exception as e:
    print(e)


class PreProcessText(object):

    def __init__(self):
        pass

    def __remove_punctuation(self, text):
        """
        Takes a String
        return : Return a String
        """
        message = []
        for x in text:
            if x in string.punctuation:
                pass
            else:
                message.append(x)
        message = ''.join(message)

        return message

    def __remove_stopwords(self, text):
        """
        Takes a String
        return List
        """
        words= []
        for x in text.split():

            if x.lower() in stopwords.words('english'):
                pass
            else:
                words.append(x)
        return words


    def token_words(self,text=''):
        """
        Takes String
        Return Token also called  list of words that is used to
        Train the Model
        """
        message = self.__remove_punctuation(text)
        words = self.__remove_stopwords(message)
        return words
In [28]:
import nltk
flag = nltk.download("stopwords")

if (flag == "False" or flag == False):
    print("Failed to Download Stop Words")
else:
    print("Downloaded Stop words ...... ")
    helper = PreProcessText()
    #words = helper.token_words(text=txt)
    words = helper.token_words(text=article_text)
    
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s.shah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloaded Stop words ...... 

Step3:

Train Model

In [15]:
from gensim.models import Word2Vec
In [30]:
#model = Word2Vec([words], min_count=1)
model = Word2Vec([words], size=100, window=5, min_count=1, workers=4)
In [31]:
vocabulary = model.wv.vocab
In [39]:
sim_words = model.wv.most_similar('machine')
In [40]:
sim_words
Out[40]:
[('categorisation', 0.3522621989250183),
 ('favor18', 0.30931341648101807),
 ('provide', 0.3048184812068939),
 ('n', 0.3024228811264038),
 ('would', 0.29657861590385437),
 ('supervised', 0.28355103731155396),
 ('species', 0.2742482125759125),
 ('collectively', 0.27363914251327515),
 ('transformative', 0.27254971861839294),
 ('advanced', 0.27198636531829834)]
In [35]:
X = model[model.wv.vocab]
C:\Users\s.shah\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  """Entry point for launching an IPython kernel.

No comments:

Post a Comment

Learn How to Connect to the Glue Data Catalog using AWS Glue Iceberg REST endpoint

gluecat Learn How to Connect to the Glue Data Catalog using AWS Glue Iceberg REST e...