Saturday, May 2, 2020

Lets Build a simple NLP Model that Predict Similar Words from Dataset you provide

Untitled1

Step 1:

Get the Text

In [25]:
import bs4 as bs
import urllib.request
import re
import nltk
In [38]:
scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Machine_learning')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

Step 2:

Reomve Stop Words

In [27]:
try:
    import string
    from nltk.corpus import stopwords
    import nltk
except Exception as e:
    print(e)


class PreProcessText(object):

    def __init__(self):
        pass

    def __remove_punctuation(self, text):
        """
        Takes a String
        return : Return a String
        """
        message = []
        for x in text:
            if x in string.punctuation:
                pass
            else:
                message.append(x)
        message = ''.join(message)

        return message

    def __remove_stopwords(self, text):
        """
        Takes a String
        return List
        """
        words= []
        for x in text.split():

            if x.lower() in stopwords.words('english'):
                pass
            else:
                words.append(x)
        return words


    def token_words(self,text=''):
        """
        Takes String
        Return Token also called  list of words that is used to
        Train the Model
        """
        message = self.__remove_punctuation(text)
        words = self.__remove_stopwords(message)
        return words
In [28]:
import nltk
flag = nltk.download("stopwords")

if (flag == "False" or flag == False):
    print("Failed to Download Stop Words")
else:
    print("Downloaded Stop words ...... ")
    helper = PreProcessText()
    #words = helper.token_words(text=txt)
    words = helper.token_words(text=article_text)
    
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s.shah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloaded Stop words ...... 

Step3:

Train Model

In [15]:
from gensim.models import Word2Vec
In [30]:
#model = Word2Vec([words], min_count=1)
model = Word2Vec([words], size=100, window=5, min_count=1, workers=4)
In [31]:
vocabulary = model.wv.vocab
In [39]:
sim_words = model.wv.most_similar('machine')
In [40]:
sim_words
Out[40]:
[('categorisation', 0.3522621989250183),
 ('favor18', 0.30931341648101807),
 ('provide', 0.3048184812068939),
 ('n', 0.3024228811264038),
 ('would', 0.29657861590385437),
 ('supervised', 0.28355103731155396),
 ('species', 0.2742482125759125),
 ('collectively', 0.27363914251327515),
 ('transformative', 0.27254971861839294),
 ('advanced', 0.27198636531829834)]
In [35]:
X = model[model.wv.vocab]
C:\Users\s.shah\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  """Entry point for launching an IPython kernel.

No comments:

Post a Comment

How to Use Publish-Audit-Merge Workflow in Apache Iceberg: A Beginner’s Guide

publish How to Use Publish-Audit-Merge Workflow in Apache Iceberg: A Beginner’s Guide ¶ In [24]: from ...