Step 1:¶

Get the Text¶

import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Machine_learning')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

Step 2:¶

Reomve Stop Words¶

try:
    import string
    from nltk.corpus import stopwords
    import nltk
except Exception as e:
    print(e)


class PreProcessText(object):

    def __init__(self):
        pass

    def __remove_punctuation(self, text):
        """
        Takes a String
        return : Return a String
        """
        message = []
        for x in text:
            if x in string.punctuation:
                pass
            else:
                message.append(x)
        message = ''.join(message)

        return message

    def __remove_stopwords(self, text):
        """
        Takes a String
        return List
        """
        words= []
        for x in text.split():

            if x.lower() in stopwords.words('english'):
                pass
            else:
                words.append(x)
        return words


    def token_words(self,text=''):
        """
        Takes String
        Return Token also called  list of words that is used to
        Train the Model
        """
        message = self.__remove_punctuation(text)
        words = self.__remove_stopwords(message)
        return words

import nltk
flag = nltk.download("stopwords")

if (flag == "False" or flag == False):
    print("Failed to Download Stop Words")
else:
    print("Downloaded Stop words ...... ")
    helper = PreProcessText()
    #words = helper.token_words(text=txt)
    words = helper.token_words(text=article_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s.shah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Downloaded Stop words ......

Step3:¶

Train Model¶

from gensim.models import Word2Vec

#model = Word2Vec([words], min_count=1)
model = Word2Vec([words], size=100, window=5, min_count=1, workers=4)

vocabulary = model.wv.vocab

sim_words = model.wv.most_similar('machine')

sim_words

[('categorisation', 0.3522621989250183),
 ('favor18', 0.30931341648101807),
 ('provide', 0.3048184812068939),
 ('n', 0.3024228811264038),
 ('would', 0.29657861590385437),
 ('supervised', 0.28355103731155396),
 ('species', 0.2742482125759125),
 ('collectively', 0.27363914251327515),
 ('transformative', 0.27254971861839294),
 ('advanced', 0.27198636531829834)]

X = model[model.wv.vocab]

C:\Users\s.shah\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  """Entry point for launching an IPython kernel.

Pythonist

Saturday, May 2, 2020

Lets Build a simple NLP Model that Predict Similar Words from Dataset you provide

Step 1:¶

Get the Text¶

Step 2:¶

Reomve Stop Words¶

Step3:¶

Train Model¶

No comments:

Post a Comment

Getting started with LakeFS and Apache Iceberg Running Locally