Thursday, April 30, 2020

How to remove Stop Words from Text using Python

TextPreProcessing

How to remove Stop Words from Text using Python

In [11]:
try:
    import string
    from nltk.corpus import stopwords
    import nltk
except Exception as e:
    print(e)


class PreProcessText(object):

    def __init__(self):
        pass

    def __remove_punctuation(self, text):
        """
        Takes a String
        return : Return a String
        """
        message = []
        for x in text:
            if x in string.punctuation:
                pass
            else:
                message.append(x)
        message = ''.join(message)

        return message

    def __remove_stopwords(self, text):
        """
        Takes a String
        return List
        """
        words= []
        for x in text.split():

            if x.lower() in stopwords.words('english'):
                pass
            else:
                words.append(x)
        return words


    def token_words(self,text=''):
        """
        Takes String
        Return Token also called  list of words that is used to
        Train the Model
        """
        message = self.__remove_punctuation(text)
        words = self.__remove_stopwords(message)
        return words


def main():
    import nltk
    flag = nltk.download("stopwords")

    if (flag == "False" or flag == False):
        print("Failed to Download Stop Words")
    else:
        print("Downloaded Stop words ...... ")
        news1 = """
        Louisiana reported 27 new deaths statewide on Monday, but none in Orleans Parish, the first time the Big Easy reported no new deaths from the virus since March 22. 
        
        Orleans Parish and the neighboring Jefferson Parish, the hardest hit areas of Louisiana, counted a combined 68 new cases Monday, with two new deaths reported in Jefferson.
        
        However, it wasn’t to last—Orleans Parish reported four new deaths on Tuesday, with 9 in Jefferson Parish, though health officials have pointed out that daily figures can be skewed by late reporting.
        
        The state’s coronavirus peak—for now—appears to have been in early April, when the state was seeing upwards of 1,000 new cases per day.
        """
        helper = PreProcessText()
        words = helper.token_words(text=news1)
        print(words)


if __name__ == "__main__":
    main()
Downloaded Stop words ...... 
['Louisiana', 'reported', '27', 'new', 'deaths', 'statewide', 'Monday', 'none', 'Orleans', 'Parish', 'first', 'time', 'Big', 'Easy', 'reported', 'new', 'deaths', 'virus', 'since', 'March', '22', 'Orleans', 'Parish', 'neighboring', 'Jefferson', 'Parish', 'hardest', 'hit', 'areas', 'Louisiana', 'counted', 'combined', '68', 'new', 'cases', 'Monday', 'two', 'new', 'deaths', 'reported', 'Jefferson', 'However', 'wasn’t', 'last—Orleans', 'Parish', 'reported', 'four', 'new', 'deaths', 'Tuesday', '9', 'Jefferson', 'Parish', 'though', 'health', 'officials', 'pointed', 'daily', 'figures', 'skewed', 'late', 'reporting', 'state’s', 'coronavirus', 'peak—for', 'now—appears', 'early', 'April', 'state', 'seeing', 'upwards', '1000', 'new', 'cases', 'per', 'day']
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s.shah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [ ]:
 

1 comment:

  1. Lucky Club Online Casino Site | Online Casino UK
    Our online casino website is a one-stop-shop for all your favourite games! · Lucky Club Casino: Live Games, Slots, Table Games, Poker and all your favourite 카지노사이트luckclub

    ReplyDelete

How to Use Publish-Audit-Merge Workflow in Apache Iceberg: A Beginner’s Guide

publish How to Use Publish-Audit-Merge Workflow in Apache Iceberg: A Beginner’s Guide ¶ In [24]: from ...