Thursday, April 30, 2020

How to remove Stop Words from Text using Python

TextPreProcessing

How to remove Stop Words from Text using Python

In [11]:
try:
    import string
    from nltk.corpus import stopwords
    import nltk
except Exception as e:
    print(e)


class PreProcessText(object):

    def __init__(self):
        pass

    def __remove_punctuation(self, text):
        """
        Takes a String
        return : Return a String
        """
        message = []
        for x in text:
            if x in string.punctuation:
                pass
            else:
                message.append(x)
        message = ''.join(message)

        return message

    def __remove_stopwords(self, text):
        """
        Takes a String
        return List
        """
        words= []
        for x in text.split():

            if x.lower() in stopwords.words('english'):
                pass
            else:
                words.append(x)
        return words


    def token_words(self,text=''):
        """
        Takes String
        Return Token also called  list of words that is used to
        Train the Model
        """
        message = self.__remove_punctuation(text)
        words = self.__remove_stopwords(message)
        return words


def main():
    import nltk
    flag = nltk.download("stopwords")

    if (flag == "False" or flag == False):
        print("Failed to Download Stop Words")
    else:
        print("Downloaded Stop words ...... ")
        news1 = """
        Louisiana reported 27 new deaths statewide on Monday, but none in Orleans Parish, the first time the Big Easy reported no new deaths from the virus since March 22. 
        
        Orleans Parish and the neighboring Jefferson Parish, the hardest hit areas of Louisiana, counted a combined 68 new cases Monday, with two new deaths reported in Jefferson.
        
        However, it wasn’t to last—Orleans Parish reported four new deaths on Tuesday, with 9 in Jefferson Parish, though health officials have pointed out that daily figures can be skewed by late reporting.
        
        The state’s coronavirus peak—for now—appears to have been in early April, when the state was seeing upwards of 1,000 new cases per day.
        """
        helper = PreProcessText()
        words = helper.token_words(text=news1)
        print(words)


if __name__ == "__main__":
    main()
Downloaded Stop words ...... 
['Louisiana', 'reported', '27', 'new', 'deaths', 'statewide', 'Monday', 'none', 'Orleans', 'Parish', 'first', 'time', 'Big', 'Easy', 'reported', 'new', 'deaths', 'virus', 'since', 'March', '22', 'Orleans', 'Parish', 'neighboring', 'Jefferson', 'Parish', 'hardest', 'hit', 'areas', 'Louisiana', 'counted', 'combined', '68', 'new', 'cases', 'Monday', 'two', 'new', 'deaths', 'reported', 'Jefferson', 'However', 'wasn’t', 'last—Orleans', 'Parish', 'reported', 'four', 'new', 'deaths', 'Tuesday', '9', 'Jefferson', 'Parish', 'though', 'health', 'officials', 'pointed', 'daily', 'figures', 'skewed', 'late', 'reporting', 'state’s', 'coronavirus', 'peak—for', 'now—appears', 'early', 'April', 'state', 'seeing', 'upwards', '1000', 'new', 'cases', 'per', 'day']
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s.shah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [ ]:
 

Learn How to configure your Spark Session to Join Managed (S3 Table Buckets) and Unmanaged Iceberg Tables | Hands on Labs

test-tble-bucket-joins Learn How to configure your Spark Session to Join Managed (S...