How to remove Stop Words from Text using Python¶
In [11]:
try:
import string
from nltk.corpus import stopwords
import nltk
except Exception as e:
print(e)
class PreProcessText(object):
def __init__(self):
pass
def __remove_punctuation(self, text):
"""
Takes a String
return : Return a String
"""
message = []
for x in text:
if x in string.punctuation:
pass
else:
message.append(x)
message = ''.join(message)
return message
def __remove_stopwords(self, text):
"""
Takes a String
return List
"""
words= []
for x in text.split():
if x.lower() in stopwords.words('english'):
pass
else:
words.append(x)
return words
def token_words(self,text=''):
"""
Takes String
Return Token also called list of words that is used to
Train the Model
"""
message = self.__remove_punctuation(text)
words = self.__remove_stopwords(message)
return words
def main():
import nltk
flag = nltk.download("stopwords")
if (flag == "False" or flag == False):
print("Failed to Download Stop Words")
else:
print("Downloaded Stop words ...... ")
news1 = """
Louisiana reported 27 new deaths statewide on Monday, but none in Orleans Parish, the first time the Big Easy reported no new deaths from the virus since March 22.
Orleans Parish and the neighboring Jefferson Parish, the hardest hit areas of Louisiana, counted a combined 68 new cases Monday, with two new deaths reported in Jefferson.
However, it wasn’t to last—Orleans Parish reported four new deaths on Tuesday, with 9 in Jefferson Parish, though health officials have pointed out that daily figures can be skewed by late reporting.
The state’s coronavirus peak—for now—appears to have been in early April, when the state was seeing upwards of 1,000 new cases per day.
"""
helper = PreProcessText()
words = helper.token_words(text=news1)
print(words)
if __name__ == "__main__":
main()
In [ ]: