📝 Edit page
➕ Add page
Basic
"""
Tokenize a sentence and remove stopwords.
Based on
https://stackoverflow.com/questions/17390326/getting-rid-of-stop-words-and-document-tokenization-using-nltk
See
http://www.nltk.org/data.html
Required:
- nltk
"""
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
STOP_WORDS = stopwords.words('english') + list(string.punctuation)
message = "this is a foo bar, bar black sheep."
tokens = word_tokenize(message.lower())
print([i for i in tokens if i not in STOP_WORDS])
# => ['foo', 'bar', 'bar', 'black', 'sheep']