Added NLP Utils to remove stopwords for example

This commit is contained in:
Mathieu B 2021-07-27 17:00:06 +02:00
parent 17acd5e05d
commit 3bf6737f45
3 changed files with 22 additions and 0 deletions

View File

View File

@ -14,6 +14,8 @@ def get_language():
def get_language_full_name(name=None): def get_language_full_name(name=None):
""" """
Return for exemple french for fr-fr, english for en-en, etc (savec in languages.json in the config folder) Return for exemple french for fr-fr, english for en-en, etc (savec in languages.json in the config folder)
Return english if the language isn't found in the languages.json file
""" """
config_json = json.load(open(path + "/config/languages.json", encoding='utf-8', mode='r')) config_json = json.load(open(path + "/config/languages.json", encoding='utf-8', mode='r'))
@ -22,3 +24,5 @@ def get_language_full_name(name=None):
if name in config_json: if name in config_json:
return config_json.get(name) return config_json.get(name)
return 'english'

18
utils/nlp_utils.py Normal file
View File

@ -0,0 +1,18 @@
from nltk.corpus import stopwords
from utils.languages_utils import get_language_full_name
def get_text_without_stopwords(sentence, language='english'):
# if the language given is something like en-us, get the full variant (english)
if '-' in language:
language = get_language_full_name(language)
stop_words = set(stopwords.words(language))
filtered_sentence = [w for w in sentence.lower().split() if w not in stop_words]
filtered_sentence = " ".join(filtered_sentence)
return filtered_sentence
if __name__ == '__main__':
print(get_text_without_stopwords("Hey give me some info about Elon Musk please"))