From 3bf6737f4595777d07c16e210ae56c6daecd5c2e Mon Sep 17 00:00:00 2001 From: Mathieu B Date: Tue, 27 Jul 2021 17:00:06 +0200 Subject: [PATCH] Added NLP Utils to remove stopwords for example --- skills/test/wikipedia_search/__init__.py | 0 utils/languages_utils.py | 4 ++++ utils/nlp_utils.py | 18 ++++++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 skills/test/wikipedia_search/__init__.py create mode 100644 utils/nlp_utils.py diff --git a/skills/test/wikipedia_search/__init__.py b/skills/test/wikipedia_search/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/languages_utils.py b/utils/languages_utils.py index 531ff4f..e47ab48 100644 --- a/utils/languages_utils.py +++ b/utils/languages_utils.py @@ -14,6 +14,8 @@ def get_language(): def get_language_full_name(name=None): """ Return for exemple french for fr-fr, english for en-en, etc (savec in languages.json in the config folder) + + Return english if the language isn't found in the languages.json file """ config_json = json.load(open(path + "/config/languages.json", encoding='utf-8', mode='r')) @@ -22,3 +24,5 @@ def get_language_full_name(name=None): if name in config_json: return config_json.get(name) + + return 'english' diff --git a/utils/nlp_utils.py b/utils/nlp_utils.py new file mode 100644 index 0000000..8c244f8 --- /dev/null +++ b/utils/nlp_utils.py @@ -0,0 +1,18 @@ +from nltk.corpus import stopwords + +from utils.languages_utils import get_language_full_name + + +def get_text_without_stopwords(sentence, language='english'): + # if the language given is something like en-us, get the full variant (english) + if '-' in language: + language = get_language_full_name(language) + + stop_words = set(stopwords.words(language)) + filtered_sentence = [w for w in sentence.lower().split() if w not in stop_words] + filtered_sentence = " ".join(filtered_sentence) + return filtered_sentence + + +if __name__ == '__main__': + print(get_text_without_stopwords("Hey give me some info about Elon Musk please"))