From b0cb9b7875ad5270ba75adbe4205da97449740cd Mon Sep 17 00:00:00 2001 From: Mathieu B Date: Mon, 26 Jul 2021 21:49:03 +0200 Subject: [PATCH] Implemented all (most) of the NLTK supported languages from the config --- config/languages.json | 19 +++++++++++++++++++ ia/nltk_utils.py | 12 ++++++++++-- utils/languages_utils.py | 15 +++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 config/languages.json create mode 100644 utils/languages_utils.py diff --git a/config/languages.json b/config/languages.json new file mode 100644 index 0000000..dcef395 --- /dev/null +++ b/config/languages.json @@ -0,0 +1,19 @@ +{ + "fr-fr": "french", + "en-en": "english", + "en-us": "english", + "de-de": "german", + "tr-tr": "turkish", + "cs-cz": "czech", + "el-gr": "greek", + "da-dk": "danish", + "et-ee": "estonian", + "pt-pt": "portuguese", + "es-es": "spanish", + "nl-nl": "dutch", + "fi-fi": "finnish", + "it-it": "italian", + "pl-pl": "polish", + "sl-si": "slovene", + "sv-se": "swedish" +} \ No newline at end of file diff --git a/ia/nltk_utils.py b/ia/nltk_utils.py index 1f1120d..bdf69bf 100644 --- a/ia/nltk_utils.py +++ b/ia/nltk_utils.py @@ -2,16 +2,24 @@ import nltk import numpy as np from nltk.stem.porter import PorterStemmer +from utils import config_utils, languages_utils + stemmer = PorterStemmer() +# TODO : have a look to replace nltk by spacy or the other way (use only one of them) + def tokenize(sentence): """ split sentence into array of words/tokens a token can be a word or punctuation character, or number """ - # TODO: implement english and others languages - return nltk.word_tokenize(sentence, language="french") + # English, Danish, Estonian, French, Greek, Norwegian, Portuguese, Spanish, Turkish, + # Czech, Dutch, Finnish, German, Italian, Polish, Slovene, and Swedish + + print(languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE"))) + return nltk.word_tokenize(sentence, + language=languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE"))) def stem(word): diff --git a/utils/languages_utils.py b/utils/languages_utils.py new file mode 100644 index 0000000..046773d --- /dev/null +++ b/utils/languages_utils.py @@ -0,0 +1,15 @@ +import json +import os + +import get_path_file + +path = os.path.dirname(get_path_file.__file__) + + +def get_language_name(name): + """ + Return for exemple french for fr-fr, english for en-en, etc (savec in languages.json in the config folder) + """ + config_json = json.load(open(path + "/config/languages.json", encoding='utf-8', mode='r')) + if name in config_json: + return config_json.get(name)