Implemented all (most) of the NLTK supported languages from the config

2021-07-26 21:49:03 +02:00 · 2021-07-26 21:49:03 +02:00 · b0cb9b7875
commit b0cb9b7875
parent 1e7f1529d2
3 changed files with 44 additions and 2 deletions
--- a/config/languages.json
+++ b/config/languages.json
@ -0,0 +1,19 @@
+{
+  "fr-fr": "french",
+  "en-en": "english",
+  "en-us": "english",
+  "de-de": "german",
+  "tr-tr": "turkish",
+  "cs-cz": "czech",
+  "el-gr": "greek",
+  "da-dk": "danish",
+  "et-ee": "estonian",
+  "pt-pt": "portuguese",
+  "es-es": "spanish",
+  "nl-nl": "dutch",
+  "fi-fi": "finnish",
+  "it-it": "italian",
+  "pl-pl": "polish",
+  "sl-si": "slovene",
+  "sv-se": "swedish"
+}
--- a/ia/nltk_utils.py
+++ b/ia/nltk_utils.py
@ -2,16 +2,24 @@ import nltk
 import numpy as np
 from nltk.stem.porter import PorterStemmer

+from utils import config_utils, languages_utils
+
 stemmer = PorterStemmer()


+# TODO : have a look to replace nltk by spacy or the other way (use only one of them)
+
 def tokenize(sentence):
    """
    split sentence into array of words/tokens
    a token can be a word or punctuation character, or number
    """
-    # TODO: implement english and others languages
-    return nltk.word_tokenize(sentence, language="french")
+    # English, Danish, Estonian, French, Greek, Norwegian, Portuguese, Spanish, Turkish,
+    # Czech, Dutch, Finnish, German, Italian, Polish, Slovene, and Swedish
+
+    print(languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))
+    return nltk.word_tokenize(sentence,
+                              language=languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))


 def stem(word):
--- a/utils/languages_utils.py
+++ b/utils/languages_utils.py
@ -0,0 +1,15 @@
+import json
+import os
+
+import get_path_file
+
+path = os.path.dirname(get_path_file.__file__)
+
+
+def get_language_name(name):
+    """
+    Return for exemple french for fr-fr, english for en-en, etc (savec in languages.json in the config folder)
+    """
+    config_json = json.load(open(path + "/config/languages.json", encoding='utf-8', mode='r'))
+    if name in config_json:
+        return config_json.get(name)