Implemented all (most) of the NLTK supported languages from the config

2021-07-26 21:49:03 +02:00 · 2021-07-26 21:49:03 +02:00 · b0cb9b7875
commit b0cb9b7875
parent 1e7f1529d2
3 changed files with 44 additions and 2 deletions
--- a/config/languages.json
+++ b/config/languages.json
@ -0,0 +1,19 @@
 {
  "fr-fr": "french",
  "en-en": "english",
  "en-us": "english",
  "de-de": "german",
  "tr-tr": "turkish",
  "cs-cz": "czech",
  "el-gr": "greek",
  "da-dk": "danish",
  "et-ee": "estonian",
  "pt-pt": "portuguese",
  "es-es": "spanish",
  "nl-nl": "dutch",
  "fi-fi": "finnish",
  "it-it": "italian",
  "pl-pl": "polish",
  "sl-si": "slovene",
  "sv-se": "swedish"
 }
--- a/ia/nltk_utils.py
+++ b/ia/nltk_utils.py
@ -2,16 +2,24 @@ import nltk
 import numpy as np
 from nltk.stem.porter import PorterStemmer
 from utils import config_utils, languages_utils
 stemmer = PorterStemmer()
 # TODO : have a look to replace nltk by spacy or the other way (use only one of them)
 def tokenize(sentence):
    """
    split sentence into array of words/tokens
    a token can be a word or punctuation character, or number
    """
-    # TODO: implement english and others languages
+    # English, Danish, Estonian, French, Greek, Norwegian, Portuguese, Spanish, Turkish,
-    return nltk.word_tokenize(sentence, language="french")
+    # Czech, Dutch, Finnish, German, Italian, Polish, Slovene, and Swedish
    print(languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))
    return nltk.word_tokenize(sentence,
                              language=languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))
 def stem(word):
--- a/utils/languages_utils.py
+++ b/utils/languages_utils.py
@ -0,0 +1,15 @@
 import json
 import os
 import get_path_file
 path = os.path.dirname(get_path_file.__file__)
 def get_language_name(name):
    """
    Return for exemple french for fr-fr, english for en-en, etc (savec in languages.json in the config folder)
    """
    config_json = json.load(open(path + "/config/languages.json", encoding='utf-8', mode='r'))
    if name in config_json:
        return config_json.get(name)