Implemented all (most) of the NLTK supported languages from the config

This commit is contained in:
Mathieu B 2021-07-26 21:49:03 +02:00
parent 1e7f1529d2
commit b0cb9b7875
3 changed files with 44 additions and 2 deletions

19
config/languages.json Normal file
View File

@ -0,0 +1,19 @@
{
"fr-fr": "french",
"en-en": "english",
"en-us": "english",
"de-de": "german",
"tr-tr": "turkish",
"cs-cz": "czech",
"el-gr": "greek",
"da-dk": "danish",
"et-ee": "estonian",
"pt-pt": "portuguese",
"es-es": "spanish",
"nl-nl": "dutch",
"fi-fi": "finnish",
"it-it": "italian",
"pl-pl": "polish",
"sl-si": "slovene",
"sv-se": "swedish"
}

View File

@ -2,16 +2,24 @@ import nltk
import numpy as np
from nltk.stem.porter import PorterStemmer
from utils import config_utils, languages_utils
stemmer = PorterStemmer()
# TODO : have a look to replace nltk by spacy or the other way (use only one of them)
def tokenize(sentence):
"""
split sentence into array of words/tokens
a token can be a word or punctuation character, or number
"""
# TODO: implement english and others languages
return nltk.word_tokenize(sentence, language="french")
# English, Danish, Estonian, French, Greek, Norwegian, Portuguese, Spanish, Turkish,
# Czech, Dutch, Finnish, German, Italian, Polish, Slovene, and Swedish
print(languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))
return nltk.word_tokenize(sentence,
language=languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))
def stem(word):

15
utils/languages_utils.py Normal file
View File

@ -0,0 +1,15 @@
import json
import os
import get_path_file
path = os.path.dirname(get_path_file.__file__)
def get_language_name(name):
"""
Return for exemple french for fr-fr, english for en-en, etc (savec in languages.json in the config folder)
"""
config_json = json.load(open(path + "/config/languages.json", encoding='utf-8', mode='r'))
if name in config_json:
return config_json.get(name)