Implemented all (most) of the NLTK supported languages from the config
This commit is contained in:
parent
1e7f1529d2
commit
b0cb9b7875
19
config/languages.json
Normal file
19
config/languages.json
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
{
|
||||||
|
"fr-fr": "french",
|
||||||
|
"en-en": "english",
|
||||||
|
"en-us": "english",
|
||||||
|
"de-de": "german",
|
||||||
|
"tr-tr": "turkish",
|
||||||
|
"cs-cz": "czech",
|
||||||
|
"el-gr": "greek",
|
||||||
|
"da-dk": "danish",
|
||||||
|
"et-ee": "estonian",
|
||||||
|
"pt-pt": "portuguese",
|
||||||
|
"es-es": "spanish",
|
||||||
|
"nl-nl": "dutch",
|
||||||
|
"fi-fi": "finnish",
|
||||||
|
"it-it": "italian",
|
||||||
|
"pl-pl": "polish",
|
||||||
|
"sl-si": "slovene",
|
||||||
|
"sv-se": "swedish"
|
||||||
|
}
|
@ -2,16 +2,24 @@ import nltk
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from nltk.stem.porter import PorterStemmer
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
|
||||||
|
from utils import config_utils, languages_utils
|
||||||
|
|
||||||
stemmer = PorterStemmer()
|
stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
|
||||||
|
# TODO : have a look to replace nltk by spacy or the other way (use only one of them)
|
||||||
|
|
||||||
def tokenize(sentence):
|
def tokenize(sentence):
|
||||||
"""
|
"""
|
||||||
split sentence into array of words/tokens
|
split sentence into array of words/tokens
|
||||||
a token can be a word or punctuation character, or number
|
a token can be a word or punctuation character, or number
|
||||||
"""
|
"""
|
||||||
# TODO: implement english and others languages
|
# English, Danish, Estonian, French, Greek, Norwegian, Portuguese, Spanish, Turkish,
|
||||||
return nltk.word_tokenize(sentence, language="french")
|
# Czech, Dutch, Finnish, German, Italian, Polish, Slovene, and Swedish
|
||||||
|
|
||||||
|
print(languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))
|
||||||
|
return nltk.word_tokenize(sentence,
|
||||||
|
language=languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))
|
||||||
|
|
||||||
|
|
||||||
def stem(word):
|
def stem(word):
|
||||||
|
15
utils/languages_utils.py
Normal file
15
utils/languages_utils.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import get_path_file
|
||||||
|
|
||||||
|
path = os.path.dirname(get_path_file.__file__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_language_name(name):
|
||||||
|
"""
|
||||||
|
Return for exemple french for fr-fr, english for en-en, etc (savec in languages.json in the config folder)
|
||||||
|
"""
|
||||||
|
config_json = json.load(open(path + "/config/languages.json", encoding='utf-8', mode='r'))
|
||||||
|
if name in config_json:
|
||||||
|
return config_json.get(name)
|
Reference in New Issue
Block a user