Implemented all (most) of the NLTK supported languages from the config
This commit is contained in:
parent
1e7f1529d2
commit
b0cb9b7875
19
config/languages.json
Normal file
19
config/languages.json
Normal file
@ -0,0 +1,19 @@
|
||||
{
|
||||
"fr-fr": "french",
|
||||
"en-en": "english",
|
||||
"en-us": "english",
|
||||
"de-de": "german",
|
||||
"tr-tr": "turkish",
|
||||
"cs-cz": "czech",
|
||||
"el-gr": "greek",
|
||||
"da-dk": "danish",
|
||||
"et-ee": "estonian",
|
||||
"pt-pt": "portuguese",
|
||||
"es-es": "spanish",
|
||||
"nl-nl": "dutch",
|
||||
"fi-fi": "finnish",
|
||||
"it-it": "italian",
|
||||
"pl-pl": "polish",
|
||||
"sl-si": "slovene",
|
||||
"sv-se": "swedish"
|
||||
}
|
@ -2,16 +2,24 @@ import nltk
|
||||
import numpy as np
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
|
||||
from utils import config_utils, languages_utils
|
||||
|
||||
stemmer = PorterStemmer()
|
||||
|
||||
|
||||
# TODO : have a look to replace nltk by spacy or the other way (use only one of them)
|
||||
|
||||
def tokenize(sentence):
|
||||
"""
|
||||
split sentence into array of words/tokens
|
||||
a token can be a word or punctuation character, or number
|
||||
"""
|
||||
# TODO: implement english and others languages
|
||||
return nltk.word_tokenize(sentence, language="french")
|
||||
# English, Danish, Estonian, French, Greek, Norwegian, Portuguese, Spanish, Turkish,
|
||||
# Czech, Dutch, Finnish, German, Italian, Polish, Slovene, and Swedish
|
||||
|
||||
print(languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))
|
||||
return nltk.word_tokenize(sentence,
|
||||
language=languages_utils.get_language_name(config_utils.get_in_config("LANGUAGE")))
|
||||
|
||||
|
||||
def stem(word):
|
||||
|
15
utils/languages_utils.py
Normal file
15
utils/languages_utils.py
Normal file
@ -0,0 +1,15 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import get_path_file
|
||||
|
||||
path = os.path.dirname(get_path_file.__file__)
|
||||
|
||||
|
||||
def get_language_name(name):
|
||||
"""
|
||||
Return for exemple french for fr-fr, english for en-en, etc (savec in languages.json in the config folder)
|
||||
"""
|
||||
config_json = json.load(open(path + "/config/languages.json", encoding='utf-8', mode='r'))
|
||||
if name in config_json:
|
||||
return config_json.get(name)
|
Reference in New Issue
Block a user