Added auto spacy model based on language
This commit is contained in:
parent
ab86509e61
commit
d9cf5e56de
19
jarvis/config/spacy.json
Normal file
19
jarvis/config/spacy.json
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
{
|
||||||
|
"fr-fr": "fr_core_news_sm",
|
||||||
|
"en-en": "en_core_web_sm",
|
||||||
|
"en-us": "en_core_web_sm",
|
||||||
|
"de-de": "de_core_news_sm",
|
||||||
|
"tr-tr": "xx_ent_wiki_sm",
|
||||||
|
"cs-cz": "xx_ent_wiki_sm",
|
||||||
|
"el-gr": "xx_ent_wiki_sm",
|
||||||
|
"da-dk": "da_core_news_sm",
|
||||||
|
"et-ee": "xx_ent_wiki_sm",
|
||||||
|
"pt-pt": "pt_core_news_sm",
|
||||||
|
"es-es": "es_core_news_sm",
|
||||||
|
"nl-nl": "nb_core_news_sm",
|
||||||
|
"fi-fi": "xx_ent_wiki_sm",
|
||||||
|
"it-it": "it_core_news_sm",
|
||||||
|
"pl-pl": "pl_core_news_sm",
|
||||||
|
"sl-si": "xx_ent_wiki_sm",
|
||||||
|
"sv-se": "xx_ent_wiki_sm"
|
||||||
|
}
|
@ -26,3 +26,15 @@ def get_language_full_name(name=None):
|
|||||||
return config_json.get(name)
|
return config_json.get(name)
|
||||||
|
|
||||||
return 'english'
|
return 'english'
|
||||||
|
|
||||||
|
|
||||||
|
def get_spacy_model(language=None):
|
||||||
|
spacy_model = json.load(open(path + "/config/spacy.json", encoding='utf-8', mode='r'))
|
||||||
|
|
||||||
|
if language is None:
|
||||||
|
language = get_language()
|
||||||
|
|
||||||
|
if language in spacy_model:
|
||||||
|
return spacy_model.get(language)
|
||||||
|
|
||||||
|
return 'xx_ent_wiki_sm' # multi-language model (for unsupported languages)
|
||||||
|
@ -1,13 +1,10 @@
|
|||||||
import spacy
|
import spacy
|
||||||
from nltk.corpus import stopwords
|
|
||||||
|
from jarvis.utils import languages_utils
|
||||||
|
|
||||||
|
|
||||||
def get_spacy_nlp():
|
def get_spacy_nlp():
|
||||||
"""
|
nlp = spacy.load(languages_utils.get_spacy_model())
|
||||||
|
|
||||||
:return: spacy
|
|
||||||
"""
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
@ -18,10 +15,3 @@ def get_text_without_stopwords(sentence):
|
|||||||
filtered_sentence = [w for w in sentence.lower().split() if w not in stop_words]
|
filtered_sentence = [w for w in sentence.lower().split() if w not in stop_words]
|
||||||
filtered_sentence = " ".join(filtered_sentence)
|
filtered_sentence = " ".join(filtered_sentence)
|
||||||
return filtered_sentence
|
return filtered_sentence
|
||||||
|
|
||||||
|
|
||||||
def get_text_without_stopwords_nltk(sentence, language='english'):
|
|
||||||
stop_words = set(stopwords.words(language))
|
|
||||||
filtered_sentence = [w for w in sentence.lower().split() if w not in stop_words]
|
|
||||||
filtered_sentence = " ".join(filtered_sentence)
|
|
||||||
return filtered_sentence
|
|
||||||
|
@ -4,4 +4,5 @@ nltk~=3.6.2
|
|||||||
torch~=1.9.0
|
torch~=1.9.0
|
||||||
numpy~=1.21.1
|
numpy~=1.21.1
|
||||||
requests~=2.26.0
|
requests~=2.26.0
|
||||||
adapt-parser
|
adapt-parser
|
||||||
|
spacy~=3.1.1
|
Reference in New Issue
Block a user