jarvis-server/jarvis/ia/nltk_utils.py

import nltk
import numpy as np
from nltk.stem.porter import PorterStemmer

from jarvis.utils import languages_utils

stemmer = PorterStemmer()


# TODO : have a look to replace nltk by spacy or the other way (use only one of them)

def tokenize(sentence):
    """
    split sentence into array of words/tokens
    a token can be a word or punctuation character, or number
    """
    # English, Danish, Estonian, French, Greek, Norwegian, Portuguese, Spanish, Turkish,
    # Czech, Dutch, Finnish, German, Italian, Polish, Slovene, and Swedish

    return nltk.word_tokenize(sentence,
                              language=languages_utils.get_language_full_name())


def stem(word):
    """
    stemming = find the root form of the word
    examples:
    words = ["organize", "organizes", "organizing"]
    words = [stem(w) for w in words]
    -> ["organ", "organ", "organ"]
    """
    return stemmer.stem(word.lower())


def bag_of_words(tokenized_sentence, words):
    """
    return bag of words array:
    1 for each known word that exists in the sentence, 0 otherwise
    example:
    sentence = ["hello", "how", "are", "you"]
    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
    bog   = [  0 ,    1 ,    0 ,   1 ,    0 ,    0 ,      0]
    """
    # stem each word
    sentence_words = [stem(word) for word in tokenized_sentence]
    # initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype='float32')
    for idx, w in enumerate(words):
        if w in sentence_words:
            bag[idx] = 1

    return bag