(IMPORTANT) Removed old AI training model and old intents

2021-07-30 12:37:16 +02:00 · 2021-07-30 12:37:16 +02:00 · 6e28b7076e
commit 6e28b7076e
parent 06f8d60d26
7 changed files with 0 additions and 367 deletions
--- a/jarvis/ia/init.py
+++ b/jarvis/ia/init.py
--- a/jarvis/ia/model.py
+++ b/jarvis/ia/model.py
@ -1,19 +0,0 @@
-import torch.nn as nn
-
-
-class NeuralNet(nn.Module):
-    def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
-        self.l1 = nn.Linear(input_size, hidden_size)
-        self.l2 = nn.Linear(hidden_size, hidden_size)
-        self.l3 = nn.Linear(hidden_size, num_classes)
-        self.relu = nn.ReLU()
-
-    def forward(self, x):
-        out = self.l1(x)
-        out = self.relu(out)
-        out = self.l2(out)
-        out = self.relu(out)
-        out = self.l3(out)
-        # no activation and no softmax at the end
-        return out
--- a/jarvis/ia/nltk_utils.py
+++ b/jarvis/ia/nltk_utils.py
@ -1,52 +0,0 @@
-import nltk
-import numpy as np
-from nltk.stem.porter import PorterStemmer
-
-from jarvis.utils import languages_utils
-
-stemmer = PorterStemmer()
-
-
-# TODO : have a look to replace nltk by spacy or the other way (use only one of them)
-
-def tokenize(sentence):
-    """
-    split sentence into array of words/tokens
-    a token can be a word or punctuation character, or number
-    """
-    # English, Danish, Estonian, French, Greek, Norwegian, Portuguese, Spanish, Turkish,
-    # Czech, Dutch, Finnish, German, Italian, Polish, Slovene, and Swedish
-
-    return nltk.word_tokenize(sentence,
-                              language=languages_utils.get_language_full_name())
-
-
-def stem(word):
-    """
-    stemming = find the root form of the word
-    examples:
-    words = ["organize", "organizes", "organizing"]
-    words = [stem(w) for w in words]
-    -> ["organ", "organ", "organ"]
-    """
-    return stemmer.stem(word.lower())
-
-
-def bag_of_words(tokenized_sentence, words):
-    """
-    return bag of words array:
-    1 for each known word that exists in the sentence, 0 otherwise
-    example:
-    sentence = ["hello", "how", "are", "you"]
-    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
-    bog   = [  0 ,    1 ,    0 ,   1 ,    0 ,    0 ,      0]
-    """
-    # stem each word
-    sentence_words = [stem(word) for word in tokenized_sentence]
-    # initialize bag with 0 for each word
-    bag = np.zeros(len(words), dtype='float32')
-    for idx, w in enumerate(words):
-        if w in sentence_words:
-            bag[idx] = 1
-
-    return bag
--- a/jarvis/ia/process.py
+++ b/jarvis/ia/process.py
@ -1,60 +0,0 @@
-import os
-
-import torch
-from unidecode import unidecode
-
-from jarvis import get_path_file
-from jarvis.ia.model import NeuralNet
-from jarvis.ia.nltk_utils import bag_of_words, tokenize
-
-print("Loading, might take a few seconds...")
-
-path = os.path.dirname(get_path_file.__file__)
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-file = path + "/ia/trained_model.pth"
-data = torch.load(file, map_location=device)
-
-input_size = data["input_size"]
-hidden_size = data["hidden_size"]
-output_size = data["output_size"]
-all_words = data['all_words']
-tags = data['tags']
-model_state = data["model_state"]
-
-model = NeuralNet(input_size, hidden_size, output_size).to(device)
-model.load_state_dict(model_state)
-model.eval()
-
-
-def get_tag_for_sentence(input_sentence):
-    """
-    Return the matching tag of the input_sentence given in parameter.
-    It usually is what the STT engine recognise or what the user's type when using no-voice mode
-
-    Parameters
-    ----------
-    input_sentence is your sentence
-
-    Returns tag from the skills.json file
-    -------
-
-    """
-    sentence = unidecode(input_sentence)  # convert accent to better recognition
-    sentence = tokenize(sentence)
-    X = bag_of_words(sentence, all_words)
-    X = X.reshape(1, X.shape[0])
-    X = torch.from_numpy(X).to(device)
-
-    output = model(X)
-    _, predicted = torch.max(output, dim=1)
-
-    tag = tags[predicted.item()]
-
-    probs = torch.softmax(output, dim=1)
-    prob = probs[0][predicted.item()]
-    if prob.item() > 0.75 and len(sentence) > 2:
-        return tag
-    else:
-        return 'dont_understand'
--- a/jarvis/ia/train.py
+++ b/jarvis/ia/train.py
@ -1,138 +0,0 @@
-import os
-
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
-
-from jarvis import get_path_file
-from jarvis.ia.model import NeuralNet
-from jarvis.ia.nltk_utils import bag_of_words, tokenize, stem
-from jarvis.utils import intents_utils
-
-path = os.path.dirname(get_path_file.__file__)
-
-
-def train():
-    intents_utils.register_all_intents()  # important
-    all_intents_patterns = intents_utils.get_all_patterns()
-
-    all_words = []
-    tags = []
-    xy = []
-    # loop through each sentence in our skills patterns
-    for intent in all_intents_patterns:
-        tag = intent
-        # add to tag list
-        tags.append(tag)
-
-        for pattern in all_intents_patterns[intent]:
-            # tokenize each word in the sentence
-            w = tokenize(pattern)
-            # add to our words list
-            all_words.extend(w)
-            # add to xy pair
-            xy.append((w, tag))
-
-    # stem and lower each word
-    ignore_words = ['?', '.', '!']
-    all_words = [stem(w) for w in all_words if w not in ignore_words]
-    # remove duplicates and sort
-    all_words = sorted(set(all_words))
-    tags = sorted(set(tags))
-
-    print(len(xy), "patterns")
-    print(len(tags), "tags:", tags)
-    print(len(all_words), "unique stemmed words:", all_words)
-
-    # create training data
-    X_train = []
-    y_train = []
-    for (pattern_sentence, tag) in xy:
-        # X: bag of words for each pattern_sentence
-        bag = bag_of_words(pattern_sentence, all_words)
-        X_train.append(bag)
-        # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
-        label = tags.index(tag)
-        y_train.append(label)
-
-    X_train = np.array(X_train)
-    y_train = np.array(y_train)
-
-    # Hyper-parameters
-    num_epochs = 1000
-    batch_size = 8
-    learning_rate = 0.001
-    input_size = len(X_train[0])
-    hidden_size = 8
-    output_size = len(tags)
-    print(input_size, output_size)
-
-    class ChatDataset(Dataset):
-
-        def __init__(self):
-            self.n_samples = len(X_train)
-            self.x_data = X_train
-            self.y_data = y_train
-
-        # support indexing such that dataset[i] can be used to get i-th sample
-        def __getitem__(self, index):
-            return self.x_data[index], self.y_data[index]
-
-        # we can call len(dataset) to return the size
-        def __len__(self):
-            return self.n_samples
-
-    dataset = ChatDataset()
-    train_loader = DataLoader(dataset=dataset,
-                              batch_size=batch_size,
-                              shuffle=True,
-                              num_workers=0)
-
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-    model = NeuralNet(input_size, hidden_size, output_size).to(device)
-
-    # Loss and optimizer
-    criterion = nn.CrossEntropyLoss()
-    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-
-    # Train the model
-    for epoch in range(num_epochs):
-        for (words, labels) in train_loader:
-            words = words.to(device)
-            labels = labels.to(dtype=torch.long).to(device)
-
-            # Forward pass
-            outputs = model(words)
-            # if y would be one-hot, we must apply
-            # labels = torch.max(labels, 1)[1]
-            loss = criterion(outputs, labels)
-
-            # Backward and optimize
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-        if (epoch + 1) % 100 == 0:
-            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
-
-    print(f'Final loss: {loss.item():.4f}')
-
-    data = {
-        "model_state": model.state_dict(),
-        "input_size": input_size,
-        "hidden_size": hidden_size,
-        "output_size": output_size,
-        "all_words": all_words,
-        "tags": tags
-    }
-
-    file = path + "/ia/trained_model.pth"
-    torch.save(data, file)
-
-    print(f'Training complete. file saved to {file}')
-
-
-if __name__ == '__main__':
-    train()
--- a/jarvis/ia/trained_model.pth
+++ b/jarvis/ia/trained_model.pth
--- a/jarvis/utils/intents_utils.py
+++ b/jarvis/utils/intents_utils.py
@ -1,98 +0,0 @@
-import glob
-import json
-import os
-import random
-
-from jarvis import get_path_file
-from jarvis.utils import languages_utils
-
-all_intents = dict()
-path = os.path.dirname(get_path_file.__file__)
-
-
-def register_all_intents():
-    global all_intents
-
-    result = {}
-
-    files = glob.glob(path + "/skills/**/info.json", recursive=True)
-    for f in files:
-        with open(f, "rb") as infile:
-            intent_info_json = json.load(infile)
-            intents_in_info = intent_info_json['intents']
-            intent_path = str(f).replace('info.json', '')
-
-            for intent in intents_in_info:
-                result[intent] = intent_path
-
-    all_intents = result
-
-
-def get_all_intents():
-    if len(all_intents) >= 1:
-        return all_intents
-    else:
-        register_all_intents()
-        return get_all_intents()
-
-
-def get_all_patterns():
-    all_patterns = {}
-
-    # need to run register first
-    if not all_intents:
-        print("Warning : No intent found at all, don't forget to register them!")
-        return {}
-
-    for intent in all_intents:
-        all_patterns[intent] = get_patterns(intent)
-
-    return all_patterns
-
-
-def get_patterns(intent_tag):
-    if exists(intent_tag):
-        patterns = get_lang_for_intent(intent_tag).get(intent_tag).get('patterns')
-        return patterns
-    else:
-        return {}
-
-
-def get_path(intent_tag):
-    if exists(intent_tag):
-        return get_all_intents().get(intent_tag)
-
-
-def get_response(intent_tag):
-    if exists(intent_tag):
-        responses = get_responses(intent_tag)
-        return random.choice(responses)
-
-
-def get_responses(intent_tag):
-    if exists(intent_tag):
-        responses = get_lang_for_intent(intent_tag).get(intent_tag).get('responses')
-        return responses
-    else:
-        return {}
-
-
-def get_lang_for_intent(intent_tag):
-    # first we check the intent
-    if exists(intent_tag):
-        lang_path = str(get_all_intents().get(intent_tag))
-        lang_path = lang_path + 'lang/' + languages_utils.get_language() + '.json'
-
-        if os.path.exists(lang_path):
-            lang_file = open(lang_path)
-            json_lang = json.load(lang_file)
-            return json_lang
-    else:
-        return {}
-
-
-def exists(intent_tag):
-    if intent_tag in get_all_intents():
-        return True
-    else:
-        return False