Fully working training and sentence recognition from the model, added stuff to README and requirements

2021-07-26 19:39:24 +02:00 · 2021-07-26 19:39:24 +02:00 · 06b8be8026
commit 06b8be8026
parent a6ad82e284
13 changed files with 351 additions and 60 deletions
--- a/README.md
+++ b/README.md
@ -22,10 +22,19 @@ The server can run on anything that runs Python 3+ *(linux recommended)*
 If not already installed, you will need Python 3.9, you can install it with these commands.
 ```shell
-sudo add-apt-repository ppa:deadsnakes/ppa
+$ sudo add-apt-repository ppa:deadsnakes/ppa
-sudo apt-get update
+$ sudo apt-get update
-sudo apt install python3.9 python3.9-dev python3.9-distutils
+$ sudo apt install python3.9 python3.9-dev python3.9-distutils
 ```
 After that, run the command `python -m pip3 install -r requirements.txt` to install the basic requirements for the
 project.
 Then we need to train our model, but before that we need to download "punkt" and "stopwords" from the NLTK downloader,
 go to the Python Console and enter the following commands :
 ```shell
 > import nltk
 > nltk.download('punkt')
 > nltk.download('stopwords')
 ```
--- a/ia/init.py
+++ b/ia/init.py
--- a/ia/model.py
+++ b/ia/model.py
@ -0,0 +1,19 @@
 import torch.nn as nn
 class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(hidden_size, hidden_size)
        self.l3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        # no activation and no softmax at the end
        return out
--- a/ia/nltk_utils.py
+++ b/ia/nltk_utils.py
@ -0,0 +1,45 @@
 import nltk
 import numpy as np
 from nltk.stem.porter import PorterStemmer
 stemmer = PorterStemmer()
 def tokenize(sentence):
    """
    split sentence into array of words/tokens
    a token can be a word or punctuation character, or number
    """
    # TODO: implement english and others languages
    return nltk.word_tokenize(sentence, language="french")
 def stem(word):
    """
    stemming = find the root form of the word
    examples:
    words = ["organize", "organizes", "organizing"]
    words = [stem(w) for w in words]
    -> ["organ", "organ", "organ"]
    """
    return stemmer.stem(word.lower())
 def bag_of_words(tokenized_sentence, words):
    """
    return bag of words array:
    1 for each known word that exists in the sentence, 0 otherwise
    example:
    sentence = ["hello", "how", "are", "you"]
    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
    bog   = [  0 ,    1 ,    0 ,   1 ,    0 ,    0 ,      0]
    """
    # stem each word
    sentence_words = [stem(word) for word in tokenized_sentence]
    # initialize bag with 0 for each word
    bag = np.zeros(len(words), dtype='float32')
    for idx, w in enumerate(words):
        if w in sentence_words:
            bag[idx] = 1
    return bag
--- a/ia/process.py
+++ b/ia/process.py
@ -0,0 +1,63 @@
 import os
 import torch
 from unidecode import unidecode
 import get_path_file
 from ia.model import NeuralNet
 from ia.nltk_utils import bag_of_words, tokenize
 path = os.path.dirname(get_path_file.__file__)
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 file = path + "/ia/trained_model.pth"
 data = torch.load(file, map_location=device)
 input_size = data["input_size"]
 hidden_size = data["hidden_size"]
 output_size = data["output_size"]
 all_words = data['all_words']
 tags = data['tags']
 model_state = data["model_state"]
 model = NeuralNet(input_size, hidden_size, output_size).to(device)
 model.load_state_dict(model_state)
 model.eval()
 def get_tag_for_sentence(input_sentence):
    """
    Return the matching tag of the input_sentence given in parameter.
    It usually is what the STT engine recognise or what the user's type when using no-voice mode
    Parameters
    ----------
    input_sentence is your sentence
    Returns tag from the intents.json file
    -------
    """
    sentence = unidecode(input_sentence)  # convert accent to better recognition
    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)
    output = model(X)
    _, predicted = torch.max(output, dim=1)
    tag = tags[predicted.item()]
    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]
    if prob.item() > 0.75 and len(sentence) > 2:
        return "MATCHING INTENT : " + tag + " (" + str(prob.item()) + ")"
        # return intents.intents.get_matching_intent_for_tag(tag).get('tag')
    else:
        return 'dont_understand'
 if __name__ == '__main__':
    print(get_tag_for_sentence("Hey, est il"))
--- a/ia/train.py
+++ b/ia/train.py
@ -0,0 +1,138 @@
 import os
 import numpy as np
 import torch
 import torch.nn as nn
 from torch.utils.data import Dataset, DataLoader
 import get_path_file
 from ia.model import NeuralNet
 from ia.nltk_utils import bag_of_words, tokenize, stem
 from utils import intents_utils
 path = os.path.dirname(get_path_file.__file__)
 def train():
    intents_utils.register_all_intents()  # important
    all_intents_patterns = intents_utils.get_all_patterns()
    all_words = []
    tags = []
    xy = []
    # loop through each sentence in our intents patterns
    for intent in all_intents_patterns:
        tag = intent
        # add to tag list
        tags.append(tag)
        for pattern in all_intents_patterns[intent]:
            # tokenize each word in the sentence
            w = tokenize(pattern)
            # add to our words list
            all_words.extend(w)
            # add to xy pair
            xy.append((w, tag))
    # stem and lower each word
    ignore_words = ['?', '.', '!']
    all_words = [stem(w) for w in all_words if w not in ignore_words]
    # remove duplicates and sort
    all_words = sorted(set(all_words))
    tags = sorted(set(tags))
    print(len(xy), "patterns")
    print(len(tags), "tags:", tags)
    print(len(all_words), "unique stemmed words:", all_words)
    # create training data
    X_train = []
    y_train = []
    for (pattern_sentence, tag) in xy:
        # X: bag of words for each pattern_sentence
        bag = bag_of_words(pattern_sentence, all_words)
        X_train.append(bag)
        # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
        label = tags.index(tag)
        y_train.append(label)
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    # Hyper-parameters
    num_epochs = 1000
    batch_size = 8
    learning_rate = 0.001
    input_size = len(X_train[0])
    hidden_size = 8
    output_size = len(tags)
    print(input_size, output_size)
    class ChatDataset(Dataset):
        def __init__(self):
            self.n_samples = len(X_train)
            self.x_data = X_train
            self.y_data = y_train
        # support indexing such that dataset[i] can be used to get i-th sample
        def __getitem__(self, index):
            return self.x_data[index], self.y_data[index]
        # we can call len(dataset) to return the size
        def __len__(self):
            return self.n_samples
    dataset = ChatDataset()
    train_loader = DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=0)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = NeuralNet(input_size, hidden_size, output_size).to(device)
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    # Train the model
    for epoch in range(num_epochs):
        for (words, labels) in train_loader:
            words = words.to(device)
            labels = labels.to(dtype=torch.long).to(device)
            # Forward pass
            outputs = model(words)
            # if y would be one-hot, we must apply
            # labels = torch.max(labels, 1)[1]
            loss = criterion(outputs, labels)
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 100 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
    print(f'Final loss: {loss.item():.4f}')
    data = {
        "model_state": model.state_dict(),
        "input_size": input_size,
        "hidden_size": hidden_size,
        "output_size": output_size,
        "all_words": all_words,
        "tags": tags
    }
    file = path + "/ia/trained_model.pth"
    torch.save(data, file)
    print(f'Training complete. file saved to {file}')
 if __name__ == '__main__':
    train()
--- a/ia/trained_model.pth
+++ b/ia/trained_model.pth
--- a/intents/daily/init.py
+++ b/intents/daily/init.py
--- a/intents/daily/date_and_time/info.json
+++ b/intents/daily/date_and_time/info.json
@ -1,6 +1,15 @@
 {
  "name": "Date and Time",
-  "languages": ["FR-FR", "EN-EN"],
+  "languages": [
-  "intents": ["what_time_is_it", "what_day_is_it"],
+    "FR-FR",
-  "variables": ["time", "day"]
+    "EN-EN"
  ],
  "intents": [
    "what_time_is_it",
    "what_day_is_it"
  ],
  "variables": [
    "time",
    "day"
  ]
 }
--- a/intents/daily/date_and_time/intent.py
+++ b/intents/daily/date_and_time/intent.py
@ -2,12 +2,10 @@ import utils.intents_utils
 def what_time_is_it():
-    return ""
+    response = utils.intents_utils.get_response("what_time_is_it")
    response.replace("{time}", "18:41")
    return response
 def what_day_is_it():
    return ""
 if __name__ == '__main__':
    print(utils.intents_utils.get_response("what_time_is_it"))
--- a/intents/intents.py
+++ b/intents/intents.py
@ -1,38 +0,0 @@
 import glob
 import json
 import os
 import get_path_file
 intents = dict()
 path = os.path.dirname(get_path_file.__file__)
 def register_all_intents():
    global intents
    result = {}
    files = glob.glob(path + "/intents/**/info.json", recursive=True)
    for f in files:
        with open(f, "rb") as infile:
            intent_info_json = json.load(infile)
            intents_in_info = intent_info_json['intents']
            intent_path = str(f).replace('info.json', '')
            for intent in intents_in_info:
                result[intent] = intent_path
    intents = result
 def get_all_intents():
    if len(intents) >= 1:
        return intents
    else:
        register_all_intents()
        return get_all_intents()
 if __name__ == '__main__':
    print(get_all_intents())
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,5 @@
-flask
+flask~=2.0.1
 Unidecode~=1.2.0
 nltk~=3.6.2
 torch~=1.9.0
 numpy~=1.21.1
--- a/utils/intents_utils.py
+++ b/utils/intents_utils.py
@ -1,8 +1,52 @@
 import glob
 import json
 import os
 import random
-import intents.intents
+import get_path_file
 all_intents = dict()
 path = os.path.dirname(get_path_file.__file__)
 def register_all_intents():
    global all_intents
    result = {}
    files = glob.glob(path + "/intents/**/info.json", recursive=True)
    for f in files:
        with open(f, "rb") as infile:
            intent_info_json = json.load(infile)
            intents_in_info = intent_info_json['intents']
            intent_path = str(f).replace('info.json', '')
            for intent in intents_in_info:
                result[intent] = intent_path
    all_intents = result
 def get_all_intents():
    if len(all_intents) >= 1:
        return all_intents
    else:
        register_all_intents()
        return get_all_intents()
 def get_all_patterns():
    all_patterns = {}
    # need to run register first
    if not all_intents:
        print("Warning : No intent found at all, don't forget to register them!")
        return {}
    for intent in all_intents:
        all_patterns[intent] = get_patterns(intent)
    return all_patterns
 def get_patterns(intent_tag):
@ -13,6 +57,12 @@ def get_patterns(intent_tag):
        return {}
 def get_response(intent_tag):
    if exists(intent_tag):
        responses = get_responses(intent_tag)
        return random.choice(responses)
 def get_responses(intent_tag):
    if exists(intent_tag):
        responses = get_lang_for_intent(intent_tag).get(intent_tag).get('responses')
@ -21,18 +71,12 @@ def get_responses(intent_tag):
        return {}
 def get_response(intent_tag):
    if exists(intent_tag):
        responses = get_responses(intent_tag)
        return random.choice(responses)
 def get_lang_for_intent(intent_tag):
    language = "fr-fr"  # TODO: use config value
    # first we check the intent
    if exists(intent_tag):
-        lang_path = str(intents.intents.get_all_intents().get(intent_tag))
+        lang_path = str(get_all_intents().get(intent_tag))
        lang_path = lang_path + 'lang/' + language + '.json'
        if os.path.exists(lang_path):
@ -44,7 +88,7 @@ def get_lang_for_intent(intent_tag):
 def exists(intent_tag):
-    if intent_tag in intents.intents.get_all_intents():
+    if intent_tag in get_all_intents():
        return True
    else:
        return False