Fully working training and sentence recognition from the model, added stuff to README and requirements

2021-07-26 19:39:24 +02:00 · 2021-07-26 19:39:24 +02:00 · 06b8be8026
commit 06b8be8026
parent a6ad82e284
13 changed files with 351 additions and 60 deletions
--- a/README.md
+++ b/README.md
@ -22,10 +22,19 @@ The server can run on anything that runs Python 3+ *(linux recommended)*
 If not already installed, you will need Python 3.9, you can install it with these commands.

 ```shell
-sudo add-apt-repository ppa:deadsnakes/ppa
-sudo apt-get update
-sudo apt install python3.9 python3.9-dev python3.9-distutils
+$ sudo add-apt-repository ppa:deadsnakes/ppa
+$ sudo apt-get update
+$ sudo apt install python3.9 python3.9-dev python3.9-distutils
 ```

 After that, run the command `python -m pip3 install -r requirements.txt` to install the basic requirements for the
-project.
+project.
+
+Then we need to train our model, but before that we need to download "punkt" and "stopwords" from the NLTK downloader,
+go to the Python Console and enter the following commands :
+
+```shell
+> import nltk
+> nltk.download('punkt')
+> nltk.download('stopwords')
+```
--- a/ia/init.py
+++ b/ia/init.py
--- a/ia/model.py
+++ b/ia/model.py
@ -0,0 +1,19 @@
+import torch.nn as nn
+
+
+class NeuralNet(nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes):
+        super(NeuralNet, self).__init__()
+        self.l1 = nn.Linear(input_size, hidden_size)
+        self.l2 = nn.Linear(hidden_size, hidden_size)
+        self.l3 = nn.Linear(hidden_size, num_classes)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        out = self.l1(x)
+        out = self.relu(out)
+        out = self.l2(out)
+        out = self.relu(out)
+        out = self.l3(out)
+        # no activation and no softmax at the end
+        return out
--- a/ia/nltk_utils.py
+++ b/ia/nltk_utils.py
@ -0,0 +1,45 @@
+import nltk
+import numpy as np
+from nltk.stem.porter import PorterStemmer
+
+stemmer = PorterStemmer()
+
+
+def tokenize(sentence):
+    """
+    split sentence into array of words/tokens
+    a token can be a word or punctuation character, or number
+    """
+    # TODO: implement english and others languages
+    return nltk.word_tokenize(sentence, language="french")
+
+
+def stem(word):
+    """
+    stemming = find the root form of the word
+    examples:
+    words = ["organize", "organizes", "organizing"]
+    words = [stem(w) for w in words]
+    -> ["organ", "organ", "organ"]
+    """
+    return stemmer.stem(word.lower())
+
+
+def bag_of_words(tokenized_sentence, words):
+    """
+    return bag of words array:
+    1 for each known word that exists in the sentence, 0 otherwise
+    example:
+    sentence = ["hello", "how", "are", "you"]
+    words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
+    bog   = [  0 ,    1 ,    0 ,   1 ,    0 ,    0 ,      0]
+    """
+    # stem each word
+    sentence_words = [stem(word) for word in tokenized_sentence]
+    # initialize bag with 0 for each word
+    bag = np.zeros(len(words), dtype='float32')
+    for idx, w in enumerate(words):
+        if w in sentence_words:
+            bag[idx] = 1
+
+    return bag
--- a/ia/process.py
+++ b/ia/process.py
@ -0,0 +1,63 @@
+import os
+
+import torch
+from unidecode import unidecode
+
+import get_path_file
+from ia.model import NeuralNet
+from ia.nltk_utils import bag_of_words, tokenize
+
+path = os.path.dirname(get_path_file.__file__)
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+file = path + "/ia/trained_model.pth"
+data = torch.load(file, map_location=device)
+
+input_size = data["input_size"]
+hidden_size = data["hidden_size"]
+output_size = data["output_size"]
+all_words = data['all_words']
+tags = data['tags']
+model_state = data["model_state"]
+
+model = NeuralNet(input_size, hidden_size, output_size).to(device)
+model.load_state_dict(model_state)
+model.eval()
+
+
+def get_tag_for_sentence(input_sentence):
+    """
+    Return the matching tag of the input_sentence given in parameter.
+    It usually is what the STT engine recognise or what the user's type when using no-voice mode
+
+    Parameters
+    ----------
+    input_sentence is your sentence
+
+    Returns tag from the intents.json file
+    -------
+
+    """
+    sentence = unidecode(input_sentence)  # convert accent to better recognition
+    sentence = tokenize(sentence)
+    X = bag_of_words(sentence, all_words)
+    X = X.reshape(1, X.shape[0])
+    X = torch.from_numpy(X).to(device)
+
+    output = model(X)
+    _, predicted = torch.max(output, dim=1)
+
+    tag = tags[predicted.item()]
+
+    probs = torch.softmax(output, dim=1)
+    prob = probs[0][predicted.item()]
+    if prob.item() > 0.75 and len(sentence) > 2:
+        return "MATCHING INTENT : " + tag + " (" + str(prob.item()) + ")"
+        # return intents.intents.get_matching_intent_for_tag(tag).get('tag')
+    else:
+        return 'dont_understand'
+
+
+if __name__ == '__main__':
+    print(get_tag_for_sentence("Hey, est il"))
--- a/ia/train.py
+++ b/ia/train.py
@ -0,0 +1,138 @@
+import os
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+
+import get_path_file
+from ia.model import NeuralNet
+from ia.nltk_utils import bag_of_words, tokenize, stem
+from utils import intents_utils
+
+path = os.path.dirname(get_path_file.__file__)
+
+
+def train():
+    intents_utils.register_all_intents()  # important
+    all_intents_patterns = intents_utils.get_all_patterns()
+
+    all_words = []
+    tags = []
+    xy = []
+    # loop through each sentence in our intents patterns
+    for intent in all_intents_patterns:
+        tag = intent
+        # add to tag list
+        tags.append(tag)
+
+        for pattern in all_intents_patterns[intent]:
+            # tokenize each word in the sentence
+            w = tokenize(pattern)
+            # add to our words list
+            all_words.extend(w)
+            # add to xy pair
+            xy.append((w, tag))
+
+    # stem and lower each word
+    ignore_words = ['?', '.', '!']
+    all_words = [stem(w) for w in all_words if w not in ignore_words]
+    # remove duplicates and sort
+    all_words = sorted(set(all_words))
+    tags = sorted(set(tags))
+
+    print(len(xy), "patterns")
+    print(len(tags), "tags:", tags)
+    print(len(all_words), "unique stemmed words:", all_words)
+
+    # create training data
+    X_train = []
+    y_train = []
+    for (pattern_sentence, tag) in xy:
+        # X: bag of words for each pattern_sentence
+        bag = bag_of_words(pattern_sentence, all_words)
+        X_train.append(bag)
+        # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
+        label = tags.index(tag)
+        y_train.append(label)
+
+    X_train = np.array(X_train)
+    y_train = np.array(y_train)
+
+    # Hyper-parameters
+    num_epochs = 1000
+    batch_size = 8
+    learning_rate = 0.001
+    input_size = len(X_train[0])
+    hidden_size = 8
+    output_size = len(tags)
+    print(input_size, output_size)
+
+    class ChatDataset(Dataset):
+
+        def __init__(self):
+            self.n_samples = len(X_train)
+            self.x_data = X_train
+            self.y_data = y_train
+
+        # support indexing such that dataset[i] can be used to get i-th sample
+        def __getitem__(self, index):
+            return self.x_data[index], self.y_data[index]
+
+        # we can call len(dataset) to return the size
+        def __len__(self):
+            return self.n_samples
+
+    dataset = ChatDataset()
+    train_loader = DataLoader(dataset=dataset,
+                              batch_size=batch_size,
+                              shuffle=True,
+                              num_workers=0)
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+    model = NeuralNet(input_size, hidden_size, output_size).to(device)
+
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+
+    # Train the model
+    for epoch in range(num_epochs):
+        for (words, labels) in train_loader:
+            words = words.to(device)
+            labels = labels.to(dtype=torch.long).to(device)
+
+            # Forward pass
+            outputs = model(words)
+            # if y would be one-hot, we must apply
+            # labels = torch.max(labels, 1)[1]
+            loss = criterion(outputs, labels)
+
+            # Backward and optimize
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+        if (epoch + 1) % 100 == 0:
+            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
+
+    print(f'Final loss: {loss.item():.4f}')
+
+    data = {
+        "model_state": model.state_dict(),
+        "input_size": input_size,
+        "hidden_size": hidden_size,
+        "output_size": output_size,
+        "all_words": all_words,
+        "tags": tags
+    }
+
+    file = path + "/ia/trained_model.pth"
+    torch.save(data, file)
+
+    print(f'Training complete. file saved to {file}')
+
+
+if __name__ == '__main__':
+    train()
--- a/ia/trained_model.pth
+++ b/ia/trained_model.pth
--- a/intents/daily/init.py
+++ b/intents/daily/init.py
--- a/intents/daily/date_and_time/info.json
+++ b/intents/daily/date_and_time/info.json
@ -1,6 +1,15 @@
 {
  "name": "Date and Time",
-  "languages": ["FR-FR", "EN-EN"],
-  "intents": ["what_time_is_it", "what_day_is_it"],
-  "variables": ["time", "day"]
+  "languages": [
+    "FR-FR",
+    "EN-EN"
+  ],
+  "intents": [
+    "what_time_is_it",
+    "what_day_is_it"
+  ],
+  "variables": [
+    "time",
+    "day"
+  ]
 }
--- a/intents/daily/date_and_time/intent.py
+++ b/intents/daily/date_and_time/intent.py
@ -2,12 +2,10 @@ import utils.intents_utils


 def what_time_is_it():
-    return ""
+    response = utils.intents_utils.get_response("what_time_is_it")
+    response.replace("{time}", "18:41")
+    return response


 def what_day_is_it():
    return ""
-
-
-if __name__ == '__main__':
-    print(utils.intents_utils.get_response("what_time_is_it"))
--- a/intents/intents.py
+++ b/intents/intents.py
@ -1,38 +0,0 @@
-import glob
-import json
-import os
-
-import get_path_file
-
-intents = dict()
-path = os.path.dirname(get_path_file.__file__)
-
-
-def register_all_intents():
-    global intents
-
-    result = {}
-
-    files = glob.glob(path + "/intents/**/info.json", recursive=True)
-    for f in files:
-        with open(f, "rb") as infile:
-            intent_info_json = json.load(infile)
-            intents_in_info = intent_info_json['intents']
-            intent_path = str(f).replace('info.json', '')
-
-            for intent in intents_in_info:
-                result[intent] = intent_path
-
-    intents = result
-
-
-def get_all_intents():
-    if len(intents) >= 1:
-        return intents
-    else:
-        register_all_intents()
-        return get_all_intents()
-
-
-if __name__ == '__main__':
-    print(get_all_intents())
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,5 @@
-flask
+flask~=2.0.1
+Unidecode~=1.2.0
+nltk~=3.6.2
+torch~=1.9.0
+numpy~=1.21.1
--- a/utils/intents_utils.py
+++ b/utils/intents_utils.py
@ -1,8 +1,52 @@
+import glob
 import json
 import os
 import random

-import intents.intents
+import get_path_file
+
+all_intents = dict()
+path = os.path.dirname(get_path_file.__file__)
+
+
+def register_all_intents():
+    global all_intents
+
+    result = {}
+
+    files = glob.glob(path + "/intents/**/info.json", recursive=True)
+    for f in files:
+        with open(f, "rb") as infile:
+            intent_info_json = json.load(infile)
+            intents_in_info = intent_info_json['intents']
+            intent_path = str(f).replace('info.json', '')
+
+            for intent in intents_in_info:
+                result[intent] = intent_path
+
+    all_intents = result
+
+
+def get_all_intents():
+    if len(all_intents) >= 1:
+        return all_intents
+    else:
+        register_all_intents()
+        return get_all_intents()
+
+
+def get_all_patterns():
+    all_patterns = {}
+
+    # need to run register first
+    if not all_intents:
+        print("Warning : No intent found at all, don't forget to register them!")
+        return {}
+
+    for intent in all_intents:
+        all_patterns[intent] = get_patterns(intent)
+
+    return all_patterns


 def get_patterns(intent_tag):
@ -13,6 +57,12 @@ def get_patterns(intent_tag):
        return {}


+def get_response(intent_tag):
+    if exists(intent_tag):
+        responses = get_responses(intent_tag)
+        return random.choice(responses)
+
+
 def get_responses(intent_tag):
    if exists(intent_tag):
        responses = get_lang_for_intent(intent_tag).get(intent_tag).get('responses')
@ -21,18 +71,12 @@ def get_responses(intent_tag):
        return {}


-def get_response(intent_tag):
-    if exists(intent_tag):
-        responses = get_responses(intent_tag)
-        return random.choice(responses)
-
-
 def get_lang_for_intent(intent_tag):
    language = "fr-fr"  # TODO: use config value

    # first we check the intent
    if exists(intent_tag):
-        lang_path = str(intents.intents.get_all_intents().get(intent_tag))
+        lang_path = str(get_all_intents().get(intent_tag))
        lang_path = lang_path + 'lang/' + language + '.json'

        if os.path.exists(lang_path):
@ -44,7 +88,7 @@ def get_lang_for_intent(intent_tag):


 def exists(intent_tag):
-    if intent_tag in intents.intents.get_all_intents():
+    if intent_tag in get_all_intents():
        return True
    else:
        return False