Fully working training and sentence recognition from the model, added stuff to README and requirements

This commit is contained in:
Mathieu B 2021-07-26 19:39:24 +02:00
parent a6ad82e284
commit 06b8be8026
13 changed files with 351 additions and 60 deletions

View File

@ -22,10 +22,19 @@ The server can run on anything that runs Python 3+ *(linux recommended)*
If not already installed, you will need Python 3.9, you can install it with these commands.
```shell
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt-get update
sudo apt install python3.9 python3.9-dev python3.9-distutils
$ sudo add-apt-repository ppa:deadsnakes/ppa
$ sudo apt-get update
$ sudo apt install python3.9 python3.9-dev python3.9-distutils
```
After that, run the command `python -m pip3 install -r requirements.txt` to install the basic requirements for the
project.
project.
Then we need to train our model, but before that we need to download "punkt" and "stopwords" from the NLTK downloader,
go to the Python Console and enter the following commands :
```shell
> import nltk
> nltk.download('punkt')
> nltk.download('stopwords')
```

0
ia/__init__.py Normal file
View File

19
ia/model.py Normal file
View File

@ -0,0 +1,19 @@
import torch.nn as nn
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(NeuralNet, self).__init__()
self.l1 = nn.Linear(input_size, hidden_size)
self.l2 = nn.Linear(hidden_size, hidden_size)
self.l3 = nn.Linear(hidden_size, num_classes)
self.relu = nn.ReLU()
def forward(self, x):
out = self.l1(x)
out = self.relu(out)
out = self.l2(out)
out = self.relu(out)
out = self.l3(out)
# no activation and no softmax at the end
return out

45
ia/nltk_utils.py Normal file
View File

@ -0,0 +1,45 @@
import nltk
import numpy as np
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def tokenize(sentence):
"""
split sentence into array of words/tokens
a token can be a word or punctuation character, or number
"""
# TODO: implement english and others languages
return nltk.word_tokenize(sentence, language="french")
def stem(word):
"""
stemming = find the root form of the word
examples:
words = ["organize", "organizes", "organizing"]
words = [stem(w) for w in words]
-> ["organ", "organ", "organ"]
"""
return stemmer.stem(word.lower())
def bag_of_words(tokenized_sentence, words):
"""
return bag of words array:
1 for each known word that exists in the sentence, 0 otherwise
example:
sentence = ["hello", "how", "are", "you"]
words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
bog = [ 0 , 1 , 0 , 1 , 0 , 0 , 0]
"""
# stem each word
sentence_words = [stem(word) for word in tokenized_sentence]
# initialize bag with 0 for each word
bag = np.zeros(len(words), dtype='float32')
for idx, w in enumerate(words):
if w in sentence_words:
bag[idx] = 1
return bag

63
ia/process.py Normal file
View File

@ -0,0 +1,63 @@
import os
import torch
from unidecode import unidecode
import get_path_file
from ia.model import NeuralNet
from ia.nltk_utils import bag_of_words, tokenize
path = os.path.dirname(get_path_file.__file__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
file = path + "/ia/trained_model.pth"
data = torch.load(file, map_location=device)
input_size = data["input_size"]
hidden_size = data["hidden_size"]
output_size = data["output_size"]
all_words = data['all_words']
tags = data['tags']
model_state = data["model_state"]
model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()
def get_tag_for_sentence(input_sentence):
"""
Return the matching tag of the input_sentence given in parameter.
It usually is what the STT engine recognise or what the user's type when using no-voice mode
Parameters
----------
input_sentence is your sentence
Returns tag from the intents.json file
-------
"""
sentence = unidecode(input_sentence) # convert accent to better recognition
sentence = tokenize(sentence)
X = bag_of_words(sentence, all_words)
X = X.reshape(1, X.shape[0])
X = torch.from_numpy(X).to(device)
output = model(X)
_, predicted = torch.max(output, dim=1)
tag = tags[predicted.item()]
probs = torch.softmax(output, dim=1)
prob = probs[0][predicted.item()]
if prob.item() > 0.75 and len(sentence) > 2:
return "MATCHING INTENT : " + tag + " (" + str(prob.item()) + ")"
# return intents.intents.get_matching_intent_for_tag(tag).get('tag')
else:
return 'dont_understand'
if __name__ == '__main__':
print(get_tag_for_sentence("Hey, est il"))

138
ia/train.py Normal file
View File

@ -0,0 +1,138 @@
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import get_path_file
from ia.model import NeuralNet
from ia.nltk_utils import bag_of_words, tokenize, stem
from utils import intents_utils
path = os.path.dirname(get_path_file.__file__)
def train():
intents_utils.register_all_intents() # important
all_intents_patterns = intents_utils.get_all_patterns()
all_words = []
tags = []
xy = []
# loop through each sentence in our intents patterns
for intent in all_intents_patterns:
tag = intent
# add to tag list
tags.append(tag)
for pattern in all_intents_patterns[intent]:
# tokenize each word in the sentence
w = tokenize(pattern)
# add to our words list
all_words.extend(w)
# add to xy pair
xy.append((w, tag))
# stem and lower each word
ignore_words = ['?', '.', '!']
all_words = [stem(w) for w in all_words if w not in ignore_words]
# remove duplicates and sort
all_words = sorted(set(all_words))
tags = sorted(set(tags))
print(len(xy), "patterns")
print(len(tags), "tags:", tags)
print(len(all_words), "unique stemmed words:", all_words)
# create training data
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
# X: bag of words for each pattern_sentence
bag = bag_of_words(pattern_sentence, all_words)
X_train.append(bag)
# y: PyTorch CrossEntropyLoss needs only class labels, not one-hot
label = tags.index(tag)
y_train.append(label)
X_train = np.array(X_train)
y_train = np.array(y_train)
# Hyper-parameters
num_epochs = 1000
batch_size = 8
learning_rate = 0.001
input_size = len(X_train[0])
hidden_size = 8
output_size = len(tags)
print(input_size, output_size)
class ChatDataset(Dataset):
def __init__(self):
self.n_samples = len(X_train)
self.x_data = X_train
self.y_data = y_train
# support indexing such that dataset[i] can be used to get i-th sample
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
# we can call len(dataset) to return the size
def __len__(self):
return self.n_samples
dataset = ChatDataset()
train_loader = DataLoader(dataset=dataset,
batch_size=batch_size,
shuffle=True,
num_workers=0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NeuralNet(input_size, hidden_size, output_size).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
for epoch in range(num_epochs):
for (words, labels) in train_loader:
words = words.to(device)
labels = labels.to(dtype=torch.long).to(device)
# Forward pass
outputs = model(words)
# if y would be one-hot, we must apply
# labels = torch.max(labels, 1)[1]
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch + 1) % 100 == 0:
print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
print(f'Final loss: {loss.item():.4f}')
data = {
"model_state": model.state_dict(),
"input_size": input_size,
"hidden_size": hidden_size,
"output_size": output_size,
"all_words": all_words,
"tags": tags
}
file = path + "/ia/trained_model.pth"
torch.save(data, file)
print(f'Training complete. file saved to {file}')
if __name__ == '__main__':
train()

BIN
ia/trained_model.pth Normal file

Binary file not shown.

View File

View File

@ -1,6 +1,15 @@
{
"name": "Date and Time",
"languages": ["FR-FR", "EN-EN"],
"intents": ["what_time_is_it", "what_day_is_it"],
"variables": ["time", "day"]
"languages": [
"FR-FR",
"EN-EN"
],
"intents": [
"what_time_is_it",
"what_day_is_it"
],
"variables": [
"time",
"day"
]
}

View File

@ -2,12 +2,10 @@ import utils.intents_utils
def what_time_is_it():
return ""
response = utils.intents_utils.get_response("what_time_is_it")
response.replace("{time}", "18:41")
return response
def what_day_is_it():
return ""
if __name__ == '__main__':
print(utils.intents_utils.get_response("what_time_is_it"))

View File

@ -1,38 +0,0 @@
import glob
import json
import os
import get_path_file
intents = dict()
path = os.path.dirname(get_path_file.__file__)
def register_all_intents():
global intents
result = {}
files = glob.glob(path + "/intents/**/info.json", recursive=True)
for f in files:
with open(f, "rb") as infile:
intent_info_json = json.load(infile)
intents_in_info = intent_info_json['intents']
intent_path = str(f).replace('info.json', '')
for intent in intents_in_info:
result[intent] = intent_path
intents = result
def get_all_intents():
if len(intents) >= 1:
return intents
else:
register_all_intents()
return get_all_intents()
if __name__ == '__main__':
print(get_all_intents())

View File

@ -1 +1,5 @@
flask
flask~=2.0.1
Unidecode~=1.2.0
nltk~=3.6.2
torch~=1.9.0
numpy~=1.21.1

View File

@ -1,8 +1,52 @@
import glob
import json
import os
import random
import intents.intents
import get_path_file
all_intents = dict()
path = os.path.dirname(get_path_file.__file__)
def register_all_intents():
global all_intents
result = {}
files = glob.glob(path + "/intents/**/info.json", recursive=True)
for f in files:
with open(f, "rb") as infile:
intent_info_json = json.load(infile)
intents_in_info = intent_info_json['intents']
intent_path = str(f).replace('info.json', '')
for intent in intents_in_info:
result[intent] = intent_path
all_intents = result
def get_all_intents():
if len(all_intents) >= 1:
return all_intents
else:
register_all_intents()
return get_all_intents()
def get_all_patterns():
all_patterns = {}
# need to run register first
if not all_intents:
print("Warning : No intent found at all, don't forget to register them!")
return {}
for intent in all_intents:
all_patterns[intent] = get_patterns(intent)
return all_patterns
def get_patterns(intent_tag):
@ -13,6 +57,12 @@ def get_patterns(intent_tag):
return {}
def get_response(intent_tag):
if exists(intent_tag):
responses = get_responses(intent_tag)
return random.choice(responses)
def get_responses(intent_tag):
if exists(intent_tag):
responses = get_lang_for_intent(intent_tag).get(intent_tag).get('responses')
@ -21,18 +71,12 @@ def get_responses(intent_tag):
return {}
def get_response(intent_tag):
if exists(intent_tag):
responses = get_responses(intent_tag)
return random.choice(responses)
def get_lang_for_intent(intent_tag):
language = "fr-fr" # TODO: use config value
# first we check the intent
if exists(intent_tag):
lang_path = str(intents.intents.get_all_intents().get(intent_tag))
lang_path = str(get_all_intents().get(intent_tag))
lang_path = lang_path + 'lang/' + language + '.json'
if os.path.exists(lang_path):
@ -44,7 +88,7 @@ def get_lang_for_intent(intent_tag):
def exists(intent_tag):
if intent_tag in intents.intents.get_all_intents():
if intent_tag in get_all_intents():
return True
else:
return False