From ee2eca484fdbeb6d890350741e43d910135bce58 Mon Sep 17 00:00:00 2001
From: Mathieu B <broilletmathieu@protonmail.com>
Date: Sat, 25 Mar 2023 12:25:47 +0100
Subject: [PATCH] added gevent for websockets server and reworked main
 structure

---
 jarvis/api.py                 | 92 +++++++++++------------------------
 jarvis/start.py               | 11 ++++-
 jarvis/utils/chat_utils.py    | 13 +++++
 jarvis/utils/whisper_utils.py | 69 ++++++++++++++++++++++++++
 requirements.txt              |  4 +-
 5 files changed, 123 insertions(+), 66 deletions(-)
 create mode 100644 jarvis/utils/chat_utils.py
 create mode 100644 jarvis/utils/whisper_utils.py

diff --git a/jarvis/api.py b/jarvis/api.py
index 0c4eb52..ac93c94 100644
--- a/jarvis/api.py
+++ b/jarvis/api.py
@@ -1,16 +1,15 @@
 import json
+import logging
 import sys
 import tempfile
 from threading import Lock
 
 import openai
-import requests
 from flask import Flask, request
 from flask_socketio import SocketIO, emit, join_room, leave_room, \
     rooms
-from pywhispercpp.model import Model
 
-from jarvis.utils.chatgpt_utils import chatgpt_recognise
+from jarvis.utils import chat_utils, whisper_utils
 
 # Set this variable to "threading", "eventlet" or "gevent" to test the
 # different async modes, or leave it set to None for the application to choose
@@ -22,7 +21,7 @@ app.config['SECRET_KEY'] = 'secret!'
 socketio = SocketIO(app, async_mode=async_mode)
 thread = None
 thread_lock = Lock()
-model = Model('base', n_threads=16, suppress_non_speech_tokens=True)
+
 openai.api_key = sys.argv[1]
 
 
@@ -34,25 +33,34 @@ def index():
 @socketio.event
 def process_message(message):
     message = json.loads(message)
-    print("New PROCESS request from room " + message['uuid'])
+    logging.info("New PROCESS request from room " + message['uuid'])
+    logging.info("Message : " + message['data'])
 
-    print("Message : " + message['data'])
-    # TODO: maybe implement grammar check ?
+    # TODO: maybe implement grammar check and correction ?
 
     # intent_manager.recognise(message['data'], message['uuid'])
-    send_jarvis_message_to_room("I don't know how to respond to that...", message['uuid'])
 
-    response = chatgpt_recognise(message['data'])
-    if 'comment' in response:
-        send_user_message_to_room(response['comment'], message['uuid'])
-    else:
-        send_jarvis_message_to_room("I don't know how to respond to that...", message['uuid'])
+    if message['data'] != "":
+        # response = chatgpt_recognise(message['data'])
+        response = {'action': 'answer',
+                    'answer': "Hello! As an AI, I don't have emotions, but I'm always here to help you with your smart home needs. How can I assist you today?"}
+
+        if response['action'] == 'clarify':
+            chat_utils.send_jarvis_message_to_room(response['question'], message['uuid'])
+        elif response['action'] == 'command':
+            chat_utils.send_jarvis_message_to_room(response['comment'], message['uuid'])
+        elif response['action'] == 'query':
+            chat_utils.send_jarvis_message_to_room(response['device_description'], message['uuid'])
+        elif response['action'] == 'answer':
+            chat_utils.send_jarvis_message_to_room(response['answer'], message['uuid'])
+        else:
+            chat_utils.send_jarvis_message_to_room("I don't know how to respond to that...", message['uuid'])
 
 
 @socketio.event
 def join(message):
     message = json.loads(message)
-    print("New client joined room " + message['uuid'])
+    logging.info("New client joined room " + message['uuid'])
     join_room(message['uuid'])
 
 
@@ -68,27 +76,18 @@ def connect():
     emit('my_response', {'data': 'Connected', 'count': 0})
 
 
-def send_user_message_to_room(text, room_id):
-    socketio.emit('message_from_user', {'data': text, "uuid": room_id}, to=room_id)
-
-
-def send_jarvis_message_to_room(text, room_id):
-    socketio.emit('message_from_jarvis', {'data': text, "uuid": room_id}, to=room_id)
-
-
 # .WAV (i.e.) FILE REQUEST
 @app.route("/get_text_from_audio", methods=['POST'])
 def get_text_from_audio():
-    print("[" + request.remote_addr + "] - New STT request")
+    logging.info("New STT request from " + request.remote_addr)
 
     audio_temp_file = tempfile.NamedTemporaryFile(prefix='jarvis-audio_', suffix='_client')
     audio_temp_file.write(request.data)
 
-    # text = whisper_stt(audio_temp_file.name)
-    text = whisper_cpp_stt(audio_temp_file.name)
-    print(text)
+    text = whisper_utils.whisper_cpp_stt(audio_temp_file.name)
+    logging.info("STT result for " + request.remote_addr + " : " + text)
 
-    return {"data": text, "uuid": "null"}
+    return {"data": text}
 
 
 """
@@ -103,39 +102,6 @@ def process_text():
     return {"transcription": text, "answer": answer}"""
 
 
-# send request to whisper-asr server (docker)
-def whisper_stt(audio_file):
-    headers = {
-        'accept': 'application/json',
-        # 'Content-Type': 'multipart/form-data',
-    }
-
-    params = {
-        'task': 'transcribe',
-        # TODO: add to config
-        'language': 'fr',
-        'output': 'json',
-    }
-
-    files = {
-        'audio_file': open(audio_file, 'rb'),
-    }
-
-    # TODO: add to config
-    response = requests.post('https://whisper.broillet.ch/asr', params=params, headers=headers, files=files)
-    return json.loads(response.text)['text']
-
-
-def whisper_cpp_stt(audio_file):
-    segments = model.transcribe(audio_file, speed_up=False, translate=False)
-
-    # combines all segments in one string
-    text = ''
-    for segment in segments:
-        text += segment.text + ' '
-
-    return text
-
-
-def start_server():
-    socketio.run(app, host='0.0.0.0', port=6000, allow_unsafe_werkzeug=True)
+def start_api():
+    logging.info("Starting Jarvis Server API...")
+    socketio.run(app, host='0.0.0.0', port=6000)
diff --git a/jarvis/start.py b/jarvis/start.py
index 080d629..7cc0a18 100644
--- a/jarvis/start.py
+++ b/jarvis/start.py
@@ -1,10 +1,14 @@
-import api
+import logging
+
 import lingua_franca
 
+import api
 from jarvis.skills.cocktails import CocktailSkill
 from jarvis.skills.intent_services import intent_manager
+from jarvis.utils import whisper_utils
 
 if __name__ == '__main__':
+    logging.getLogger().setLevel(logging.DEBUG)
 
     # Load lingua franca in the memory
     lingua_franca.load_language(lang="fr")
@@ -15,5 +19,8 @@ if __name__ == '__main__':
     # Load the skills
     intent_manager.load_all_skills()
 
+    # Load the STT (whisper) model
+    whisper_utils.load_model()
+
     # Start the api endpoint
-    api.start_server()
+    api.start_api()
diff --git a/jarvis/utils/chat_utils.py b/jarvis/utils/chat_utils.py
new file mode 100644
index 0000000..c82524e
--- /dev/null
+++ b/jarvis/utils/chat_utils.py
@@ -0,0 +1,13 @@
+import logging
+
+from jarvis.api import socketio
+
+
+def send_user_message_to_room(text, room_id):
+    logging.debug("Sending message from user to room " + room_id + " : " + text)
+    socketio.emit('message_from_user', {'data': text, "uuid": room_id}, to=room_id)
+
+
+def send_jarvis_message_to_room(text, room_id):
+    logging.debug("Sending message from jarvis to room " + room_id + " : " + text)
+    socketio.emit('message_from_jarvis', {'data': text, "uuid": room_id}, to=room_id)
diff --git a/jarvis/utils/whisper_utils.py b/jarvis/utils/whisper_utils.py
new file mode 100644
index 0000000..1acea11
--- /dev/null
+++ b/jarvis/utils/whisper_utils.py
@@ -0,0 +1,69 @@
+import json
+import logging
+
+import requests
+from pywhispercpp.model import Model
+
+from jarvis.utils import languages_utils
+
+
+def load_model():
+    log_level = logging.getLogger().level
+    global model
+    model = Model('base', n_threads=8, suppress_non_speech_tokens=True, log_level=logging.ERROR)
+    logging.getLogger().setLevel(log_level)
+
+
+def get_model():
+    return model
+
+
+def whisper_cpp_stt(audio_file):
+    """
+    Transcribe audio file using whisper-cpp, no additional server/service needed, runs on CPU.
+
+    :param audio_file:
+    :param model:
+    :return: text
+    """
+    if model is None:
+        logging.error("Model is not loaded")
+        load_model()
+
+    segments = model.transcribe(audio_file, speed_up=False, translate=False)
+
+    # combines all segments in one string
+    text = ''
+    for segment in segments:
+        text += segment.text + ' '
+
+    return text
+
+
+def whisper_asr_stt(audio_file):
+    """
+    Transcribe audio file using whisper-asr (docker), a server is needed, runs on GPU.
+    See : https://github.com/ahmetoner/whisper-asr-webservice
+
+    :param audio_file:
+    :return: text
+    """
+    headers = {
+        'accept': 'application/json',
+        # 'Content-Type': 'multipart/form-data',
+    }
+
+    params = {
+        'task': 'transcribe',
+        # TODO: add to config
+        'language': languages_utils.get_language(),
+        'output': 'json',
+    }
+
+    files = {
+        'audio_file': open(audio_file, 'rb'),
+    }
+
+    # TODO: add to config
+    response = requests.post('https://whisper.broillet.ch/asr', params=params, headers=headers, files=files)
+    return json.loads(response.text)['text']
diff --git a/requirements.txt b/requirements.txt
index 58d3492..844ff1f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,6 @@ lingua-franca
 Flask-SocketIO
 pywhispercpp
 padatious
-openai
\ No newline at end of file
+openai
+gevent
+gevent-websocket
\ No newline at end of file