From 7ce3df75f7188f1da6d762f042d17cb1deeaab12 Mon Sep 17 00:00:00 2001 From: Mathieu Broillet Date: Mon, 6 Nov 2023 20:38:25 +0100 Subject: [PATCH] massive rework of the whole api and sockets handling, will need rework and android app but more dynamic now, more usable for others clients and external apps now --- basic_test_client.py | 44 +++++++ run.py | 12 +- src/api.py | 121 ------------------ src/audio/__init__.py | 0 src/audio/audio_utils.py | 58 +++++++++ src/audio/stt/__init__.py | 0 .../stt/faster_whisper.py} | 27 +++- src/audio/tts/__init__.py | 0 src/declarations.py | 50 ++++++++ src/network/__init__.py | 0 src/network/api.py | 83 ++++++++++++ src/network/endpoint_handler.py | 77 +++++++++++ src/network/interactions.py | 13 ++ src/network/socket_handler.py | 74 +++++++++++ .../db-jarvis-commands-memory.sqlite | Bin 12288 -> 12288 bytes src/skills/intent_services/intent_manager.py | 4 +- src/utils/chat_utils.py | 13 -- src/utils/homeassistant_utils.py | 17 +++ 18 files changed, 445 insertions(+), 148 deletions(-) create mode 100644 basic_test_client.py delete mode 100644 src/api.py create mode 100644 src/audio/__init__.py create mode 100644 src/audio/audio_utils.py create mode 100644 src/audio/stt/__init__.py rename src/{utils/faster_whisper_utils.py => audio/stt/faster_whisper.py} (59%) create mode 100644 src/audio/tts/__init__.py create mode 100644 src/declarations.py create mode 100644 src/network/__init__.py create mode 100644 src/network/api.py create mode 100644 src/network/endpoint_handler.py create mode 100644 src/network/interactions.py create mode 100644 src/network/socket_handler.py delete mode 100644 src/utils/chat_utils.py create mode 100644 src/utils/homeassistant_utils.py diff --git a/basic_test_client.py b/basic_test_client.py new file mode 100644 index 0000000..eb69390 --- /dev/null +++ b/basic_test_client.py @@ -0,0 +1,44 @@ +# BASIC CLIENT FOR INTERACTING WITH THE SERVER +# This client is used to test the server and to interact with it +import json + +import socketio + +HOST = "localhost" +PORT = 6000 +waiting = False + +if __name__ == '__main__': + + # Create a TCP/IP socket + sock = socketio.Client() + sock.connect(f"http://{HOST}:{PORT}") + + # Join the room + sock.emit('join', json.dumps({'uuid': 'clientpc'})) + + # Listen for messages from the server + @sock.on('message_from_assistant') + def on_message_from_jarvis(data): + print("Assistant says: " + data['data']) + global waiting + waiting = False + + + # Chat with the server + while True: + + while not waiting: + message = input("Enter a message to send to the server: ") + + # Exit when CTRL+C is pressed + if message == "exit": + print("Exiting") + + # Leave the room + sock.emit('leave', json.dumps({'uuid': 'clientpc'})) + exit(0) + + waiting = True + + sock.emit('process_message', json.dumps({'data': message, 'uuid': 'clientpc'})) diff --git a/run.py b/run.py index e08983d..65fc9f3 100644 --- a/run.py +++ b/run.py @@ -1,8 +1,9 @@ import logging -from src import api +from src.audio import audio_utils from src.database import db_utils -from src.utils import faster_whisper_utils +from src.declarations import TTSEngine, STTEngine +from src.network import api # import lingua_franca @@ -22,9 +23,8 @@ if __name__ == '__main__': # Load the skills # intent_manager.load_all_skills() - # Load the STT (whisper) model - # whisper_utils.load_model() - faster_whisper_utils.load_model() + # Load the audio model(s) + audio_utils.load_models(stt_engine=STTEngine.FASTER_WHISPER, tts_engine=TTSEngine.PIPER) - # Start the api endpoint + # Start the api server api.start_api(6000) diff --git a/src/api.py b/src/api.py deleted file mode 100644 index 32277c2..0000000 --- a/src/api.py +++ /dev/null @@ -1,121 +0,0 @@ -import json -import logging - -import openai -import sys -import tempfile -from threading import Lock -from flask import Flask, request -from flask_socketio import SocketIO, emit, join_room, leave_room, \ - rooms - -from src.utils import chat_utils, chatgpt_utils, faster_whisper_utils - -# Set this variable to "threading", "eventlet" or "gevent" to test the -# different async modes, or leave it set to None for the application to choose -# the best option based on installed packages. -async_mode = None - -app = Flask(__name__) -app.config['SECRET_KEY'] = 'secret!' -socketio = SocketIO(app, async_mode=async_mode) -thread = None -thread_lock = Lock() - -openai.api_key = sys.argv[1] - - -@app.route('/') -def index(): - return "Welcome to Jarvis Server API !" - - -@socketio.event -def process_message(message): - message = json.loads(message) - logging.info("New PROCESS request from room " + message['uuid']) - logging.info("Message : " + message['data']) - - if message['uuid'] not in rooms(): - logging.warning("Room not found, creating it") - join_room(message['uuid']) - - # TODO: maybe implement grammar check and correction ? - - # intent_manager.recognise(message['data'], message['uuid']) - if message['data'] != "": - response = chatgpt_utils.chatgpt_recognise(message['data'], message['uuid']) - # text_response = "Tokens are expensive ya know?" - - chat_utils.send_jarvis_message_to_room(response['response'], message['uuid']) - - -@socketio.event -def join(message): - message = json.loads(message) - - logging.info("New client joined room " + message['uuid']) - join_room(message['uuid']) - - -@socketio.event -def leave(message): - leave_room(message['uuid']) - - -@socketio.event -def connect(): - global thread - emit('my_response', {'data': 'Connected', 'count': 0}) - - -@socketio.event -def clear_chat(uuid): - """ - Clear chat history for a specific room. - :param uuid: uuid - :return: - """ - # uuid = json.loads(uuid) - - emit('clear_chat', {}, to=uuid) - chatgpt_utils.clear_chat(uuid) - - -# .WAV (i.e.) FILE REQUEST -@app.route("/get_text_from_audio", methods=['POST']) -def get_text_from_audio(): - """ - Transcribe audio file using whisper. - - :return: transcription text - """ - - logging.info("New STT request from " + request.remote_addr) - - audio_temp_file = tempfile.NamedTemporaryFile(prefix='jarvis-audio_', suffix='_client') - audio_temp_file.write(request.data) - - # text = whisper_utils.whisper_cpp_stt(audio_temp_file.name) - text = faster_whisper_utils.faster_whisper_stt(audio_temp_file.name) - - logging.info("STT result for " + request.remote_addr + " : " + text) - - return {"data": text} - - -""" -@src.route("/process_text", methods=['POST']) -def process_text(): - print("[" + request.remote_addr + "] - New TXT request") - - text = request.values['text'] - - answer = intent_manager.recognise(text, request.headers.get('Client-Ip'), request.headers.get('Client-Port')) - - return {"transcription": text, "answer": answer}""" - - -def start_api(port=6000): - logging.info("Starting Jarvis Server API on port " + str(port) + "...") - socketio.run(app, host='0.0.0.0', port=port) diff --git a/src/audio/__init__.py b/src/audio/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/audio/audio_utils.py b/src/audio/audio_utils.py new file mode 100644 index 0000000..ebd6755 --- /dev/null +++ b/src/audio/audio_utils.py @@ -0,0 +1,58 @@ +import tempfile + +from src.audio.stt import faster_whisper +from src.declarations import STTEngine, TTSEngine + + +def load_models(stt_engine: STTEngine, tts_engine: TTSEngine): + """ + Load the STT and TTS models in the memory. + :return: + """ + + if stt_engine is STTEngine.FASTER_WHISPER: + faster_whisper.load_model() + elif stt_engine is STTEngine.WHISPER: + pass + else: + raise Exception("Unknown STT engine: " + stt_engine.name) + + if tts_engine is TTSEngine.PIPER: + pass + else: + raise Exception("Unknown TTS engine: " + tts_engine.name) + + +def get_text_from_audio(audio_bytes, stt_engine): + """ + Transcribe audio file. + + :param audio_bytes: + :param stt_engine: + :return: + """ + + audio_temp_file = tempfile.NamedTemporaryFile(prefix='jarvis-audio_', suffix='_client') + audio_temp_file.write(audio_bytes) + + if stt_engine is STTEngine.FASTER_WHISPER: + return faster_whisper.speech_to_text(audio_temp_file.name) + # text = whisper_utils.whisper_cpp_stt(audio_temp_file.name) + elif stt_engine is STTEngine.WHISPER: + # TODO: implement whisper + pass + else: + raise Exception("Unknown STT engine: " + stt_engine.name) + + +def get_speech_from_text(text, tts_engine): + """ + Speak text using Piper. + :return: audio file + """ + + # TODO: implement TTS + if tts_engine is TTSEngine.PIPER: + pass + else: + raise Exception("Unknown TTS engine: " + tts_engine.name) diff --git a/src/audio/stt/__init__.py b/src/audio/stt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/faster_whisper_utils.py b/src/audio/stt/faster_whisper.py similarity index 59% rename from src/utils/faster_whisper_utils.py rename to src/audio/stt/faster_whisper.py index 9312742..40b6ba8 100644 --- a/src/utils/faster_whisper_utils.py +++ b/src/audio/stt/faster_whisper.py @@ -2,8 +2,19 @@ import logging from faster_whisper import WhisperModel +model = None + def load_model(model_size='small', device="cpu", cpu_threads=8, compute_type="int8"): + """ + Load the whisper model in the memory. + + :param model_size: small, medium or large + :param device: cpu or cuda + :param cpu_threads: number of cpu threads + :param compute_type: use int8 (haven't tested others) + :return: None + """ log_level = logging.getLogger().level global model model = WhisperModel(model_size_or_path=model_size, device=device, cpu_threads=cpu_threads, @@ -12,22 +23,26 @@ def load_model(model_size='small', device="cpu", cpu_threads=8, compute_type="in def get_model(): + """ + Get the whisper model. + :return: the whisper model + :rtype: WhisperModel + """ return model -def faster_whisper_stt(audio_file): +def speech_to_text(audio_file): """ Transcribe audio file using faster_whisper, no additional server/service needed, runs on CPU. - :param audio_file: - :param model: - :return: text + :param audio_file: path to audio file + :return: transcription text """ - if model is None: + if get_model() is None: logging.error("Model is not loaded") load_model() - segments, info = model.transcribe(audio_file, beam_size=5, language='fr') + segments, info = get_model().transcribe(audio_file, beam_size=5, language='fr') print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) # combines all segments in one string diff --git a/src/audio/tts/__init__.py b/src/audio/tts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/declarations.py b/src/declarations.py new file mode 100644 index 0000000..818cdb7 --- /dev/null +++ b/src/declarations.py @@ -0,0 +1,50 @@ +from enum import Enum + + +class Sockets(Enum): + """ + List of sockets + """ + PROCESS_MESSAGE = "process_message" + JOIN = "join" + LEAVE = "leave" + CONNECT = "connect" + CLEAR_CHAT = "clear_chat" + + +class Endpoints(Enum): + """ + List of endpoints + """ + DEFAULT = "/" + STATUS = "/status" + STT = "/stt" + TTS = "/tts" + + +class TTSEngine(Enum): + """ + List of TTS engines + """ + PIPER = "piper" + + +class STTEngine(Enum): + """ + List of STT engines + """ + WHISPER = "whisper" + FASTER_WHISPER = "faster_whisper" + + +def get_enum_from_str(enumclass, name): + """ + Get enum from string + :param enumclass: + :param name: + :return: + """ + for enum in enumclass: + if enum.name == name: + return enum + raise Exception("Unknown enum " + name) diff --git a/src/network/__init__.py b/src/network/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/network/api.py b/src/network/api.py new file mode 100644 index 0000000..2ed13ba --- /dev/null +++ b/src/network/api.py @@ -0,0 +1,83 @@ +import logging +import os + +import openai +from flask import Flask, request +from flask_socketio import SocketIO + +from src.network import endpoint_handler, socket_handler +from src.declarations import Endpoints, Sockets + +# Set this variable to "threading", "eventlet" or "gevent" to test the +# different async modes, or leave it set to None for the application to choose +# the best option based on installed packages. +async_mode = None + +app = Flask(__name__) +app.config['SECRET_KEY'] = 'secret!' + +socketio = SocketIO(app, async_mode=async_mode) +# thread = None +# thread_lock = Lock() + +openai.api_key = os.getenv("OPENAI_API_KEY") + + +# +# ENDPOINTS +# + + +@app.route('/') +def index(): + return endpoint_handler.handle_endpoint(Endpoints.DEFAULT, request) + + +@app.route('/status') +def status(): + return endpoint_handler.handle_endpoint(Endpoints.STATUS, request) + + +@app.route("/stt", methods=['POST']) +def speech_to_text(): + return endpoint_handler.handle_endpoint(Endpoints.STT, request) + + +@app.route("/tts", methods=['POST']) +def text_to_speech(): + return endpoint_handler.handle_endpoint(Endpoints.TTS, request) + + +# +# SOCKETS +# + + +@socketio.event +def process_message(message): + return socket_handler.handle_socket(Sockets.PROCESS_MESSAGE, message) + + +@socketio.event +def join(message): + return socket_handler.handle_socket(Sockets.JOIN, message) + + +@socketio.event +def leave(message): + return socket_handler.handle_socket(Sockets.LEAVE, message) + + +@socketio.event +def connect(): + return socket_handler.handle_socket(Sockets.CONNECT, None) + + +@socketio.event +def clear_chat(message): + return socket_handler.handle_socket(Sockets.CLEAR_CHAT, message) + + +def start_api(port=6000): + logging.info("Starting Jarvis Server API on port " + str(port) + "...") + socketio.run(app, host='0.0.0.0', port=port) diff --git a/src/network/endpoint_handler.py b/src/network/endpoint_handler.py new file mode 100644 index 0000000..437047c --- /dev/null +++ b/src/network/endpoint_handler.py @@ -0,0 +1,77 @@ +import logging + +from src.audio import audio_utils +from src.declarations import Endpoints, Sockets + + +def handle_endpoint(endpoint, request): + if endpoint is Endpoints.DEFAULT: + return default_endpoint(request) + elif endpoint is Endpoints.STATUS: + return status_endpoint(request) + elif endpoint is Endpoints.STT: + return speech_to_text(request) + elif endpoint is Endpoints.TTS: + return text_to_speech(request) + else: + return default_endpoint(request) + + +def default_endpoint(request): + list_endpoints = [] + for endpoint in Endpoints: + list_endpoints.append(endpoint.value) + + list_sockets = [] + for socket in Sockets: + list_sockets.append(socket.value) + + return {"message": "Welcome to Jarvis Server API !", + "endpoints": list_endpoints, + "sockets": list_sockets, + "version": "1.0.0" # TODO: get version from somewhere + } + + +def status_endpoint(request): + return {"status": "ok"} + + +def speech_to_text(request): + """ + Transcribe audio file using whisper. + + Exemple of request: + { + "data": "base64 encoded audio file", + "engine": "faster-whisper" + } + + :return: transcription text + """ + + logging.info("New STT request from " + request.remote_addr) + text = audio_utils.get_text_from_audio(request.data, request.engine) + logging.info("STT result for " + request.remote_addr + " : " + text) + + return {"text": text} + + +def text_to_speech(request): + """ + Speak text using Piper. + + Exemple of request: + { + "data": "Hello World !", + "engine": "piper" + } + + :return: audio data + """ + + logging.info("New TTS request from " + request.remote_addr) + + # TODO: implement TTS + + return {"audio": ""} diff --git a/src/network/interactions.py b/src/network/interactions.py new file mode 100644 index 0000000..e77e7e4 --- /dev/null +++ b/src/network/interactions.py @@ -0,0 +1,13 @@ +import logging + +from flask_socketio import emit + + +def add_message_from_user(text, room_id): + logging.debug("Sending message from user to room " + room_id + " : " + text) + emit('message_from_user', {'data': text, "uuid": room_id}, to=room_id) + + +def add_message_from_assistant(text, room_id): + logging.debug("Sending message from assistant to room " + room_id + " : " + text) + emit('message_from_assistant', {'data': text, "uuid": room_id}, to=room_id) diff --git a/src/network/socket_handler.py b/src/network/socket_handler.py new file mode 100644 index 0000000..23190b0 --- /dev/null +++ b/src/network/socket_handler.py @@ -0,0 +1,74 @@ +import json +import logging + +from flask_socketio import rooms, join_room, leave_room, emit + +from src.declarations import Sockets +from src.network import interactions + + +def handle_socket(socket, message): + if socket is Sockets.CONNECT: + connect() + elif socket is Sockets.JOIN: + join(message) + elif socket is Sockets.LEAVE: + leave(message) + elif socket is Sockets.PROCESS_MESSAGE: + process_message(message) + elif socket is Sockets.CLEAR_CHAT: + clear_chat(message) + else: + logging.warning("Unknown socket " + socket) + + +def connect(): + emit('connection', {'data': 'Connected', 'count': 0}) + + +def join(message): + message = json.loads(message) + + logging.info("New client joined room " + message['uuid']) + join_room(message['uuid']) + + +def leave(message): + message = json.loads(message) + + logging.info("Client left room " + message['uuid']) + leave_room(message['uuid']) + + +def process_message(message): + message = json.loads(message) + + logging.info("New process request from room " + message['uuid']) + logging.info("Message : " + message['data']) + + if message['uuid'] not in rooms(): + logging.warning("Room not found, creating it") + join_room(message['uuid']) + + # TODO: maybe implement grammar check and correction ? + + # intent_manager.recognise(message['data'], message['uuid']) + if message['data'] != "": + # response = chatgpt_utils.chatgpt_recognise(message['data'], message['uuid']) + text_response = "Tokens are expensive ya know?" + + print(text_response) + interactions.add_message_from_assistant(text_response, message['uuid']) + # chat_utils.send_jarvis_message_to_room(response['response'], message['uuid']) + + +def clear_chat(message): + """ + Clear chat history for a specific room. + :param uuid: uuid + :return: + """ + message = json.loads(message) + + emit('clear_chat', {}, to=message['uuid']) + # chatgpt_utils.clear_chat(uuid) diff --git a/src/resources/db-jarvis-commands-memory.sqlite b/src/resources/db-jarvis-commands-memory.sqlite index cc295d627b8b8dd90b7ee315f08f25a237536cdb..91e668f7b6b654c203bc657630448738d005a64b 100644 GIT binary patch delta 89 zcmZojXh@hKEx3q*fq@x_VPM`w9b=%NUe*O(pb!(^83ukwzB8Kz1!DM^iUTH}<11zK YpX|bafQdl>D7c1yVuK$WNF{;`0A@cF%m4rY delta 234 zcmXwyy-EW?6oqHv4_LTkDvLCy6D6kc0c62OL?j|sf?3Vo%!HktFmo?r7vDh1V_1fi z0UsdvOpZkkoaTIRzWHcA`s{Tdl~Oyo+uVQkFY-8e?j7c-Q*4wei_Nlo^i*`x+2Xel zeY&_ZKl|FOm04%7G`FSRQ^WEAL!kytewI*Ij?Z0#N~S^L_{56CB_1TU2z8CdO)Mw4 z7B4i8A^3lrHY;UT`SRP`T