diff --git a/jarvis/api.py b/jarvis/api.py index 0c4eb52..ac93c94 100644 --- a/jarvis/api.py +++ b/jarvis/api.py @@ -1,16 +1,15 @@ import json +import logging import sys import tempfile from threading import Lock import openai -import requests from flask import Flask, request from flask_socketio import SocketIO, emit, join_room, leave_room, \ rooms -from pywhispercpp.model import Model -from jarvis.utils.chatgpt_utils import chatgpt_recognise +from jarvis.utils import chat_utils, whisper_utils # Set this variable to "threading", "eventlet" or "gevent" to test the # different async modes, or leave it set to None for the application to choose @@ -22,7 +21,7 @@ app.config['SECRET_KEY'] = 'secret!' socketio = SocketIO(app, async_mode=async_mode) thread = None thread_lock = Lock() -model = Model('base', n_threads=16, suppress_non_speech_tokens=True) + openai.api_key = sys.argv[1] @@ -34,25 +33,34 @@ def index(): @socketio.event def process_message(message): message = json.loads(message) - print("New PROCESS request from room " + message['uuid']) + logging.info("New PROCESS request from room " + message['uuid']) + logging.info("Message : " + message['data']) - print("Message : " + message['data']) - # TODO: maybe implement grammar check ? + # TODO: maybe implement grammar check and correction ? # intent_manager.recognise(message['data'], message['uuid']) - send_jarvis_message_to_room("I don't know how to respond to that...", message['uuid']) - response = chatgpt_recognise(message['data']) - if 'comment' in response: - send_user_message_to_room(response['comment'], message['uuid']) - else: - send_jarvis_message_to_room("I don't know how to respond to that...", message['uuid']) + if message['data'] != "": + # response = chatgpt_recognise(message['data']) + response = {'action': 'answer', + 'answer': "Hello! As an AI, I don't have emotions, but I'm always here to help you with your smart home needs. How can I assist you today?"} + + if response['action'] == 'clarify': + chat_utils.send_jarvis_message_to_room(response['question'], message['uuid']) + elif response['action'] == 'command': + chat_utils.send_jarvis_message_to_room(response['comment'], message['uuid']) + elif response['action'] == 'query': + chat_utils.send_jarvis_message_to_room(response['device_description'], message['uuid']) + elif response['action'] == 'answer': + chat_utils.send_jarvis_message_to_room(response['answer'], message['uuid']) + else: + chat_utils.send_jarvis_message_to_room("I don't know how to respond to that...", message['uuid']) @socketio.event def join(message): message = json.loads(message) - print("New client joined room " + message['uuid']) + logging.info("New client joined room " + message['uuid']) join_room(message['uuid']) @@ -68,27 +76,18 @@ def connect(): emit('my_response', {'data': 'Connected', 'count': 0}) -def send_user_message_to_room(text, room_id): - socketio.emit('message_from_user', {'data': text, "uuid": room_id}, to=room_id) - - -def send_jarvis_message_to_room(text, room_id): - socketio.emit('message_from_jarvis', {'data': text, "uuid": room_id}, to=room_id) - - # .WAV (i.e.) FILE REQUEST @app.route("/get_text_from_audio", methods=['POST']) def get_text_from_audio(): - print("[" + request.remote_addr + "] - New STT request") + logging.info("New STT request from " + request.remote_addr) audio_temp_file = tempfile.NamedTemporaryFile(prefix='jarvis-audio_', suffix='_client') audio_temp_file.write(request.data) - # text = whisper_stt(audio_temp_file.name) - text = whisper_cpp_stt(audio_temp_file.name) - print(text) + text = whisper_utils.whisper_cpp_stt(audio_temp_file.name) + logging.info("STT result for " + request.remote_addr + " : " + text) - return {"data": text, "uuid": "null"} + return {"data": text} """ @@ -103,39 +102,6 @@ def process_text(): return {"transcription": text, "answer": answer}""" -# send request to whisper-asr server (docker) -def whisper_stt(audio_file): - headers = { - 'accept': 'application/json', - # 'Content-Type': 'multipart/form-data', - } - - params = { - 'task': 'transcribe', - # TODO: add to config - 'language': 'fr', - 'output': 'json', - } - - files = { - 'audio_file': open(audio_file, 'rb'), - } - - # TODO: add to config - response = requests.post('https://whisper.broillet.ch/asr', params=params, headers=headers, files=files) - return json.loads(response.text)['text'] - - -def whisper_cpp_stt(audio_file): - segments = model.transcribe(audio_file, speed_up=False, translate=False) - - # combines all segments in one string - text = '' - for segment in segments: - text += segment.text + ' ' - - return text - - -def start_server(): - socketio.run(app, host='0.0.0.0', port=6000, allow_unsafe_werkzeug=True) +def start_api(): + logging.info("Starting Jarvis Server API...") + socketio.run(app, host='0.0.0.0', port=6000) diff --git a/jarvis/start.py b/jarvis/start.py index 080d629..7cc0a18 100644 --- a/jarvis/start.py +++ b/jarvis/start.py @@ -1,10 +1,14 @@ -import api +import logging + import lingua_franca +import api from jarvis.skills.cocktails import CocktailSkill from jarvis.skills.intent_services import intent_manager +from jarvis.utils import whisper_utils if __name__ == '__main__': + logging.getLogger().setLevel(logging.DEBUG) # Load lingua franca in the memory lingua_franca.load_language(lang="fr") @@ -15,5 +19,8 @@ if __name__ == '__main__': # Load the skills intent_manager.load_all_skills() + # Load the STT (whisper) model + whisper_utils.load_model() + # Start the api endpoint - api.start_server() + api.start_api() diff --git a/jarvis/utils/chat_utils.py b/jarvis/utils/chat_utils.py new file mode 100644 index 0000000..c82524e --- /dev/null +++ b/jarvis/utils/chat_utils.py @@ -0,0 +1,13 @@ +import logging + +from jarvis.api import socketio + + +def send_user_message_to_room(text, room_id): + logging.debug("Sending message from user to room " + room_id + " : " + text) + socketio.emit('message_from_user', {'data': text, "uuid": room_id}, to=room_id) + + +def send_jarvis_message_to_room(text, room_id): + logging.debug("Sending message from jarvis to room " + room_id + " : " + text) + socketio.emit('message_from_jarvis', {'data': text, "uuid": room_id}, to=room_id) diff --git a/jarvis/utils/whisper_utils.py b/jarvis/utils/whisper_utils.py new file mode 100644 index 0000000..1acea11 --- /dev/null +++ b/jarvis/utils/whisper_utils.py @@ -0,0 +1,69 @@ +import json +import logging + +import requests +from pywhispercpp.model import Model + +from jarvis.utils import languages_utils + + +def load_model(): + log_level = logging.getLogger().level + global model + model = Model('base', n_threads=8, suppress_non_speech_tokens=True, log_level=logging.ERROR) + logging.getLogger().setLevel(log_level) + + +def get_model(): + return model + + +def whisper_cpp_stt(audio_file): + """ + Transcribe audio file using whisper-cpp, no additional server/service needed, runs on CPU. + + :param audio_file: + :param model: + :return: text + """ + if model is None: + logging.error("Model is not loaded") + load_model() + + segments = model.transcribe(audio_file, speed_up=False, translate=False) + + # combines all segments in one string + text = '' + for segment in segments: + text += segment.text + ' ' + + return text + + +def whisper_asr_stt(audio_file): + """ + Transcribe audio file using whisper-asr (docker), a server is needed, runs on GPU. + See : https://github.com/ahmetoner/whisper-asr-webservice + + :param audio_file: + :return: text + """ + headers = { + 'accept': 'application/json', + # 'Content-Type': 'multipart/form-data', + } + + params = { + 'task': 'transcribe', + # TODO: add to config + 'language': languages_utils.get_language(), + 'output': 'json', + } + + files = { + 'audio_file': open(audio_file, 'rb'), + } + + # TODO: add to config + response = requests.post('https://whisper.broillet.ch/asr', params=params, headers=headers, files=files) + return json.loads(response.text)['text'] diff --git a/requirements.txt b/requirements.txt index 58d3492..844ff1f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,6 @@ lingua-franca Flask-SocketIO pywhispercpp padatious -openai \ No newline at end of file +openai +gevent +gevent-websocket \ No newline at end of file