From 74afc2f9f0db3e589ad9b4541ed7d2685eed2344 Mon Sep 17 00:00:00 2001 From: Mathieu Broillet Date: Wed, 31 May 2023 17:16:09 +0200 Subject: [PATCH] replaced whispercpp by faster-whisper --- jarvis/api.py | 6 +++-- jarvis/utils/faster_whisper_utils.py | 37 ++++++++++++++++++++++++++++ start.py | 11 ++++----- 3 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 jarvis/utils/faster_whisper_utils.py diff --git a/jarvis/api.py b/jarvis/api.py index 227a83c..c5a8205 100644 --- a/jarvis/api.py +++ b/jarvis/api.py @@ -9,7 +9,7 @@ from flask import Flask, request from flask_socketio import SocketIO, emit, join_room, leave_room, \ rooms -from jarvis.utils import chat_utils, whisper_utils, chatgpt_utils +from jarvis.utils import chat_utils, whisper_utils, chatgpt_utils, faster_whisper_utils # Set this variable to "threading", "eventlet" or "gevent" to test the # different async modes, or leave it set to None for the application to choose @@ -86,7 +86,9 @@ def get_text_from_audio(): audio_temp_file = tempfile.NamedTemporaryFile(prefix='jarvis-audio_', suffix='_client') audio_temp_file.write(request.data) - text = whisper_utils.whisper_cpp_stt(audio_temp_file.name) + # text = whisper_utils.whisper_cpp_stt(audio_temp_file.name) + text = faster_whisper_utils.faster_whisper_stt(audio_temp_file.name) + logging.info("STT result for " + request.remote_addr + " : " + text) return {"data": text} diff --git a/jarvis/utils/faster_whisper_utils.py b/jarvis/utils/faster_whisper_utils.py new file mode 100644 index 0000000..15a93a0 --- /dev/null +++ b/jarvis/utils/faster_whisper_utils.py @@ -0,0 +1,37 @@ +import logging + +from faster_whisper import WhisperModel + + +def load_model(): + log_level = logging.getLogger().level + global model + model = WhisperModel('small', device="cpu", cpu_threads=8, compute_type="int8") + logging.getLogger().setLevel(log_level) + + +def get_model(): + return model + + +def faster_whisper_stt(audio_file): + """ + Transcribe audio file using whisper-cpp, no additional server/service needed, runs on CPU. + + :param audio_file: + :param model: + :return: text + """ + if model is None: + logging.error("Model is not loaded") + load_model() + + segments, info = model.transcribe(audio_file, beam_size=5) + print("Detected language '%s' with probability %f" % (info.language, info.language_probability)) + + # combines all segments in one string + text = '' + for segment in segments: + text += segment.text + ' ' + + return text diff --git a/start.py b/start.py index 9ced225..a201c52 100644 --- a/start.py +++ b/start.py @@ -3,9 +3,7 @@ import logging import lingua_franca import jarvis.api -from jarvis.skills.cocktails import CocktailSkill -from jarvis.skills.intent_services import intent_manager -from jarvis.utils import whisper_utils +from jarvis.utils import faster_whisper_utils if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) @@ -14,13 +12,14 @@ if __name__ == '__main__': lingua_franca.load_language(lang="fr") # Register each skills - CocktailSkill().register() + # CocktailSkill().register() # Load the skills - intent_manager.load_all_skills() + # intent_manager.load_all_skills() # Load the STT (whisper) model - whisper_utils.load_model() + # whisper_utils.load_model() + faster_whisper_utils.load_model() # Start the api endpoint jarvis.api.start_api()