replaced whispercpp by faster-whisper

This commit is contained in:
Mathieu Broillet 2023-05-31 17:16:09 +02:00
parent 78d706cfcc
commit 74afc2f9f0
3 changed files with 46 additions and 8 deletions

View File

@ -9,7 +9,7 @@ from flask import Flask, request
from flask_socketio import SocketIO, emit, join_room, leave_room, \ from flask_socketio import SocketIO, emit, join_room, leave_room, \
rooms rooms
from jarvis.utils import chat_utils, whisper_utils, chatgpt_utils from jarvis.utils import chat_utils, whisper_utils, chatgpt_utils, faster_whisper_utils
# Set this variable to "threading", "eventlet" or "gevent" to test the # Set this variable to "threading", "eventlet" or "gevent" to test the
# different async modes, or leave it set to None for the application to choose # different async modes, or leave it set to None for the application to choose
@ -86,7 +86,9 @@ def get_text_from_audio():
audio_temp_file = tempfile.NamedTemporaryFile(prefix='jarvis-audio_', suffix='_client') audio_temp_file = tempfile.NamedTemporaryFile(prefix='jarvis-audio_', suffix='_client')
audio_temp_file.write(request.data) audio_temp_file.write(request.data)
text = whisper_utils.whisper_cpp_stt(audio_temp_file.name) # text = whisper_utils.whisper_cpp_stt(audio_temp_file.name)
text = faster_whisper_utils.faster_whisper_stt(audio_temp_file.name)
logging.info("STT result for " + request.remote_addr + " : " + text) logging.info("STT result for " + request.remote_addr + " : " + text)
return {"data": text} return {"data": text}

View File

@ -0,0 +1,37 @@
import logging
from faster_whisper import WhisperModel
def load_model():
log_level = logging.getLogger().level
global model
model = WhisperModel('small', device="cpu", cpu_threads=8, compute_type="int8")
logging.getLogger().setLevel(log_level)
def get_model():
return model
def faster_whisper_stt(audio_file):
"""
Transcribe audio file using whisper-cpp, no additional server/service needed, runs on CPU.
:param audio_file:
:param model:
:return: text
"""
if model is None:
logging.error("Model is not loaded")
load_model()
segments, info = model.transcribe(audio_file, beam_size=5)
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
# combines all segments in one string
text = ''
for segment in segments:
text += segment.text + ' '
return text

View File

@ -3,9 +3,7 @@ import logging
import lingua_franca import lingua_franca
import jarvis.api import jarvis.api
from jarvis.skills.cocktails import CocktailSkill from jarvis.utils import faster_whisper_utils
from jarvis.skills.intent_services import intent_manager
from jarvis.utils import whisper_utils
if __name__ == '__main__': if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO) logging.getLogger().setLevel(logging.INFO)
@ -14,13 +12,14 @@ if __name__ == '__main__':
lingua_franca.load_language(lang="fr") lingua_franca.load_language(lang="fr")
# Register each skills # Register each skills
CocktailSkill().register() # CocktailSkill().register()
# Load the skills # Load the skills
intent_manager.load_all_skills() # intent_manager.load_all_skills()
# Load the STT (whisper) model # Load the STT (whisper) model
whisper_utils.load_model() # whisper_utils.load_model()
faster_whisper_utils.load_model()
# Start the api endpoint # Start the api endpoint
jarvis.api.start_api() jarvis.api.start_api()