jarvis-server-v2/jarvis/utils/whisper_utils.py

70 lines
1.6 KiB
Python
Raw Normal View History

import json
import logging
import requests
from pywhispercpp.model import Model
from jarvis.utils import languages_utils
def load_model():
log_level = logging.getLogger().level
global model
model = Model('base', n_threads=8, suppress_non_speech_tokens=True, log_level=logging.ERROR)
logging.getLogger().setLevel(log_level)
def get_model():
return model
def whisper_cpp_stt(audio_file):
"""
Transcribe audio file using whisper-cpp, no additional server/service needed, runs on CPU.
:param audio_file:
:param model:
:return: text
"""
if model is None:
logging.error("Model is not loaded")
load_model()
segments = model.transcribe(audio_file, speed_up=False, translate=False)
# combines all segments in one string
text = ''
for segment in segments:
text += segment.text + ' '
return text
def whisper_asr_stt(audio_file):
"""
Transcribe audio file using whisper-asr (docker), a server is needed, runs on GPU.
See : https://github.com/ahmetoner/whisper-asr-webservice
:param audio_file:
:return: text
"""
headers = {
'accept': 'application/json',
# 'Content-Type': 'multipart/form-data',
}
params = {
'task': 'transcribe',
# TODO: add to config
'language': languages_utils.get_language(),
'output': 'json',
}
files = {
'audio_file': open(audio_file, 'rb'),
}
# TODO: add to config
2023-05-31 18:51:55 +02:00
response = requests.post('https://whisper.yourdomain.xyz/asr', params=params, headers=headers, files=files)
return json.loads(response.text)['text']