You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
90 lines
2.5 KiB
90 lines
2.5 KiB
import os |
|
|
|
DEBUG = os.environ.get('DEBUG') |
|
|
|
import logging |
|
logging.basicConfig( |
|
format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s', |
|
level=logging.DEBUG if DEBUG else logging.INFO) |
|
|
|
import pymumble_py3 as pymumble_py3 |
|
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS |
|
import whisper |
|
from copy import copy |
|
import numpy as np |
|
import time |
|
|
|
logging.info('Loading whisper model...') |
|
model = whisper.load_model('medium') |
|
logging.info('Done.') |
|
|
|
# Connection details for mumble server. Hardcoded for now, will have to be |
|
# command line arguments eventually |
|
pwd = "" # password |
|
server = "protospace.ca" # server address |
|
nick = "python" |
|
port = 64738 # port number |
|
|
|
CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s |
|
|
|
# array of 0.5 sec audio chunks |
|
audio_chunks = [bytearray()] |
|
|
|
def sound_received_handler(user, soundchunk): |
|
# pymumble PCM is 16-bit 48000 Hz |
|
|
|
if len(audio_chunks[-1]) < CHUNK_LENGTH: |
|
audio_chunks[-1].extend(soundchunk.pcm) |
|
else: |
|
audio_chunks.append(bytearray()) |
|
|
|
if len(audio_chunks) > 10: |
|
audio_chunks.pop(0) |
|
|
|
|
|
# Spin up a client and connect to mumble server |
|
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port) |
|
# set up callback called when PCS event occurs |
|
mumble.callbacks.set_callback(PCS, sound_received_handler) |
|
mumble.set_receive_sound(1) # Enable receiving sound from mumble server |
|
mumble.start() |
|
mumble.is_ready() # Wait for client is ready |
|
|
|
# constant capturing sound and sending it to mumble server |
|
while True: |
|
#data = stream.read(CHUNK, exception_on_overflow=False) |
|
#mumble.sound_output.add_sound(data) |
|
|
|
if len(audio_chunks) != 10: |
|
continue |
|
|
|
start = time.time() |
|
a = copy(audio_chunks) |
|
b = b''.join(a) |
|
c = np.frombuffer(b, np.int16) |
|
|
|
# Define a low-pass filter kernel |
|
fs = 48000 |
|
cutoff_freq = fs / 6 |
|
nyquist_freq = fs / 2 |
|
num_taps = 101 |
|
taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2)) |
|
taps *= np.blackman(num_taps) |
|
taps /= np.sum(taps) |
|
|
|
# Apply the filter kernel to audio_data using convolution |
|
filtered_audio_data = np.convolve(c, taps, mode='same') |
|
# Downsample filtered_audio_data by a factor of 3 using take |
|
downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)) |
|
downsampled_audio_data = downsampled_audio_data.flatten().astype(np.float32) / 32768.0 |
|
|
|
d = whisper.pad_or_trim(downsampled_audio_data) |
|
|
|
#print('processed audio in', time.time() - start, 's') |
|
|
|
e = model.transcribe(d) |
|
|
|
print(e['text']) |
|
|
|
|
|
|
|
|