You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
73 lines
1.9 KiB
73 lines
1.9 KiB
from copy import copy |
|
import numpy as np |
|
import time |
|
import whisper |
|
|
|
print('Loaded audio.py') |
|
|
|
CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s |
|
|
|
def process_pcm(audio_chunks, data): |
|
# pymumble PCM is 16-bit 48000 Hz |
|
|
|
start = time.time() |
|
|
|
audio_chunks.append(data) |
|
|
|
if len(audio_chunks) > 75: |
|
audio_chunks.pop(0) |
|
|
|
#print('finished chunk in', time.time() - start, 's') |
|
|
|
def process_stream(audio_chunks, model): |
|
|
|
if len(audio_chunks) != 75: |
|
print('Skipping, bad length.') |
|
time.sleep(0.5) |
|
return |
|
|
|
start = time.time() |
|
a = copy(audio_chunks) |
|
b = b''.join(a) |
|
c = np.frombuffer(b, np.int16) |
|
|
|
# Define a low-pass filter kernel |
|
fs = 48000 |
|
cutoff_freq = fs / 6 |
|
nyquist_freq = fs / 2 |
|
num_taps = 101 |
|
taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2)) |
|
taps *= np.blackman(num_taps) |
|
taps /= np.sum(taps) |
|
|
|
# Apply the filter kernel to audio_data using convolution |
|
filtered_audio_data = np.convolve(c, taps, mode='same') |
|
# Downsample filtered_audio_data by a factor of 3 using take |
|
downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)).flatten() |
|
|
|
norm_audio = downsampled_audio_data.astype(np.float32) / 32768.0 |
|
|
|
#abs_mean = np.mean(np.abs(downsampled_audio_data ** 3)) |
|
#print('abs mean:', abs_mean) |
|
#if abs_mean < 0.0: |
|
# print('silence detected, skipping') |
|
# time.sleep(1) |
|
# return |
|
|
|
d = whisper.pad_or_trim(norm_audio) |
|
|
|
#print('processed audio in', time.time() - start, 's') |
|
|
|
start = time.time() |
|
e = model.transcribe(d, language='en') |
|
print('transcribed audio in', time.time() - start, 's') |
|
|
|
if time.time() - start > 10: |
|
with open('downsampled.pcm', 'wb') as f: |
|
f.write(downsampled_audio_data.astype(np.int16).tobytes()) |
|
|
|
print('wrote file, sleeping') |
|
#breakpoint() |
|
time.sleep(100) |
|
|
|
print(' ', e['text'])
|
|
|