from copy import copy import numpy as np import time import whisper print('Loaded audio.py') CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s def process_pcm(audio_chunks, data): # pymumble PCM is 16-bit 48000 Hz start = time.time() audio_chunks.append(data) if len(audio_chunks) > 75: audio_chunks.pop(0) #print('finished chunk in', time.time() - start, 's') def process_stream(audio_chunks, model): if len(audio_chunks) != 75: print('Skipping, bad length.') time.sleep(0.5) return start = time.time() a = copy(audio_chunks) b = b''.join(a) c = np.frombuffer(b, np.int16) # Define a low-pass filter kernel fs = 48000 cutoff_freq = fs / 6 nyquist_freq = fs / 2 num_taps = 101 taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2)) taps *= np.blackman(num_taps) taps /= np.sum(taps) # Apply the filter kernel to audio_data using convolution filtered_audio_data = np.convolve(c, taps, mode='same') # Downsample filtered_audio_data by a factor of 3 using take downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)).flatten() norm_audio = downsampled_audio_data.astype(np.float32) / 32768.0 #abs_mean = np.mean(np.abs(downsampled_audio_data ** 3)) #print('abs mean:', abs_mean) #if abs_mean < 0.0: # print('silence detected, skipping') # time.sleep(1) # return d = whisper.pad_or_trim(norm_audio) #print('processed audio in', time.time() - start, 's') start = time.time() e = model.transcribe(d, language='en') print('transcribed audio in', time.time() - start, 's') if time.time() - start > 10: with open('downsampled.pcm', 'wb') as f: f.write(downsampled_audio_data.astype(np.int16).tobytes()) print('wrote file, sleeping') #breakpoint() time.sleep(100) print(' ', e['text'])