From 41bc6015bf61e39decafe87ea2649e5d51fc4ca1 Mon Sep 17 00:00:00 2001 From: Tanner Collin Date: Fri, 7 Apr 2023 21:50:01 +0000 Subject: [PATCH] Add a bunch of test files --- audio.py | 73 ++++++++++++++++++++++ audio_only_client.py | 79 ++++++++++++++++++++++++ read_file.py | 61 +++++++++++++++++++ stream.py | 90 +++++++++++++++++++++++++++ test.py | 79 ++++++++++++++++++++++++ transcribe_demo.py | 142 +++++++++++++++++++++++++++++++++++++++++++ write_file.py | 33 ++++++++++ 7 files changed, 557 insertions(+) create mode 100644 audio.py create mode 100644 audio_only_client.py create mode 100644 read_file.py create mode 100644 stream.py create mode 100644 test.py create mode 100644 transcribe_demo.py create mode 100644 write_file.py diff --git a/audio.py b/audio.py new file mode 100644 index 0000000..6546dee --- /dev/null +++ b/audio.py @@ -0,0 +1,73 @@ +from copy import copy +import numpy as np +import time +import whisper + +print('Loaded audio.py') + +CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s + +def process_pcm(audio_chunks, data): + # pymumble PCM is 16-bit 48000 Hz + + start = time.time() + + audio_chunks.append(data) + + if len(audio_chunks) > 75: + audio_chunks.pop(0) + + #print('finished chunk in', time.time() - start, 's') + +def process_stream(audio_chunks, model): + + if len(audio_chunks) != 75: + print('Skipping, bad length.') + time.sleep(0.5) + return + + start = time.time() + a = copy(audio_chunks) + b = b''.join(a) + c = np.frombuffer(b, np.int16) + + # Define a low-pass filter kernel + fs = 48000 + cutoff_freq = fs / 6 + nyquist_freq = fs / 2 + num_taps = 101 + taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2)) + taps *= np.blackman(num_taps) + taps /= np.sum(taps) + + # Apply the filter kernel to audio_data using convolution + filtered_audio_data = np.convolve(c, taps, mode='same') + # Downsample filtered_audio_data by a factor of 3 using take + downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)).flatten() + + norm_audio = downsampled_audio_data.astype(np.float32) / 32768.0 + + #abs_mean = np.mean(np.abs(downsampled_audio_data ** 3)) + #print('abs mean:', abs_mean) + #if abs_mean < 0.0: + # print('silence detected, skipping') + # time.sleep(1) + # return + + d = whisper.pad_or_trim(norm_audio) + + #print('processed audio in', time.time() - start, 's') + + start = time.time() + e = model.transcribe(d, language='en') + print('transcribed audio in', time.time() - start, 's') + + if time.time() - start > 10: + with open('downsampled.pcm', 'wb') as f: + f.write(downsampled_audio_data.astype(np.int16).tobytes()) + + print('wrote file, sleeping') + #breakpoint() + time.sleep(100) + + print(' ', e['text']) diff --git a/audio_only_client.py b/audio_only_client.py new file mode 100644 index 0000000..2dc65f6 --- /dev/null +++ b/audio_only_client.py @@ -0,0 +1,79 @@ +# A python script to do both listening and talking. This is the basic model +# for an audio-only mumble client. + +# Usage: + +# Install pyaudio (instructions: https://people.csail.mit.edu/hubert/pyaudio/#downloads) +# If `fatal error: 'portaudio.h' file not found` is encountered while installing +# pyaudio even after following the instruction, this solution might be of help: +# https://stackoverflow.com/questions/33513522/when-installing-pyaudio-pip-cannot-find-portaudio-h-in-usr-local-include +# +# Install dependencies for pymumble. +# +# Set up a mumber server. For testing purpose, you can use https://guildbit.com/ +# to spin up a free server. Hard code the server details in this file. +# +# run `python3 ./listen_n_talk.py`. Now an audio-only mumble client is connected +# to the server. +# +# To test its functionality, in a separate device, use some official mumble +# client (https://www.mumble.com/mumble-download.php) to verbally communicate +# with this audio-only client. +# +# Works on MacOS. Does NOT work on RPi 3B+ (I cannot figure out why. Help will +# be much appreciated) + +import pymumble_py3 as pymumble_py3 +from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS +#import pyaudio + +# Connection details for mumble server. Hardcoded for now, will have to be +# command line arguments eventually +pwd = "" # password +server = "protospace.ca" # server address +nick = "python" +port = 64738 # port number + + +# pyaudio set up +#CHUNK = 1024 +#FORMAT = pyaudio.paInt16 # pymumble soundchunk.pcm is 16 bits +#CHANNELS = 1 +#RATE = 48000 # pymumble soundchunk.pcm is 48000Hz + +#p = pyaudio.PyAudio() +#stream = p.open(format=FORMAT, +# channels=CHANNELS, +# rate=RATE, +# input=True, # enable both talk +# output=True, # and listen +# frames_per_buffer=CHUNK) + + +# mumble client set up +def sound_received_handler(user, soundchunk): + """ play sound received from mumble server upon its arrival """ + #stream.write(soundchunk.pcm) + print(len(soundchunk.pcm)) + + +# Spin up a client and connect to mumble server +mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port) +# set up callback called when PCS event occurs +mumble.callbacks.set_callback(PCS, sound_received_handler) +mumble.set_receive_sound(1) # Enable receiving sound from mumble server +mumble.start() +mumble.is_ready() # Wait for client is ready + + +# constant capturing sound and sending it to mumble server +while True: + #data = stream.read(CHUNK, exception_on_overflow=False) + #mumble.sound_output.add_sound(data) + pass + + +# close the stream and pyaudio instance +stream.stop_stream() +stream.close() +p.terminate() diff --git a/read_file.py b/read_file.py new file mode 100644 index 0000000..8179253 --- /dev/null +++ b/read_file.py @@ -0,0 +1,61 @@ +import os +import importlib +import threading +import time +import whisper +import traceback + +import audio + +print('Loading whisper model...') +start = time.time() +model = whisper.load_model('medium') +print('Done after', time.time() - start, 's') + +# array of audio chunks +audio_chunks = [bytearray()] + +def read_audio_thread(): + global audio_chunks + + while True: + with open('whispercppexample.pcm', 'rb') as f: + while True: + data = f.read(1920) + if not data: + break + audio.process_pcm(audio_chunks, data) + time.sleep(0.04) + +def process_stream_thread(): + global audio_chunks + + while True: + try: + audio.process_stream(audio_chunks, model) + except BaseException as e: + print('exception') + traceback.print_exc() + print('sleeping...') + time.sleep(5) + +def monitor_module(): + mod_time = os.path.getmtime('audio.py') + + while True: + if os.path.getmtime('audio.py') > mod_time: + mod_time = os.path.getmtime('audio.py') + print('Change detected, reloading.') + importlib.reload(audio) + time.sleep(1) + +t1 = threading.Thread(target=read_audio_thread) +t2 = threading.Thread(target=process_stream_thread) +t3 = threading.Thread(target=monitor_module) +t1.start() +t2.start() +t3.start() + +while True: + pass + diff --git a/stream.py b/stream.py new file mode 100644 index 0000000..7d6894c --- /dev/null +++ b/stream.py @@ -0,0 +1,90 @@ +import os + +DEBUG = os.environ.get('DEBUG') + +import logging +logging.basicConfig( + format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s', + level=logging.DEBUG if DEBUG else logging.INFO) + +import pymumble_py3 as pymumble_py3 +from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS +import whisper +from copy import copy +import numpy as np +import time + +logging.info('Loading whisper model...') +model = whisper.load_model('medium') +logging.info('Done.') + +# Connection details for mumble server. Hardcoded for now, will have to be +# command line arguments eventually +pwd = "" # password +server = "protospace.ca" # server address +nick = "python" +port = 64738 # port number + +CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s + +# array of 0.5 sec audio chunks +audio_chunks = [bytearray()] + +def sound_received_handler(user, soundchunk): + # pymumble PCM is 16-bit 48000 Hz + + if len(audio_chunks[-1]) < CHUNK_LENGTH: + audio_chunks[-1].extend(soundchunk.pcm) + else: + audio_chunks.append(bytearray()) + + if len(audio_chunks) > 10: + audio_chunks.pop(0) + + +# Spin up a client and connect to mumble server +mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port) +# set up callback called when PCS event occurs +mumble.callbacks.set_callback(PCS, sound_received_handler) +mumble.set_receive_sound(1) # Enable receiving sound from mumble server +mumble.start() +mumble.is_ready() # Wait for client is ready + +# constant capturing sound and sending it to mumble server +while True: + #data = stream.read(CHUNK, exception_on_overflow=False) + #mumble.sound_output.add_sound(data) + + if len(audio_chunks) != 10: + continue + + start = time.time() + a = copy(audio_chunks) + b = b''.join(a) + c = np.frombuffer(b, np.int16) + + # Define a low-pass filter kernel + fs = 48000 + cutoff_freq = fs / 6 + nyquist_freq = fs / 2 + num_taps = 101 + taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2)) + taps *= np.blackman(num_taps) + taps /= np.sum(taps) + + # Apply the filter kernel to audio_data using convolution + filtered_audio_data = np.convolve(c, taps, mode='same') + # Downsample filtered_audio_data by a factor of 3 using take + downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)) + downsampled_audio_data = downsampled_audio_data.flatten().astype(np.float32) / 32768.0 + + d = whisper.pad_or_trim(downsampled_audio_data) + + #print('processed audio in', time.time() - start, 's') + + e = model.transcribe(d) + + print(e['text']) + + + diff --git a/test.py b/test.py new file mode 100644 index 0000000..31da3c1 --- /dev/null +++ b/test.py @@ -0,0 +1,79 @@ +import os + +DEBUG = os.environ.get('DEBUG') + +import logging +logging.basicConfig( + format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s', + level=logging.DEBUG if DEBUG else logging.INFO) +logging.getLogger('aiohttp').setLevel(logging.DEBUG if DEBUG else logging.WARNING) + + +import ffmpeg +import whisper +import time +import asyncio +from aiohttp import web, ClientSession, ClientError +import numpy as np + +app = web.Application() +PORT = 3002 +SAMPLE_RATE = 16000 + +logging.info('Loading whisper model...') +model = whisper.load_model('medium') +logging.info('Done.') + +#start = time.time() +#result = model.transcribe('whisper-test.ogg') +#print('finished in', time.time() - start, 's') +# +#print(result['text']) + +def load_audio(binary_file, sr = SAMPLE_RATE): + # stolen from https://github.com/ckaytev/tgisper/blob/main/tgisper/tgisperbot.py + try: + # This launches a subprocess to decode audio while down-mixing and + # resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + out, _ = ( + ffmpeg.input("pipe:", threads=0) + .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) + .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=binary_file) + ) + except ffmpeg.Error as e: + raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e + + return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + +async def index(request): + return web.Response(text='hello world', content_type='text/html') + +async def post_whisper(request): + data = await request.post() + audio = load_audio(data['audio'].file.read()) + + logging.info('Starting audio transcription...') + result = model.transcribe(audio) + logging.info('Done.') + + return web.json_response(result) + +async def run_webserver(): + logging.info('Starting webserver on port: %s', PORT) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, '0.0.0.0', PORT) + await site.start() + + while True: + await asyncio.sleep(10) + +if __name__ == '__main__': + app.router.add_get('/', index) + app.router.add_post('/whisper', post_whisper) + + loop = asyncio.get_event_loop() + a = loop.create_task(run_webserver()) + loop.run_forever() + diff --git a/transcribe_demo.py b/transcribe_demo.py new file mode 100644 index 0000000..35b0db4 --- /dev/null +++ b/transcribe_demo.py @@ -0,0 +1,142 @@ +#! python3.7 + +import argparse +import io +import os +import speech_recognition as sr +import whisper +import torch + +from datetime import datetime, timedelta +from queue import Queue +from tempfile import NamedTemporaryFile +from time import sleep +from sys import platform + + +def main(): + parser = argparse.ArgumentParser() + if 'linux' in platform: + parser.add_argument("--default_microphone", default='pulse', + help="Default microphone name for SpeechRecognition. " + "Run this with 'list' to view available Microphones.", type=str) + args = parser.parse_args() + + # The last time a recording was retreived from the queue. + phrase_time = None + # Current raw audio bytes. + last_sample = bytes() + # Thread safe Queue for passing data from the threaded recording callback. + data_queue = Queue() + # We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends. + recorder = sr.Recognizer() + recorder.energy_threshold = 1000 + # Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording. + recorder.dynamic_energy_threshold = False + + # Important for linux users. + # Prevents permanent application hang and crash by using the wrong Microphone + if 'linux' in platform: + mic_name = args.default_microphone + if not mic_name or mic_name == 'list': + print("Available microphone devices are: ") + for index, name in enumerate(sr.Microphone.list_microphone_names()): + print(f"Microphone with name \"{name}\" found") + return + else: + for index, name in enumerate(sr.Microphone.list_microphone_names()): + if mic_name in name: + source = sr.Microphone(sample_rate=16000, device_index=index) + break + else: + source = sr.Microphone(sample_rate=16000) + + # Load / Download model + model = 'medium' + non_english = False + if args.model != "large" and not non_english: + model = model + ".en" + audio_model = whisper.load_model(model) + + record_timeout = 2 # seconds + phrase_timeout = 3 # seconds between new lines + + temp_file = NamedTemporaryFile().name + transcription = [''] + + with source: + recorder.adjust_for_ambient_noise(source) + + def record_callback(_, audio:sr.AudioData) -> None: + """ + Threaded callback function to recieve audio data when recordings finish. + audio: An AudioData containing the recorded bytes. + """ + # Grab the raw bytes and push it into the thread safe queue. + data = audio.get_raw_data() + data_queue.put(data) + + # Create a background thread that will pass us raw audio bytes. + # We could do this manually but SpeechRecognizer provides a nice helper. + recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout) + + # Cue the user that we're ready to go. + print("Model loaded.\n") + + while True: + try: + now = datetime.utcnow() + # Pull raw recorded audio from the queue. + if not data_queue.empty(): + phrase_complete = False + # If enough time has passed between recordings, consider the phrase complete. + # Clear the current working audio buffer to start over with the new data. + if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout): + last_sample = bytes() + phrase_complete = True + # This is the last time we received new audio data from the queue. + phrase_time = now + + # Concatenate our current audio data with the latest audio data. + while not data_queue.empty(): + data = data_queue.get() + last_sample += data + + # Use AudioData to convert the raw data to wav data. + audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH) + wav_data = io.BytesIO(audio_data.get_wav_data()) + + # Write wav data to the temporary file as bytes. + with open(temp_file, 'w+b') as f: + f.write(wav_data.read()) + + # Read the transcription. + result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available()) + text = result['text'].strip() + + # If we detected a pause between recordings, add a new item to our transcripion. + # Otherwise edit the existing one. + if phrase_complete: + transcription.append(text) + else: + transcription[-1] = text + + # Clear the console to reprint the updated transcription. + os.system('cls' if os.name=='nt' else 'clear') + for line in transcription: + print(line) + # Flush stdout. + print('', end='', flush=True) + + # Infinite loops are bad for processors, must sleep. + sleep(0.25) + except KeyboardInterrupt: + break + + print("\n\nTranscription:") + for line in transcription: + print(line) + + +if __name__ == "__main__": + main() diff --git a/write_file.py b/write_file.py new file mode 100644 index 0000000..b0f0358 --- /dev/null +++ b/write_file.py @@ -0,0 +1,33 @@ +import pymumble_py3 as pymumble_py3 +from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS + +# Connection details for mumble server. Hardcoded for now, will have to be +# command line arguments eventually +pwd = "" # password +server = "protospace.ca" # server address +nick = "python" +port = 64738 # port number + +audio_file = open('audio.wav', 'wb') + +# mumble client set up +def sound_received_handler(user, soundchunk): + """ play sound received from mumble server upon its arrival """ + print(len(soundchunk.pcm)) + + audio_file.write(soundchunk.pcm) + +# Spin up a client and connect to mumble server +mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port) +# set up callback called when PCS event occurs +mumble.callbacks.set_callback(PCS, sound_received_handler) +mumble.set_receive_sound(1) # Enable receiving sound from mumble server +mumble.start() +mumble.is_ready() # Wait for client is ready + +# constant capturing sound and sending it to mumble server +try: + while True: + pass +finally: + audio_file.close()