From 41bc6015bf61e39decafe87ea2649e5d51fc4ca1 Mon Sep 17 00:00:00 2001
From: Tanner Collin <git@tannercollin.com>
Date: Fri, 7 Apr 2023 21:50:01 +0000
Subject: [PATCH] Add a bunch of test files

---
 audio.py             |  73 ++++++++++++++++++++++
 audio_only_client.py |  79 ++++++++++++++++++++++++
 read_file.py         |  61 +++++++++++++++++++
 stream.py            |  90 +++++++++++++++++++++++++++
 test.py              |  79 ++++++++++++++++++++++++
 transcribe_demo.py   | 142 +++++++++++++++++++++++++++++++++++++++++++
 write_file.py        |  33 ++++++++++
 7 files changed, 557 insertions(+)
 create mode 100644 audio.py
 create mode 100644 audio_only_client.py
 create mode 100644 read_file.py
 create mode 100644 stream.py
 create mode 100644 test.py
 create mode 100644 transcribe_demo.py
 create mode 100644 write_file.py

diff --git a/audio.py b/audio.py
new file mode 100644
index 0000000..6546dee
--- /dev/null
+++ b/audio.py
@@ -0,0 +1,73 @@
+from copy import copy
+import numpy as np
+import time
+import whisper
+
+print('Loaded audio.py')
+
+CHUNK_LENGTH = 24000  # 48000 Hz * 0.5 s
+
+def process_pcm(audio_chunks, data):
+    # pymumble PCM is 16-bit 48000 Hz
+
+    start = time.time()
+
+    audio_chunks.append(data)
+
+    if len(audio_chunks) > 75:
+        audio_chunks.pop(0)
+
+    #print('finished chunk in', time.time() - start, 's')
+
+def process_stream(audio_chunks, model):
+
+    if len(audio_chunks) != 75:
+        print('Skipping, bad length.')
+        time.sleep(0.5)
+        return
+
+    start = time.time()
+    a = copy(audio_chunks)
+    b = b''.join(a)
+    c = np.frombuffer(b, np.int16)
+
+    # Define a low-pass filter kernel
+    fs = 48000
+    cutoff_freq = fs / 6
+    nyquist_freq = fs / 2
+    num_taps = 101
+    taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2))
+    taps *= np.blackman(num_taps)
+    taps /= np.sum(taps)
+
+    # Apply the filter kernel to audio_data using convolution
+    filtered_audio_data = np.convolve(c, taps, mode='same')
+    # Downsample filtered_audio_data by a factor of 3 using take
+    downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)).flatten()
+
+    norm_audio = downsampled_audio_data.astype(np.float32) / 32768.0
+
+    #abs_mean = np.mean(np.abs(downsampled_audio_data ** 3))
+    #print('abs mean:', abs_mean)
+    #if abs_mean < 0.0:
+    #    print('silence detected, skipping')
+    #    time.sleep(1)
+    #    return
+
+    d = whisper.pad_or_trim(norm_audio)
+
+    #print('processed audio in', time.time() - start, 's')
+
+    start = time.time()
+    e = model.transcribe(d, language='en')
+    print('transcribed audio in', time.time() - start, 's')
+
+    if time.time() - start > 10:
+        with open('downsampled.pcm', 'wb') as f:
+            f.write(downsampled_audio_data.astype(np.int16).tobytes())
+
+        print('wrote file, sleeping')
+        #breakpoint()
+        time.sleep(100)
+
+    print('  ', e['text'])
diff --git a/audio_only_client.py b/audio_only_client.py
new file mode 100644
index 0000000..2dc65f6
--- /dev/null
+++ b/audio_only_client.py
@@ -0,0 +1,79 @@
+# A python script to do both listening and talking. This is the basic model
+# for an audio-only mumble client.
+
+# Usage:
+
+# Install pyaudio (instructions: https://people.csail.mit.edu/hubert/pyaudio/#downloads)
+# If `fatal error: 'portaudio.h' file not found` is encountered while installing
+# pyaudio even after following the instruction, this solution might be of help:
+# https://stackoverflow.com/questions/33513522/when-installing-pyaudio-pip-cannot-find-portaudio-h-in-usr-local-include
+#
+# Install dependencies for pymumble.
+#
+# Set up a mumber server. For testing purpose, you can use https://guildbit.com/
+# to spin up a free server. Hard code the server details in this file.
+#
+# run `python3 ./listen_n_talk.py`. Now an audio-only mumble client is connected
+# to the server.
+#
+# To test its functionality, in a separate device, use some official mumble
+# client (https://www.mumble.com/mumble-download.php) to verbally communicate
+# with this audio-only client.
+#
+# Works on MacOS. Does NOT work on RPi 3B+ (I cannot figure out why. Help will
+# be much appreciated)
+
+import pymumble_py3 as pymumble_py3
+from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS
+#import pyaudio
+
+# Connection details for mumble server. Hardcoded for now, will have to be
+# command line arguments eventually
+pwd = ""  # password
+server = "protospace.ca"  # server address
+nick = "python"
+port = 64738  # port number
+
+
+# pyaudio set up
+#CHUNK = 1024
+#FORMAT = pyaudio.paInt16  # pymumble soundchunk.pcm is 16 bits
+#CHANNELS = 1
+#RATE = 48000  # pymumble soundchunk.pcm is 48000Hz
+
+#p = pyaudio.PyAudio()
+#stream = p.open(format=FORMAT,
+#                channels=CHANNELS,
+#                rate=RATE,
+#                input=True,  # enable both talk
+#                output=True,  # and listen
+#                frames_per_buffer=CHUNK)
+
+
+# mumble client set up
+def sound_received_handler(user, soundchunk):
+    """ play sound received from mumble server upon its arrival """
+    #stream.write(soundchunk.pcm)
+    print(len(soundchunk.pcm))
+
+
+# Spin up a client and connect to mumble server
+mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port)
+# set up callback called when PCS event occurs
+mumble.callbacks.set_callback(PCS, sound_received_handler)
+mumble.set_receive_sound(1)  # Enable receiving sound from mumble server
+mumble.start()
+mumble.is_ready()  # Wait for client is ready
+
+
+# constant capturing sound and sending it to mumble server
+while True:
+    #data = stream.read(CHUNK, exception_on_overflow=False)
+    #mumble.sound_output.add_sound(data)
+    pass
+
+
+# close the stream and pyaudio instance
+stream.stop_stream()
+stream.close()
+p.terminate()
diff --git a/read_file.py b/read_file.py
new file mode 100644
index 0000000..8179253
--- /dev/null
+++ b/read_file.py
@@ -0,0 +1,61 @@
+import os
+import importlib
+import threading
+import time
+import whisper
+import traceback
+
+import audio
+
+print('Loading whisper model...')
+start = time.time()
+model = whisper.load_model('medium')
+print('Done after', time.time() - start, 's')
+
+# array of audio chunks
+audio_chunks = [bytearray()]
+
+def read_audio_thread():
+    global audio_chunks
+
+    while True:
+        with open('whispercppexample.pcm', 'rb') as f:
+            while True:
+                data = f.read(1920)
+                if not data:
+                    break
+                audio.process_pcm(audio_chunks, data)
+                time.sleep(0.04)
+
+def process_stream_thread():
+    global audio_chunks
+
+    while True:
+        try:
+            audio.process_stream(audio_chunks, model)
+        except BaseException as e:
+            print('exception')
+            traceback.print_exc()
+            print('sleeping...')
+            time.sleep(5)
+
+def monitor_module():
+    mod_time = os.path.getmtime('audio.py')
+
+    while True:
+        if os.path.getmtime('audio.py') > mod_time:
+            mod_time = os.path.getmtime('audio.py')
+            print('Change detected, reloading.')
+            importlib.reload(audio)
+        time.sleep(1)
+
+t1 = threading.Thread(target=read_audio_thread)
+t2 = threading.Thread(target=process_stream_thread)
+t3 = threading.Thread(target=monitor_module)
+t1.start()
+t2.start()
+t3.start()
+
+while True:
+    pass
+
diff --git a/stream.py b/stream.py
new file mode 100644
index 0000000..7d6894c
--- /dev/null
+++ b/stream.py
@@ -0,0 +1,90 @@
+import os
+
+DEBUG = os.environ.get('DEBUG')
+
+import logging
+logging.basicConfig(
+    format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s',
+    level=logging.DEBUG if DEBUG else logging.INFO)
+
+import pymumble_py3 as pymumble_py3
+from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS
+import whisper
+from copy import copy
+import numpy as np
+import time
+
+logging.info('Loading whisper model...')
+model = whisper.load_model('medium')
+logging.info('Done.')
+
+# Connection details for mumble server. Hardcoded for now, will have to be
+# command line arguments eventually
+pwd = ""  # password
+server = "protospace.ca"  # server address
+nick = "python"
+port = 64738  # port number
+
+CHUNK_LENGTH = 24000  # 48000 Hz * 0.5 s
+
+# array of 0.5 sec audio chunks
+audio_chunks = [bytearray()]
+
+def sound_received_handler(user, soundchunk):
+    # pymumble PCM is 16-bit 48000 Hz
+
+    if len(audio_chunks[-1]) < CHUNK_LENGTH:
+        audio_chunks[-1].extend(soundchunk.pcm)
+    else:
+        audio_chunks.append(bytearray())
+
+    if len(audio_chunks) > 10:
+        audio_chunks.pop(0)
+
+
+# Spin up a client and connect to mumble server
+mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port)
+# set up callback called when PCS event occurs
+mumble.callbacks.set_callback(PCS, sound_received_handler)
+mumble.set_receive_sound(1)  # Enable receiving sound from mumble server
+mumble.start()
+mumble.is_ready()  # Wait for client is ready
+
+# constant capturing sound and sending it to mumble server
+while True:
+    #data = stream.read(CHUNK, exception_on_overflow=False)
+    #mumble.sound_output.add_sound(data)
+
+    if len(audio_chunks) != 10:
+        continue
+
+    start = time.time()
+    a = copy(audio_chunks)
+    b = b''.join(a)
+    c = np.frombuffer(b, np.int16)
+
+    # Define a low-pass filter kernel
+    fs = 48000
+    cutoff_freq = fs / 6
+    nyquist_freq = fs / 2
+    num_taps = 101
+    taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2))
+    taps *= np.blackman(num_taps)
+    taps /= np.sum(taps)
+
+    # Apply the filter kernel to audio_data using convolution
+    filtered_audio_data = np.convolve(c, taps, mode='same')
+    # Downsample filtered_audio_data by a factor of 3 using take
+    downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3))
+    downsampled_audio_data = downsampled_audio_data.flatten().astype(np.float32) / 32768.0
+
+    d = whisper.pad_or_trim(downsampled_audio_data)
+
+    #print('processed audio in', time.time() - start, 's')
+
+    e = model.transcribe(d)
+
+    print(e['text'])
+
+
+
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..31da3c1
--- /dev/null
+++ b/test.py
@@ -0,0 +1,79 @@
+import os
+
+DEBUG = os.environ.get('DEBUG')
+
+import logging
+logging.basicConfig(
+    format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s',
+    level=logging.DEBUG if DEBUG else logging.INFO)
+logging.getLogger('aiohttp').setLevel(logging.DEBUG if DEBUG else logging.WARNING)
+
+
+import ffmpeg
+import whisper
+import time
+import asyncio
+from aiohttp import web, ClientSession, ClientError
+import numpy as np
+
+app = web.Application()
+PORT = 3002
+SAMPLE_RATE = 16000
+
+logging.info('Loading whisper model...')
+model = whisper.load_model('medium')
+logging.info('Done.')
+
+#start = time.time()
+#result = model.transcribe('whisper-test.ogg')
+#print('finished in', time.time() - start, 's')
+#
+#print(result['text'])
+
+def load_audio(binary_file, sr = SAMPLE_RATE):
+	# stolen from https://github.com/ckaytev/tgisper/blob/main/tgisper/tgisperbot.py
+    try:
+        # This launches a subprocess to decode audio while down-mixing and
+        # resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = (
+            ffmpeg.input("pipe:", threads=0)
+            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+            .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=binary_file)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+
+async def index(request):
+    return web.Response(text='hello world', content_type='text/html')
+
+async def post_whisper(request):
+    data = await request.post()
+    audio = load_audio(data['audio'].file.read())
+
+    logging.info('Starting audio transcription...')
+    result = model.transcribe(audio)
+    logging.info('Done.')
+
+    return web.json_response(result)
+
+async def run_webserver():
+    logging.info('Starting webserver on port: %s', PORT)
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, '0.0.0.0', PORT)
+    await site.start()
+
+    while True:
+        await asyncio.sleep(10)
+
+if __name__ == '__main__':
+    app.router.add_get('/', index)
+    app.router.add_post('/whisper', post_whisper)
+
+    loop = asyncio.get_event_loop()
+    a = loop.create_task(run_webserver())
+    loop.run_forever()
+
diff --git a/transcribe_demo.py b/transcribe_demo.py
new file mode 100644
index 0000000..35b0db4
--- /dev/null
+++ b/transcribe_demo.py
@@ -0,0 +1,142 @@
+#! python3.7
+
+import argparse
+import io
+import os
+import speech_recognition as sr
+import whisper
+import torch
+
+from datetime import datetime, timedelta
+from queue import Queue
+from tempfile import NamedTemporaryFile
+from time import sleep
+from sys import platform
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    if 'linux' in platform:
+        parser.add_argument("--default_microphone", default='pulse',
+                            help="Default microphone name for SpeechRecognition. "
+                                 "Run this with 'list' to view available Microphones.", type=str)
+    args = parser.parse_args()
+    
+    # The last time a recording was retreived from the queue.
+    phrase_time = None
+    # Current raw audio bytes.
+    last_sample = bytes()
+    # Thread safe Queue for passing data from the threaded recording callback.
+    data_queue = Queue()
+    # We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends.
+    recorder = sr.Recognizer()
+    recorder.energy_threshold = 1000
+    # Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording.
+    recorder.dynamic_energy_threshold = False
+    
+    # Important for linux users. 
+    # Prevents permanent application hang and crash by using the wrong Microphone
+    if 'linux' in platform:
+        mic_name = args.default_microphone
+        if not mic_name or mic_name == 'list':
+            print("Available microphone devices are: ")
+            for index, name in enumerate(sr.Microphone.list_microphone_names()):
+                print(f"Microphone with name \"{name}\" found")   
+            return
+        else:
+            for index, name in enumerate(sr.Microphone.list_microphone_names()):
+                if mic_name in name:
+                    source = sr.Microphone(sample_rate=16000, device_index=index)
+                    break
+    else:
+        source = sr.Microphone(sample_rate=16000)
+        
+    # Load / Download model
+    model = 'medium'
+    non_english = False
+    if args.model != "large" and not non_english:
+        model = model + ".en"
+    audio_model = whisper.load_model(model)
+
+    record_timeout = 2  # seconds
+    phrase_timeout = 3  # seconds between new lines
+
+    temp_file = NamedTemporaryFile().name
+    transcription = ['']
+    
+    with source:
+        recorder.adjust_for_ambient_noise(source)
+
+    def record_callback(_, audio:sr.AudioData) -> None:
+        """
+        Threaded callback function to recieve audio data when recordings finish.
+        audio: An AudioData containing the recorded bytes.
+        """
+        # Grab the raw bytes and push it into the thread safe queue.
+        data = audio.get_raw_data()
+        data_queue.put(data)
+
+    # Create a background thread that will pass us raw audio bytes.
+    # We could do this manually but SpeechRecognizer provides a nice helper.
+    recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
+
+    # Cue the user that we're ready to go.
+    print("Model loaded.\n")
+
+    while True:
+        try:
+            now = datetime.utcnow()
+            # Pull raw recorded audio from the queue.
+            if not data_queue.empty():
+                phrase_complete = False
+                # If enough time has passed between recordings, consider the phrase complete.
+                # Clear the current working audio buffer to start over with the new data.
+                if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
+                    last_sample = bytes()
+                    phrase_complete = True
+                # This is the last time we received new audio data from the queue.
+                phrase_time = now
+
+                # Concatenate our current audio data with the latest audio data.
+                while not data_queue.empty():
+                    data = data_queue.get()
+                    last_sample += data
+
+                # Use AudioData to convert the raw data to wav data.
+                audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
+                wav_data = io.BytesIO(audio_data.get_wav_data())
+
+                # Write wav data to the temporary file as bytes.
+                with open(temp_file, 'w+b') as f:
+                    f.write(wav_data.read())
+
+                # Read the transcription.
+                result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
+                text = result['text'].strip()
+
+                # If we detected a pause between recordings, add a new item to our transcripion.
+                # Otherwise edit the existing one.
+                if phrase_complete:
+                    transcription.append(text)
+                else:
+                    transcription[-1] = text
+
+                # Clear the console to reprint the updated transcription.
+                os.system('cls' if os.name=='nt' else 'clear')
+                for line in transcription:
+                    print(line)
+                # Flush stdout.
+                print('', end='', flush=True)
+
+                # Infinite loops are bad for processors, must sleep.
+                sleep(0.25)
+        except KeyboardInterrupt:
+            break
+
+    print("\n\nTranscription:")
+    for line in transcription:
+        print(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/write_file.py b/write_file.py
new file mode 100644
index 0000000..b0f0358
--- /dev/null
+++ b/write_file.py
@@ -0,0 +1,33 @@
+import pymumble_py3 as pymumble_py3
+from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS
+
+# Connection details for mumble server. Hardcoded for now, will have to be
+# command line arguments eventually
+pwd = ""  # password
+server = "protospace.ca"  # server address
+nick = "python"
+port = 64738  # port number
+
+audio_file = open('audio.wav', 'wb')
+
+# mumble client set up
+def sound_received_handler(user, soundchunk):
+    """ play sound received from mumble server upon its arrival """
+    print(len(soundchunk.pcm))
+
+    audio_file.write(soundchunk.pcm)
+
+# Spin up a client and connect to mumble server
+mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port)
+# set up callback called when PCS event occurs
+mumble.callbacks.set_callback(PCS, sound_received_handler)
+mumble.set_receive_sound(1)  # Enable receiving sound from mumble server
+mumble.start()
+mumble.is_ready()  # Wait for client is ready
+
+# constant capturing sound and sending it to mumble server
+try:
+    while True:
+        pass
+finally:
+    audio_file.close()