Add a bunch of test files

master
Tanner Collin 1 year ago
parent 89cb732e42
commit 41bc6015bf
  1. 73
      audio.py
  2. 79
      audio_only_client.py
  3. 61
      read_file.py
  4. 90
      stream.py
  5. 79
      test.py
  6. 142
      transcribe_demo.py
  7. 33
      write_file.py

@ -0,0 +1,73 @@
from copy import copy
import numpy as np
import time
import whisper
print('Loaded audio.py')
CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s
def process_pcm(audio_chunks, data):
# pymumble PCM is 16-bit 48000 Hz
start = time.time()
audio_chunks.append(data)
if len(audio_chunks) > 75:
audio_chunks.pop(0)
#print('finished chunk in', time.time() - start, 's')
def process_stream(audio_chunks, model):
if len(audio_chunks) != 75:
print('Skipping, bad length.')
time.sleep(0.5)
return
start = time.time()
a = copy(audio_chunks)
b = b''.join(a)
c = np.frombuffer(b, np.int16)
# Define a low-pass filter kernel
fs = 48000
cutoff_freq = fs / 6
nyquist_freq = fs / 2
num_taps = 101
taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2))
taps *= np.blackman(num_taps)
taps /= np.sum(taps)
# Apply the filter kernel to audio_data using convolution
filtered_audio_data = np.convolve(c, taps, mode='same')
# Downsample filtered_audio_data by a factor of 3 using take
downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)).flatten()
norm_audio = downsampled_audio_data.astype(np.float32) / 32768.0
#abs_mean = np.mean(np.abs(downsampled_audio_data ** 3))
#print('abs mean:', abs_mean)
#if abs_mean < 0.0:
# print('silence detected, skipping')
# time.sleep(1)
# return
d = whisper.pad_or_trim(norm_audio)
#print('processed audio in', time.time() - start, 's')
start = time.time()
e = model.transcribe(d, language='en')
print('transcribed audio in', time.time() - start, 's')
if time.time() - start > 10:
with open('downsampled.pcm', 'wb') as f:
f.write(downsampled_audio_data.astype(np.int16).tobytes())
print('wrote file, sleeping')
#breakpoint()
time.sleep(100)
print(' ', e['text'])

@ -0,0 +1,79 @@
# A python script to do both listening and talking. This is the basic model
# for an audio-only mumble client.
# Usage:
# Install pyaudio (instructions: https://people.csail.mit.edu/hubert/pyaudio/#downloads)
# If `fatal error: 'portaudio.h' file not found` is encountered while installing
# pyaudio even after following the instruction, this solution might be of help:
# https://stackoverflow.com/questions/33513522/when-installing-pyaudio-pip-cannot-find-portaudio-h-in-usr-local-include
#
# Install dependencies for pymumble.
#
# Set up a mumber server. For testing purpose, you can use https://guildbit.com/
# to spin up a free server. Hard code the server details in this file.
#
# run `python3 ./listen_n_talk.py`. Now an audio-only mumble client is connected
# to the server.
#
# To test its functionality, in a separate device, use some official mumble
# client (https://www.mumble.com/mumble-download.php) to verbally communicate
# with this audio-only client.
#
# Works on MacOS. Does NOT work on RPi 3B+ (I cannot figure out why. Help will
# be much appreciated)
import pymumble_py3 as pymumble_py3
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS
#import pyaudio
# Connection details for mumble server. Hardcoded for now, will have to be
# command line arguments eventually
pwd = "" # password
server = "protospace.ca" # server address
nick = "python"
port = 64738 # port number
# pyaudio set up
#CHUNK = 1024
#FORMAT = pyaudio.paInt16 # pymumble soundchunk.pcm is 16 bits
#CHANNELS = 1
#RATE = 48000 # pymumble soundchunk.pcm is 48000Hz
#p = pyaudio.PyAudio()
#stream = p.open(format=FORMAT,
# channels=CHANNELS,
# rate=RATE,
# input=True, # enable both talk
# output=True, # and listen
# frames_per_buffer=CHUNK)
# mumble client set up
def sound_received_handler(user, soundchunk):
""" play sound received from mumble server upon its arrival """
#stream.write(soundchunk.pcm)
print(len(soundchunk.pcm))
# Spin up a client and connect to mumble server
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port)
# set up callback called when PCS event occurs
mumble.callbacks.set_callback(PCS, sound_received_handler)
mumble.set_receive_sound(1) # Enable receiving sound from mumble server
mumble.start()
mumble.is_ready() # Wait for client is ready
# constant capturing sound and sending it to mumble server
while True:
#data = stream.read(CHUNK, exception_on_overflow=False)
#mumble.sound_output.add_sound(data)
pass
# close the stream and pyaudio instance
stream.stop_stream()
stream.close()
p.terminate()

@ -0,0 +1,61 @@
import os
import importlib
import threading
import time
import whisper
import traceback
import audio
print('Loading whisper model...')
start = time.time()
model = whisper.load_model('medium')
print('Done after', time.time() - start, 's')
# array of audio chunks
audio_chunks = [bytearray()]
def read_audio_thread():
global audio_chunks
while True:
with open('whispercppexample.pcm', 'rb') as f:
while True:
data = f.read(1920)
if not data:
break
audio.process_pcm(audio_chunks, data)
time.sleep(0.04)
def process_stream_thread():
global audio_chunks
while True:
try:
audio.process_stream(audio_chunks, model)
except BaseException as e:
print('exception')
traceback.print_exc()
print('sleeping...')
time.sleep(5)
def monitor_module():
mod_time = os.path.getmtime('audio.py')
while True:
if os.path.getmtime('audio.py') > mod_time:
mod_time = os.path.getmtime('audio.py')
print('Change detected, reloading.')
importlib.reload(audio)
time.sleep(1)
t1 = threading.Thread(target=read_audio_thread)
t2 = threading.Thread(target=process_stream_thread)
t3 = threading.Thread(target=monitor_module)
t1.start()
t2.start()
t3.start()
while True:
pass

@ -0,0 +1,90 @@
import os
DEBUG = os.environ.get('DEBUG')
import logging
logging.basicConfig(
format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s',
level=logging.DEBUG if DEBUG else logging.INFO)
import pymumble_py3 as pymumble_py3
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS
import whisper
from copy import copy
import numpy as np
import time
logging.info('Loading whisper model...')
model = whisper.load_model('medium')
logging.info('Done.')
# Connection details for mumble server. Hardcoded for now, will have to be
# command line arguments eventually
pwd = "" # password
server = "protospace.ca" # server address
nick = "python"
port = 64738 # port number
CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s
# array of 0.5 sec audio chunks
audio_chunks = [bytearray()]
def sound_received_handler(user, soundchunk):
# pymumble PCM is 16-bit 48000 Hz
if len(audio_chunks[-1]) < CHUNK_LENGTH:
audio_chunks[-1].extend(soundchunk.pcm)
else:
audio_chunks.append(bytearray())
if len(audio_chunks) > 10:
audio_chunks.pop(0)
# Spin up a client and connect to mumble server
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port)
# set up callback called when PCS event occurs
mumble.callbacks.set_callback(PCS, sound_received_handler)
mumble.set_receive_sound(1) # Enable receiving sound from mumble server
mumble.start()
mumble.is_ready() # Wait for client is ready
# constant capturing sound and sending it to mumble server
while True:
#data = stream.read(CHUNK, exception_on_overflow=False)
#mumble.sound_output.add_sound(data)
if len(audio_chunks) != 10:
continue
start = time.time()
a = copy(audio_chunks)
b = b''.join(a)
c = np.frombuffer(b, np.int16)
# Define a low-pass filter kernel
fs = 48000
cutoff_freq = fs / 6
nyquist_freq = fs / 2
num_taps = 101
taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2))
taps *= np.blackman(num_taps)
taps /= np.sum(taps)
# Apply the filter kernel to audio_data using convolution
filtered_audio_data = np.convolve(c, taps, mode='same')
# Downsample filtered_audio_data by a factor of 3 using take
downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3))
downsampled_audio_data = downsampled_audio_data.flatten().astype(np.float32) / 32768.0
d = whisper.pad_or_trim(downsampled_audio_data)
#print('processed audio in', time.time() - start, 's')
e = model.transcribe(d)
print(e['text'])

@ -0,0 +1,79 @@
import os
DEBUG = os.environ.get('DEBUG')
import logging
logging.basicConfig(
format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s',
level=logging.DEBUG if DEBUG else logging.INFO)
logging.getLogger('aiohttp').setLevel(logging.DEBUG if DEBUG else logging.WARNING)
import ffmpeg
import whisper
import time
import asyncio
from aiohttp import web, ClientSession, ClientError
import numpy as np
app = web.Application()
PORT = 3002
SAMPLE_RATE = 16000
logging.info('Loading whisper model...')
model = whisper.load_model('medium')
logging.info('Done.')
#start = time.time()
#result = model.transcribe('whisper-test.ogg')
#print('finished in', time.time() - start, 's')
#
#print(result['text'])
def load_audio(binary_file, sr = SAMPLE_RATE):
# stolen from https://github.com/ckaytev/tgisper/blob/main/tgisper/tgisperbot.py
try:
# This launches a subprocess to decode audio while down-mixing and
# resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input("pipe:", threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=binary_file)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
async def index(request):
return web.Response(text='hello world', content_type='text/html')
async def post_whisper(request):
data = await request.post()
audio = load_audio(data['audio'].file.read())
logging.info('Starting audio transcription...')
result = model.transcribe(audio)
logging.info('Done.')
return web.json_response(result)
async def run_webserver():
logging.info('Starting webserver on port: %s', PORT)
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, '0.0.0.0', PORT)
await site.start()
while True:
await asyncio.sleep(10)
if __name__ == '__main__':
app.router.add_get('/', index)
app.router.add_post('/whisper', post_whisper)
loop = asyncio.get_event_loop()
a = loop.create_task(run_webserver())
loop.run_forever()

@ -0,0 +1,142 @@
#! python3.7
import argparse
import io
import os
import speech_recognition as sr
import whisper
import torch
from datetime import datetime, timedelta
from queue import Queue
from tempfile import NamedTemporaryFile
from time import sleep
from sys import platform
def main():
parser = argparse.ArgumentParser()
if 'linux' in platform:
parser.add_argument("--default_microphone", default='pulse',
help="Default microphone name for SpeechRecognition. "
"Run this with 'list' to view available Microphones.", type=str)
args = parser.parse_args()
# The last time a recording was retreived from the queue.
phrase_time = None
# Current raw audio bytes.
last_sample = bytes()
# Thread safe Queue for passing data from the threaded recording callback.
data_queue = Queue()
# We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends.
recorder = sr.Recognizer()
recorder.energy_threshold = 1000
# Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording.
recorder.dynamic_energy_threshold = False
# Important for linux users.
# Prevents permanent application hang and crash by using the wrong Microphone
if 'linux' in platform:
mic_name = args.default_microphone
if not mic_name or mic_name == 'list':
print("Available microphone devices are: ")
for index, name in enumerate(sr.Microphone.list_microphone_names()):
print(f"Microphone with name \"{name}\" found")
return
else:
for index, name in enumerate(sr.Microphone.list_microphone_names()):
if mic_name in name:
source = sr.Microphone(sample_rate=16000, device_index=index)
break
else:
source = sr.Microphone(sample_rate=16000)
# Load / Download model
model = 'medium'
non_english = False
if args.model != "large" and not non_english:
model = model + ".en"
audio_model = whisper.load_model(model)
record_timeout = 2 # seconds
phrase_timeout = 3 # seconds between new lines
temp_file = NamedTemporaryFile().name
transcription = ['']
with source:
recorder.adjust_for_ambient_noise(source)
def record_callback(_, audio:sr.AudioData) -> None:
"""
Threaded callback function to recieve audio data when recordings finish.
audio: An AudioData containing the recorded bytes.
"""
# Grab the raw bytes and push it into the thread safe queue.
data = audio.get_raw_data()
data_queue.put(data)
# Create a background thread that will pass us raw audio bytes.
# We could do this manually but SpeechRecognizer provides a nice helper.
recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
# Cue the user that we're ready to go.
print("Model loaded.\n")
while True:
try:
now = datetime.utcnow()
# Pull raw recorded audio from the queue.
if not data_queue.empty():
phrase_complete = False
# If enough time has passed between recordings, consider the phrase complete.
# Clear the current working audio buffer to start over with the new data.
if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
last_sample = bytes()
phrase_complete = True
# This is the last time we received new audio data from the queue.
phrase_time = now
# Concatenate our current audio data with the latest audio data.
while not data_queue.empty():
data = data_queue.get()
last_sample += data
# Use AudioData to convert the raw data to wav data.
audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
wav_data = io.BytesIO(audio_data.get_wav_data())
# Write wav data to the temporary file as bytes.
with open(temp_file, 'w+b') as f:
f.write(wav_data.read())
# Read the transcription.
result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
text = result['text'].strip()
# If we detected a pause between recordings, add a new item to our transcripion.
# Otherwise edit the existing one.
if phrase_complete:
transcription.append(text)
else:
transcription[-1] = text
# Clear the console to reprint the updated transcription.
os.system('cls' if os.name=='nt' else 'clear')
for line in transcription:
print(line)
# Flush stdout.
print('', end='', flush=True)
# Infinite loops are bad for processors, must sleep.
sleep(0.25)
except KeyboardInterrupt:
break
print("\n\nTranscription:")
for line in transcription:
print(line)
if __name__ == "__main__":
main()

@ -0,0 +1,33 @@
import pymumble_py3 as pymumble_py3
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS
# Connection details for mumble server. Hardcoded for now, will have to be
# command line arguments eventually
pwd = "" # password
server = "protospace.ca" # server address
nick = "python"
port = 64738 # port number
audio_file = open('audio.wav', 'wb')
# mumble client set up
def sound_received_handler(user, soundchunk):
""" play sound received from mumble server upon its arrival """
print(len(soundchunk.pcm))
audio_file.write(soundchunk.pcm)
# Spin up a client and connect to mumble server
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port)
# set up callback called when PCS event occurs
mumble.callbacks.set_callback(PCS, sound_received_handler)
mumble.set_receive_sound(1) # Enable receiving sound from mumble server
mumble.start()
mumble.is_ready() # Wait for client is ready
# constant capturing sound and sending it to mumble server
try:
while True:
pass
finally:
audio_file.close()
Loading…
Cancel
Save