Add a bunch of test files
This commit is contained in:
parent
89cb732e42
commit
41bc6015bf
73
audio.py
Normal file
73
audio.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
from copy import copy
|
||||
import numpy as np
|
||||
import time
|
||||
import whisper
|
||||
|
||||
print('Loaded audio.py')
|
||||
|
||||
CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s
|
||||
|
||||
def process_pcm(audio_chunks, data):
|
||||
# pymumble PCM is 16-bit 48000 Hz
|
||||
|
||||
start = time.time()
|
||||
|
||||
audio_chunks.append(data)
|
||||
|
||||
if len(audio_chunks) > 75:
|
||||
audio_chunks.pop(0)
|
||||
|
||||
#print('finished chunk in', time.time() - start, 's')
|
||||
|
||||
def process_stream(audio_chunks, model):
|
||||
|
||||
if len(audio_chunks) != 75:
|
||||
print('Skipping, bad length.')
|
||||
time.sleep(0.5)
|
||||
return
|
||||
|
||||
start = time.time()
|
||||
a = copy(audio_chunks)
|
||||
b = b''.join(a)
|
||||
c = np.frombuffer(b, np.int16)
|
||||
|
||||
# Define a low-pass filter kernel
|
||||
fs = 48000
|
||||
cutoff_freq = fs / 6
|
||||
nyquist_freq = fs / 2
|
||||
num_taps = 101
|
||||
taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2))
|
||||
taps *= np.blackman(num_taps)
|
||||
taps /= np.sum(taps)
|
||||
|
||||
# Apply the filter kernel to audio_data using convolution
|
||||
filtered_audio_data = np.convolve(c, taps, mode='same')
|
||||
# Downsample filtered_audio_data by a factor of 3 using take
|
||||
downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)).flatten()
|
||||
|
||||
norm_audio = downsampled_audio_data.astype(np.float32) / 32768.0
|
||||
|
||||
#abs_mean = np.mean(np.abs(downsampled_audio_data ** 3))
|
||||
#print('abs mean:', abs_mean)
|
||||
#if abs_mean < 0.0:
|
||||
# print('silence detected, skipping')
|
||||
# time.sleep(1)
|
||||
# return
|
||||
|
||||
d = whisper.pad_or_trim(norm_audio)
|
||||
|
||||
#print('processed audio in', time.time() - start, 's')
|
||||
|
||||
start = time.time()
|
||||
e = model.transcribe(d, language='en')
|
||||
print('transcribed audio in', time.time() - start, 's')
|
||||
|
||||
if time.time() - start > 10:
|
||||
with open('downsampled.pcm', 'wb') as f:
|
||||
f.write(downsampled_audio_data.astype(np.int16).tobytes())
|
||||
|
||||
print('wrote file, sleeping')
|
||||
#breakpoint()
|
||||
time.sleep(100)
|
||||
|
||||
print(' ', e['text'])
|
79
audio_only_client.py
Normal file
79
audio_only_client.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
# A python script to do both listening and talking. This is the basic model
|
||||
# for an audio-only mumble client.
|
||||
|
||||
# Usage:
|
||||
|
||||
# Install pyaudio (instructions: https://people.csail.mit.edu/hubert/pyaudio/#downloads)
|
||||
# If `fatal error: 'portaudio.h' file not found` is encountered while installing
|
||||
# pyaudio even after following the instruction, this solution might be of help:
|
||||
# https://stackoverflow.com/questions/33513522/when-installing-pyaudio-pip-cannot-find-portaudio-h-in-usr-local-include
|
||||
#
|
||||
# Install dependencies for pymumble.
|
||||
#
|
||||
# Set up a mumber server. For testing purpose, you can use https://guildbit.com/
|
||||
# to spin up a free server. Hard code the server details in this file.
|
||||
#
|
||||
# run `python3 ./listen_n_talk.py`. Now an audio-only mumble client is connected
|
||||
# to the server.
|
||||
#
|
||||
# To test its functionality, in a separate device, use some official mumble
|
||||
# client (https://www.mumble.com/mumble-download.php) to verbally communicate
|
||||
# with this audio-only client.
|
||||
#
|
||||
# Works on MacOS. Does NOT work on RPi 3B+ (I cannot figure out why. Help will
|
||||
# be much appreciated)
|
||||
|
||||
import pymumble_py3 as pymumble_py3
|
||||
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS
|
||||
#import pyaudio
|
||||
|
||||
# Connection details for mumble server. Hardcoded for now, will have to be
|
||||
# command line arguments eventually
|
||||
pwd = "" # password
|
||||
server = "protospace.ca" # server address
|
||||
nick = "python"
|
||||
port = 64738 # port number
|
||||
|
||||
|
||||
# pyaudio set up
|
||||
#CHUNK = 1024
|
||||
#FORMAT = pyaudio.paInt16 # pymumble soundchunk.pcm is 16 bits
|
||||
#CHANNELS = 1
|
||||
#RATE = 48000 # pymumble soundchunk.pcm is 48000Hz
|
||||
|
||||
#p = pyaudio.PyAudio()
|
||||
#stream = p.open(format=FORMAT,
|
||||
# channels=CHANNELS,
|
||||
# rate=RATE,
|
||||
# input=True, # enable both talk
|
||||
# output=True, # and listen
|
||||
# frames_per_buffer=CHUNK)
|
||||
|
||||
|
||||
# mumble client set up
|
||||
def sound_received_handler(user, soundchunk):
|
||||
""" play sound received from mumble server upon its arrival """
|
||||
#stream.write(soundchunk.pcm)
|
||||
print(len(soundchunk.pcm))
|
||||
|
||||
|
||||
# Spin up a client and connect to mumble server
|
||||
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port)
|
||||
# set up callback called when PCS event occurs
|
||||
mumble.callbacks.set_callback(PCS, sound_received_handler)
|
||||
mumble.set_receive_sound(1) # Enable receiving sound from mumble server
|
||||
mumble.start()
|
||||
mumble.is_ready() # Wait for client is ready
|
||||
|
||||
|
||||
# constant capturing sound and sending it to mumble server
|
||||
while True:
|
||||
#data = stream.read(CHUNK, exception_on_overflow=False)
|
||||
#mumble.sound_output.add_sound(data)
|
||||
pass
|
||||
|
||||
|
||||
# close the stream and pyaudio instance
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
61
read_file.py
Normal file
61
read_file.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
import os
|
||||
import importlib
|
||||
import threading
|
||||
import time
|
||||
import whisper
|
||||
import traceback
|
||||
|
||||
import audio
|
||||
|
||||
print('Loading whisper model...')
|
||||
start = time.time()
|
||||
model = whisper.load_model('medium')
|
||||
print('Done after', time.time() - start, 's')
|
||||
|
||||
# array of audio chunks
|
||||
audio_chunks = [bytearray()]
|
||||
|
||||
def read_audio_thread():
|
||||
global audio_chunks
|
||||
|
||||
while True:
|
||||
with open('whispercppexample.pcm', 'rb') as f:
|
||||
while True:
|
||||
data = f.read(1920)
|
||||
if not data:
|
||||
break
|
||||
audio.process_pcm(audio_chunks, data)
|
||||
time.sleep(0.04)
|
||||
|
||||
def process_stream_thread():
|
||||
global audio_chunks
|
||||
|
||||
while True:
|
||||
try:
|
||||
audio.process_stream(audio_chunks, model)
|
||||
except BaseException as e:
|
||||
print('exception')
|
||||
traceback.print_exc()
|
||||
print('sleeping...')
|
||||
time.sleep(5)
|
||||
|
||||
def monitor_module():
|
||||
mod_time = os.path.getmtime('audio.py')
|
||||
|
||||
while True:
|
||||
if os.path.getmtime('audio.py') > mod_time:
|
||||
mod_time = os.path.getmtime('audio.py')
|
||||
print('Change detected, reloading.')
|
||||
importlib.reload(audio)
|
||||
time.sleep(1)
|
||||
|
||||
t1 = threading.Thread(target=read_audio_thread)
|
||||
t2 = threading.Thread(target=process_stream_thread)
|
||||
t3 = threading.Thread(target=monitor_module)
|
||||
t1.start()
|
||||
t2.start()
|
||||
t3.start()
|
||||
|
||||
while True:
|
||||
pass
|
||||
|
90
stream.py
Normal file
90
stream.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
import os
|
||||
|
||||
DEBUG = os.environ.get('DEBUG')
|
||||
|
||||
import logging
|
||||
logging.basicConfig(
|
||||
format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s',
|
||||
level=logging.DEBUG if DEBUG else logging.INFO)
|
||||
|
||||
import pymumble_py3 as pymumble_py3
|
||||
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS
|
||||
import whisper
|
||||
from copy import copy
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
logging.info('Loading whisper model...')
|
||||
model = whisper.load_model('medium')
|
||||
logging.info('Done.')
|
||||
|
||||
# Connection details for mumble server. Hardcoded for now, will have to be
|
||||
# command line arguments eventually
|
||||
pwd = "" # password
|
||||
server = "protospace.ca" # server address
|
||||
nick = "python"
|
||||
port = 64738 # port number
|
||||
|
||||
CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s
|
||||
|
||||
# array of 0.5 sec audio chunks
|
||||
audio_chunks = [bytearray()]
|
||||
|
||||
def sound_received_handler(user, soundchunk):
|
||||
# pymumble PCM is 16-bit 48000 Hz
|
||||
|
||||
if len(audio_chunks[-1]) < CHUNK_LENGTH:
|
||||
audio_chunks[-1].extend(soundchunk.pcm)
|
||||
else:
|
||||
audio_chunks.append(bytearray())
|
||||
|
||||
if len(audio_chunks) > 10:
|
||||
audio_chunks.pop(0)
|
||||
|
||||
|
||||
# Spin up a client and connect to mumble server
|
||||
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port)
|
||||
# set up callback called when PCS event occurs
|
||||
mumble.callbacks.set_callback(PCS, sound_received_handler)
|
||||
mumble.set_receive_sound(1) # Enable receiving sound from mumble server
|
||||
mumble.start()
|
||||
mumble.is_ready() # Wait for client is ready
|
||||
|
||||
# constant capturing sound and sending it to mumble server
|
||||
while True:
|
||||
#data = stream.read(CHUNK, exception_on_overflow=False)
|
||||
#mumble.sound_output.add_sound(data)
|
||||
|
||||
if len(audio_chunks) != 10:
|
||||
continue
|
||||
|
||||
start = time.time()
|
||||
a = copy(audio_chunks)
|
||||
b = b''.join(a)
|
||||
c = np.frombuffer(b, np.int16)
|
||||
|
||||
# Define a low-pass filter kernel
|
||||
fs = 48000
|
||||
cutoff_freq = fs / 6
|
||||
nyquist_freq = fs / 2
|
||||
num_taps = 101
|
||||
taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2))
|
||||
taps *= np.blackman(num_taps)
|
||||
taps /= np.sum(taps)
|
||||
|
||||
# Apply the filter kernel to audio_data using convolution
|
||||
filtered_audio_data = np.convolve(c, taps, mode='same')
|
||||
# Downsample filtered_audio_data by a factor of 3 using take
|
||||
downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3))
|
||||
downsampled_audio_data = downsampled_audio_data.flatten().astype(np.float32) / 32768.0
|
||||
|
||||
d = whisper.pad_or_trim(downsampled_audio_data)
|
||||
|
||||
#print('processed audio in', time.time() - start, 's')
|
||||
|
||||
e = model.transcribe(d)
|
||||
|
||||
print(e['text'])
|
||||
|
||||
|
||||
|
79
test.py
Normal file
79
test.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
import os
|
||||
|
||||
DEBUG = os.environ.get('DEBUG')
|
||||
|
||||
import logging
|
||||
logging.basicConfig(
|
||||
format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s',
|
||||
level=logging.DEBUG if DEBUG else logging.INFO)
|
||||
logging.getLogger('aiohttp').setLevel(logging.DEBUG if DEBUG else logging.WARNING)
|
||||
|
||||
|
||||
import ffmpeg
|
||||
import whisper
|
||||
import time
|
||||
import asyncio
|
||||
from aiohttp import web, ClientSession, ClientError
|
||||
import numpy as np
|
||||
|
||||
app = web.Application()
|
||||
PORT = 3002
|
||||
SAMPLE_RATE = 16000
|
||||
|
||||
logging.info('Loading whisper model...')
|
||||
model = whisper.load_model('medium')
|
||||
logging.info('Done.')
|
||||
|
||||
#start = time.time()
|
||||
#result = model.transcribe('whisper-test.ogg')
|
||||
#print('finished in', time.time() - start, 's')
|
||||
#
|
||||
#print(result['text'])
|
||||
|
||||
def load_audio(binary_file, sr = SAMPLE_RATE):
|
||||
# stolen from https://github.com/ckaytev/tgisper/blob/main/tgisper/tgisperbot.py
|
||||
try:
|
||||
# This launches a subprocess to decode audio while down-mixing and
|
||||
# resampling as necessary.
|
||||
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
|
||||
out, _ = (
|
||||
ffmpeg.input("pipe:", threads=0)
|
||||
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
|
||||
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=binary_file)
|
||||
)
|
||||
except ffmpeg.Error as e:
|
||||
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
||||
|
||||
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
|
||||
|
||||
async def index(request):
|
||||
return web.Response(text='hello world', content_type='text/html')
|
||||
|
||||
async def post_whisper(request):
|
||||
data = await request.post()
|
||||
audio = load_audio(data['audio'].file.read())
|
||||
|
||||
logging.info('Starting audio transcription...')
|
||||
result = model.transcribe(audio)
|
||||
logging.info('Done.')
|
||||
|
||||
return web.json_response(result)
|
||||
|
||||
async def run_webserver():
|
||||
logging.info('Starting webserver on port: %s', PORT)
|
||||
runner = web.AppRunner(app)
|
||||
await runner.setup()
|
||||
site = web.TCPSite(runner, '0.0.0.0', PORT)
|
||||
await site.start()
|
||||
|
||||
while True:
|
||||
await asyncio.sleep(10)
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.router.add_get('/', index)
|
||||
app.router.add_post('/whisper', post_whisper)
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
a = loop.create_task(run_webserver())
|
||||
loop.run_forever()
|
||||
|
142
transcribe_demo.py
Normal file
142
transcribe_demo.py
Normal file
|
@ -0,0 +1,142 @@
|
|||
#! python3.7
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import os
|
||||
import speech_recognition as sr
|
||||
import whisper
|
||||
import torch
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
from queue import Queue
|
||||
from tempfile import NamedTemporaryFile
|
||||
from time import sleep
|
||||
from sys import platform
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
if 'linux' in platform:
|
||||
parser.add_argument("--default_microphone", default='pulse',
|
||||
help="Default microphone name for SpeechRecognition. "
|
||||
"Run this with 'list' to view available Microphones.", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
# The last time a recording was retreived from the queue.
|
||||
phrase_time = None
|
||||
# Current raw audio bytes.
|
||||
last_sample = bytes()
|
||||
# Thread safe Queue for passing data from the threaded recording callback.
|
||||
data_queue = Queue()
|
||||
# We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends.
|
||||
recorder = sr.Recognizer()
|
||||
recorder.energy_threshold = 1000
|
||||
# Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording.
|
||||
recorder.dynamic_energy_threshold = False
|
||||
|
||||
# Important for linux users.
|
||||
# Prevents permanent application hang and crash by using the wrong Microphone
|
||||
if 'linux' in platform:
|
||||
mic_name = args.default_microphone
|
||||
if not mic_name or mic_name == 'list':
|
||||
print("Available microphone devices are: ")
|
||||
for index, name in enumerate(sr.Microphone.list_microphone_names()):
|
||||
print(f"Microphone with name \"{name}\" found")
|
||||
return
|
||||
else:
|
||||
for index, name in enumerate(sr.Microphone.list_microphone_names()):
|
||||
if mic_name in name:
|
||||
source = sr.Microphone(sample_rate=16000, device_index=index)
|
||||
break
|
||||
else:
|
||||
source = sr.Microphone(sample_rate=16000)
|
||||
|
||||
# Load / Download model
|
||||
model = 'medium'
|
||||
non_english = False
|
||||
if args.model != "large" and not non_english:
|
||||
model = model + ".en"
|
||||
audio_model = whisper.load_model(model)
|
||||
|
||||
record_timeout = 2 # seconds
|
||||
phrase_timeout = 3 # seconds between new lines
|
||||
|
||||
temp_file = NamedTemporaryFile().name
|
||||
transcription = ['']
|
||||
|
||||
with source:
|
||||
recorder.adjust_for_ambient_noise(source)
|
||||
|
||||
def record_callback(_, audio:sr.AudioData) -> None:
|
||||
"""
|
||||
Threaded callback function to recieve audio data when recordings finish.
|
||||
audio: An AudioData containing the recorded bytes.
|
||||
"""
|
||||
# Grab the raw bytes and push it into the thread safe queue.
|
||||
data = audio.get_raw_data()
|
||||
data_queue.put(data)
|
||||
|
||||
# Create a background thread that will pass us raw audio bytes.
|
||||
# We could do this manually but SpeechRecognizer provides a nice helper.
|
||||
recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
|
||||
|
||||
# Cue the user that we're ready to go.
|
||||
print("Model loaded.\n")
|
||||
|
||||
while True:
|
||||
try:
|
||||
now = datetime.utcnow()
|
||||
# Pull raw recorded audio from the queue.
|
||||
if not data_queue.empty():
|
||||
phrase_complete = False
|
||||
# If enough time has passed between recordings, consider the phrase complete.
|
||||
# Clear the current working audio buffer to start over with the new data.
|
||||
if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
|
||||
last_sample = bytes()
|
||||
phrase_complete = True
|
||||
# This is the last time we received new audio data from the queue.
|
||||
phrase_time = now
|
||||
|
||||
# Concatenate our current audio data with the latest audio data.
|
||||
while not data_queue.empty():
|
||||
data = data_queue.get()
|
||||
last_sample += data
|
||||
|
||||
# Use AudioData to convert the raw data to wav data.
|
||||
audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
|
||||
wav_data = io.BytesIO(audio_data.get_wav_data())
|
||||
|
||||
# Write wav data to the temporary file as bytes.
|
||||
with open(temp_file, 'w+b') as f:
|
||||
f.write(wav_data.read())
|
||||
|
||||
# Read the transcription.
|
||||
result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available())
|
||||
text = result['text'].strip()
|
||||
|
||||
# If we detected a pause between recordings, add a new item to our transcripion.
|
||||
# Otherwise edit the existing one.
|
||||
if phrase_complete:
|
||||
transcription.append(text)
|
||||
else:
|
||||
transcription[-1] = text
|
||||
|
||||
# Clear the console to reprint the updated transcription.
|
||||
os.system('cls' if os.name=='nt' else 'clear')
|
||||
for line in transcription:
|
||||
print(line)
|
||||
# Flush stdout.
|
||||
print('', end='', flush=True)
|
||||
|
||||
# Infinite loops are bad for processors, must sleep.
|
||||
sleep(0.25)
|
||||
except KeyboardInterrupt:
|
||||
break
|
||||
|
||||
print("\n\nTranscription:")
|
||||
for line in transcription:
|
||||
print(line)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
33
write_file.py
Normal file
33
write_file.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
import pymumble_py3 as pymumble_py3
|
||||
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS
|
||||
|
||||
# Connection details for mumble server. Hardcoded for now, will have to be
|
||||
# command line arguments eventually
|
||||
pwd = "" # password
|
||||
server = "protospace.ca" # server address
|
||||
nick = "python"
|
||||
port = 64738 # port number
|
||||
|
||||
audio_file = open('audio.wav', 'wb')
|
||||
|
||||
# mumble client set up
|
||||
def sound_received_handler(user, soundchunk):
|
||||
""" play sound received from mumble server upon its arrival """
|
||||
print(len(soundchunk.pcm))
|
||||
|
||||
audio_file.write(soundchunk.pcm)
|
||||
|
||||
# Spin up a client and connect to mumble server
|
||||
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port)
|
||||
# set up callback called when PCS event occurs
|
||||
mumble.callbacks.set_callback(PCS, sound_received_handler)
|
||||
mumble.set_receive_sound(1) # Enable receiving sound from mumble server
|
||||
mumble.start()
|
||||
mumble.is_ready() # Wait for client is ready
|
||||
|
||||
# constant capturing sound and sending it to mumble server
|
||||
try:
|
||||
while True:
|
||||
pass
|
||||
finally:
|
||||
audio_file.close()
|
Loading…
Reference in New Issue
Block a user