parent
89cb732e42
commit
41bc6015bf
7 changed files with 557 additions and 0 deletions
@ -0,0 +1,73 @@ |
||||
from copy import copy |
||||
import numpy as np |
||||
import time |
||||
import whisper |
||||
|
||||
print('Loaded audio.py') |
||||
|
||||
CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s |
||||
|
||||
def process_pcm(audio_chunks, data): |
||||
# pymumble PCM is 16-bit 48000 Hz |
||||
|
||||
start = time.time() |
||||
|
||||
audio_chunks.append(data) |
||||
|
||||
if len(audio_chunks) > 75: |
||||
audio_chunks.pop(0) |
||||
|
||||
#print('finished chunk in', time.time() - start, 's') |
||||
|
||||
def process_stream(audio_chunks, model): |
||||
|
||||
if len(audio_chunks) != 75: |
||||
print('Skipping, bad length.') |
||||
time.sleep(0.5) |
||||
return |
||||
|
||||
start = time.time() |
||||
a = copy(audio_chunks) |
||||
b = b''.join(a) |
||||
c = np.frombuffer(b, np.int16) |
||||
|
||||
# Define a low-pass filter kernel |
||||
fs = 48000 |
||||
cutoff_freq = fs / 6 |
||||
nyquist_freq = fs / 2 |
||||
num_taps = 101 |
||||
taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2)) |
||||
taps *= np.blackman(num_taps) |
||||
taps /= np.sum(taps) |
||||
|
||||
# Apply the filter kernel to audio_data using convolution |
||||
filtered_audio_data = np.convolve(c, taps, mode='same') |
||||
# Downsample filtered_audio_data by a factor of 3 using take |
||||
downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)).flatten() |
||||
|
||||
norm_audio = downsampled_audio_data.astype(np.float32) / 32768.0 |
||||
|
||||
#abs_mean = np.mean(np.abs(downsampled_audio_data ** 3)) |
||||
#print('abs mean:', abs_mean) |
||||
#if abs_mean < 0.0: |
||||
# print('silence detected, skipping') |
||||
# time.sleep(1) |
||||
# return |
||||
|
||||
d = whisper.pad_or_trim(norm_audio) |
||||
|
||||
#print('processed audio in', time.time() - start, 's') |
||||
|
||||
start = time.time() |
||||
e = model.transcribe(d, language='en') |
||||
print('transcribed audio in', time.time() - start, 's') |
||||
|
||||
if time.time() - start > 10: |
||||
with open('downsampled.pcm', 'wb') as f: |
||||
f.write(downsampled_audio_data.astype(np.int16).tobytes()) |
||||
|
||||
print('wrote file, sleeping') |
||||
#breakpoint() |
||||
time.sleep(100) |
||||
|
||||
print(' ', e['text']) |
@ -0,0 +1,79 @@ |
||||
# A python script to do both listening and talking. This is the basic model |
||||
# for an audio-only mumble client. |
||||
|
||||
# Usage: |
||||
|
||||
# Install pyaudio (instructions: https://people.csail.mit.edu/hubert/pyaudio/#downloads) |
||||
# If `fatal error: 'portaudio.h' file not found` is encountered while installing |
||||
# pyaudio even after following the instruction, this solution might be of help: |
||||
# https://stackoverflow.com/questions/33513522/when-installing-pyaudio-pip-cannot-find-portaudio-h-in-usr-local-include |
||||
# |
||||
# Install dependencies for pymumble. |
||||
# |
||||
# Set up a mumber server. For testing purpose, you can use https://guildbit.com/ |
||||
# to spin up a free server. Hard code the server details in this file. |
||||
# |
||||
# run `python3 ./listen_n_talk.py`. Now an audio-only mumble client is connected |
||||
# to the server. |
||||
# |
||||
# To test its functionality, in a separate device, use some official mumble |
||||
# client (https://www.mumble.com/mumble-download.php) to verbally communicate |
||||
# with this audio-only client. |
||||
# |
||||
# Works on MacOS. Does NOT work on RPi 3B+ (I cannot figure out why. Help will |
||||
# be much appreciated) |
||||
|
||||
import pymumble_py3 as pymumble_py3 |
||||
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS |
||||
#import pyaudio |
||||
|
||||
# Connection details for mumble server. Hardcoded for now, will have to be |
||||
# command line arguments eventually |
||||
pwd = "" # password |
||||
server = "protospace.ca" # server address |
||||
nick = "python" |
||||
port = 64738 # port number |
||||
|
||||
|
||||
# pyaudio set up |
||||
#CHUNK = 1024 |
||||
#FORMAT = pyaudio.paInt16 # pymumble soundchunk.pcm is 16 bits |
||||
#CHANNELS = 1 |
||||
#RATE = 48000 # pymumble soundchunk.pcm is 48000Hz |
||||
|
||||
#p = pyaudio.PyAudio() |
||||
#stream = p.open(format=FORMAT, |
||||
# channels=CHANNELS, |
||||
# rate=RATE, |
||||
# input=True, # enable both talk |
||||
# output=True, # and listen |
||||
# frames_per_buffer=CHUNK) |
||||
|
||||
|
||||
# mumble client set up |
||||
def sound_received_handler(user, soundchunk): |
||||
""" play sound received from mumble server upon its arrival """ |
||||
#stream.write(soundchunk.pcm) |
||||
print(len(soundchunk.pcm)) |
||||
|
||||
|
||||
# Spin up a client and connect to mumble server |
||||
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port) |
||||
# set up callback called when PCS event occurs |
||||
mumble.callbacks.set_callback(PCS, sound_received_handler) |
||||
mumble.set_receive_sound(1) # Enable receiving sound from mumble server |
||||
mumble.start() |
||||
mumble.is_ready() # Wait for client is ready |
||||
|
||||
|
||||
# constant capturing sound and sending it to mumble server |
||||
while True: |
||||
#data = stream.read(CHUNK, exception_on_overflow=False) |
||||
#mumble.sound_output.add_sound(data) |
||||
pass |
||||
|
||||
|
||||
# close the stream and pyaudio instance |
||||
stream.stop_stream() |
||||
stream.close() |
||||
p.terminate() |
@ -0,0 +1,61 @@ |
||||
import os |
||||
import importlib |
||||
import threading |
||||
import time |
||||
import whisper |
||||
import traceback |
||||
|
||||
import audio |
||||
|
||||
print('Loading whisper model...') |
||||
start = time.time() |
||||
model = whisper.load_model('medium') |
||||
print('Done after', time.time() - start, 's') |
||||
|
||||
# array of audio chunks |
||||
audio_chunks = [bytearray()] |
||||
|
||||
def read_audio_thread(): |
||||
global audio_chunks |
||||
|
||||
while True: |
||||
with open('whispercppexample.pcm', 'rb') as f: |
||||
while True: |
||||
data = f.read(1920) |
||||
if not data: |
||||
break |
||||
audio.process_pcm(audio_chunks, data) |
||||
time.sleep(0.04) |
||||
|
||||
def process_stream_thread(): |
||||
global audio_chunks |
||||
|
||||
while True: |
||||
try: |
||||
audio.process_stream(audio_chunks, model) |
||||
except BaseException as e: |
||||
print('exception') |
||||
traceback.print_exc() |
||||
print('sleeping...') |
||||
time.sleep(5) |
||||
|
||||
def monitor_module(): |
||||
mod_time = os.path.getmtime('audio.py') |
||||
|
||||
while True: |
||||
if os.path.getmtime('audio.py') > mod_time: |
||||
mod_time = os.path.getmtime('audio.py') |
||||
print('Change detected, reloading.') |
||||
importlib.reload(audio) |
||||
time.sleep(1) |
||||
|
||||
t1 = threading.Thread(target=read_audio_thread) |
||||
t2 = threading.Thread(target=process_stream_thread) |
||||
t3 = threading.Thread(target=monitor_module) |
||||
t1.start() |
||||
t2.start() |
||||
t3.start() |
||||
|
||||
while True: |
||||
pass |
||||
|
@ -0,0 +1,90 @@ |
||||
import os |
||||
|
||||
DEBUG = os.environ.get('DEBUG') |
||||
|
||||
import logging |
||||
logging.basicConfig( |
||||
format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s', |
||||
level=logging.DEBUG if DEBUG else logging.INFO) |
||||
|
||||
import pymumble_py3 as pymumble_py3 |
||||
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS |
||||
import whisper |
||||
from copy import copy |
||||
import numpy as np |
||||
import time |
||||
|
||||
logging.info('Loading whisper model...') |
||||
model = whisper.load_model('medium') |
||||
logging.info('Done.') |
||||
|
||||
# Connection details for mumble server. Hardcoded for now, will have to be |
||||
# command line arguments eventually |
||||
pwd = "" # password |
||||
server = "protospace.ca" # server address |
||||
nick = "python" |
||||
port = 64738 # port number |
||||
|
||||
CHUNK_LENGTH = 24000 # 48000 Hz * 0.5 s |
||||
|
||||
# array of 0.5 sec audio chunks |
||||
audio_chunks = [bytearray()] |
||||
|
||||
def sound_received_handler(user, soundchunk): |
||||
# pymumble PCM is 16-bit 48000 Hz |
||||
|
||||
if len(audio_chunks[-1]) < CHUNK_LENGTH: |
||||
audio_chunks[-1].extend(soundchunk.pcm) |
||||
else: |
||||
audio_chunks.append(bytearray()) |
||||
|
||||
if len(audio_chunks) > 10: |
||||
audio_chunks.pop(0) |
||||
|
||||
|
||||
# Spin up a client and connect to mumble server |
||||
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port) |
||||
# set up callback called when PCS event occurs |
||||
mumble.callbacks.set_callback(PCS, sound_received_handler) |
||||
mumble.set_receive_sound(1) # Enable receiving sound from mumble server |
||||
mumble.start() |
||||
mumble.is_ready() # Wait for client is ready |
||||
|
||||
# constant capturing sound and sending it to mumble server |
||||
while True: |
||||
#data = stream.read(CHUNK, exception_on_overflow=False) |
||||
#mumble.sound_output.add_sound(data) |
||||
|
||||
if len(audio_chunks) != 10: |
||||
continue |
||||
|
||||
start = time.time() |
||||
a = copy(audio_chunks) |
||||
b = b''.join(a) |
||||
c = np.frombuffer(b, np.int16) |
||||
|
||||
# Define a low-pass filter kernel |
||||
fs = 48000 |
||||
cutoff_freq = fs / 6 |
||||
nyquist_freq = fs / 2 |
||||
num_taps = 101 |
||||
taps = np.sinc(2 * cutoff_freq / fs * (np.arange(num_taps) - (num_taps - 1) / 2)) |
||||
taps *= np.blackman(num_taps) |
||||
taps /= np.sum(taps) |
||||
|
||||
# Apply the filter kernel to audio_data using convolution |
||||
filtered_audio_data = np.convolve(c, taps, mode='same') |
||||
# Downsample filtered_audio_data by a factor of 3 using take |
||||
downsampled_audio_data = filtered_audio_data.take(np.arange(0, len(filtered_audio_data), 3)) |
||||
downsampled_audio_data = downsampled_audio_data.flatten().astype(np.float32) / 32768.0 |
||||
|
||||
d = whisper.pad_or_trim(downsampled_audio_data) |
||||
|
||||
#print('processed audio in', time.time() - start, 's') |
||||
|
||||
e = model.transcribe(d) |
||||
|
||||
print(e['text']) |
||||
|
||||
|
||||
|
@ -0,0 +1,79 @@ |
||||
import os |
||||
|
||||
DEBUG = os.environ.get('DEBUG') |
||||
|
||||
import logging |
||||
logging.basicConfig( |
||||
format='[%(asctime)s] %(levelname)s %(module)s/%(funcName)s: - %(message)s', |
||||
level=logging.DEBUG if DEBUG else logging.INFO) |
||||
logging.getLogger('aiohttp').setLevel(logging.DEBUG if DEBUG else logging.WARNING) |
||||
|
||||
|
||||
import ffmpeg |
||||
import whisper |
||||
import time |
||||
import asyncio |
||||
from aiohttp import web, ClientSession, ClientError |
||||
import numpy as np |
||||
|
||||
app = web.Application() |
||||
PORT = 3002 |
||||
SAMPLE_RATE = 16000 |
||||
|
||||
logging.info('Loading whisper model...') |
||||
model = whisper.load_model('medium') |
||||
logging.info('Done.') |
||||
|
||||
#start = time.time() |
||||
#result = model.transcribe('whisper-test.ogg') |
||||
#print('finished in', time.time() - start, 's') |
||||
# |
||||
#print(result['text']) |
||||
|
||||
def load_audio(binary_file, sr = SAMPLE_RATE): |
||||
# stolen from https://github.com/ckaytev/tgisper/blob/main/tgisper/tgisperbot.py |
||||
try: |
||||
# This launches a subprocess to decode audio while down-mixing and |
||||
# resampling as necessary. |
||||
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. |
||||
out, _ = ( |
||||
ffmpeg.input("pipe:", threads=0) |
||||
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) |
||||
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True, input=binary_file) |
||||
) |
||||
except ffmpeg.Error as e: |
||||
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e |
||||
|
||||
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 |
||||
|
||||
async def index(request): |
||||
return web.Response(text='hello world', content_type='text/html') |
||||
|
||||
async def post_whisper(request): |
||||
data = await request.post() |
||||
audio = load_audio(data['audio'].file.read()) |
||||
|
||||
logging.info('Starting audio transcription...') |
||||
result = model.transcribe(audio) |
||||
logging.info('Done.') |
||||
|
||||
return web.json_response(result) |
||||
|
||||
async def run_webserver(): |
||||
logging.info('Starting webserver on port: %s', PORT) |
||||
runner = web.AppRunner(app) |
||||
await runner.setup() |
||||
site = web.TCPSite(runner, '0.0.0.0', PORT) |
||||
await site.start() |
||||
|
||||
while True: |
||||
await asyncio.sleep(10) |
||||
|
||||
if __name__ == '__main__': |
||||
app.router.add_get('/', index) |
||||
app.router.add_post('/whisper', post_whisper) |
||||
|
||||
loop = asyncio.get_event_loop() |
||||
a = loop.create_task(run_webserver()) |
||||
loop.run_forever() |
||||
|
@ -0,0 +1,142 @@ |
||||
#! python3.7 |
||||
|
||||
import argparse |
||||
import io |
||||
import os |
||||
import speech_recognition as sr |
||||
import whisper |
||||
import torch |
||||
|
||||
from datetime import datetime, timedelta |
||||
from queue import Queue |
||||
from tempfile import NamedTemporaryFile |
||||
from time import sleep |
||||
from sys import platform |
||||
|
||||
|
||||
def main(): |
||||
parser = argparse.ArgumentParser() |
||||
if 'linux' in platform: |
||||
parser.add_argument("--default_microphone", default='pulse', |
||||
help="Default microphone name for SpeechRecognition. " |
||||
"Run this with 'list' to view available Microphones.", type=str) |
||||
args = parser.parse_args() |
||||
|
||||
# The last time a recording was retreived from the queue. |
||||
phrase_time = None |
||||
# Current raw audio bytes. |
||||
last_sample = bytes() |
||||
# Thread safe Queue for passing data from the threaded recording callback. |
||||
data_queue = Queue() |
||||
# We use SpeechRecognizer to record our audio because it has a nice feauture where it can detect when speech ends. |
||||
recorder = sr.Recognizer() |
||||
recorder.energy_threshold = 1000 |
||||
# Definitely do this, dynamic energy compensation lowers the energy threshold dramtically to a point where the SpeechRecognizer never stops recording. |
||||
recorder.dynamic_energy_threshold = False |
||||
|
||||
# Important for linux users. |
||||
# Prevents permanent application hang and crash by using the wrong Microphone |
||||
if 'linux' in platform: |
||||
mic_name = args.default_microphone |
||||
if not mic_name or mic_name == 'list': |
||||
print("Available microphone devices are: ") |
||||
for index, name in enumerate(sr.Microphone.list_microphone_names()): |
||||
print(f"Microphone with name \"{name}\" found") |
||||
return |
||||
else: |
||||
for index, name in enumerate(sr.Microphone.list_microphone_names()): |
||||
if mic_name in name: |
||||
source = sr.Microphone(sample_rate=16000, device_index=index) |
||||
break |
||||
else: |
||||
source = sr.Microphone(sample_rate=16000) |
||||
|
||||
# Load / Download model |
||||
model = 'medium' |
||||
non_english = False |
||||
if args.model != "large" and not non_english: |
||||
model = model + ".en" |
||||
audio_model = whisper.load_model(model) |
||||
|
||||
record_timeout = 2 # seconds |
||||
phrase_timeout = 3 # seconds between new lines |
||||
|
||||
temp_file = NamedTemporaryFile().name |
||||
transcription = [''] |
||||
|
||||
with source: |
||||
recorder.adjust_for_ambient_noise(source) |
||||
|
||||
def record_callback(_, audio:sr.AudioData) -> None: |
||||
""" |
||||
Threaded callback function to recieve audio data when recordings finish. |
||||
audio: An AudioData containing the recorded bytes. |
||||
""" |
||||
# Grab the raw bytes and push it into the thread safe queue. |
||||
data = audio.get_raw_data() |
||||
data_queue.put(data) |
||||
|
||||
# Create a background thread that will pass us raw audio bytes. |
||||
# We could do this manually but SpeechRecognizer provides a nice helper. |
||||
recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout) |
||||
|
||||
# Cue the user that we're ready to go. |
||||
print("Model loaded.\n") |
||||
|
||||
while True: |
||||
try: |
||||
now = datetime.utcnow() |
||||
# Pull raw recorded audio from the queue. |
||||
if not data_queue.empty(): |
||||
phrase_complete = False |
||||
# If enough time has passed between recordings, consider the phrase complete. |
||||
# Clear the current working audio buffer to start over with the new data. |
||||
if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout): |
||||
last_sample = bytes() |
||||
phrase_complete = True |
||||
# This is the last time we received new audio data from the queue. |
||||
phrase_time = now |
||||
|
||||
# Concatenate our current audio data with the latest audio data. |
||||
while not data_queue.empty(): |
||||
data = data_queue.get() |
||||
last_sample += data |
||||
|
||||
# Use AudioData to convert the raw data to wav data. |
||||
audio_data = sr.AudioData(last_sample, source.SAMPLE_RATE, source.SAMPLE_WIDTH) |
||||
wav_data = io.BytesIO(audio_data.get_wav_data()) |
||||
|
||||
# Write wav data to the temporary file as bytes. |
||||
with open(temp_file, 'w+b') as f: |
||||
f.write(wav_data.read()) |
||||
|
||||
# Read the transcription. |
||||
result = audio_model.transcribe(temp_file, fp16=torch.cuda.is_available()) |
||||
text = result['text'].strip() |
||||
|
||||
# If we detected a pause between recordings, add a new item to our transcripion. |
||||
# Otherwise edit the existing one. |
||||
if phrase_complete: |
||||
transcription.append(text) |
||||
else: |
||||
transcription[-1] = text |
||||
|
||||
# Clear the console to reprint the updated transcription. |
||||
os.system('cls' if os.name=='nt' else 'clear') |
||||
for line in transcription: |
||||
print(line) |
||||
# Flush stdout. |
||||
print('', end='', flush=True) |
||||
|
||||
# Infinite loops are bad for processors, must sleep. |
||||
sleep(0.25) |
||||
except KeyboardInterrupt: |
||||
break |
||||
|
||||
print("\n\nTranscription:") |
||||
for line in transcription: |
||||
print(line) |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
main() |
@ -0,0 +1,33 @@ |
||||
import pymumble_py3 as pymumble_py3 |
||||
from pymumble_py3.callbacks import PYMUMBLE_CLBK_SOUNDRECEIVED as PCS |
||||
|
||||
# Connection details for mumble server. Hardcoded for now, will have to be |
||||
# command line arguments eventually |
||||
pwd = "" # password |
||||
server = "protospace.ca" # server address |
||||
nick = "python" |
||||
port = 64738 # port number |
||||
|
||||
audio_file = open('audio.wav', 'wb') |
||||
|
||||
# mumble client set up |
||||
def sound_received_handler(user, soundchunk): |
||||
""" play sound received from mumble server upon its arrival """ |
||||
print(len(soundchunk.pcm)) |
||||
|
||||
audio_file.write(soundchunk.pcm) |
||||
|
||||
# Spin up a client and connect to mumble server |
||||
mumble = pymumble_py3.Mumble(server, nick, password=pwd, port=port) |
||||
# set up callback called when PCS event occurs |
||||
mumble.callbacks.set_callback(PCS, sound_received_handler) |
||||
mumble.set_receive_sound(1) # Enable receiving sound from mumble server |
||||
mumble.start() |
||||
mumble.is_ready() # Wait for client is ready |
||||
|
||||
# constant capturing sound and sending it to mumble server |
||||
try: |
||||
while True: |
||||
pass |
||||
finally: |
||||
audio_file.close() |
Loading…
Reference in new issue