-
Notifications
You must be signed in to change notification settings - Fork 0
/
voiceProtect_app.py
220 lines (161 loc) · 7.71 KB
/
voiceProtect_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# '##::: ##::'#######::'########:'########:
# ###:: ##:'##.... ##:... ##..:: ##.....::
# ####: ##: ##:::: ##:::: ##:::: ##:::::::
# ## ## ##: ##:::: ##:::: ##:::: ######:::
# ##. ####: ##:::: ##:::: ##:::: ##...::::
# ##:. ###: ##:::: ##:::: ##:::: ##:::::::
# ##::. ##:. #######::::: ##:::: ########:
# ..::::..:::.......::::::..:::::........::
# Both local deployment and the live Streamlit deployment launch the main function in tortoise_tts/voiceProtect_app.py, NOT this file. This is a duplicate file for ease of accessility.
# Deploying from the root directory causes threading issues as tortoise_tts must be launched in the main thread, which is assigned to the parent dir of __file__ on launch.
# ER: ValueError: signal only works in main thread of the main interpreter ; how to fix, or otherwise how to avoid having to download locally. See Issues.txt for full trace.
# //////////////////////////////////////////////////////////////////////////////
# PHASED OUT: Try to move script into tortoise_tts module while executing - threading issues:
# absolute_path = os.path.abspath(__file__)
# subdir_path = os.path.abspath("./tortoise_tts")
# target_path = os.path.join(subdir_path, "voiceProtect_app.py")
# st.info(absolute_path)
# st.info(target_path)
# os.rename(absolute_path, target_path)
# os.replace(absolute_path, target_path)
# shutil.move(absolute_path, target_path)
# //////////////////////////////////////////////////////////////////////////////
# System imports
import wave
import sys
import os
import io
from PIL import Image
from glob import glob
import subprocess
# Add tortoise_tts directory to path:
# Calculate the absolute path of the directory containing app.py, submodule directory, and append to system path
app_directory = os.path.dirname(os.path.abspath(__file__))
submodule_directory = os.path.join(app_directory, 'tortoise_tts')
sys.path.append(submodule_directory)
# Set current working directory to parent dir of main app file(/tortoise_tts)
# Resolves issue: local deploy launches CWD in parent dir, Streamlit Deploy launches CWD in root dir
os.chdir(app_directory)
# Main functionality
from tortoise.models.classifier import AudioMiniEncoderWithClassifierHead
import pyaudio
import torchaudio
import torch
import torch.nn.functional as F
import streamlit as st
# Misc imports
import librosa
import plotly.express as px
import numpy as np
from scipy.io.wavfile import read
from pydub import AudioSegment
import matplotlib.pyplot as plt
# App function imports
from inputAudio import inputAudio
from configWavPlot import configWavPlot
# Removed backend matplotlib GUI - caused multithreading issues
# import matplotlib
# matplotlib.use("TkAgg") # Change to "Qt5Agg" or "QtAgg" if you prefer
# load an audio file as tensor for model input
def load_audio(audiopath, sampling_rate =22000):
if isinstance(audiopath, str):
if audiopath.endswith('.mp3'):
audio, lsr = librosa.load(audiopath, sr=sampling_rate)
audio = torch.FloatTensor(audio)
else:
assert False, f"Error for unsupported audio format {audiopath[-4]}"
elif isinstance(audiopath, io.BytesIO):
audio, lsr = torchaudio.load(audiopath)
audio = audio[0]
if lsr != sampling_rate:
audio = torchaudio.functional.resample(audio, lsr, sampling_rate)
#torch.any(param) returns true if any param is true
if torch.any(audio > 2) or not torch.any(audio < 0 ):
print(f"Error with audio data, Max: {audio.max()} and min: {audio.min()}")
audio.clip_(-1, 1)
return audio.unsqueeze(0)
# function to classify audio with tortoise-tts library
def classify_audio_clip(clip):
classifier = AudioMiniEncoderWithClassifierHead(2, spec_dim = 1, embedding_dim = 512, depth = 5, downsample_factor = 4,
resnet_blocks = 2, attn_blocks = 4, num_attn_heads = 4, base_channels = 32,
dropout = 0, kernel_size = 5, distribute_zero_label= False)
state_dict = torch.load('../classifier.pth', map_location=torch.device('cpu'))
classifier.load_state_dict(state_dict)
clip = clip.cpu().unsqueeze(dim = 0)
results = F.softmax(classifier(clip), dim= -1)
return results[0][0]
# Create Streamlit app layout
def setStreamlitGUI():
#App GUI
# st.info("Current working directory:", os.getcwd())
st.set_page_config(layout = "wide")
logo = Image.open('../resources/VoiceProtect-logo.png')
# Resize, maintain aspect ratio
logo.thumbnail((600, 600))
st.image(logo, width=50, use_column_width="auto")
st.title("VoiceProtect - Deepfake Audio Detection")
st.info("Implemented with Tortoise Text to Speech Library. Classifier.pth model can be found below:")
st.info("https://huggingface.co/jbetker/tortoise-tts-v2")
col1, col2 = st.columns(2)
# Upload local mp3 file, use as input to model
with col1:
st.info("Upload a .mp3 file from your local machine to analyze")
uploaded_file = st.file_uploader("Choose a file", type = 'mp3')
if uploaded_file is not None:
if st.button("Analyze audio"):
st.info("Analyzing Audio...")
row1, row2, row3 = st.columns(3)
with row1:
audio_clip = load_audio(uploaded_file)
results = classify_audio_clip(audio_clip)
results = results.item()
st.info("your results are below")
st.info(f"The probability of deepfake audio is: {results}")
st.success(f"The uploaded audio is {results * 100: .2f}% likely to be AI generated")
with row2:
st.info("audio player")
st.audio(uploaded_file)
with row3:
st.info("tried")
# Convert uploaded mp3 to wav, path to WAV for matplotlib
output_wav_file = '../resources/upload.wav'
AudioSegment.from_mp3(uploaded_file).export(output_wav_file, format="wav")
absolute_path = os.path.abspath(output_wav_file)
generateWavePlot(absolute_path)
# Record User audio data, use as input to model
with col2:
st.info("Record a live, 5 second audio clip live to analyze")
if st.button("Record audio"):
# Record audio and save as wav file "output.wav"
pyaudioStream()
row1, row2, row3 = st.columns(3)
with row1:
# Path to MP3 for matplotlib
relative_path = "../resources/output.mp3"
absolute_path = os.path.abspath(relative_path)
audio_clip = load_audio(absolute_path)
results = classify_audio_clip(audio_clip)
results = results.item()
st.info("your results are below")
st.info(f"The probability of deepfake audio is : {results}")
st.info(f"The uploaded audio is {results * 100: .2f}% likely to be AI generated")
with row2:
st.info("audio player")
st.audio(absolute_path)
with row3:
# Path to WAV for matplotlib
relative_path = "../resources/output.wav"
absolute_path = os.path.abspath(relative_path)
generateWavePlot(absolute_path)
# Record user audio stream with pyaudio
# Convert to mp3 using pydub (load_audio() accepts mp3)
def pyaudioStream():
inputAudio.pyaudioStream()
# Generate and stylize waveform plot from input .WAV path
def generateWavePlot(path: str):
configWavPlot.generateWavePlot(path)
# Run script
def main():
setStreamlitGUI()
if __name__ == "__main__":
main()