Transforms to apply data augmentation in audio tasks. Inspired by FastAudio and torch-audiomentations.
with less_random():
_,axs = plt.subplots(1,2,figsize=(8,4))
f = Flip()
for ax in axs: f(audio, split_idx=0).show(ctx=ax, hear=False)
with less_random():
_,axs = plt.subplots(1,3,figsize=(12,4))
f = Roll(p=1,max_roll=0.5)
for ax in axs: f(audio, split_idx=0).show(ctx=ax, hear=False)
RandomCropPad
will pad using one of the following five modes if the input length is less than duration
or samples
.
with less_random():
_,axs = plt.subplots(1,4,figsize=(18,4))
for ax,padmode in zip(axs.flatten(), [AudioPadMode.Constant, AudioPadMode.ConstantPre,
AudioPadMode.ConstantPost, AudioPadMode.Repeat]
):
rcp = RandomCropPad(4, padmode=padmode)
rcp(audio, split_idx=1).show(ctx=ax, title=padmode, hear=False)
plt.tight_layout()
During training, RandomCropPad
will randomly crop if the input length is greater than duration
or samples
.
with less_random():
_,axs = plt.subplots(1,3,figsize=(12,4))
rcp = RandomCropPad(1.2)
for ax in axs: rcp(audio, split_idx=0).show(ctx=ax, hear=False)
During validation or prediction, RandomCropPad
will center crop if the input length is greater than duration
or samples
.
with less_random():
_,axs = plt.subplots(1,3,figsize=(12,4))
rcp = RandomCropPad(1.2)
for ax in axs: rcp(audio, split_idx=1).show(ctx=ax, hear=False)
_,axs = plt.subplots(1,4,figsize=(18,4))
for ax,gain in zip(axs.flatten(), [-18, -6, 0, 6]):
vol = Volume(p=1., gain=gain)
vol(audio, split_idx=0).show(ctx=ax, title=f'{gain} DB', hear=False)
_,axs = plt.subplots(1,1,figsize=(4,4))
PeakNorm(p=1.)(audio, split_idx=0).show(ctx=axs, hear=False)
with less_random():
cn = NoiseColor.White
_,axs = plt.subplots(1,4,figsize=(22,4))
plt.suptitle(f'{cn}')
for ax,nl in zip(axs.flatten(), [0.02, 0.05, 0.1, 0.2]):
ps = Noise(p=1., noise_level=nl, color=cn)
audio = TensorAudio.create(TEST_AUDIO)
ps(audio, split_idx=0).show(ctx=ax, title=f'noise_level: {nl}', hear=False)
with less_random():
_,axs = plt.subplots(1,4,figsize=(22,4))
ps = PitchShift(p=1.)
for ax in axs.flatten():
audio = TensorAudio.create(TEST_AUDIO)
ps(audio.unsqueeze(0), split_idx=0).squeeze(0).show(ctx=ax, title=f'shift: {ps.shift}', hear=False)
_,axs = plt.subplots(1,4,figsize=(22,4))
for ax,steps in zip(axs.flatten(), [2, 3, 5, 7]):
ps = PitchShiftTA(p=1., n_steps=steps)
audio = TensorAudio.create(TEST_AUDIO)
ps(audio.unsqueeze(0), split_idx=0).squeeze(0).show(ctx=ax, title=f'n_steps: {steps}', hear=False)
with less_random():
_,axs = plt.subplots(1,4,figsize=(22,4))
ts = TimeStretch(p=1.)
for ax in axs.flatten():
audio = TensorAudio.create(TEST_AUDIO)
ts(audio.unsqueeze(0), split_idx=0).squeeze(0).show(ctx=ax, title=f'stretch: {ts.stretch}', hear=False)
with less_random(2998):
_,axs = plt.subplots(1,4,figsize=(22,4))
psts = PitchShiftOrTimeStretch(p=1.)
for ax in axs.flatten():
audio = TensorAudio.create(TEST_AUDIO)
psts(audio.unsqueeze(0), split_idx=0).squeeze(0).show(ctx=ax, title='stretch' if sum(psts.stretch_idxs) else 'pitch', hear=False)
mel = MelSpectrogram(audio.sr, hop_length=1024, n_fft=1024, n_mels=112)(audio)
mel.show(to_db=True)
with less_random():
_,axs = plt.subplots(1,3,figsize=(12,4))
tm = TimeMasking(p=1., max_mask=0.5)
for ax in axs:
mel = MelSpectrogram(audio.sr, hop_length=1024, n_fft=1024, n_mels=112)(audio)
tm(mel.unsqueeze(0), split_idx=0).squeeze(0).show(ctx=ax, to_db=True)
with less_random():
_,axs = plt.subplots(1,3,figsize=(12,4))
fm = FrequencyMasking(p=1., max_mask=0.5)
for ax in axs:
mel = MelSpectrogram(audio.sr, hop_length=1024, n_fft=1024, n_mels=112)(audio)
fm(mel.unsqueeze(0), split_idx=0).squeeze(0).show(ctx=ax, to_db=True)