musicmouse/espmusicmouse/host_driver/audio_analysis.py

130 lines
4.7 KiB
Python

from typing import overload
import librosa
from numba import jit
import numpy as np
@jit(nopython=True)
def normalize_feature_sequence(X, norm='2', threshold=0.0001, v=None):
"""Normalizes the columns of a feature sequence
Notebook: C3/C3S1_FeatureNormalization.ipynb
Args:
X (np.ndarray): Feature sequence
norm (str): The norm to be applied. '1', '2', 'max' or 'z' (Default value = '2')
threshold (float): An threshold below which the vector ``v`` used instead of normalization
(Default value = 0.0001)
v (float): Used instead of normalization below ``threshold``. If None, uses unit vector for given norm
(Default value = None)
Returns:
X_norm (np.ndarray): Normalized feature sequence
"""
assert norm in ['1', '2', 'max', 'z']
K, N = X.shape
X_norm = np.zeros((K, N))
if norm == '1':
if v is None:
v = np.ones(K, dtype=np.float64) / K
for n in range(N):
s = np.sum(np.abs(X[:, n]))
if s > threshold:
X_norm[:, n] = X[:, n] / s
else:
X_norm[:, n] = v
if norm == '2':
if v is None:
v = np.ones(K, dtype=np.float64) / np.sqrt(K)
for n in range(N):
s = np.sqrt(np.sum(X[:, n]**2))
if s > threshold:
X_norm[:, n] = X[:, n] / s
else:
X_norm[:, n] = v
if norm == 'max':
if v is None:
v = np.ones(K, dtype=np.float64)
for n in range(N):
s = np.max(np.abs(X[:, n]))
if s > threshold:
X_norm[:, n] = X[:, n] / s
else:
X_norm[:, n] = v
if norm == 'z':
if v is None:
v = np.zeros(K, dtype=np.float64)
for n in range(N):
mu = np.sum(X[:, n]) / K
sigma = np.sqrt(np.sum((X[:, n] - mu)**2) / (K - 1))
if sigma > threshold:
X_norm[:, n] = (X[:, n] - mu) / sigma
else:
X_norm[:, n] = v
return X_norm
def compute_chromagram_from_filename(fn_wav,
Fs=22050,
N=4096,
H=2048,
gamma=None,
version='STFT',
norm='2'):
"""Compute chromagram for WAV file specified by filename
Notebook: C5/C5S2_ChordRec_Templates.ipynb
Args:
fn_wav (str): Filenname of WAV
Fs (scalar): Sampling rate (Default value = 22050)
N (int): Window size (Default value = 4096)
H (int): Hop size (Default value = 2048)
gamma (float): Constant for logarithmic compression (Default value = None)
version (str): Technique used for front-end decomposition ('STFT', 'IIS', 'CQT') (Default value = 'STFT')
norm (str): If not 'None', chroma vectors are normalized by norm as specified ('1', '2', 'max')
(Default value = '2')
Returns:
X (np.ndarray): Chromagram
Fs_X (scalar): Feature reate of chromagram
x (np.ndarray): Audio signal
Fs (scalar): Sampling rate of audio signal
x_dur (float): Duration (seconds) of audio signal
"""
x, Fs = librosa.load(fn_wav, sr=Fs)
x_dur = x.shape[0] / Fs
if version == 'STFT':
# Compute chroma features with STFT
X = librosa.stft(x, n_fft=N, hop_length=H, pad_mode='constant', center=True)
if gamma is not None:
X = np.log(1 + gamma * np.abs(X)**2)
else:
X = np.abs(X)**2
X = librosa.feature.chroma_stft(S=X, sr=Fs, tuning=0, norm=None, hop_length=H, n_fft=N)
if version == 'CQT':
# Compute chroma features with CQT decomposition
X = librosa.feature.chroma_cqt(y=x, sr=Fs, hop_length=H, norm=None)
if version == 'IIR':
# Compute chroma features with filter bank (using IIR elliptic filter)
X = librosa.iirt(y=x, sr=Fs, win_length=N, hop_length=H, center=True, tuning=0.0)
if gamma is not None:
X = np.log(1.0 + gamma * X)
X = librosa.feature.chroma_cqt(C=X,
bins_per_octave=12,
n_octaves=7,
fmin=librosa.midi_to_hz(24),
norm=None)
if norm is not None:
X = normalize_feature_sequence(X, norm='2')
Fs_X = Fs / H
return X, Fs_X, x, Fs, x_dur
if __name__ == "__main__":
test_file = "/home/martin/Music/deemix Music/Simone Sommerland - Ki-Ka-Kinderturnen.mp3"
chroma = compute_chromagram_from_filename(test_file)