from typing import overload import librosa from numba import jit import numpy as np @jit(nopython=True) def normalize_feature_sequence(X, norm='2', threshold=0.0001, v=None): """Normalizes the columns of a feature sequence Notebook: C3/C3S1_FeatureNormalization.ipynb Args: X (np.ndarray): Feature sequence norm (str): The norm to be applied. '1', '2', 'max' or 'z' (Default value = '2') threshold (float): An threshold below which the vector ``v`` used instead of normalization (Default value = 0.0001) v (float): Used instead of normalization below ``threshold``. If None, uses unit vector for given norm (Default value = None) Returns: X_norm (np.ndarray): Normalized feature sequence """ assert norm in ['1', '2', 'max', 'z'] K, N = X.shape X_norm = np.zeros((K, N)) if norm == '1': if v is None: v = np.ones(K, dtype=np.float64) / K for n in range(N): s = np.sum(np.abs(X[:, n])) if s > threshold: X_norm[:, n] = X[:, n] / s else: X_norm[:, n] = v if norm == '2': if v is None: v = np.ones(K, dtype=np.float64) / np.sqrt(K) for n in range(N): s = np.sqrt(np.sum(X[:, n]**2)) if s > threshold: X_norm[:, n] = X[:, n] / s else: X_norm[:, n] = v if norm == 'max': if v is None: v = np.ones(K, dtype=np.float64) for n in range(N): s = np.max(np.abs(X[:, n])) if s > threshold: X_norm[:, n] = X[:, n] / s else: X_norm[:, n] = v if norm == 'z': if v is None: v = np.zeros(K, dtype=np.float64) for n in range(N): mu = np.sum(X[:, n]) / K sigma = np.sqrt(np.sum((X[:, n] - mu)**2) / (K - 1)) if sigma > threshold: X_norm[:, n] = (X[:, n] - mu) / sigma else: X_norm[:, n] = v return X_norm def compute_chromagram_from_filename(fn_wav, Fs=22050, N=4096, H=2048, gamma=None, version='STFT', norm='2'): """Compute chromagram for WAV file specified by filename Notebook: C5/C5S2_ChordRec_Templates.ipynb Args: fn_wav (str): Filenname of WAV Fs (scalar): Sampling rate (Default value = 22050) N (int): Window size (Default value = 4096) H (int): Hop size (Default value = 2048) gamma (float): Constant for logarithmic compression (Default value = None) version (str): Technique used for front-end decomposition ('STFT', 'IIS', 'CQT') (Default value = 'STFT') norm (str): If not 'None', chroma vectors are normalized by norm as specified ('1', '2', 'max') (Default value = '2') Returns: X (np.ndarray): Chromagram Fs_X (scalar): Feature reate of chromagram x (np.ndarray): Audio signal Fs (scalar): Sampling rate of audio signal x_dur (float): Duration (seconds) of audio signal """ x, Fs = librosa.load(fn_wav, sr=Fs) x_dur = x.shape[0] / Fs if version == 'STFT': # Compute chroma features with STFT X = librosa.stft(x, n_fft=N, hop_length=H, pad_mode='constant', center=True) if gamma is not None: X = np.log(1 + gamma * np.abs(X)**2) else: X = np.abs(X)**2 X = librosa.feature.chroma_stft(S=X, sr=Fs, tuning=0, norm=None, hop_length=H, n_fft=N) if version == 'CQT': # Compute chroma features with CQT decomposition X = librosa.feature.chroma_cqt(y=x, sr=Fs, hop_length=H, norm=None) if version == 'IIR': # Compute chroma features with filter bank (using IIR elliptic filter) X = librosa.iirt(y=x, sr=Fs, win_length=N, hop_length=H, center=True, tuning=0.0) if gamma is not None: X = np.log(1.0 + gamma * X) X = librosa.feature.chroma_cqt(C=X, bins_per_octave=12, n_octaves=7, fmin=librosa.midi_to_hz(24), norm=None) if norm is not None: X = normalize_feature_sequence(X, norm='2') Fs_X = Fs / H return X, Fs_X, x, Fs, x_dur if __name__ == "__main__": test_file = "/home/martin/Music/deemix Music/Simone Sommerland - Ki-Ka-Kinderturnen.mp3" chroma = compute_chromagram_from_filename(test_file)