From a9b9be66b9bd989a2c0c80232cda06ced4372fe4 Mon Sep 17 00:00:00 2001 From: KakaruHayate Date: Sat, 16 May 2026 12:44:16 +0800 Subject: [PATCH] Add mu-law support for voicing/energy Introduce 'mulaw' voicing_domain and mu-law energy handling across configs and preprocessing. Configs updated to include voicing_domain='mulaw' (acoustic/variance templates and instances). Acoustic/variance binarizers now pass hparams['voicing_domain'] to get_voicing. Parameter adaptor clamps voicing max to 0 when domain is 'mulaw'. get_energy_librosa and get_voicing gain new domain/mu params and implement mu-law compression (default mu=255) to produce a dB-like range compatible with existing pipelines. These changes add support for mu-law representation of voicing/energy. revert --- configs/acoustic.yaml | 1 + configs/templates/config_acoustic.yaml | 1 + configs/templates/config_variance.yaml | 1 + configs/variance.yaml | 1 + modules/fastspeech/param_adaptor.py | 2 +- preprocessing/acoustic_binarizer.py | 2 +- preprocessing/variance_binarizer.py | 2 +- utils/binarizer_utils.py | 17 +++++++++++++---- 8 files changed, 20 insertions(+), 7 deletions(-) diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml index 55ad4d994..f506da81a 100644 --- a/configs/acoustic.yaml +++ b/configs/acoustic.yaml @@ -45,6 +45,7 @@ mel_base: 'e' energy_smooth_width: 0.12 breathiness_smooth_width: 0.12 voicing_smooth_width: 0.12 +voicing_domain: 'mulaw' tension_smooth_width: 0.12 use_lang_id: false diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml index 1991dc4ac..94ade55ac 100644 --- a/configs/templates/config_acoustic.yaml +++ b/configs/templates/config_acoustic.yaml @@ -49,6 +49,7 @@ use_energy_embed: false use_breathiness_embed: false use_voicing_embed: false use_tension_embed: false +voicing_domain: 'mulaw' use_key_shift_embed: true use_speed_embed: true diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml index ca5cd3a3a..a8ca0af3c 100644 --- a/configs/templates/config_variance.yaml +++ b/configs/templates/config_variance.yaml @@ -59,6 +59,7 @@ breathiness_db_max: -20.0 voicing_db_min: -96.0 voicing_db_max: -12.0 +voicing_domain: 'mulaw' tension_logit_min: -10.0 tension_logit_max: 10.0 diff --git a/configs/variance.yaml b/configs/variance.yaml index 10c84e888..8cf40445c 100644 --- a/configs/variance.yaml +++ b/configs/variance.yaml @@ -82,6 +82,7 @@ breathiness_smooth_width: 0.12 voicing_db_min: -96.0 voicing_db_max: -12.0 voicing_smooth_width: 0.12 +voicing_domain: 'mulaw' tension_logit_min: -10.0 tension_logit_max: 10.0 diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py index 77ebb8331..e0af3088c 100644 --- a/modules/fastspeech/param_adaptor.py +++ b/modules/fastspeech/param_adaptor.py @@ -49,7 +49,7 @@ def build_adaptor(self, cls=MultiVarianceDiffusion): if self.predict_voicing: ranges.append(( hparams['voicing_db_min'], - hparams['voicing_db_max'] + 0. if hparams.get('voicing_domain', 'db')=='mulaw' else hparams['voicing_db_max'] )) clamps.append((hparams['voicing_db_min'], 0.)) diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py index 9301f14bc..0a73882aa 100644 --- a/preprocessing/acoustic_binarizer.py +++ b/preprocessing/acoustic_binarizer.py @@ -193,7 +193,7 @@ def process_item(self, item_name, meta_data, binarization_args): if self.need_voicing: # get ground truth voicing voicing = get_voicing( - dec_waveform, None, None, length=length + dec_waveform, None, None, length=length, domain=hparams.get('voicing_domain', 'db') ) global voicing_smooth diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py index 3d2990fe4..8d7bec17c 100644 --- a/preprocessing/variance_binarizer.py +++ b/preprocessing/variance_binarizer.py @@ -478,7 +478,7 @@ def process_item(self, item_name, meta_data, binarization_args): ) if voicing is None: voicing = get_voicing( - dec_waveform, None, None, length=length + dec_waveform, None, None, length=length, domain=hparams.get('voicing_domain', 'db') ) voicing_from_wav = True diff --git a/utils/binarizer_utils.py b/utils/binarizer_utils.py index df5216429..77fd03978 100644 --- a/utils/binarizer_utils.py +++ b/utils/binarizer_utils.py @@ -79,14 +79,15 @@ def get_pitch_parselmouth( return f0, uv -def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db'): +def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db', mu=255.0): """ Definition of energy: RMS of the waveform, in dB representation :param waveform: [T] :param length: Expected number of frames :param hop_size: Frame width, in number of samples :param win_size: Window size, in number of samples - :param domain: db or amplitude + :param domain: 'db', 'amplitude', or 'mulaw' + :param mu: mu parameter for mu-law compression :return: energy """ energy = librosa.feature.rms(y=waveform, frame_length=win_size, hop_length=hop_size)[0] @@ -97,6 +98,10 @@ def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db'): energy = librosa.amplitude_to_db(energy) elif domain == 'amplitude': pass + elif domain == 'mulaw': + energy = np.log1p(mu * energy) / np.log1p(mu) + # Since modifications to the API have been frozen, this approach is adopted for compatibility. + energy = energy * 96 -96 else: raise ValueError(f'Invalid domain: {domain}') return energy @@ -134,7 +139,8 @@ def get_breathiness( def get_voicing( waveform: Union[np.ndarray, DecomposedWaveform], samplerate, f0, length, - *, hop_size=None, fft_size=None, win_size=None + *, hop_size=None, fft_size=None, win_size=None, + domain='db', mu=255.0 ): """ Definition of voicing: RMS of the harmonic part, in dB representation @@ -145,6 +151,8 @@ def get_voicing( :param hop_size: Frame width, in number of samples :param fft_size: Number of fft bins :param win_size: Window size, in number of samples + :param domain: 'db', 'amplitude', or 'mulaw' + :param mu: mu parameter for mu-law compression :return: voicing """ if not isinstance(waveform, DecomposedWaveform): @@ -155,7 +163,8 @@ def get_voicing( waveform_sp = waveform.harmonic() voicing = get_energy_librosa( waveform_sp, length=length, - hop_size=waveform.hop_size, win_size=waveform.win_size + hop_size=waveform.hop_size, win_size=waveform.win_size, + domain=domain, mu=mu ) return voicing