From a9b9be66b9bd989a2c0c80232cda06ced4372fe4 Mon Sep 17 00:00:00 2001
From: KakaruHayate <kakaru.hk@hotmail.com>
Date: Sat, 16 May 2026 12:44:16 +0800
Subject: [PATCH] Add mu-law support for voicing/energy

Introduce 'mulaw' voicing_domain and mu-law energy handling across configs and preprocessing. Configs updated to include voicing_domain='mulaw' (acoustic/variance templates and instances). Acoustic/variance binarizers now pass hparams['voicing_domain'] to get_voicing. Parameter adaptor clamps voicing max to 0 when domain is 'mulaw'. get_energy_librosa and get_voicing gain new domain/mu params and implement mu-law compression (default mu=255) to produce a dB-like range compatible with existing pipelines. These changes add support for mu-law representation of voicing/energy.

revert
---
 configs/acoustic.yaml                  |  1 +
 configs/templates/config_acoustic.yaml |  1 +
 configs/templates/config_variance.yaml |  1 +
 configs/variance.yaml                  |  1 +
 modules/fastspeech/param_adaptor.py    |  2 +-
 preprocessing/acoustic_binarizer.py    |  2 +-
 preprocessing/variance_binarizer.py    |  2 +-
 utils/binarizer_utils.py               | 17 +++++++++++++----
 8 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/configs/acoustic.yaml b/configs/acoustic.yaml
index 55ad4d994..f506da81a 100644
--- a/configs/acoustic.yaml
+++ b/configs/acoustic.yaml
@@ -45,6 +45,7 @@ mel_base: 'e'
 energy_smooth_width: 0.12
 breathiness_smooth_width: 0.12
 voicing_smooth_width: 0.12
+voicing_domain: 'mulaw'
 tension_smooth_width: 0.12
 
 use_lang_id: false
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
index 1991dc4ac..94ade55ac 100644
--- a/configs/templates/config_acoustic.yaml
+++ b/configs/templates/config_acoustic.yaml
@@ -49,6 +49,7 @@ use_energy_embed: false
 use_breathiness_embed: false
 use_voicing_embed: false
 use_tension_embed: false
+voicing_domain: 'mulaw'
 
 use_key_shift_embed: true
 use_speed_embed: true
diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
index ca5cd3a3a..a8ca0af3c 100644
--- a/configs/templates/config_variance.yaml
+++ b/configs/templates/config_variance.yaml
@@ -59,6 +59,7 @@ breathiness_db_max: -20.0
 
 voicing_db_min: -96.0
 voicing_db_max: -12.0
+voicing_domain: 'mulaw'
 
 tension_logit_min: -10.0
 tension_logit_max: 10.0
diff --git a/configs/variance.yaml b/configs/variance.yaml
index 10c84e888..8cf40445c 100644
--- a/configs/variance.yaml
+++ b/configs/variance.yaml
@@ -82,6 +82,7 @@ breathiness_smooth_width: 0.12
 voicing_db_min: -96.0
 voicing_db_max: -12.0
 voicing_smooth_width: 0.12
+voicing_domain: 'mulaw'
 
 tension_logit_min: -10.0
 tension_logit_max: 10.0
diff --git a/modules/fastspeech/param_adaptor.py b/modules/fastspeech/param_adaptor.py
index 77ebb8331..e0af3088c 100644
--- a/modules/fastspeech/param_adaptor.py
+++ b/modules/fastspeech/param_adaptor.py
@@ -49,7 +49,7 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
         if self.predict_voicing:
             ranges.append((
                 hparams['voicing_db_min'],
-                hparams['voicing_db_max']
+                0. if hparams.get('voicing_domain', 'db')=='mulaw' else hparams['voicing_db_max']
             ))
             clamps.append((hparams['voicing_db_min'], 0.))
 
diff --git a/preprocessing/acoustic_binarizer.py b/preprocessing/acoustic_binarizer.py
index 9301f14bc..0a73882aa 100644
--- a/preprocessing/acoustic_binarizer.py
+++ b/preprocessing/acoustic_binarizer.py
@@ -193,7 +193,7 @@ def process_item(self, item_name, meta_data, binarization_args):
         if self.need_voicing:
             # get ground truth voicing
             voicing = get_voicing(
-                dec_waveform, None, None, length=length
+                dec_waveform, None, None, length=length, domain=hparams.get('voicing_domain', 'db')
             )
 
             global voicing_smooth
diff --git a/preprocessing/variance_binarizer.py b/preprocessing/variance_binarizer.py
index 3d2990fe4..8d7bec17c 100644
--- a/preprocessing/variance_binarizer.py
+++ b/preprocessing/variance_binarizer.py
@@ -478,7 +478,7 @@ def process_item(self, item_name, meta_data, binarization_args):
                     )
             if voicing is None:
                 voicing = get_voicing(
-                    dec_waveform, None, None, length=length
+                    dec_waveform, None, None, length=length, domain=hparams.get('voicing_domain', 'db')
                 )
                 voicing_from_wav = True
 
diff --git a/utils/binarizer_utils.py b/utils/binarizer_utils.py
index df5216429..77fd03978 100644
--- a/utils/binarizer_utils.py
+++ b/utils/binarizer_utils.py
@@ -79,14 +79,15 @@ def get_pitch_parselmouth(
     return f0, uv
 
 
-def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db'):
+def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db', mu=255.0):
     """
     Definition of energy: RMS of the waveform, in dB representation
     :param waveform: [T]
     :param length: Expected number of frames
     :param hop_size: Frame width, in number of samples
     :param win_size: Window size, in number of samples
-    :param domain: db or amplitude
+    :param domain: 'db', 'amplitude', or 'mulaw'
+    :param mu: mu parameter for mu-law compression
     :return: energy
     """
     energy = librosa.feature.rms(y=waveform, frame_length=win_size, hop_length=hop_size)[0]
@@ -97,6 +98,10 @@ def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db'):
         energy = librosa.amplitude_to_db(energy)
     elif domain == 'amplitude':
         pass
+    elif domain == 'mulaw':
+        energy = np.log1p(mu * energy) / np.log1p(mu)
+        # Since modifications to the API have been frozen, this approach is adopted for compatibility.
+        energy = energy * 96 -96
     else:
         raise ValueError(f'Invalid domain: {domain}')
     return energy
@@ -134,7 +139,8 @@ def get_breathiness(
 def get_voicing(
         waveform: Union[np.ndarray, DecomposedWaveform],
         samplerate, f0, length,
-        *, hop_size=None, fft_size=None, win_size=None
+        *, hop_size=None, fft_size=None, win_size=None, 
+        domain='db', mu=255.0
 ):
     """
     Definition of voicing: RMS of the harmonic part, in dB representation
@@ -145,6 +151,8 @@ def get_voicing(
     :param hop_size: Frame width, in number of samples
     :param fft_size: Number of fft bins
     :param win_size: Window size, in number of samples
+    :param domain: 'db', 'amplitude', or 'mulaw'
+    :param mu: mu parameter for mu-law compression
     :return: voicing
     """
     if not isinstance(waveform, DecomposedWaveform):
@@ -155,7 +163,8 @@ def get_voicing(
     waveform_sp = waveform.harmonic()
     voicing = get_energy_librosa(
         waveform_sp, length=length,
-        hop_size=waveform.hop_size, win_size=waveform.win_size
+        hop_size=waveform.hop_size, win_size=waveform.win_size, 
+        domain=domain, mu=mu
     )
     return voicing