Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions configs/acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ mel_base: 'e'
energy_smooth_width: 0.12
breathiness_smooth_width: 0.12
voicing_smooth_width: 0.12
voicing_domain: 'mulaw'
tension_smooth_width: 0.12

use_lang_id: false
Expand Down
1 change: 1 addition & 0 deletions configs/templates/config_acoustic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ use_energy_embed: false
use_breathiness_embed: false
use_voicing_embed: false
use_tension_embed: false
voicing_domain: 'mulaw'

use_key_shift_embed: true
use_speed_embed: true
Expand Down
1 change: 1 addition & 0 deletions configs/templates/config_variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ breathiness_db_max: -20.0

voicing_db_min: -96.0
voicing_db_max: -12.0
voicing_domain: 'mulaw'

tension_logit_min: -10.0
tension_logit_max: 10.0
Expand Down
1 change: 1 addition & 0 deletions configs/variance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ breathiness_smooth_width: 0.12
voicing_db_min: -96.0
voicing_db_max: -12.0
voicing_smooth_width: 0.12
voicing_domain: 'mulaw'

tension_logit_min: -10.0
tension_logit_max: 10.0
Expand Down
2 changes: 1 addition & 1 deletion modules/fastspeech/param_adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def build_adaptor(self, cls=MultiVarianceDiffusion):
if self.predict_voicing:
ranges.append((
hparams['voicing_db_min'],
hparams['voicing_db_max']
0. if hparams.get('voicing_domain', 'db')=='mulaw' else hparams['voicing_db_max']
))
clamps.append((hparams['voicing_db_min'], 0.))

Expand Down
2 changes: 1 addition & 1 deletion preprocessing/acoustic_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def process_item(self, item_name, meta_data, binarization_args):
if self.need_voicing:
# get ground truth voicing
voicing = get_voicing(
dec_waveform, None, None, length=length
dec_waveform, None, None, length=length, domain=hparams.get('voicing_domain', 'db')
)

global voicing_smooth
Expand Down
2 changes: 1 addition & 1 deletion preprocessing/variance_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ def process_item(self, item_name, meta_data, binarization_args):
)
if voicing is None:
voicing = get_voicing(
dec_waveform, None, None, length=length
dec_waveform, None, None, length=length, domain=hparams.get('voicing_domain', 'db')
)
voicing_from_wav = True

Expand Down
17 changes: 13 additions & 4 deletions utils/binarizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,15 @@ def get_pitch_parselmouth(
return f0, uv


def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db'):
def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db', mu=255.0):
"""
Definition of energy: RMS of the waveform, in dB representation
:param waveform: [T]
:param length: Expected number of frames
:param hop_size: Frame width, in number of samples
:param win_size: Window size, in number of samples
:param domain: db or amplitude
:param domain: 'db', 'amplitude', or 'mulaw'
:param mu: mu parameter for mu-law compression
:return: energy
"""
energy = librosa.feature.rms(y=waveform, frame_length=win_size, hop_length=hop_size)[0]
Expand All @@ -97,6 +98,10 @@ def get_energy_librosa(waveform, length, *, hop_size, win_size, domain='db'):
energy = librosa.amplitude_to_db(energy)
elif domain == 'amplitude':
pass
elif domain == 'mulaw':
energy = np.log1p(mu * energy) / np.log1p(mu)
# Since modifications to the API have been frozen, this approach is adopted for compatibility.
energy = energy * 96 -96
else:
raise ValueError(f'Invalid domain: {domain}')
return energy
Expand Down Expand Up @@ -134,7 +139,8 @@ def get_breathiness(
def get_voicing(
waveform: Union[np.ndarray, DecomposedWaveform],
samplerate, f0, length,
*, hop_size=None, fft_size=None, win_size=None
*, hop_size=None, fft_size=None, win_size=None,
domain='db', mu=255.0
):
"""
Definition of voicing: RMS of the harmonic part, in dB representation
Expand All @@ -145,6 +151,8 @@ def get_voicing(
:param hop_size: Frame width, in number of samples
:param fft_size: Number of fft bins
:param win_size: Window size, in number of samples
:param domain: 'db', 'amplitude', or 'mulaw'
:param mu: mu parameter for mu-law compression
:return: voicing
"""
if not isinstance(waveform, DecomposedWaveform):
Expand All @@ -155,7 +163,8 @@ def get_voicing(
waveform_sp = waveform.harmonic()
voicing = get_energy_librosa(
waveform_sp, length=length,
hop_size=waveform.hop_size, win_size=waveform.win_size
hop_size=waveform.hop_size, win_size=waveform.win_size,
domain=domain, mu=mu
)
return voicing

Expand Down