diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 3dbe5b138..d8ebf2f4d 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -737,7 +737,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi", "ko", "vi"], + choices=["en", "de", "es", "fr", "hu", "sv", "zh", "ar", "it", "hy", "ja", "hi", "ko", "vi", "pt"], default="en", type=str, ) diff --git a/nemo_text_processing/text_normalization/pt/data/electronic/__init__.py b/nemo_text_processing/text_normalization/pt/data/electronic/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/electronic/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/pt/data/electronic/domain.tsv new file mode 100644 index 000000000..b9daa19a5 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/electronic/domain.tsv @@ -0,0 +1,7 @@ +.com ponto com +.com.br ponto com ponto br +.gov.br ponto gov ponto br +.org ponto org +.net ponto net +.edu ponto edu +.br ponto br diff --git a/nemo_text_processing/text_normalization/pt/data/electronic/electronic_spoken_unit.tsv b/nemo_text_processing/text_normalization/pt/data/electronic/electronic_spoken_unit.tsv new file mode 100644 index 000000000..698bc2773 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/electronic/electronic_spoken_unit.tsv @@ -0,0 +1,16 @@ +google google +usuario usuario +microsoft microsoft +amazon amazon +facebook facebook +meta meta +netflix netflix +spotify spotify +samsung samsung +apple apple +linkedin linkedin +instagram instagram +whatsapp whatsapp +oracle oracle +adobe adobe +paypal paypal diff --git a/nemo_text_processing/text_normalization/pt/data/electronic/server_name.tsv b/nemo_text_processing/text_normalization/pt/data/electronic/server_name.tsv new file mode 100644 index 000000000..50b4eeb65 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/electronic/server_name.tsv @@ -0,0 +1,6 @@ +gmail +nvidia +outlook +hotmail +yahoo +live diff --git a/nemo_text_processing/text_normalization/pt/data/electronic/symbols.tsv b/nemo_text_processing/text_normalization/pt/data/electronic/symbols.tsv new file mode 100644 index 000000000..85c8a1b10 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/electronic/symbols.tsv @@ -0,0 +1,22 @@ +. ponto +- traço +_ underscore +! exclamação +# cerquilha +$ dólar +% por cento +& e comercial +' apóstrofo +* asterisco ++ mais +/ barra += igual +? interrogação +^ acento circunflexo +` crase +{ chave esquerda +| barra vertical +} chave direita +~ til +, vírgula +: dois pontos diff --git a/nemo_text_processing/text_normalization/pt/data/fractions/specials.tsv b/nemo_text_processing/text_normalization/pt/data/fractions/specials.tsv index c140ca4ba..7efeba5e6 100644 --- a/nemo_text_processing/text_normalization/pt/data/fractions/specials.tsv +++ b/nemo_text_processing/text_normalization/pt/data/fractions/specials.tsv @@ -2,3 +2,4 @@ connector e minus menos plural_suffix s avos_suffix avos +avos_between diff --git a/nemo_text_processing/text_normalization/pt/data/measure/__init__.py b/nemo_text_processing/text_normalization/pt/data/measure/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/measure/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/measure/measurements_plural.tsv b/nemo_text_processing/text_normalization/pt/data/measure/measurements_plural.tsv new file mode 100755 index 000000000..d9cbfb87f --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/measure/measurements_plural.tsv @@ -0,0 +1,64 @@ +h horas +min minutos +s segundos +ms milissegundos +ns nanossegundos +μs microssegundos +t toneladas +kg quilos +kg quilogramas +g gramas +mg miligramas +μm micrômetros +nm nanômetros +mm milímetros +cm centímetros +cm² centímetros quadrados +cm³ centímetros cúbicos +m metros +m² metros quadrados +m³ metros cúbicos +km quilômetros +km² quilômetros quadrados +ha hectares +kph quilômetros por hora +mph milhas por hora +m/s metros por segundo +l litros +ml mililitros +kgf quilogramas força +% por cento +°F fahrenheit +°F graus fahrenheit +°C graus celsius +Hz hertz +kHz quilo hertz +MHz mega hertz +GHz giga hertz +W watts +kW quilowatts +MW megawatts +GW gigawatts +Wh watts hora +kWh quilowatts hora +MWh megawatts hora +GWh gigawatts hora +kV quilovolts +V volts +mV milivolts +A amperes +mA miliamperes +rpm rotações por minuto +db decibéis +cal calorias +kcal quilocalorias +G gramas +KG quilos +KG quilogramas +KM quilômetros +M metros +L litros +ML mililitros +M2 metros quadrados +M^2 metros quadrados +C graus celsius diff --git a/nemo_text_processing/text_normalization/pt/data/measure/measurements_singular.tsv b/nemo_text_processing/text_normalization/pt/data/measure/measurements_singular.tsv new file mode 100755 index 000000000..242aba14e --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/measure/measurements_singular.tsv @@ -0,0 +1,65 @@ +h hora +min minuto +s segundo +ms milissegundo +ns nanossegundo +μs microssegundo +t tonelada +kg quilo +kg quilograma +g grama +mg miligrama +μm micrômetro +nm nanômetro +mm milímetro +cm centímetro +cm² centímetro quadrado +cm³ centímetro cúbico +m metro +m² metro quadrado +m³ metro cúbico +km quilômetro +km² quilômetro quadrado +ha hectare +kph quilômetro por hora +mph milha por hora +m/s metro por segundo +l litro +ml mililitro +kgf quilograma força +% por cento +°F fahrenheit +°C celsius +°F grau fahrenheit +°C grau celsius +Hz hertz +kHz quilo hertz +MHz mega hertz +GHz giga hertz +W watt +kW quilowatt +MW megawatt +GW gigawatt +Wh watt hora +kWh quilowatt hora +MWh megawatt hora +GWh gigawatt hora +kV quilovolt +V volt +mV milivolt +A ampere +mA miliampere +rpm rotação por minuto +db decibel +cal caloria +kcal quilocaloria +G grama +KG quilo +KG quilograma +KM quilômetro +M metro +L litro +ML mililitro +M2 metro quadrado +M^2 metro quadrado +C celsius diff --git a/nemo_text_processing/text_normalization/pt/data/money/__init__.py b/nemo_text_processing/text_normalization/pt/data/money/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/money/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/money/currency_major.tsv b/nemo_text_processing/text_normalization/pt/data/money/currency_major.tsv new file mode 100644 index 000000000..b23d1bcce --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/money/currency_major.tsv @@ -0,0 +1,5 @@ +US$ dólar americano +R$ real +€ euro +£ libra esterlina +$ dólar diff --git a/nemo_text_processing/text_normalization/pt/data/money/currency_major_plural.tsv b/nemo_text_processing/text_normalization/pt/data/money/currency_major_plural.tsv new file mode 100644 index 000000000..feca270ff --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/money/currency_major_plural.tsv @@ -0,0 +1,5 @@ +dólar americano dólares americanos +real reais +euro euros +libra esterlina libras esterlinas +dólar dólares diff --git a/nemo_text_processing/text_normalization/pt/data/money/currency_minor.tsv b/nemo_text_processing/text_normalization/pt/data/money/currency_minor.tsv new file mode 100644 index 000000000..4e9a95d66 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/money/currency_minor.tsv @@ -0,0 +1,5 @@ +US$ centavo +R$ centavo +€ centavo +£ centavo +$ centavo diff --git a/nemo_text_processing/text_normalization/pt/data/money/currency_minor_plural.tsv b/nemo_text_processing/text_normalization/pt/data/money/currency_minor_plural.tsv new file mode 100644 index 000000000..2ac61dff7 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/money/currency_minor_plural.tsv @@ -0,0 +1 @@ +centavo centavos diff --git a/nemo_text_processing/text_normalization/pt/data/telephone/__init__.py b/nemo_text_processing/text_normalization/pt/data/telephone/__init__.py new file mode 100644 index 000000000..9e3fb699d --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/telephone/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/pt/data/telephone/ip_prompt.tsv b/nemo_text_processing/text_normalization/pt/data/telephone/ip_prompt.tsv new file mode 100644 index 000000000..7d59e35eb --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/telephone/ip_prompt.tsv @@ -0,0 +1,2 @@ +ip ip +endereço de ip endereço de i p diff --git a/nemo_text_processing/text_normalization/pt/data/telephone/telephone_prompt.tsv b/nemo_text_processing/text_normalization/pt/data/telephone/telephone_prompt.tsv new file mode 100644 index 000000000..62efeccd5 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/telephone/telephone_prompt.tsv @@ -0,0 +1,4 @@ +ligue para +telefone +celular +meu número é diff --git a/nemo_text_processing/text_normalization/pt/data/time/day_period_suffix.tsv b/nemo_text_processing/text_normalization/pt/data/time/day_period_suffix.tsv index 93a6d9086..a942795f4 100644 --- a/nemo_text_processing/text_normalization/pt/data/time/day_period_suffix.tsv +++ b/nemo_text_processing/text_normalization/pt/data/time/day_period_suffix.tsv @@ -1,4 +1,4 @@ -manhã da manhã -tarde da tarde -noite da noite -madrugada da madrugada +manhã da manhã 6 11 +tarde da tarde 12 17 +noite da noite 18 23 +madrugada da madrugada 0 5 diff --git a/nemo_text_processing/text_normalization/pt/data/whitelist/ipa_symbols.tsv b/nemo_text_processing/text_normalization/pt/data/whitelist/ipa_symbols.tsv new file mode 100644 index 000000000..f5559c711 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/whitelist/ipa_symbols.tsv @@ -0,0 +1,521 @@ +a +aoj +aəj +aː +aːʲ +aː͡j +aː͡ɨ̯ +aˤ +aˤː +a̠ +a̠ː +a̰ +a͡e +a͡i +a͡iː +a͡i̯ +a͡j +a͡o +a͡u +a͡uː +a͡u̯ +a͡w +a͡ə +a͡ɨ̯ +a͡ɪ +a͡ʊ +b +bʱ +bʲ +bː +b̥ +c +cʰ +cː +ç +d +dʲ +dː +d̥ +d̪ +d̪ʱ +d͡z +d͡zʷ +d͡zː +d͡ʑ +d͡ʒ +d͡ʒʱ +d͡ʒʲ +d͡ʒː +e +eː +eːʲ +eː͡j +ẽː +ẽ͡j̃ +e̞ +e̞ː +e̯ +e͡i +e͡iː +e͡ɨ̯ +f +fʲ +fː +h +hː +i +iəj +iəw +iʲ +iː +iːʲ +ĩː +i̥ +i̯ +i͡u +i͡ə +i͡ɛ +j +jː +j̃ +k +kʰ +kʰː +kʲ +kʲʼ +kʷ +kʷʼ +kʼ +kː +k̚ +k̚ʲ +k̟̚ +k͈ +k͡p̚ +l +lʲ +lː +l̥ +l̩ +m +mʲ +mʲː +mː +m̥ +m̩ +n +nʲ +nː +n̥ +n̩ +o +oʲ +oː +oːʲ +ò +õ͡j̃ +õ͡w̃ +o̝ +o̞ +o̞ː +o̯ +o̰ +o͡u +o͡uː +p +pʰ +pʰː +pʲ +pʷʼ +pʼ +pː +p̚ +p̚ʲ +p͈ +p͜f +p͡f +q +qʷ +qʼ +r +rʲ +rː +r̂ +r̂ː +r̥ +r̩ +s +sʰ +sʲ +sʼ +sː +s͈ +t +tʰ +tʰː +tʲ +tʷʼ +tʼ +tː +t̚ +t̪ +t̪ʰ +t͈ +t͜s +t͡s +t͡sʰ +t͡sʰː +t͡sʲ +t͡sʷ +t͡sʼ +t͡sː +t͡ɕ +t͡ɕʰ +t͡ɕ͈ +t͡ʂ +t͡ʂʼ +t͡ʃ +t͡ʃʰ +t͡ʃʰː +t͡ʃʲ +t͡ʃʷ +t͡ʃʼ +t͡ʃː +u +uəj +uʲ +uː +uːʲ +ũː +ũ͡j̃ +u̯ +u͡e +u͡i +u͡j +u͡ɔ +u͡ə +v +vʲ +vː +w +w̃ +x +xʷ +xː +y +yː +yːʲ +y̯ +z +zʲ +zː +z̥ +à +àː +á +áː +â +âː +ã +ã̠ +æ +æː +æ̀ +æ̀ː +æ̂ +æ̂ː +æ͡ɪ +æ͡ʉ +ç +è +èː +é +éː +ê +êː +ì +ìː +í +íː +î +îː +ï +ð +ò +òː +ó +óː +ô +ôː +õ +õː +õ̞ +ø +øː +øːʲ +ø̯ +ù +ùː +ú +úː +û +ûː +ā +āː +ē +ēː +ĕ +ĕ͡ə +ě +ěː +ħ +ĩ +ĩː +ī +īː +ŋ +ŋʲ +ŋ̊ +ŋ̍ +ŋ̟ +ŋ̩ +ŋ͡m +ō +ŏ +ŏ͡ə +œ +œː +œ̃ +œ͡i +œ͡iː +œ͡ʏ +ř +řː +ũ +ũː +ū +ūː +ŭ +ŭ͡ə +ǎ +ǎː +ǐ +ǐː +ǒ +ǒː +ǔ +ǔː +ǣ +ǣː +ɐ +ɐː +ɐ̃ +ɐ̃͡j̃ +ɐ̃͡w̃ +ɐ̯ +ɐ̯̯ +ɑ +ɑː +ɑ̃ +ɑ̃ː +ɒ +ɒʲ +ɒː +ɓ +ɔ +ɔː +ɔˤː +ɔ̀ +ɔ̀ː +ɔ́ +ɔ́ː +ɔ̃ +ɔ̃ː +ɔ̰ +ɔ͡i̯ +ɔ͡ə +ɔ͡ɨ̯ +ɔ͡ɪ +ɔ͡ʊ +ɕ +ɕʰ +ɕː +ɕ͈ +ɖ +ɖʱ +ɗ +ɘ +ɘː +ə +əː +əˤ +ə̀ +ə́ +ə̃ +ə̯ +ə͡u̯ +ə͡w +ə͡ɨ +ə͡ɨ̯ +ɚ +ɛ +ɛʲ +ɛː +ɛˤː +ɛ̀ +ɛ̀ː +ɛ́ +ɛ́ː +ɛ̂ +ɛ̂ː +ɛ̃ +ɛ̃ː +ɛ̄ +ɛ̄ː +ɛ̰ +ɛ͡i +ɛ͡i̯ +ɛ͡u +ɛ͡u̯ +ɛ͡ɪ +ɛ͡ʊ +ɜ +ɜː +ɝ +ɝː +ɟ +ɟː +ɟ͡ʝ +ɡ +ɡʱ +ɡʲ +ɡʷ +ɡː +ɡ̊ +ɣ +ɤ +ɥ +ɦ +ɨ +ɨəj +ɨː +ɨ̃ᵝ +ɨ̞ +ɨ̥ᵝ +ɨ̯ +ɨ͡u̯ +ɨ͡w +ɨ͡ə +ɨᵝ +ɨᵝː +ɪ +ɪː +ɪ̀ +ɪ́ +ɪ̃ +ɪ̯ +ɪ̰ +ɪ͡u̯ +ɪ͡ʊ +ɫ +ɫː +ɬ +ɬʼ +ɭ +ɮ +ɯ +ɯː +ɯ̟̃ᵝ +ɯ̟̊ᵝ +ɯ̟ᵝ +ɯ̟ᵝː +ɰ +ɰ̃ +ɰᵝ +ɱ +ɱ̩ +ɲ +ɲː +ɲ̊ +ɲ̟ +ɳ +ɴ +ɸ +ɸʷ +ɹ +ɻ +ɽ +ɽʱ +ɾ +ɾʲ +ɾː +ɾ̝̊ +ʀ +ʁ +ʁʷ +ʁː +ʂ +ʂʷ +ʃ +ʃʰ +ʃʲ +ʃʷ +ʃʷʼ +ʃʼ +ʃː +ʈ +ʈʰ +ʉ +ʉː +ʊ +ʊ̀ +ʊ́ +ʊ̃ +ʊ̯ +ʊ̯͡i +ʊ̯͡ɨ +ʊ̰ +ʋ +ʌ +ʌ̹ +ʍ +ʎ +ʏ +ʏː +ʏ̯ +ʐ +ʐʷ +ʑ +ʒ +ʒʲ +ʒʷ +ʒː +ʔ +ʔʲ +ʔʷ +ʝ +˦ˀ˥ +˦˥ +˦˧˥ +˦˩ +˧ˀ˨ +˧˦ +˧˧ +˧˨ +˧˩ +˨˩ +˨˩˦ +˨˩˨ +β +θ +χ +χʷ +χː +ḛ +ḭ +ṵ +ẽ +ẽː +ẽ̞ +‿ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/pt/data/whitelist/symbol.tsv b/nemo_text_processing/text_normalization/pt/data/whitelist/symbol.tsv new file mode 100644 index 000000000..7f7b525e3 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/data/whitelist/symbol.tsv @@ -0,0 +1,23 @@ +& e comercial +# cerquilha +@ arroba +§ parágrafo +™ marca comercial +® marca registrada +© direitos autorais +_ sublinhado +% por cento +* asterisco ++ mais +/ barra += igual +^ acento circunflexo +| barra vertical +~ til +$ dólar +£ libra esterlina +€ euro +₩ won +¥ ienes +° grau +º ordinal masculino diff --git a/nemo_text_processing/text_normalization/pt/data/whitelist/tts.tsv b/nemo_text_processing/text_normalization/pt/data/whitelist/tts.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_text_processing/text_normalization/pt/graph_utils.py b/nemo_text_processing/text_normalization/pt/graph_utils.py index 3342c2816..1d548deb5 100644 --- a/nemo_text_processing/text_normalization/pt/graph_utils.py +++ b/nemo_text_processing/text_normalization/pt/graph_utils.py @@ -45,6 +45,11 @@ NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, pynini.accep('"')).optimize() NEMO_SIGMA = pynini.closure(NEMO_CHAR) +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() + +MIN_NEG_WEIGHT = -0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) insert_space = pynutil.insert(" ") diff --git a/nemo_text_processing/text_normalization/pt/taggers/cardinal.py b/nemo_text_processing/text_normalization/pt/taggers/cardinal.py index 393aabc0e..0f14f3b46 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/pt/taggers/cardinal.py @@ -72,9 +72,12 @@ def __init__(self, deterministic: bool = True): self.tens = graph_tens.optimize() self.two_digit_non_zero = pynini.union(digit, graph_tens, (pynini.cross("0", NEMO_SPACE) + digit)).optimize() + # After "X00" hundreds (oitocentos, …), suffix "01"-"09" needs leading zero stripped + # (graph_tens has no path for "09"; connector+digit only consumes one digit). graph_hundreds = hundreds + pynini.union( pynutil.delete("00"), (connector_e + graph_tens), + (connector_e + pynutil.delete("0") + digit), (connector_e + digit), ) # "100" -> cem only (cross("1", cento)+delete("00") would also match "100" but @@ -109,6 +112,7 @@ def __init__(self, deterministic: bool = True): hundreds + pynini.union( (connector_e + graph_tens), + (connector_e + pynutil.delete("0") + digit), (connector_e + digit), ), ) diff --git a/nemo_text_processing/text_normalization/pt/taggers/decimal.py b/nemo_text_processing/text_normalization/pt/taggers/decimal.py index 126e89697..d9d5d6094 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/pt/taggers/decimal.py @@ -23,12 +23,14 @@ class DecimalFst(GraphFst): """ Finite state transducer for classifying Portuguese decimal numbers, e.g. - "1,26" -> decimal { integer_part: "um" fractional_part: "vinte e seis" } - "0,01" -> decimal { integer_part: "zero" fractional_part: "um" } (leading zeros stripped) - "1,001" -> decimal { integer_part: "um" fractional_part: "mil e um" } (data: decimal_fractional_specials) + "1,26" -> decimal { integer_part: "um" fractional_part: "dois seis" } + "0,01" -> decimal { integer_part: "zero" fractional_part: "zero um" } "-1,26" -> decimal { negative: "true" ... } "1,33 milhões" / "1 milhão" -> decimal { ... quantity: "milhões" / "milhão" } + The fractional mantissa (after the comma) is always read digit-by-digit (0–9), + including leading zeros. Integer part and quantities still use cardinals. + Args: cardinal: CardinalFst instance for integer verbalization in tags. deterministic: if True will provide a single transduction option, @@ -44,56 +46,28 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): quantity_words = _num("quantity_words.tsv") digit = _num("digit.tsv") zero = _num("zero.tsv") - fractional_specials = _num("decimal_fractional_specials.tsv") graph_digit_or_zero = pynini.union(digit, zero) digit_by_digit = (graph_digit_or_zero + pynini.closure(insert_space + graph_digit_or_zero)).optimize() - # Fractional: strip leading zeros → rest @ cardinal; all zeros → "zero" - delete_leading_zero = pynini.cross("0", "") - rest = pynini.difference(NEMO_DIGIT, pynini.accep("0")) + pynini.closure(NEMO_DIGIT, 0) - with_rest = (pynini.closure(delete_leading_zero, 0) + rest) @ (pynini.closure(NEMO_DIGIT, 1) @ cardinal_graph) - only_zeros = pynini.closure(delete_leading_zero, 1) + pynini.cross("0", "zero") - fractional_strip = pynini.union(with_rest, only_zeros).optimize() - # Prefer specials (001→mil e um, 010→mil e dez, 100→mil e cem) over strip when both match - fractional_with_specials = pynini.union( - pynutil.add_weight(fractional_specials, -0.01), - fractional_strip, - ).optimize() + fractional_digits = pynini.closure(NEMO_DIGIT, 1, 15) + graph_fractional = ( + pynutil.insert('fractional_part: "') + (fractional_digits @ digit_by_digit) + pynutil.insert('"') + ) - fractional_short = pynini.closure(NEMO_DIGIT, 1, 9) - fractional_long = pynini.closure(NEMO_DIGIT, 10, 15) non_zero_lead = pynini.difference(NEMO_DIGIT, pynini.accep("0")) - # Integer "0" → fractional strip only (no specials) graph_integer_zero = ( pynutil.insert('integer_part: "') + pynini.cross("0", "zero") + pynutil.insert('"') + insert_space ) - graph_fractional_zero = ( - pynutil.insert('fractional_part: "') - + pynini.union( - fractional_short @ fractional_strip, - fractional_long @ digit_by_digit, - ) - + pynutil.insert('"') - ) - decimal_when_zero = graph_integer_zero + comma + insert_space + graph_fractional_zero + decimal_when_zero = graph_integer_zero + comma + insert_space + graph_fractional - # Integer non-zero → fractional: specials | strip + cardinal | digit-by-digit graph_integer_pos = ( pynutil.insert('integer_part: "') + (non_zero_lead + pynini.closure(NEMO_DIGIT, 0, 11)) @ cardinal_graph + pynutil.insert('"') + insert_space ) - graph_fractional_pos = ( - pynutil.insert('fractional_part: "') - + pynini.union( - fractional_short @ fractional_with_specials, - fractional_long @ digit_by_digit, - ) - + pynutil.insert('"') - ) - decimal_when_pos = graph_integer_pos + comma + insert_space + graph_fractional_pos + decimal_when_pos = graph_integer_pos + comma + insert_space + graph_fractional decimal_core = pynini.union(decimal_when_zero, decimal_when_pos) integer_quantity = ( diff --git a/nemo_text_processing/text_normalization/pt/taggers/electronic.py b/nemo_text_processing/text_normalization/pt/taggers/electronic.py new file mode 100644 index 000000000..ddd89f4e0 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/electronic.py @@ -0,0 +1,82 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_ALPHA, NEMO_DIGIT, NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class ElectronicFst(GraphFst): + """ + Finite state transducer for classifying electronic strings in pt-BR: + abc@hotmail.com -> electronic { username: "abc" domain: "hotmail.com" preserve_order: true } + https://www.abc.com -> electronic { protocol: "https://www." domain: "abc.com" preserve_order: true } + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="electronic", kind="classify", deterministic=deterministic) + + full_stop = pynini.accep(".") + at_symbol = "@" + protocol_string = "protocol" + domain_string = "domain" + username_string = "username" + http = "http" + https = "https" + www = "www" + + symbols = [x[0] for x in load_labels(get_abs_path("data/electronic/symbols.tsv"))] + symbols = pynini.union(*symbols) + symbols_no_full_stop = pynini.difference(symbols, full_stop) + accepted_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols_no_full_stop), 1) + all_characters = pynini.closure((NEMO_ALPHA | NEMO_DIGIT | symbols), 1) + + domain_component = full_stop + accepted_characters + domain_graph = ( + pynutil.insert(domain_string + ': "') + + (accepted_characters + pynini.closure(domain_component, 1)) + + pynutil.insert('"') + ) + + username = ( + pynutil.insert(username_string + ': "') + + all_characters + + pynutil.insert('"') + + pynini.cross(at_symbol, NEMO_SPACE) + ) + email = username + domain_graph + + social_tag = ( + pynini.cross(at_symbol, "") + + pynutil.insert(username_string + ': "') + + (accepted_characters | (accepted_characters + pynini.closure(domain_component, 1))) + + pynutil.insert('"') + ) + + protocol_start = pynini.accep(https + "://") | pynini.accep(http + "://") + protocol_end = pynini.accep(www + ".") + if not deterministic: + protocol_end |= pynini.cross(www + ".", "dáblio dáblio dáblio.") + + protocol = protocol_start | protocol_end | (protocol_start + protocol_end) + protocol = pynutil.insert(protocol_string + ': "') + protocol + pynutil.insert('"') + url = protocol + pynutil.insert(NEMO_SPACE) + domain_graph + + graph = url | domain_graph | email | social_tag + self.graph = graph + + final_graph = self.add_tokens(self.graph + pynutil.insert(" preserve_order: true")) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/measure.py b/nemo_text_processing/text_normalization/pt/taggers/measure.py new file mode 100644 index 000000000..0906551d1 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/measure.py @@ -0,0 +1,78 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_SIGMA, + NEMO_SPACE, + GraphFst, + convert_space, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure (pt-BR), e.g. + 200 g -> measure { cardinal { integer: "duzentos" } units: "gramas" } + 1 kg -> measure { cardinal { integer: "um" } units: "quilo" } + 2,4 g -> measure { decimal { ... } units: "gramas" } + 1/2 l -> measure { fraction { ... } units: "litros" } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + fraction: FractionFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): + super().__init__(name="measure", kind="classify", deterministic=deterministic) + + unit_singular = pynini.string_file(get_abs_path("data/measure/measurements_singular.tsv")) + unit_plural = pynini.string_file(get_abs_path("data/measure/measurements_plural.tsv")) + + graph_unit_singular = convert_space(unit_singular) + graph_unit_plural = convert_space(unit_plural) + + optional_graph_negative = pynini.closure(pynini.accep("-"), 0, 1) + + unit_plural = pynutil.insert('units: "') + graph_unit_plural + pynutil.insert('"') + unit_singular_graph = pynutil.insert('units: "') + graph_unit_singular + pynutil.insert('"') + + subgraph_decimal = decimal.fst + insert_space + pynini.closure(NEMO_SPACE, 0, 1) + unit_plural + + subgraph_cardinal = ( + (optional_graph_negative + (NEMO_SIGMA - "1")) @ cardinal.fst + + insert_space + + pynini.closure(delete_space, 0, 1) + + unit_plural + ) + + subgraph_cardinal |= ( + (optional_graph_negative + pynini.accep("1")) @ cardinal.fst + + insert_space + + pynini.closure(delete_space, 0, 1) + + unit_singular_graph + ) + + subgraph_fraction = fraction.fst + insert_space + pynini.closure(delete_space, 0, 1) + unit_plural + + final_graph = subgraph_decimal | subgraph_cardinal | subgraph_fraction + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/money.py b/nemo_text_processing/text_normalization/pt/taggers/money.py new file mode 100644 index 000000000..e059891f5 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/money.py @@ -0,0 +1,171 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_ALPHA, + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_SPACE, + GraphFst, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class MoneyFst(GraphFst): + """ + Finite state transducer for classifying money (pt-BR), e.g. + R$ 12 -> money { currency_maj: "reais" integer_part: "doze" } + R$ 12,05 -> money { currency_maj: "reais" integer_part: "doze" fractional_part: "cinco" + currency_min: "centavos" preserve_order: true } + R$ 0,20 -> money { fractional_part: "vinte" currency_min: "centavos" preserve_order: true } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): + super().__init__(name="money", kind="classify", deterministic=deterministic) + + decimal_separator = pynini.accep(",") + maj_singular = pynini.string_file(get_abs_path("data/money/currency_major.tsv")) + maj_plural_map = pynini.string_file(get_abs_path("data/money/currency_major_plural.tsv")) + maj_plural_graph = maj_singular @ maj_plural_map + min_singular = pynini.string_file(get_abs_path("data/money/currency_minor.tsv")) + min_plural_map = pynini.string_file(get_abs_path("data/money/currency_minor_plural.tsv")) + min_plural_graph = min_singular @ min_plural_map + + cardinal_graph = cardinal.graph + graph_decimal_final = decimal.final_graph_wo_negative + + graph_maj_singular = pynutil.insert('currency_maj: "') + maj_singular + pynutil.insert('"') + graph_maj_plural = pynutil.insert('currency_maj: "') + maj_plural_graph + pynutil.insert('"') + + graph_integer_one = ( + pynutil.insert('integer_part: "') + (pynini.accep("1") @ cardinal_graph) + pynutil.insert('"') + ) + + decimal_with_quantity = (NEMO_SIGMA + NEMO_ALPHA) @ graph_decimal_final + + graph_decimal_plural = pynini.union( + graph_maj_plural + pynini.closure(delete_space, 0, 1) + insert_space + graph_decimal_final, + graph_decimal_final + pynini.closure(delete_space, 0, 1) + insert_space + graph_maj_plural, + ) + graph_decimal_plural = ((NEMO_SIGMA - "1") + decimal_separator + NEMO_SIGMA) @ graph_decimal_plural + + graph_decimal_singular = pynini.union( + graph_maj_singular + pynini.closure(delete_space, 0, 1) + insert_space + graph_decimal_final, + graph_decimal_final + pynini.closure(delete_space, 0, 1) + insert_space + graph_maj_singular, + ) + graph_decimal_singular = (pynini.accep("1") + decimal_separator + NEMO_SIGMA) @ graph_decimal_singular + + graph_decimal = pynini.union( + graph_decimal_singular, + graph_decimal_plural, + graph_maj_plural + pynini.closure(delete_space, 0, 1) + insert_space + decimal_with_quantity, + ) + + graph_integer = pynutil.insert('integer_part: "') + ((NEMO_SIGMA - "1") @ cardinal_graph) + pynutil.insert('"') + + graph_integer_only = pynini.union( + graph_maj_singular + pynini.closure(delete_space, 0, 1) + insert_space + graph_integer_one, + graph_integer_one + pynini.closure(delete_space, 0, 1) + insert_space + graph_maj_singular, + ) + graph_integer_only |= pynini.union( + graph_maj_plural + pynini.closure(delete_space, 0, 1) + insert_space + graph_integer, + graph_integer + pynini.closure(delete_space, 0, 1) + insert_space + graph_maj_plural, + ) + + graph = graph_integer_only | graph_decimal + + two_digits_fractional_part = ( + pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(pynutil.delete("0")) + ) @ ( + (pynutil.delete("0") + (NEMO_DIGIT - "0")) + | ((NEMO_DIGIT - "0") + pynutil.insert("0")) + | ((NEMO_DIGIT - "0") + NEMO_DIGIT) + ) + + graph_min_singular = pynutil.insert('currency_min: "') + min_singular + pynutil.insert('"') + graph_min_plural = pynutil.insert('currency_min: "') + min_plural_graph + pynutil.insert('"') + + maj_singular_labels = load_labels(get_abs_path("data/money/currency_major.tsv")) + decimal_graph_with_minor = None + for curr_symbol, _ in maj_singular_labels: + preserve_order = pynutil.insert(" preserve_order: true") + + integer_plus_maj = pynini.union( + graph_integer + insert_space + pynutil.insert(curr_symbol) @ graph_maj_plural, + graph_integer_one + insert_space + pynutil.insert(curr_symbol) @ graph_maj_singular, + ) + integer_plus_maj = (pynini.closure(NEMO_DIGIT) - "0") @ integer_plus_maj + + graph_fractional_one = ( + pynutil.insert('fractional_part: "') + + (two_digits_fractional_part @ pynini.cross("1", "um")) + + pynutil.insert('"') + ) + + graph_fractional = ( + two_digits_fractional_part @ (pynini.closure(NEMO_DIGIT, 1, 2) - "1") @ cardinal.two_digit_non_zero + ) + graph_fractional = pynutil.insert('fractional_part: "') + graph_fractional + pynutil.insert('"') + + fractional_plus_min = pynini.union( + graph_fractional + insert_space + pynutil.insert(curr_symbol) @ graph_min_plural, + graph_fractional_one + insert_space + pynutil.insert(curr_symbol) @ graph_min_singular, + ) + + decimal_graph_with_minor_curr = ( + integer_plus_maj + pynini.cross(decimal_separator, NEMO_SPACE) + fractional_plus_min + ) + if not deterministic: + decimal_graph_with_minor_curr |= pynutil.add_weight( + integer_plus_maj + + pynini.cross(decimal_separator, NEMO_SPACE) + + pynutil.insert('fractional_part: "') + + two_digits_fractional_part @ cardinal.two_digit_non_zero + + pynutil.insert('"'), + weight=0.0001, + ) + + decimal_graph_with_minor_curr |= pynutil.delete("0,") + fractional_plus_min + decimal_graph_with_minor_curr = pynini.union( + pynutil.delete(curr_symbol) + + pynini.closure(delete_space, 0, 1) + + decimal_graph_with_minor_curr + + preserve_order, + decimal_graph_with_minor_curr + + preserve_order + + pynini.closure(delete_space, 0, 1) + + pynutil.delete(curr_symbol), + ) + + decimal_graph_with_minor = ( + decimal_graph_with_minor_curr + if decimal_graph_with_minor is None + else pynini.union(decimal_graph_with_minor, decimal_graph_with_minor_curr) + ) + + final_graph = graph | pynutil.add_weight(decimal_graph_with_minor, -0.001) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/punctuation.py b/nemo_text_processing/text_normalization/pt/taggers/punctuation.py index 1244f236a..f42a8815f 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/pt/taggers/punctuation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,24 +12,52 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import sys +from unicodedata import category + import pynini +from pynini.examples import plurals from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_NOT_SPACE, NEMO_SIGMA, GraphFst +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels class PunctuationFst(GraphFst): """ - Finite state transducer for classifying punctuation for Vietnamese + Finite state transducer for classifying punctuation (pt-BR pipeline). + Mirrors the English punctuation tagger; whitelist symbols are excluded via data/whitelist/symbol.tsv. """ def __init__(self, deterministic: bool = True): super().__init__(name="punctuation", kind="classify", deterministic=deterministic) + s = "!#%&\'()*+,-./:;<=>?@^_`{|}~\"" + + punct_symbols_to_exclude = ["[", "]"] + punct_unicode = [ + chr(i) + for i in range(sys.maxunicode) + if category(chr(i)).startswith("P") and chr(i) not in punct_symbols_to_exclude + ] + [r"\[", r"\]"] + + symbol_path = get_abs_path("data/whitelist/symbol.tsv") + whitelist_rows = load_labels(symbol_path) if os.path.isfile(symbol_path) else [] + whitelist_symbols = [x[0] for x in whitelist_rows if x] + self.punct_marks = [p for p in punct_unicode + list(s) if p not in whitelist_symbols] + + punct = pynini.union(*self.punct_marks) + punct = pynini.closure(punct, 1) + + emphasis = ( + pynini.accep("<") + + ( + (pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1) + pynini.closure(pynini.accep("/"), 0, 1)) + | (pynini.accep("/") + pynini.closure(NEMO_NOT_SPACE - pynini.union("<", ">"), 1)) + ) + + pynini.accep(">") + ) + punct = plurals._priority_union(emphasis, punct, NEMO_SIGMA) - s = "!#%&'()*+,-./:;<=>?@^_`{|}~′″°" - - punct = pynini.union(*s) - self.punct_marks = punct self.graph = punct - self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/telephone.py b/nemo_text_processing/text_normalization/pt/taggers/telephone.py new file mode 100644 index 000000000..a24e30c67 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/telephone.py @@ -0,0 +1,119 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_SPACE, NEMO_WHITE_SPACE, GraphFst, insert_space +from nemo_text_processing.text_normalization.pt.utils import get_abs_path + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying pt-BR telephone and IP formats, e.g. + (11) 99999-8888 -> telephone { number_part: "um um nove nove nove nove nove oito oito oito oito" } + +55 11 3333-4444 -> telephone { country_code: "mais cinco cinco" number_part: "um um três três três três quatro quatro quatro quatro" } + 192.168.1.1 -> telephone { number_part: "um nove dois ponto um seis oito ponto um ponto um" } + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="classify", deterministic=deterministic) + + digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize() + zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")).optimize() + single_digits = (digit | zero).optimize() + + # Only strip grouping punctuation between digit blocks; do not delete spaces + # (deleting spaces would glue spoken digit words together in the output). + separators = pynini.union("-", ".") + # Optional separator after country / prompt (still allow +55 11 …). + delete_optional_sep = pynini.closure(pynutil.delete(separators), 0, 1) + # Require an explicit separator between local digit blocks so plain long cardinals + # (e.g. 3022110709) are not misclassified as telephone. + delete_required_separator = pynutil.delete(separators) + delete_optional_spaces = pynini.closure(pynutil.delete(NEMO_WHITE_SPACE), 0) + + def n_digits(n: int): + return pynini.closure(single_digits + insert_space, n - 1, n - 1) + single_digits + + country_digits = n_digits(1) | n_digits(2) | n_digits(3) + country_code = pynini.cross("+", "mais ") + country_digits + + ip_prompts = pynini.string_file(get_abs_path("data/telephone/ip_prompt.tsv")) + telephone_prompts = pynini.string_file(get_abs_path("data/telephone/telephone_prompt.tsv")) + tel_prompt_sequence = telephone_prompts + NEMO_SPACE + pynini.closure(country_code, 0, 1) + + country_code_graph = ( + pynutil.insert('country_code: "') + + (country_code | ip_prompts | tel_prompt_sequence) + + delete_optional_sep + + pynutil.insert('"') + ) + + area_code = (pynutil.delete("(") + n_digits(2) + pynutil.delete(")")) | n_digits(2) + + eleven_digit_graph = ( + area_code + + delete_optional_spaces + + insert_space + + n_digits(5) + + delete_required_separator + + insert_space + + n_digits(4) + ) + ten_digit_graph = ( + area_code + + delete_optional_spaces + + insert_space + + n_digits(4) + + delete_required_separator + + insert_space + + n_digits(4) + ) + nine_digit_graph = n_digits(5) + delete_required_separator + insert_space + n_digits(4) + eight_digit_graph = n_digits(4) + delete_required_separator + insert_space + n_digits(4) + seven_digit_graph = n_digits(3) + delete_required_separator + insert_space + n_digits(4) + + digit_to_str_graph = single_digits + pynini.closure(pynutil.insert(" ") + single_digits, 0, 2) + ip_graph = digit_to_str_graph + (pynini.cross(".", " ponto ") + digit_to_str_graph) ** 3 + + number_part = ( + eleven_digit_graph + | ten_digit_graph + | nine_digit_graph + | eight_digit_graph + | seven_digit_graph + | pynutil.add_weight(ip_graph, 0.01) + ) + number_part = pynutil.insert('number_part: "') + number_part + pynutil.insert('"') + + # "ramal" -> spoken "ramal …"; "extensão" / "ext." -> spoken "extensão …" (not "ext." letter-by-letter). + ext_core = n_digits(1) + pynini.closure(insert_space + n_digits(1), 0, 3) + extension_intro = delete_optional_spaces + ( + (pynutil.delete("ramal") + delete_optional_spaces + pynutil.insert("ramal ")) + | ( + (pynutil.delete("extensão") | pynutil.delete("ext.")) + + delete_optional_spaces + + pynutil.insert("extensão ") + ) + ) + ext_graph = pynutil.insert('extension: "') + extension_intro + ext_core + pynutil.insert('"') + + graph = ( + pynini.closure(country_code_graph + delete_optional_spaces + insert_space, 0, 1) + + number_part + + pynini.closure(delete_optional_spaces + insert_space + ext_graph, 0, 1) + ) + + self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/time.py b/nemo_text_processing/text_normalization/pt/taggers/time.py index 38b51f689..4e2f84165 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/time.py +++ b/nemo_text_processing/text_normalization/pt/taggers/time.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_DIGIT, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.pt.graph_utils import GraphFst, delete_space, insert_space from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels @@ -24,28 +24,37 @@ class TimeFst(GraphFst): Finite state transducer for classifying Portuguese (Brazilian) time, e.g. 14:30 -> time { hours: "catorze" minutes: "trinta" preserve_order: true } 14:30:05 -> time { hours: "catorze" minutes: "trinta" seconds: "cinco" preserve_order: true } + 09:00:31 -> time { hours: "nove" minutes: "zero" seconds: "trinta e um" preserve_order: true } 12:00 -> time { hours: "doze" preserve_order: true } 11:00 da manhã -> time { hours: "onze" suffix: "da manhã" preserve_order: true } + 16:00 da tarde -> time { hours: "quatro" suffix: "da tarde" preserve_order: true } + 23:18 da tarde -> time { hours: "vinte e três" ... suffix: "da tarde" preserve_order: true } Args: - cardinal: cardinal GraphFst + cardinal: CardinalFst deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="time", kind="classify", deterministic=deterministic) - cardinal_graph = cardinal.graph + cardinal_graph = cardinal.graph.optimize() - labels_hour = [str(x) for x in range(0, 24)] - labels_minute_single = [str(x) for x in range(1, 10)] - labels_minute_double = [str(x) for x in range(10, 60)] + hour_words = [] + for h in range(24): + key = str(h) + comp = pynini.compose(pynini.accep(key), cardinal_graph).optimize() + hour_words.append(pynini.shortestpath(comp, nshortest=1, unique=True).string()) - delete_leading_zero_to_double_digit = (NEMO_DIGIT + NEMO_DIGIT) | ( - pynini.closure(pynutil.delete("0"), 0, 1) + NEMO_DIGIT - ) + hour_delete_fsts = [] + for h in range(24): + if h < 10: + hour_delete_fsts.append(pynini.union(pynutil.delete(str(h)), pynutil.delete(f"0{h}")).optimize()) + else: + hour_delete_fsts.append(pynutil.delete(str(h))) - graph_hour = delete_leading_zero_to_double_digit @ pynini.union(*labels_hour) @ cardinal_graph + labels_minute_single = [str(x) for x in range(1, 10)] + labels_minute_double = [str(x) for x in range(10, 60)] graph_minute_single = pynini.union(*labels_minute_single) @ cardinal_graph graph_minute_double = pynini.union(*labels_minute_double) @ cardinal_graph @@ -61,7 +70,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + pynutil.insert('"') ) - final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') + # HMS verbalizer always expects ``minutes`` and ``seconds`` tags; bare ``delete("00")`` omits them. + zero_word = hour_words[0] + minutes_zero = ( + pynutil.delete("00") + pynutil.insert('minutes: "') + pynutil.insert(zero_word) + pynutil.insert('"') + ) + seconds_zero = ( + pynutil.delete("00") + pynutil.insert('seconds: "') + pynutil.insert(zero_word) + pynutil.insert('"') + ) delete_h = pynini.union( pynutil.delete(pynini.accep(pynini.escape("h"))), @@ -74,42 +90,91 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) period_rows = load_labels(get_abs_path("data/time/day_period_suffix.tsv")) - period_branches = [] + period_meta = [] for row in period_rows: if len(row) < 2 or not row[0].strip(): continue tail, tag_val = row[0].strip(), row[1].strip() - period_branches.append(pynutil.delete(tail) + pynutil.insert(f'suffix: "{tag_val}"')) - suffix_tail = delete_space + pynutil.delete("da") + delete_space + pynini.union(*period_branches) - optional_suffix = pynini.closure(insert_space + suffix_tail, 0, 1) - - graph_hm = ( - final_graph_hour + if len(row) < 4 or not row[2].strip().isdigit() or not row[3].strip().isdigit(): + raise ValueError( + f"day_period_suffix.tsv row must have 4 columns (tail, tag, hour_min, hour_max): {row!r}" + ) + h0, h1 = int(row[2].strip()), int(row[3].strip()) + allowed = frozenset(range(h0, h1 + 1)) + suf_fst = insert_space + delete_space + pynutil.delete("da") + delete_space + pynutil.delete(tail) + period_meta.append((tag_val, allowed, suf_fst, tail)) + + preserve = pynutil.insert(" preserve_order: true") + + mid_hm = pynutil.delete(time_delim) + (pynutil.delete("00") | insert_space + final_graph_minute) + mid_h_minute = delete_h + (pynutil.delete("00") | insert_space + final_graph_minute) + mid_h_only = delete_h + mid_hms = ( + pynutil.delete(time_delim) + + (minutes_zero | insert_space + final_graph_minute) + pynutil.delete(time_delim) - + (pynutil.delete("00") | insert_space + final_graph_minute) - + optional_suffix - + pynutil.insert(" preserve_order: true") + + (seconds_zero | insert_space + final_graph_second) ) - graph_h_minute = ( - final_graph_hour - + delete_h - + (pynutil.delete("00") | insert_space + final_graph_minute) - + optional_suffix - + pynutil.insert(" preserve_order: true") - ) - - graph_h_only = final_graph_hour + delete_h + optional_suffix + pynutil.insert(" preserve_order: true") - - graph_hms = ( - final_graph_hour - + pynutil.delete(time_delim) - + (pynutil.delete("00") | insert_space + final_graph_minute) - + pynutil.delete(time_delim) - + (pynutil.delete("00") | insert_space + final_graph_second) - + optional_suffix - + pynutil.insert(" preserve_order: true") - ) - - final_graph = (graph_hm | graph_h_minute | graph_h_only | graph_hms).optimize() + graph_chunks = [] + for mid_after_hour in (mid_hm, mid_h_minute, mid_h_only, mid_hms): + branches = [] + for h in range(24): + hd = hour_delete_fsts[h] + hw24 = hour_words[h] + hour_tok_24 = pynutil.insert('hours: "') + pynutil.insert(hw24) + pynutil.insert('"') + branches.append(hd + hour_tok_24 + mid_after_hour + preserve) + for tag_val, allowed, suf, tail in period_meta: + keep_suffix, hour_idx = TimeFst._resolve_suffix_hour(h, tail, allowed) + hw_suf = hour_words[hour_idx] + hour_tok_suf = pynutil.insert('hours: "') + pynutil.insert(hw_suf) + pynutil.insert('"') + if keep_suffix: + branches.append( + hd + + hour_tok_suf + + mid_after_hour + + suf + + pynutil.insert(f' suffix: "{tag_val}"') + + preserve + ) + else: + # User wrote a period: always emit ``suffix:`` so TN does not drop it from speech + # (hours stay 24h when the period does not match the clock policy). + branches.append( + hd + + hour_tok_24 + + mid_after_hour + + suf + + pynutil.insert(f' suffix: "{tag_val}"') + + preserve + ) + graph_chunks.append(pynini.union(*branches).optimize()) + + final_graph = pynini.union(*graph_chunks).optimize() self.fst = self.add_tokens(final_graph).optimize() + + @staticmethod + def _resolve_suffix_hour(h: int, period_tail: str, allowed: frozenset) -> tuple[bool, int]: + """Return (keep_suffix, hour_index) for ``hour_words[hour_index]`` when a day-period applies.""" + if period_tail == "manhã": + allowed_m = allowed | frozenset({1, 2, 3, 4, 5}) + if h not in allowed_m: + return False, h + return True, h + if period_tail == "tarde": + if h in allowed: + return True, 12 if h == 12 else h - 12 + if 1 <= h <= 5 and (h + 12) in allowed: + return True, h + return False, h + if period_tail == "noite": + if h in allowed: + return True, h - 12 + if 6 <= h <= 11 and (h + 12) in allowed: + return True, h + return False, h + if period_tail == "madrugada": + if h in allowed: + return True, h + return False, h + return False, h diff --git a/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py index bbc7a14b7..61f1ee161 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/pt/taggers/tokenize_and_classify.py @@ -18,9 +18,6 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst -from nemo_text_processing.text_normalization.en.taggers.whitelist import WhiteListFst -from nemo_text_processing.text_normalization.en.taggers.word import WordFst from nemo_text_processing.text_normalization.pt.graph_utils import ( NEMO_WHITE_SPACE, GraphFst, @@ -31,9 +28,16 @@ from nemo_text_processing.text_normalization.pt.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.pt.taggers.date import DateFst from nemo_text_processing.text_normalization.pt.taggers.decimal import DecimalFst +from nemo_text_processing.text_normalization.pt.taggers.electronic import ElectronicFst from nemo_text_processing.text_normalization.pt.taggers.fraction import FractionFst +from nemo_text_processing.text_normalization.pt.taggers.measure import MeasureFst +from nemo_text_processing.text_normalization.pt.taggers.money import MoneyFst from nemo_text_processing.text_normalization.pt.taggers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.pt.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.pt.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.pt.taggers.time import TimeFst +from nemo_text_processing.text_normalization.pt.taggers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.pt.taggers.word import WordFst from nemo_text_processing.utils.logging import logger @@ -80,8 +84,12 @@ def __init__( ordinal = OrdinalFst(cardinal, deterministic=deterministic) fraction = FractionFst(cardinal, ordinal, deterministic=deterministic) decimal = DecimalFst(cardinal, deterministic=deterministic) + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) + money = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic) date = DateFst(cardinal, deterministic=deterministic) time = TimeFst(cardinal, deterministic=deterministic) + telephone = TelephoneFst(deterministic=deterministic) + electronic = ElectronicFst(deterministic=deterministic) punctuation = PunctuationFst(deterministic=deterministic) word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst @@ -89,12 +97,16 @@ def __init__( classify = ( pynutil.add_weight(whitelist.fst, 1.01) - | pynutil.add_weight(date.fst, 1.09) + | pynutil.add_weight(date.fst, 1.1) | pynutil.add_weight(time.fst, 1.1) + | pynutil.add_weight(measure.fst, 1.1) | pynutil.add_weight(fraction.fst, 1.1) | pynutil.add_weight(decimal.fst, 1.1) | pynutil.add_weight(ordinal.fst, 1.1) | pynutil.add_weight(cardinal.fst, 1.1) + | pynutil.add_weight(money.fst, 1.1) + | pynutil.add_weight(telephone.fst, 1.11) + | pynutil.add_weight(electronic.fst, 1.11) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/text_normalization/pt/taggers/whitelist.py b/nemo_text_processing/text_normalization/pt/taggers/whitelist.py new file mode 100644 index 000000000..784825165 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/taggers/whitelist.py @@ -0,0 +1,111 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.utils import augment_labels_with_punct_at_end +from nemo_text_processing.text_normalization.pt.graph_utils import ( + INPUT_CASED, + INPUT_LOWER_CASED, + NEMO_SIGMA, + NEMO_UPPER, + GraphFst, + convert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +def _empty_fst() -> "pynini.FstLike": + """FST that accepts nothing (no whitelist rows).""" + return pynini.intersect(pynini.accep("a"), pynini.accep("b")).optimize() + + +def get_formats(input_f, input_case=INPUT_CASED, is_default=True): + """Abbreviation format variants (same idea as EN whitelist).""" + multiple_formats = load_labels(input_f) + if not multiple_formats: + return _empty_fst() + additional_options = [] + for x, y in multiple_formats: + if input_case == INPUT_LOWER_CASED: + x = x.lower() + additional_options.append((f"{x}.", y)) + additional_options.append((f"{x[0].upper() + x[1:]}", f"{y[0].upper() + y[1:]}")) + additional_options.append((f"{x[0].upper() + x[1:]}.", f"{y[0].upper() + y[1:]}")) + multiple_formats.extend(additional_options) + + if not is_default: + multiple_formats = [(x, f"|raw_start|{x}|raw_end||norm_start|{y}|norm_end|") for (x, y) in multiple_formats] + + return pynini.string_map(multiple_formats) + + +class WhiteListFst(GraphFst): + """ + Whitelist classifier for pt-BR TN. Data lives under pt/data/whitelist/ (may be empty). + """ + + def __init__(self, input_case: str, deterministic: bool = True, input_file: str = None): + super().__init__(name="whitelist", kind="classify", deterministic=deterministic) + + def _get_whitelist_graph(input_case, file, keep_punct_add_end: bool = False): + whitelist = load_labels(file) if os.path.isfile(file) else [] + if not whitelist: + return _empty_fst() + if input_case == INPUT_LOWER_CASED: + whitelist = [[x.lower(), y] for x, y in whitelist] + else: + whitelist = [[x, y] for x, y in whitelist] + + if keep_punct_add_end: + whitelist.extend(augment_labels_with_punct_at_end(whitelist)) + + return pynini.string_map(whitelist) + + graph = _get_whitelist_graph(input_case, get_abs_path("data/whitelist/tts.tsv")) + + symbol_path = get_abs_path("data/whitelist/symbol.tsv") + if os.path.isfile(symbol_path) and load_labels(symbol_path): + graph |= pynini.compose( + pynini.difference(NEMO_SIGMA, pynini.accep("/")).optimize(), + _get_whitelist_graph(input_case, symbol_path), + ).optimize() + + for x in [".", ". "]: + graph |= ( + NEMO_UPPER + + pynini.closure(pynutil.delete(x) + NEMO_UPPER, 2) + + pynini.closure(pynutil.delete("."), 0, 1) + ) + + if not deterministic: + alt_path = get_abs_path("data/whitelist/alternatives.tsv") + if os.path.isfile(alt_path) and load_labels(alt_path): + graph |= _get_whitelist_graph(input_case, alt_path, keep_punct_add_end=True) + fmt_path = get_abs_path("data/whitelist/alternatives_all_format.tsv") + if os.path.isfile(fmt_path) and load_labels(fmt_path): + graph |= get_formats(fmt_path, input_case=input_case) + + if input_file: + whitelist_provided = _get_whitelist_graph(input_case, input_file) + if not deterministic: + graph |= whitelist_provided + else: + graph = whitelist_provided + + self.graph = convert_space(graph).optimize() + self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/pt/taggers/word.py b/nemo_text_processing/text_normalization/pt/taggers/word.py index 96d203467..01dcc389b 100644 --- a/nemo_text_processing/text_normalization/pt/taggers/word.py +++ b/nemo_text_processing/text_normalization/pt/taggers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,34 +13,69 @@ # limitations under the License. import pynini +from pynini.examples import plurals from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_ALPHA, NEMO_DIGIT, NEMO_NOT_SPACE, GraphFst +from nemo_text_processing.text_normalization.pt.graph_utils import ( + MIN_NEG_WEIGHT, + NEMO_ALPHA, + NEMO_DIGIT, + NEMO_NOT_SPACE, + NEMO_SIGMA, + GraphFst, + convert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path class WordFst(GraphFst): """ - Finite state transducer for classifying Vietnamese words. - e.g. ngày -> name: "ngày" - - Args: - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + Finite state transducer for classifying words (pt-BR pipeline). + Same structure as the English word tagger; uses PT data paths and the passed PunctuationFst. """ - def __init__(self, deterministic: bool = True): + def __init__(self, punctuation: GraphFst, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) - # Symbols that should cause token breaks - # Include measure symbols, currency symbols, and digits - symbols_to_exclude = pynini.union("°", "′", "″", "$", "€", "₩", "£", "¥", "#", "%", "₫", NEMO_DIGIT).optimize() + punct = punctuation.graph + default_graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, punct.project("input")), 1) + symbols_to_exclude = (pynini.union("$", "€", "₩", "£", "¥", "#", "%") | NEMO_DIGIT).optimize() + graph = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1) + graph = pynutil.add_weight(graph, MIN_NEG_WEIGHT) | default_graph - word_chars = pynini.closure(pynini.difference(NEMO_NOT_SPACE, symbols_to_exclude), 1) - default_word_graph = word_chars + phoneme_unit = pynini.closure(NEMO_ALPHA, 1) + pynini.closure(NEMO_DIGIT) + phoneme = ( + pynini.accep(pynini.escape("[")) + + pynini.closure(phoneme_unit + pynini.accep(" ")) + + phoneme_unit + + pynini.accep(pynini.escape("]")) + ) - alpha_word_graph = pynini.closure(NEMO_ALPHA, 1) + punct_marks = pynini.union(*punctuation.punct_marks).optimize() + stress = pynini.union("ˈ", "'", "ˌ") + ipa_phoneme_unit = pynini.string_file(get_abs_path("data/whitelist/ipa_symbols.tsv")) + ipa_phonemes = ( + pynini.closure(stress, 0, 1) + + pynini.closure(ipa_phoneme_unit, 1) + + pynini.closure(stress | ipa_phoneme_unit) + ) + delim = (punct_marks | pynini.accep(" ")) ** (1, ...) + ipa_phonemes = ipa_phonemes + pynini.closure(delim + ipa_phonemes) + pynini.closure(delim, 0, 1) + ipa_phonemes = (pynini.accep(pynini.escape("[")) + ipa_phonemes + pynini.accep(pynini.escape("]"))).optimize() - graph = pynutil.add_weight(alpha_word_graph, -1.0) | default_word_graph + if not deterministic: + phoneme = ( + pynini.accep(pynini.escape("[")) + + pynini.closure(pynini.accep(" "), 0, 1) + + pynini.closure(phoneme_unit + pynini.accep(" ")) + + phoneme_unit + + pynini.closure(pynini.accep(" "), 0, 1) + + pynini.accep(pynini.escape("]")) + ).optimize() + ipa_phonemes = ( + pynini.accep(pynini.escape("[")) + ipa_phonemes + pynini.accep(pynini.escape("]")) + ).optimize() - word = pynutil.insert("name: \"") + graph + pynutil.insert("\"") - self.fst = word.optimize() + phoneme |= ipa_phonemes + self.graph = plurals._priority_union(convert_space(phoneme.optimize()), graph, NEMO_SIGMA) + self.fst = (pynutil.insert("name: \"") + self.graph + pynutil.insert("\"")).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/electronic.py b/nemo_text_processing/text_normalization/pt/verbalizers/electronic.py new file mode 100644 index 000000000..cbe3317b5 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/electronic.py @@ -0,0 +1,79 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SIGMA, + NEMO_SPACE, + GraphFst, + delete_preserve_order, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path + +digit_no_zero = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) +graph_symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")) +server_common = pynini.string_file(get_abs_path("data/electronic/server_name.tsv")) +domain_common = pynini.string_file(get_abs_path("data/electronic/domain.tsv")) +spoken_unit = pynini.string_file(get_abs_path("data/electronic/electronic_spoken_unit.tsv")) + + +class ElectronicFst(GraphFst): + """ + Finite state transducer for verbalizing electronic. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="electronic", kind="verbalize", deterministic=deterministic) + + graph_digit = digit_no_zero | zero + + def add_space_after_char(): + return pynini.closure(NEMO_NOT_QUOTE - pynini.accep(NEMO_SPACE) + pynutil.insert(NEMO_SPACE)) + ( + NEMO_NOT_QUOTE - pynini.accep(NEMO_SPACE) + ) + + verbalize_characters = pynini.cdrewrite(graph_symbols | graph_digit, "", "", NEMO_SIGMA) + + # Prefer whole tokens (server names, TLDs, company/common words) over letter-by-letter. + user_segment = pynutil.add_weight(NEMO_NOT_QUOTE, weight=0.0001) | server_common | spoken_unit + user_name = ( + pynutil.delete('username: "') + + (user_segment + pynini.closure(pynutil.insert(NEMO_SPACE) + user_segment)) + + pynutil.delete('"') + ) + user_name @= verbalize_characters + + convert_defaults = ( + pynutil.add_weight(NEMO_NOT_QUOTE, weight=0.0001) | domain_common | server_common | spoken_unit + ) + domain = convert_defaults + pynini.closure(pynutil.insert(NEMO_SPACE) + convert_defaults) + domain @= verbalize_characters + domain = pynutil.delete('domain: "') + domain + pynutil.delete('"') + + protocol = ( + pynutil.delete('protocol: "') + + add_space_after_char() @ pynini.cdrewrite(graph_symbols, "", "", NEMO_SIGMA) + + pynutil.delete('"') + ) + + self.graph = (pynini.closure(protocol + NEMO_SPACE, 0, 1) + domain) | ( + user_name + NEMO_SPACE + pynutil.insert("arroba" + NEMO_SPACE) + domain + | (pynutil.insert("arroba" + NEMO_SPACE) + user_name) + ) + + self.fst = self.delete_tokens(self.graph + delete_preserve_order).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py b/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py index 118b7c859..fc5b286db 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/fraction.py @@ -26,6 +26,15 @@ class FractionFst(GraphFst): fraction { integer_part: "dois" numerator: "três" denominator: "quarto" } -> dois e três quartos fraction { numerator: "dois" denominator: "onze" morphosyntactic_features: "avos" } -> dois onze avos + Denominator routing (set by the tagger, unchanged here): + + * **Ordinal** (``morphosyntactic_features: "ordinal"``): denominators 2–10, 100, 1000 — spoken as + ordinals with plural ``s`` on the denominator when the numerator is not ``um`` (``três quartos``). + * **Avos** (``… "avos"``): all other positive integer denominators — cardinal denominator + the word + ``avos`` (``três onze avos``). Optional ``sobre`` between numerator and denominator is controlled by + ``data/fractions/specials.tsv`` key ``avos_between`` (empty = single space; ``sobre`` = ``… sobre …``). + * **Mixed** numbers use ``connector`` from the same TSV (default `` e ``) after the integer part. + Args: deterministic: if True will provide a single transduction option, for False multiple options (used for audio-based normalization) @@ -40,6 +49,7 @@ def __init__(self, deterministic: bool = True): minus = spec.get("minus", "menos ").rstrip() plural_suffix = spec.get("plural_suffix", "s") avos_word = spec.get("avos_suffix", " avos").strip() + avos_between = spec.get("avos_between", "").strip() numerator_one_val = spec.get("numerator_one", "um") denominator_half_val = spec.get("denominator_half", "meio") @@ -74,9 +84,14 @@ def __init__(self, deterministic: bool = True): fraction_ordinal_plural = numerator_rest + insert_space + denom_ordinal + pynutil.insert(plural_suffix) fraction_ordinal = pynini.union(fraction_ordinal_singular, fraction_ordinal_plural) + if avos_between: + avos_mid = insert_space + pynutil.insert(avos_between) + insert_space + else: + avos_mid = insert_space + fraction_avos = ( pynini.union(numerator_one, numerator_rest) - + insert_space + + avos_mid + denom_avos + insert_space + pynutil.insert(avos_word) diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/measure.py b/nemo_text_processing/text_normalization/pt/verbalizers/measure.py new file mode 100644 index 000000000..bb94ef783 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/measure.py @@ -0,0 +1,98 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_WHITE_SPACE, + GraphFst, + delete_preserve_order, + delete_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure (pt-BR), e.g. + measure { cardinal { integer: "duzentos" } units: "gramas" } -> duzentos gramas + measure { cardinal { integer: "um" } units: "hora" } -> uma hora + + Args: + decimal: DecimalFst verbalizer + cardinal: CardinalFst verbalizer + fraction: FractionFst verbalizer + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool = True): + super().__init__(name="measure", kind="verbalize", deterministic=deterministic) + + hours_unit = ( + pynutil.delete('units: "') + + pynini.union(pynini.accep("hora"), pynini.accep("horas")) + + pynutil.delete('"') + ) + non_hours_unit = ( + pynutil.delete('units: "') + + pynini.difference(pynini.closure(NEMO_NOT_QUOTE, 1), pynini.union("hora", "horas")) + + pynutil.delete('"') + ) + + graph_cardinal = ( + pynutil.delete("cardinal {") + + delete_space + + cardinal.graph_fem + + delete_space + + pynutil.delete("}") + + NEMO_WHITE_SPACE + + hours_unit + ) + graph_cardinal |= ( + pynutil.delete("cardinal {") + + delete_space + + cardinal.graph_masc + + delete_space + + pynutil.delete("}") + + NEMO_WHITE_SPACE + + non_hours_unit + ) + + graph_decimal = ( + pynutil.delete("decimal {") + + delete_space + + decimal.numbers + + delete_space + + pynutil.delete("}") + + NEMO_WHITE_SPACE + + (hours_unit | non_hours_unit) + ) + + graph_fraction = ( + pynutil.delete("fraction {") + + delete_space + + fraction.inner_graph + + delete_space + + pynutil.delete("}") + + NEMO_WHITE_SPACE + + (hours_unit | non_hours_unit) + ) + + graph = graph_cardinal | graph_decimal | graph_fraction + graph += delete_preserve_order + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/money.py b/nemo_text_processing/text_normalization/pt/verbalizers/money.py new file mode 100644 index 000000000..4b247953c --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/money.py @@ -0,0 +1,96 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the "License". +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SIGMA, + NEMO_SPACE, + GraphFst, + delete_preserve_order, + insert_space, +) +from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels + + +class MoneyFst(GraphFst): + """ + Finite state transducer for verbalizing money (pt-BR), e.g. + money { currency_maj: "reais" integer_part: "doze" } -> doze reais + money { ... fractional_part: "cinco" currency_min: "centavos" ... } -> doze reais e cinco centavos + + Args: + decimal: DecimalFst verbalizer (for decimal amounts embedded in money) + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, decimal: GraphFst, deterministic: bool = True): + super().__init__(name="money", kind="verbalize", deterministic=deterministic) + + scales_data = load_labels(get_abs_path("data/numbers/scales.tsv")) + currency_plural_data = load_labels(get_abs_path("data/money/currency_major_plural.tsv")) + + scale_words = [] + for row in scales_data[1:]: + if len(row) < 2: + continue + one_label = row[0].strip() + plural = row[1].strip() + if not one_label or not plural: + continue + scale_words.extend((one_label.split()[-1], plural)) + + curr_words = [row[1].strip() for row in currency_plural_data if len(row) >= 2 and row[1].strip()] + + scales = pynini.union(*[pynini.accep(w) + NEMO_SPACE for w in scale_words]).optimize() + currencies = pynini.union(*curr_words).optimize() + + maj = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + min_unit = pynutil.delete('currency_min: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + + fractional_part = ( + pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + ) + integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + + connector_minor = pynutil.insert("e") + insert_space + if not deterministic: + connector_minor |= pynutil.insert("com") + insert_space + + graph_integer = integer_part + NEMO_SPACE + maj + + graph_integer_with_minor = ( + integer_part + + NEMO_SPACE + + maj + + NEMO_SPACE + + connector_minor + + fractional_part + + NEMO_SPACE + + min_unit + + delete_preserve_order + ) + + graph_decimal = decimal.numbers + NEMO_SPACE + maj + + graph_minor = fractional_part + NEMO_SPACE + min_unit + delete_preserve_order + + graph = graph_integer | graph_integer_with_minor | graph_decimal | graph_minor + graph @= pynini.cdrewrite(pynutil.insert("de") + insert_space, scales, currencies, NEMO_SIGMA) + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/telephone.py b/nemo_text_processing/text_normalization/pt/verbalizers/telephone.py new file mode 100644 index 000000000..eae320638 --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/telephone.py @@ -0,0 +1,57 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="telephone", kind="verbalize", deterministic=deterministic) + + optional_country_code = pynini.closure( + pynutil.delete('country_code: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + + delete_space + + insert_space, + 0, + 1, + ) + + number_part = ( + pynutil.delete('number_part: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynini.closure(pynutil.add_weight(pynutil.delete(" "), -0.0001), 0, 1) + + pynutil.delete('"') + ) + + optional_extension = pynini.closure( + delete_space + + insert_space + + pynutil.delete('extension: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"'), + 0, + 1, + ) + + graph = optional_country_code + number_part + optional_extension + self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/time.py b/nemo_text_processing/text_normalization/pt/verbalizers/time.py index 072cb5b70..dfebdab3e 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/time.py @@ -22,17 +22,17 @@ delete_space, insert_space, ) -from nemo_text_processing.text_normalization.pt.utils import get_abs_path, load_labels class TimeFst(GraphFst): """ Finite state transducer for verbalizing Portuguese time, e.g. time { hours: "catorze" minutes: "trinta" preserve_order: true } -> catorze horas e trinta - time { hours: "catorze" minutes: "trinta" seconds: "cinco" preserve_order: true } - -> catorze horas e trinta minutos e cinco segundos + time { hours: "um" minutes: "trinta" preserve_order: true } -> uma hora e trinta + time { hours: "dois" minutes: "quinze" preserve_order: true } -> duas horas e quinze time { hours: "onze" suffix: "da manhã" preserve_order: true } -> onze horas da manhã - time { hours: "doze" preserve_order: true } -> doze horas + time { hours: "vinte e um" minutes: "dezoito" suffix: "da tarde" preserve_order: true } + -> vinte e uma horas e dezoito da tarde Args: deterministic: if True will provide a single transduction option, @@ -44,7 +44,6 @@ def __init__(self, deterministic: bool = True): quoted = pynini.closure(NEMO_NOT_QUOTE, 1) - hours = pynutil.delete('hours: "') + quoted + pynutil.delete('"') minutes_val = pynutil.delete('minutes: "') + quoted + pynutil.delete('"') seconds_val = pynutil.delete('seconds: "') + quoted + pynutil.delete('"') suffix_val = pynutil.delete('suffix: "') + quoted + pynutil.delete('"') @@ -52,10 +51,48 @@ def __init__(self, deterministic: bool = True): gap = delete_space + insert_space suffix_out = pynini.closure(gap + suffix_val, 0, 1) + hours_default = pynutil.delete('hours: "') + quoted + pynutil.delete('"') + gap + pynutil.insert("horas") + # Match whitespace after the closing quote (same as hours_default's gap) so the path composes + # with minutes/suffix fields; otherwise only the generic "… horas" branch accepts the token. + hours_um = ( + pynutil.delete('hours: "') + + pynutil.delete("um") + + pynutil.delete('"') + + delete_space + + pynutil.insert("uma hora") + ) + hours_dois = ( + pynutil.delete('hours: "') + + pynutil.delete("dois") + + pynutil.delete('"') + + delete_space + + pynutil.insert("duas horas") + ) + hours_vinte_um = ( + pynutil.delete('hours: "') + + pynutil.delete("vinte e um") + + pynutil.delete('"') + + delete_space + + pynutil.insert("vinte e uma horas") + ) + hours_vinte_dois = ( + pynutil.delete('hours: "') + + pynutil.delete("vinte e dois") + + pynutil.delete('"') + + delete_space + + pynutil.insert("vinte e duas horas") + ) + # Prefer feminine hour phrases over the generic ``… horas`` path (tie-break by weight). + hour_phrase = ( + pynutil.add_weight(hours_um, -0.01) + | pynutil.add_weight(hours_dois, -0.01) + | pynutil.add_weight(hours_vinte_um, -0.01) + | pynutil.add_weight(hours_vinte_dois, -0.01) + | hours_default + ).optimize() + graph_hms = ( - hours - + gap - + pynutil.insert("horas") + hour_phrase + insert_space + pynutil.insert("e") + insert_space @@ -73,19 +110,16 @@ def __init__(self, deterministic: bool = True): ) with_minutes = ( - hours - + gap - + pynutil.insert("horas") - + gap + hour_phrase + + insert_space + pynutil.insert("e") + insert_space - + gap + minutes_val + suffix_out + delete_preserve_order ) - hours_only = hours + gap + pynutil.insert("horas") + suffix_out + delete_preserve_order + hours_only = hour_phrase + suffix_out + delete_preserve_order graph = pynini.union(graph_hms, with_minutes, hours_only).optimize() self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py index dbe4e2a17..9a6910d6a 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize.py @@ -17,8 +17,12 @@ from nemo_text_processing.text_normalization.pt.verbalizers.cardinal import CardinalFst from nemo_text_processing.text_normalization.pt.verbalizers.date import DateFst from nemo_text_processing.text_normalization.pt.verbalizers.decimal import DecimalFst +from nemo_text_processing.text_normalization.pt.verbalizers.electronic import ElectronicFst from nemo_text_processing.text_normalization.pt.verbalizers.fraction import FractionFst +from nemo_text_processing.text_normalization.pt.verbalizers.measure import MeasureFst +from nemo_text_processing.text_normalization.pt.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.pt.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.pt.verbalizers.telephone import TelephoneFst from nemo_text_processing.text_normalization.pt.verbalizers.time import TimeFst @@ -40,8 +44,23 @@ def __init__(self, deterministic: bool = True): ordinal = OrdinalFst(deterministic=deterministic) fraction = FractionFst(deterministic=deterministic) decimal = DecimalFst(deterministic=deterministic) + measure = MeasureFst(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) + money = MoneyFst(decimal=decimal, deterministic=deterministic) date = DateFst(deterministic=deterministic) time = TimeFst(deterministic=deterministic) - graph = fraction.fst | decimal.fst | date.fst | time.fst | ordinal.fst | cardinal.fst + telephone = TelephoneFst(deterministic=deterministic) + electronic = ElectronicFst(deterministic=deterministic) + graph = ( + fraction.fst + | decimal.fst + | date.fst + | time.fst + | measure.fst + | money.fst + | ordinal.fst + | cardinal.fst + | telephone.fst + | electronic.fst + ) self.fst = graph diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py index cc2eaae3d..84ed9ac39 100644 --- a/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/pt/verbalizers/verbalize_final.py @@ -17,7 +17,6 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.pt.graph_utils import ( GraphFst, delete_extra_space, @@ -25,6 +24,7 @@ generator_main, ) from nemo_text_processing.text_normalization.pt.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.pt.verbalizers.word import WordFst from nemo_text_processing.utils.logging import logger diff --git a/nemo_text_processing/text_normalization/pt/verbalizers/word.py b/nemo_text_processing/text_normalization/pt/verbalizers/word.py new file mode 100644 index 000000000..2b53ad1ff --- /dev/null +++ b/nemo_text_processing/text_normalization/pt/verbalizers/word.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.pt.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space + + +class WordFst(GraphFst): + """ + Verbalizes tokens { name: "..." } for the pt-BR pipeline. + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="word", kind="verbalize", deterministic=deterministic) + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete("\"") + chars + pynutil.delete("\"") + graph = char @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 26a3fc7b6..3a1964bbd 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'ko', 'vi'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'ko', 'vi', 'pt'], default="en", type=str, ) diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt index 7ef575c73..be8057d71 100644 --- a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_cardinal.txt @@ -114,4 +114,8 @@ 3004005006007~três trilhões quatro bilhões cinco milhões seis mil e sete 1000001~um milhão e um 1001100~um milhão mil e cem -1001110~um milhão mil cento e dez \ No newline at end of file +1001110~um milhão mil cento e dez +47701~quarenta e sete mil setecentos e um +394506~trezentos e noventa e quatro mil quinhentos e seis +3022110709~três bilhões vinte e dois milhões cento e dez mil setecentos e nove +302210709~trezentos e dois milhões duzentos e dez mil setecentos e nove \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt index f74c5e4fc..0bea9be23 100644 --- a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_decimal.txt @@ -2,56 +2,57 @@ 0,2~zero vírgula dois 0,5~zero vírgula cinco 0,9~zero vírgula nove -0,01~zero vírgula um -0,02~zero vírgula dois -0,05~zero vírgula cinco -0,10~zero vírgula dez -0,11~zero vírgula onze -0,15~zero vírgula quinze -0,20~zero vírgula vinte -0,25~zero vírgula vinte e cinco -0,50~zero vírgula cinquenta -0,99~zero vírgula noventa e nove +0,01~zero vírgula zero um +0,02~zero vírgula zero dois +0,05~zero vírgula zero cinco +0,10~zero vírgula um zero +0,11~zero vírgula um um +0,15~zero vírgula um cinco +0,20~zero vírgula dois zero +0,25~zero vírgula dois cinco +0,50~zero vírgula cinco zero +0,99~zero vírgula nove nove 1,1~um vírgula um 1,2~um vírgula dois 1,5~um vírgula cinco -1,10~um vírgula dez -1,15~um vírgula quinze -1,20~um vírgula vinte -1,26~um vírgula vinte e seis -1,33~um vírgula trinta e três -1,50~um vírgula cinquenta -3,141~três vírgula cento e quarenta e um -3,256~três vírgula duzentos e cinquenta e seis -3,999~três vírgula novecentos e noventa e nove -3,1415~três vírgula mil quatrocentos e quinze -3,1001~três vírgula mil e um -3,2003~três vírgula dois mil e três -3,014~três vírgula catorze -3,0141~três vírgula cento e quarenta e um -3,1005~três vírgula mil e cinco -3,1050~três vírgula mil e cinquenta -3,1010~três vírgula mil e dez +1,10~um vírgula um zero +1,15~um vírgula um cinco +1,20~um vírgula dois zero +1,26~um vírgula dois seis +1,33~um vírgula três três +1,50~um vírgula cinco zero +3,141~três vírgula um quatro um +3,256~três vírgula dois cinco seis +3,999~três vírgula nove nove nove +3,1415~três vírgula um quatro um cinco +3,1001~três vírgula um zero zero um +3,014~três vírgula zero um quatro +3,0141~três vírgula zero um quatro um +3,1005~três vírgula um zero zero cinco +3,1050~três vírgula um zero cinco zero -1,2~menos um vírgula dois --1,26~menos um vírgula vinte e seis +-1,26~menos um vírgula dois seis -3,5~menos três vírgula cinco -0,5~menos zero vírgula cinco 1,2 milhões~um vírgula dois milhões 1,5 milhões~um vírgula cinco milhões -1,25 milhões~um vírgula vinte e cinco milhões +1,25 milhões~um vírgula dois cinco milhões 2,5 bilhões~dois vírgula cinco bilhões -3,75 bilhões~três vírgula setenta e cinco bilhões -0,001~zero vírgula um -0,0001~zero vírgula um -1,001~um vírgula mil e um -1,010~um vírgula mil e dez -1,100~um vírgula mil e cem -10,01~dez vírgula um -10,001~dez vírgula mil e um +3,75 bilhões~três vírgula sete cinco bilhões +0,001~zero vírgula zero zero um +0,0001~zero vírgula zero zero zero um +1,001~um vírgula zero zero um +1,010~um vírgula zero um zero +1,100~um vírgula um zero zero +10,01~dez vírgula zero um +10,001~dez vírgula zero zero um 100,5~cem vírgula cinco -100,05~cem vírgula cinco -3,14~três vírgula catorze -3,141~três vírgula cento e quarenta e um -3,1415~três vírgula mil quatrocentos e quinze -3,14159~três vírgula catorze mil cento e cinquenta e nove -3,1415926535~três vírgula um quatro um cinco nove dois seis cinco três cinco \ No newline at end of file +100,05~cem vírgula zero cinco +3,14~três vírgula um quatro +3,141~três vírgula um quatro um +3,1415~três vírgula um quatro um cinco +3,14159~três vírgula um quatro um cinco nove +1,1234567~um vírgula um dois três quatro cinco seis sete +3,1415926535~três vírgula um quatro um cinco nove dois seis cinco três cinco +12,27~doze vírgula dois sete +87,69~oitenta e sete vírgula seis nove diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_electronic.txt new file mode 100644 index 000000000..3aee0be70 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_electronic.txt @@ -0,0 +1,13 @@ +test@gmail.com~t e s t arroba gmail ponto com +a.bc@gmail.com~a ponto b c arroba gmail ponto com +admin@company.com.br~a d m i n arroba c o m p a n y ponto com ponto br +www.google.com~w w w ponto google ponto com +https://www.nvidia.com~h t t p s dois pontos barra barra w w w ponto nvidia ponto com +http://site.com.br~h t t p dois pontos barra barra s i t e ponto com ponto br +nvidia.com~nvidia ponto com +@usuario~arroba usuario +mail@google.com~m a i l arroba google ponto com +support@microsoft.com~s u p p o r t arroba microsoft ponto com +https://www.amazon.com.br~h t t p s dois pontos barra barra w w w ponto amazon ponto com ponto br +u.s.e.r@facebook.com~u ponto s ponto e ponto r arroba facebook ponto com +api.netflix.com~a p i ponto netflix ponto com diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_fraction.txt index 256fecd86..e59642bac 100644 --- a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_fraction.txt @@ -18,4 +18,5 @@ 1/1000~um milésimo 1 1/2~um e um meio 2 1/4~dois e um quarto -3 2/3~três e dois terços \ No newline at end of file +3 2/3~três e dois terços +47701/913~quarenta e sete mil setecentos e um novecentos e treze avos \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..34a082108 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_measure.txt @@ -0,0 +1,18 @@ +200 g~duzentos gramas +1 kg~um quilo~um quilograma +5 kg~cinco quilos~cinco quilogramas +200 m~duzentos metros +1 km~um quilômetro +5 km~cinco quilômetros +100 m~cem metros +1 l~um litro +2 l~dois litros +500 ml~quinhentos mililitros +1 m²~um metro quadrado +10 m²~dez metros quadrados +25°C~vinte e cinco graus celsius +-5°C~menos cinco graus celsius +1 h~uma hora +2 h~duas horas +30 min~trinta minutos +45 s~quarenta e cinco segundos diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_money.txt new file mode 100644 index 000000000..ea0680741 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_money.txt @@ -0,0 +1,25 @@ +R$ 1~um real +R$ 12~doze reais +R$ 100~cem reais +R$ 200~duzentos reais +R$ 12,05~doze reais e cinco centavos +R$ 1,01~um real e um centavo +R$ 199,99~cento e noventa e nove reais e noventa e nove centavos +R$ 0,20~vinte centavos +R$ 0,25~vinte e cinco centavos +R$ 0,50~cinquenta centavos +$ 1~um dólar +$ 12~doze dólares +$ 12,05~doze dólares e cinco centavos +$ 29,50~vinte e nove dólares e cinquenta centavos +$ 75,63~setenta e cinco dólares e sessenta e três centavos~setenta e cinco dólares com sessenta e três centavos +€ 1~um euro +€ 12~doze euros +€ 12,05~doze euros e cinco centavos +R$ 1000~mil reais +R$ 1000000~um milhão de reais +$ 1000~mil dólares +$ 1000000~um milhão de dólares +R$ 181809~cento e oitenta e um mil oitocentos e nove reais +R$ 181819~cento e oitenta e um mil oitocentos e dezenove reais +R$ 1811605~um milhão oitocentos e onze mil seiscentos e cinco reais diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..a5b085469 --- /dev/null +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_telephone.txt @@ -0,0 +1,7 @@ +(11) 99999-8888~um um nove nove nove nove nove oito oito oito oito +11 3333-4444~um um três três três três quatro quatro quatro quatro +555-1234~cinco cinco cinco um dois três quatro +99999-1234~nove nove nove nove nove um dois três quatro ++55 (11) 3333-4444~mais cinco cinco um um três três três três quatro quatro quatro quatro +192.168.1.1~um nove dois ponto um seis oito ponto um ponto um +(11) 3333-4444 ext. 12~um um três três três três quatro quatro quatro quatro extensão um dois \ No newline at end of file diff --git a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_time.txt index deb868ad8..b9ae642e7 100644 --- a/tests/nemo_text_processing/pt/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/pt/data_text_normalization/test_cases_time.txt @@ -17,8 +17,14 @@ 00h~zero horas 12h30~doze horas e trinta 14:30:05~catorze horas e trinta minutos e cinco segundos +09:00:31~nove horas e zero minutos e trinta e um segundos +14:30:00~catorze horas e trinta minutos e zero segundos +09:00:00~nove horas e zero minutos e zero segundos 11:00 da manhã~onze horas da manhã 3:30 da tarde~três horas e trinta da tarde -15h da tarde~quinze horas da tarde -16:00 da tarde~dezesseis horas da tarde -14:30:05 da tarde~catorze horas e trinta minutos e cinco segundos da tarde +15h da tarde~três horas da tarde +16:00 da tarde~quatro horas da tarde +14:30:05 da tarde~duas horas e trinta minutos e cinco segundos da tarde +21:18:14 da manhã~vinte e uma horas e dezoito minutos e catorze segundos da manhã +23:18 da tarde~vinte e três horas e dezoito da tarde +22h09 da tarde~vinte e duas horas e nove da tarde diff --git a/tests/nemo_text_processing/pt/test_electronic.py b/tests/nemo_text_processing/pt/test_electronic.py index bff47d1fe..9f462c5db 100644 --- a/tests/nemo_text_processing/pt/test_electronic.py +++ b/tests/nemo_text_processing/pt/test_electronic.py @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -29,3 +30,12 @@ class TestElectronic: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_electronic.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_measure.py b/tests/nemo_text_processing/pt/test_measure.py index 9dcfc8548..603c967f4 100644 --- a/tests/nemo_text_processing/pt/test_measure.py +++ b/tests/nemo_text_processing/pt/test_measure.py @@ -17,6 +17,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -30,3 +31,15 @@ class TestMeasure: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + if isinstance(expected, list): + assert pred in expected + else: + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_money.py b/tests/nemo_text_processing/pt/test_money.py index 632bdb458..d440a93e8 100644 --- a/tests/nemo_text_processing/pt/test_money.py +++ b/tests/nemo_text_processing/pt/test_money.py @@ -17,6 +17,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -30,3 +31,15 @@ class TestMoney: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_money.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + if isinstance(expected, list): + assert pred in expected + else: + assert pred == expected diff --git a/tests/nemo_text_processing/pt/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/pt/test_sparrowhawk_normalization.sh index 50140b553..c9f39aaa7 100755 --- a/tests/nemo_text_processing/pt/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/pt/test_sparrowhawk_normalization.sh @@ -10,15 +10,20 @@ runtest () { # read test file while read testcase; do - IFS='~' read written spoken <<< $testcase - norm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + IFS='~' read -a testcase_tokenized <<< $testcase + written=${testcase_tokenized[0]} + # only tests against first possible option when there are multiple shortest paths + spoken=${testcase_tokenized[1]} + + # replace non breaking space with breaking space + denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') # trim white space spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" - norm_pred="$(echo -e "${norm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" # input expected actual - assertEquals "$written" "$spoken" "$norm_pred" + assertEquals "$written" "$spoken" "$denorm_pred" done < "$input" } @@ -52,30 +57,30 @@ testTNTime() { runtest $input } -# testTNMeasure() { -# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_measure.txt -# runtest $input -# } +testTNMeasure() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_measure.txt + runtest $input +} -# testTNMoney() { -# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_money.txt -# runtest $input -# } +testTNMoney() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_money.txt + runtest $input +} # testTNWhitelist() { # input=$PROJECT_DIR/pt/data_text_normalization/test_cases_whitelist.txt # runtest $input # } -# testTNTelephone() { -# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_telephone.txt -# runtest $input -# } +testTNTelephone() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_telephone.txt + runtest $input +} -# testTNElectronic() { -# input=$PROJECT_DIR/pt/data_text_normalization/test_cases_electronic.txt -# runtest $input -# } +testTNElectronic() { + input=$PROJECT_DIR/pt/data_text_normalization/test_cases_electronic.txt + runtest $input +} # testTNWord() { # input=$PROJECT_DIR/pt/data_text_normalization/test_cases_word.txt diff --git a/tests/nemo_text_processing/pt/test_telephone.py b/tests/nemo_text_processing/pt/test_telephone.py index e27c47e1c..f69f0b5a3 100644 --- a/tests/nemo_text_processing/pt/test_telephone.py +++ b/tests/nemo_text_processing/pt/test_telephone.py @@ -17,6 +17,7 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -30,3 +31,12 @@ class TestTelephone: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(lang='pt', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') + + @parameterized.expand(parse_test_case_file('pt/data_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected