From b773d590b0a1c745a2202125d54062d2248e1f22 Mon Sep 17 00:00:00 2001 From: Shreyas Pawar Date: Thu, 11 Jun 2026 10:24:57 +0000 Subject: [PATCH 1/2] Hi TN Serial: Implement SerialFst tagger for Devanagari-numeric mixtures, number chains, and mathematical powers Signed-off-by: Shreyas Pawar --- Jenkinsfile | 2 +- .../hi/data/serial/chars.tsv | 68 +++++++ .../hi/data/serial/power_special.tsv | 4 + .../hi/data/serial/special_symbols.tsv | 4 + .../text_normalization/hi/taggers/serial.py | 191 ++++++++++++++++++ .../hi/taggers/tokenize_and_classify.py | 12 +- .../test_cases_serial.txt | 21 ++ tests/nemo_text_processing/hi/test_serial.py | 33 +++ .../hi/test_sparrowhawk_normalization.sh | 8 +- 9 files changed, 335 insertions(+), 8 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/serial/chars.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/serial/power_special.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/serial/special_symbols.tsv create mode 100644 nemo_text_processing/text_normalization/hi/taggers/serial.py create mode 100644 tests/nemo_text_processing/hi/data_text_normalization/test_cases_serial.txt create mode 100644 tests/nemo_text_processing/hi/test_serial.py diff --git a/Jenkinsfile b/Jenkinsfile index 33b361cf5..b9a92cd30 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-11-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/serial/chars.tsv b/nemo_text_processing/text_normalization/hi/data/serial/chars.tsv new file mode 100644 index 000000000..d5becfcf2 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/serial/chars.tsv @@ -0,0 +1,68 @@ +अ अ +आ आ +इ इ +ई ई +उ उ +ऊ ऊ +ऋ ऋ +ए ए +ऐ ऐ +ओ ओ +औ औ +ऑ ऑ +ा ा +ि ि +ी ी +ु ु +ू ू +ृ ृ +े े +ै ै +ो ो +ौ ौ +ॉ ॉ +ं ं +ः ः +ँ ँ +क क +ख ख +ग ग +घ घ +ङ ङ +च च +छ छ +ज ज +झ झ +ञ ञ +ट ट +ठ ठ +ड ड +ढ ढ +ण ण +त त +थ थ +द द +ध ध +न न +प प +फ फ +ब ब +भ भ +म म +य य +र र +ल ल +व व +श श +ष ष +स स +ह ह +क़ क़ +ख़ ख़ +ग़ ग़ +ज़ ज़ +ड़ ड़ +ढ़ ढ़ +फ़ फ़ +य़ य़ +् ् \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/serial/power_special.tsv b/nemo_text_processing/text_normalization/hi/data/serial/power_special.tsv new file mode 100644 index 000000000..64583f947 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/serial/power_special.tsv @@ -0,0 +1,4 @@ +^2 स्क्वेर्ड +^२ स्क्वेर्ड +^3 क्यूब +^३ क्यूब \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/serial/special_symbols.tsv b/nemo_text_processing/text_normalization/hi/data/serial/special_symbols.tsv new file mode 100644 index 000000000..c96a15bd6 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/serial/special_symbols.tsv @@ -0,0 +1,4 @@ +# हैशटैग +% प्रतिशत +& एंड +@ एट \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/serial.py b/nemo_text_processing/text_normalization/hi/taggers/serial.py new file mode 100644 index 000000000..1a6cb794d --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/taggers/serial.py @@ -0,0 +1,191 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import ( + NEMO_ALPHA, + NEMO_DIGIT, + NEMO_NOT_SPACE, + NEMO_SIGMA, + GraphFst, + convert_space, +) +from nemo_text_processing.text_normalization.hi.utils import get_abs_path + + +class SerialFst(GraphFst): + """ + Finite state transducer for classifying serial strings in Hindi. + Handles Devanagari-numeric mixtures, complex delimited number chains, + symbols, and powers. Supports both ASCII (0-9) and Devanagari (०-९) digits. + + e.g. कोविड-19 -> tokens { name: "कोविड-उन्नीस" } + e.g. 5जी -> tokens { name: "पाँच जी" } + e.g. ३जी -> tokens { name: "तीन जी" } + e.g. 2^2 -> tokens { name: "दो स्क्वेर्ड" } + e.g. 2^4 -> tokens { name: "दो टु द पावर चार" } + e.g. 1-800-555 -> tokens { name: "एक-आठ सौ-पाँच सौ पचपन" } + + Note: Pure Latin-alpha + digit patterns (A12, B-60) are intentionally + excluded here so they fall through to the electronic classifier. + """ + + def __init__( + self, + cardinal: GraphFst, + deterministic: bool = True, + ): + super().__init__(name="serial", kind="classify", deterministic=deterministic) + + digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + zero_graph = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + + devanagari_digits = pynini.project( + pynini.union(digit_graph, zero_graph), + "input", + ).optimize() + + any_digit = pynini.union(NEMO_DIGIT, devanagari_digits).optimize() + + not_quote = pynini.closure(pynini.difference(NEMO_SIGMA, pynini.accep('"')), 1) + strip_cardinal_tags = ( + pynutil.delete('cardinal { integer: "') + + not_quote + + pynutil.delete('" }') + ) + + pure_cardinal_words = pynini.compose(cardinal.fst, strip_cardinal_tags).optimize() + + length_filter = pynini.closure(any_digit, 1, 3) + limited_cardinal = pynini.compose(length_filter, pure_cardinal_words).optimize() + + num_graph = limited_cardinal + + symbols_graph = pynini.string_file( + get_abs_path("data/serial/special_symbols.tsv") + ).optimize() + + devanagari_chars = pynini.project( + pynini.string_file(get_abs_path("data/serial/chars.tsv")), + "input", + ).optimize() + + letter_graph = pynini.string_file(get_abs_path("data/address/letters.tsv")) + latin_letters = letter_graph + pynini.closure(pynutil.insert(" ") + letter_graph) + latin_letters = latin_letters.optimize() + + devanagari_word = pynini.closure(devanagari_chars, 2).optimize() + + delimiter = (pynini.accep("-") | pynini.accep("/") | pynini.accep(" ")).optimize() + + alphas = (latin_letters | devanagari_word).optimize() + segment = (alphas | num_graph | symbols_graph).optimize() + + serial_core = segment + pynini.closure(delimiter + segment, 1) + serial_core = serial_core.optimize() + + serial_graph = serial_core + + all_alphas = pynini.union(NEMO_ALPHA, devanagari_chars).optimize() + + insert_space_alpha_digit = pynini.cdrewrite( + pynutil.insert(" "), all_alphas, any_digit, NEMO_SIGMA + ) + insert_space_digit_alpha = pynini.cdrewrite( + pynutil.insert(" "), any_digit, all_alphas, NEMO_SIGMA + ) + space_inserter = pynini.compose(insert_space_alpha_digit, insert_space_digit_alpha).optimize() + + glued_serial = pynini.compose(space_inserter, serial_core).optimize() + serial_graph = pynini.union(serial_graph, glued_serial).optimize() + + power_special = pynutil.add_weight( + pynini.string_file(get_abs_path("data/serial/power_special.tsv")), + -1.0 + ).optimize() + + power_generic = pynutil.add_weight( + ( + pynutil.delete("^") + + pynutil.insert(" टु द पावर ") + + num_graph + ), + 1.0 + ).optimize() + + power_suffix = pynini.union(power_special, power_generic).optimize() + power_graph = num_graph + power_suffix + serial_graph = pynini.union(serial_graph, power_graph).optimize() + + serial_graph = pynini.compose( + pynini.closure(NEMO_NOT_SPACE, 2), serial_graph + ).optimize() + + pure_word_slash = ( + pynini.closure(NEMO_ALPHA, 1) + + pynini.accep("/") + + pynini.closure(NEMO_ALPHA, 1) + ) + + dimension_pattern = ( + pynini.closure(any_digit, 1) + + (pynini.accep("x") | pynini.accep("X")) + + pynini.closure(any_digit, 1) + ) + + _opt_delim = pynini.closure(pynini.accep("-") | pynini.accep(" "), 0, 1) + latin_alphanum = ( + pynini.closure(NEMO_ALPHA, 1) + _opt_delim + pynini.closure(any_digit, 1) + ) | ( + pynini.closure(any_digit, 1) + _opt_delim + pynini.closure(NEMO_ALPHA, 1) + ) + + ordinal_suffixes = pynini.project( + pynini.union( + pynini.string_file(get_abs_path("data/ordinal/suffixes.tsv")), + pynini.string_file(get_abs_path("data/ordinal/suffixes_map.tsv")), + ), + "input", + ).optimize() + ordinal_pattern = pynini.closure(any_digit, 1) + ordinal_suffixes + + date_year_suffix = pynini.project( + pynini.string_file(get_abs_path("data/date/year_suffix.tsv")), + "input", + ).optimize() + date_suffixes = pynini.project( + pynini.string_file(get_abs_path("data/date/suffixes.tsv")), + "input", + ).optimize() + date_pattern = ( + pynini.closure(any_digit, 1) + + pynini.closure(pynini.accep("-") + pynini.closure(any_digit, 1), 0) + + pynini.accep(" ") + + pynini.union(date_year_suffix, date_suffixes) + ) + + exclusions = pure_word_slash | dimension_pattern | latin_alphanum | ordinal_pattern | date_pattern + accepted_inputs = pynini.difference(NEMO_SIGMA, exclusions).optimize() + + serial_graph = pynini.compose(accepted_inputs, serial_graph).optimize() + + self.graph = serial_graph.optimize() + graph = ( + pynutil.insert('name: "') + + convert_space(self.graph).optimize() + + pynutil.insert('"') + ) + self.fst = graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 88cb04727..94292b6ac 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -35,6 +35,7 @@ from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.hi.taggers.serial import SerialFst from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.text_normalization.hi.taggers.whitelist import WhiteListFst @@ -111,12 +112,18 @@ def __init__( punctuation = PunctuationFst(deterministic=deterministic) punct_graph = punctuation.fst + word = WordFst(punctuation=punctuation, deterministic=deterministic) + word_graph = word.fst + telephone = TelephoneFst() telephone_graph = telephone.fst electronic = ElectronicFst(deterministic=deterministic) electronic_graph = electronic.fst + serial = SerialFst(cardinal=cardinal, deterministic=deterministic) + serial_graph = serial.fst + classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(cardinal_graph, 1.1) @@ -129,10 +136,9 @@ def __init__( | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(ordinal_graph, 1.1) | pynutil.add_weight(electronic_graph, 1.1) + | pynutil.add_weight(serial_graph, 1.11) ) - word_graph = WordFst(punctuation=punctuation, deterministic=deterministic).fst - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") punct = pynini.closure( pynini.union( @@ -165,4 +171,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_serial.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_serial.txt new file mode 100644 index 000000000..4c3880fb9 --- /dev/null +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_serial.txt @@ -0,0 +1,21 @@ +कोविड-19~कोविड-उन्नीस +कोविड-१९~कोविड-उन्नीस +5जी~पाँच जी +५जी~पाँच जी +2^2~दो स्क्वेर्ड +२^२~दो स्क्वेर्ड +1-800-555~एक-आठ सौ-पाँच सौ पचपन +3जी~तीन जी +4जी~चार जी +कोरोना-2~कोरोना-दो +अग्नि-5~अग्नि-पाँच +ओमिक्रॉन-2~ओमिक्रॉन-दो +3^2~तीन स्क्वेर्ड +2^3~दो क्यूब +5^3~पाँच क्यूब +४^५~चार टु द पावर पाँच +99-1~निन्यानबे-एक +10-20-30~दस-बीस-तीस +1-800-999~एक-आठ सौ-नौ सौ निन्यानबे +पृथ्वी-4~पृथ्वी-चार +ब्रह्मोस-1~ब्रह्मोस-एक \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_serial.py b/tests/nemo_text_processing/hi/test_serial.py new file mode 100644 index 000000000..67b763069 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_serial.py @@ -0,0 +1,33 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestSerial: + normalizer = Normalizer( + input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_serial.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) + assert pred == expected \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index e8057a126..974dac331 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -52,10 +52,10 @@ testTNDecimal() { # runtest $input #} -#testTNSerial() { -# input=$PROJECT_DIR/hi/data_text_normalization/test_cases_serial.txt -# runtest $input -#} +testTNSerial() { + input=$PROJECT_DIR/hi/data_text_normalization/test_cases_serial.txt + runtest $input +} #testTNRoman() { # input=$PROJECT_DIR/en/data_text_normalization/test_cases_roman.txt From b60d1b8b445e52d25a6e98b728d2dbe92d8e7097 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:36:17 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/serial.py | 56 +++++-------------- .../hi/taggers/tokenize_and_classify.py | 2 +- tests/nemo_text_processing/hi/test_serial.py | 2 +- 3 files changed, 15 insertions(+), 45 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/serial.py b/nemo_text_processing/text_normalization/hi/taggers/serial.py index 1a6cb794d..0264c4db2 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/serial.py +++ b/nemo_text_processing/text_normalization/hi/taggers/serial.py @@ -61,12 +61,8 @@ def __init__( any_digit = pynini.union(NEMO_DIGIT, devanagari_digits).optimize() not_quote = pynini.closure(pynini.difference(NEMO_SIGMA, pynini.accep('"')), 1) - strip_cardinal_tags = ( - pynutil.delete('cardinal { integer: "') - + not_quote - + pynutil.delete('" }') - ) - + strip_cardinal_tags = pynutil.delete('cardinal { integer: "') + not_quote + pynutil.delete('" }') + pure_cardinal_words = pynini.compose(cardinal.fst, strip_cardinal_tags).optimize() length_filter = pynini.closure(any_digit, 1, 3) @@ -74,9 +70,7 @@ def __init__( num_graph = limited_cardinal - symbols_graph = pynini.string_file( - get_abs_path("data/serial/special_symbols.tsv") - ).optimize() + symbols_graph = pynini.string_file(get_abs_path("data/serial/special_symbols.tsv")).optimize() devanagari_chars = pynini.project( pynini.string_file(get_abs_path("data/serial/chars.tsv")), @@ -101,55 +95,35 @@ def __init__( all_alphas = pynini.union(NEMO_ALPHA, devanagari_chars).optimize() - insert_space_alpha_digit = pynini.cdrewrite( - pynutil.insert(" "), all_alphas, any_digit, NEMO_SIGMA - ) - insert_space_digit_alpha = pynini.cdrewrite( - pynutil.insert(" "), any_digit, all_alphas, NEMO_SIGMA - ) + insert_space_alpha_digit = pynini.cdrewrite(pynutil.insert(" "), all_alphas, any_digit, NEMO_SIGMA) + insert_space_digit_alpha = pynini.cdrewrite(pynutil.insert(" "), any_digit, all_alphas, NEMO_SIGMA) space_inserter = pynini.compose(insert_space_alpha_digit, insert_space_digit_alpha).optimize() glued_serial = pynini.compose(space_inserter, serial_core).optimize() serial_graph = pynini.union(serial_graph, glued_serial).optimize() power_special = pynutil.add_weight( - pynini.string_file(get_abs_path("data/serial/power_special.tsv")), - -1.0 + pynini.string_file(get_abs_path("data/serial/power_special.tsv")), -1.0 ).optimize() power_generic = pynutil.add_weight( - ( - pynutil.delete("^") - + pynutil.insert(" टु द पावर ") - + num_graph - ), - 1.0 + (pynutil.delete("^") + pynutil.insert(" टु द पावर ") + num_graph), 1.0 ).optimize() power_suffix = pynini.union(power_special, power_generic).optimize() power_graph = num_graph + power_suffix serial_graph = pynini.union(serial_graph, power_graph).optimize() - serial_graph = pynini.compose( - pynini.closure(NEMO_NOT_SPACE, 2), serial_graph - ).optimize() + serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize() - pure_word_slash = ( - pynini.closure(NEMO_ALPHA, 1) - + pynini.accep("/") - + pynini.closure(NEMO_ALPHA, 1) - ) + pure_word_slash = pynini.closure(NEMO_ALPHA, 1) + pynini.accep("/") + pynini.closure(NEMO_ALPHA, 1) dimension_pattern = ( - pynini.closure(any_digit, 1) - + (pynini.accep("x") | pynini.accep("X")) - + pynini.closure(any_digit, 1) + pynini.closure(any_digit, 1) + (pynini.accep("x") | pynini.accep("X")) + pynini.closure(any_digit, 1) ) _opt_delim = pynini.closure(pynini.accep("-") | pynini.accep(" "), 0, 1) - latin_alphanum = ( - pynini.closure(NEMO_ALPHA, 1) + _opt_delim + pynini.closure(any_digit, 1) - ) | ( + latin_alphanum = (pynini.closure(NEMO_ALPHA, 1) + _opt_delim + pynini.closure(any_digit, 1)) | ( pynini.closure(any_digit, 1) + _opt_delim + pynini.closure(NEMO_ALPHA, 1) ) @@ -183,9 +157,5 @@ def __init__( serial_graph = pynini.compose(accepted_inputs, serial_graph).optimize() self.graph = serial_graph.optimize() - graph = ( - pynutil.insert('name: "') - + convert_space(self.graph).optimize() - + pynutil.insert('"') - ) - self.fst = graph.optimize() \ No newline at end of file + graph = pynutil.insert('name: "') + convert_space(self.graph).optimize() + pynutil.insert('"') + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 94292b6ac..75663ca24 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -171,4 +171,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/tests/nemo_text_processing/hi/test_serial.py b/tests/nemo_text_processing/hi/test_serial.py index 67b763069..43da54b17 100644 --- a/tests/nemo_text_processing/hi/test_serial.py +++ b/tests/nemo_text_processing/hi/test_serial.py @@ -30,4 +30,4 @@ class TestSerial: @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) - assert pred == expected \ No newline at end of file + assert pred == expected