Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pipeline {
environment {
AR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-24-24-0'
DE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-23-24-0'
EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-3'
EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-11-26-1'
ES_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-25-24-0'
ES_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-30-24-0'
HI_EN_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-04-26-4'
Expand Down
157 changes: 105 additions & 52 deletions nemo_text_processing/text_normalization/en/taggers/serial.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.en.graph_utils import (
MIN_NEG_WEIGHT,
MIN_POS_WEIGHT,
NEMO_ALPHA,
NEMO_DIGIT,
NEMO_NOT_SPACE,
Expand All @@ -28,16 +30,65 @@
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels


def _leading_zero_graph(cardinal: GraphFst) -> "pynini.FstLike":
return pynini.compose(pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph).optimize()


def _build_serial_graph(
num_graph: "pynini.FstLike",
delimiter: "pynini.FstLike",
alphas: "pynini.FstLike",
ordinal: GraphFst,
) -> "pynini.FstLike":
letter_num = alphas + delimiter + num_graph
num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
next_alpha_or_num |= pynini.closure(
delimiter
+ num_graph
+ plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize()
+ alphas
)

serial_graph = letter_num + next_alpha_or_num
serial_graph |= num_letter + next_alpha_or_num
serial_graph |= num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)

symbols = [x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))]
symbols = pynini.union(*symbols)
serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph)

serial_graph = pynini.compose(
pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph
).optimize()

serial_graph = pynutil.add_weight(serial_graph, MIN_POS_WEIGHT)
serial_graph |= (
pynini.closure(NEMO_NOT_SPACE, 1) + (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize()
)

serial_graph = (
pynini.closure((serial_graph | num_graph | alphas) + delimiter)
+ serial_graph
+ pynini.closure(delimiter + (serial_graph | num_graph | alphas))
)
return serial_graph.optimize()


class SerialFst(GraphFst):
"""
This class is a composite class of two other class instances
Finite state transducer for classifying serial numbers without conventional delimiters.

Digit normalization within letter-digit tokens follows:
1. 1-2 digits, or single digits followed by zeros -> cardinal
2. 3 digits not ending in 00, or 4+ digits -> single-digit reading
3. Digit-only tokens separated by ``/`` -> cardinal per segment (5+ digits stay single-digit)
Comment thread
folivoramanh marked this conversation as resolved.

Args:
time: composed tagger and verbalizer
date: composed tagger and verbalizer
cardinal: tagger
cardinal: cardinal tagger
ordinal: ordinal tagger (used to exclude ordinal readings)
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
for False multiple transduction are generated (used for audio-based normalization)
lm: whether to use for hybrid LM
"""

Expand All @@ -48,31 +99,56 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool =
Finite state transducer for classifying serial (handles only cases without delimiters,
values with delimiters are handled by default).
The serial is a combination of digits, letters and dashes, e.g.:
c325b -> tokens { cardinal { integer: "c three two five b" } }
"H800" -> tokens { name: "H eight hundred" }
"a320b" -> tokens { name: "a three two zero b" }
"12/345/67890" -> tokens { name: "twelve/three hundred forty five/six seven eight nine zero" }

"""
if deterministic:
num_graph = pynini.compose(NEMO_DIGIT ** (6, ...), cardinal.single_digits_graph).optimize()
num_graph |= pynini.compose(NEMO_DIGIT ** (1, 5), cardinal.graph).optimize()
# to handle numbers starting with zero
num_graph |= pynini.compose(
pynini.accep("0") + pynini.closure(NEMO_DIGIT), cardinal.single_digits_graph
num_graph_pure = (
pynini.compose(NEMO_DIGIT ** (1, 3), cardinal.graph)
| pynini.compose(NEMO_DIGIT ** (4, ...), cardinal.single_digits_graph)
| _leading_zero_graph(cardinal)
).optimize()

num_graph_alnum = (
pynini.compose(NEMO_DIGIT, cardinal.graph)
| pynini.compose(NEMO_DIGIT**2, cardinal.graph)
| pynutil.add_weight(
pynini.compose(NEMO_DIGIT + pynini.closure("0", 1), cardinal.graph), MIN_NEG_WEIGHT
)
| pynini.compose(
pynini.difference(NEMO_DIGIT**3, NEMO_DIGIT + NEMO_DIGIT + "00"), cardinal.single_digits_graph
)
| pynini.compose(NEMO_DIGIT ** (4, ...), cardinal.single_digits_graph)
| _leading_zero_graph(cardinal)
).optimize()

num_graph_slash = (
pynini.compose(NEMO_DIGIT ** (1, 4), cardinal.graph)
| pynini.compose(NEMO_DIGIT ** (5, ...), cardinal.single_digits_graph)
| _leading_zero_graph(cardinal)
).optimize()

else:
num_graph = cardinal.final_graph
num_graph_pure = cardinal.final_graph
num_graph_alnum = cardinal.final_graph
num_graph_slash = cardinal.final_graph

# TODO: "#" doesn't work from the file
symbols_graph = pynini.string_file(get_abs_path("data/whitelist/symbol.tsv")).optimize() | pynini.cross(
"#", "hash"
)
num_graph |= symbols_graph
num_graph_pure |= symbols_graph
num_graph_alnum |= symbols_graph

if not self.deterministic and not lm:
num_graph |= cardinal.single_digits_graph
num_graph |= pynini.compose(num_graph, NEMO_SIGMA + pynutil.delete("hundred ") + NEMO_SIGMA)
# also allow double digits to be pronounced as integer in serial number
num_graph |= pynutil.add_weight(
NEMO_DIGIT**2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=0.0001
num_graph_pure |= cardinal.single_digits_graph
num_graph_pure |= pynini.compose(num_graph_pure, NEMO_SIGMA + pynutil.delete("hundred ") + NEMO_SIGMA)
num_graph_pure |= pynutil.add_weight(
NEMO_DIGIT**2 @ cardinal.graph_hundred_component_at_least_one_none_zero_digit, weight=MIN_POS_WEIGHT
)
num_graph_alnum = num_graph_pure

# add space between letter and digit/symbol
symbols = [x[0] for x in load_labels(get_abs_path("data/whitelist/symbol.tsv"))]
Expand All @@ -90,44 +166,21 @@ def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool =
delimiter |= pynini.cross("-", " dash ") | pynini.cross("/", " slash ")

alphas = pynini.closure(NEMO_ALPHA, 1)
letter_num = alphas + delimiter + num_graph
num_letter = pynini.closure(num_graph + delimiter, 1) + alphas
next_alpha_or_num = pynini.closure(delimiter + (alphas | num_graph))
next_alpha_or_num |= pynini.closure(
delimiter
+ num_graph
+ plurals._priority_union(pynini.accep(" "), pynutil.insert(" "), NEMO_SIGMA).optimize()
+ alphas
)

serial_graph = letter_num + next_alpha_or_num
serial_graph |= num_letter + next_alpha_or_num
# numbers only with 2+ delimiters
serial_graph |= (
num_graph + delimiter + num_graph + delimiter + num_graph + pynini.closure(delimiter + num_graph)
)
# 2+ symbols
serial_graph |= pynini.compose(NEMO_SIGMA + symbols + NEMO_SIGMA, num_graph + delimiter + num_graph)

# exclude ordinal numbers from serial options
serial_graph = pynini.compose(
pynini.difference(NEMO_SIGMA, pynini.project(ordinal.graph, "input")), serial_graph
).optimize()

serial_graph = pynutil.add_weight(serial_graph, 0.0001)
serial_graph |= (
pynini.closure(NEMO_NOT_SPACE, 1)
+ (pynini.cross("^2", " squared") | pynini.cross("^3", " cubed")).optimize()
)
serial_graph = _build_serial_graph(num_graph_pure, delimiter, alphas, ordinal)
serial_graph_alnum = _build_serial_graph(num_graph_alnum, delimiter, alphas, ordinal)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if num_graph_alnum = num_graph_pure why we need 2 different serial graphs?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's only the same when we run with deterministic==False. otherwise we have the distinction in lines 108 and 114


# at least one serial graph with alpha numeric value and optional additional serial/num/alpha values
serial_graph = (
pynini.closure((serial_graph | num_graph | alphas) + delimiter)
+ serial_graph
+ pynini.closure(delimiter + (serial_graph | num_graph | alphas))
# Rule 3: tokens that contain only digits and slashes (e.g. 31/31/100, 123/261788/2021).
slash_digit_token = (
pynini.closure(NEMO_DIGIT, 1) + pynini.accep("/") + pynini.closure(NEMO_DIGIT | pynini.accep("/"), 0)
)
slash_serial = pynini.compose(
slash_digit_token,
pynini.closure(num_graph_slash + pynini.accep("/"), 1) + num_graph_slash,
).optimize()
serial_graph |= pynutil.add_weight(slash_serial, MIN_NEG_WEIGHT)

serial_graph |= pynini.compose(graph_with_space, serial_graph.optimize()).optimize()
serial_graph |= pynini.compose(graph_with_space, serial_graph_alnum.optimize()).optimize()
serial_graph = pynini.compose(pynini.closure(NEMO_NOT_SPACE, 2), serial_graph).optimize()

# this is not to verbolize "/" as "slash" in cases like "import/export"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@
21th~twenty one th
121st~one hundred twenty first
111th~one hundred eleventh
111st~one hundred eleven st
111st~one one one st
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,5 @@ a 4-kilogram bag~a four-kilogram bag
100-car~one hundred-car
123/261788/2021~one hundred twenty three/two six one seven eight eight/two thousand twenty one
2*8~two asterisk eight
my pnr is t2000~my pnr is t two thousand
your otp is ab9453~your otp is ab nine four five three
Loading