From 42725cfbeabb775d3e33a776b6296ec928a00c07 Mon Sep 17 00:00:00 2001 From: joshop Date: Fri, 14 Jul 2023 16:28:34 -0400 Subject: [PATCH 1/5] Added initial fake dataset generator and test --- pipit/util/fake.py | 83 +++++++++++++++++++++++++++ pipit/util/faketest.py | 124 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 pipit/util/fake.py create mode 100644 pipit/util/faketest.py diff --git a/pipit/util/fake.py b/pipit/util/fake.py new file mode 100644 index 00000000..7fb609b7 --- /dev/null +++ b/pipit/util/fake.py @@ -0,0 +1,83 @@ +from pipit import Trace +import numpy as np +from faketest import gen_fake_tree, emit_tree_file +import pandas as pd + + +def test_with_fake_data(): + """ + Generate a fake test file and ground truth file, read the test file + with Pipit, and check it against the ground truth. Tests inclusive and + exclusive metrics, and uses time_profile_test_generic. + """ + num_processes = 8 + # generate one fake tree per process, 2000 functions in the tree + trees = [gen_fake_tree(2000) for n in range(num_processes)] + test_file = open("fake.csv", "w") + ground_truth = open("fake_ground.csv", "w") + emit_tree_file(trees, test_file, ground_truth) + test_file.close() + ground_truth.close() + trace = Trace.from_csv("fake.csv") + # gt_dataframe should hold identical values to the columns of trace.events + gt_dataframe = pd.read_csv("fake_ground.csv") + trace.calc_exc_metrics() + pipit_dataframe = trace.events[["time.inc", "time.exc"]] + # adjust for nanoseconds + gt_dataframe["time.inc"] *= 1e9 + gt_dataframe["time.exc"] *= 1e9 + # NaN values for time won't compare equal, so check ourselves + assert ( + np.isclose(pipit_dataframe["time.inc"], gt_dataframe["time.inc"]) + | (np.isnan(gt_dataframe["time.inc"]) & np.isnan(pipit_dataframe["time.inc"])) + ).all() + # likewise, check exclusive metrics + assert ( + np.isclose(pipit_dataframe["time.exc"], gt_dataframe["time.exc"]) + | (np.isnan(gt_dataframe["time.exc"]) & np.isnan(pipit_dataframe["time.exc"])) + ).all() + time_profile_test_generic(trace, num_processes) + + +def time_profile_test_generic(trace, num_processes): + """ + Tests universal properties of time_profile, regardless of the trace. + Most asserts were taken from pipit/tests/trace.py, except those specific + to the ping-pong trace. + """ + trace.calc_exc_metrics(["Timestamp (ns)"]) + + time_profile = trace.time_profile(num_bins=62) + + # check length + assert len(time_profile) == 62 + + # check bin sizes + exp_duration = ( + trace.events["Timestamp (ns)"].max() - trace.events["Timestamp (ns)"].min() + ) + exp_bin_size = exp_duration / 62 + bin_sizes = time_profile["bin_end"] - time_profile["bin_start"] + + assert np.isclose(bin_sizes, exp_bin_size).all() + + # check that sum of function contributions per bin equals bin duration + exp_bin_total_duration = exp_bin_size * num_processes + time_profile.drop(columns=["bin_start", "bin_end"], inplace=True) + + assert np.isclose(time_profile.sum(axis=1), exp_bin_total_duration).all() + + # check for each function that sum of exc time per bin equals total exc time + total_exc_times = trace.events.groupby("Name")["time.exc"].sum() + + for column in time_profile: + if column == "idle_time": + continue + + assert np.isclose(time_profile[column].sum(), total_exc_times[column]) + + # check normalization + norm = trace.time_profile(num_bins=62, normalized=True) + norm.drop(columns=["bin_start", "bin_end"], inplace=True) + + assert (time_profile / exp_bin_total_duration).equals(norm) diff --git a/pipit/util/faketest.py b/pipit/util/faketest.py new file mode 100644 index 00000000..bf43670f --- /dev/null +++ b/pipit/util/faketest.py @@ -0,0 +1,124 @@ +import random +import textwrap +import pandas as pd + + +class FakeNode: + """ + A single node of the calling tree used to produce fake traces. + Represents a single function call. + """ + + def __init__(self, name, exc_time): + self.name = name + self.exc_time = exc_time + self.children = {} # run_time -> child node + + def add_child(self, child, run_time): + """ + This function adds a child that executes after run_time + exclusive time within the function represented by the current node. + """ + self.children[run_time] = child + + def calc_inc_time(self): + """ + Similar to the calc_*_metrics functions in trace.py, + computes inclusive execution time for this tree. + """ + self.inc_time = self.exc_time + for run_time, child in self.children.items(): + child.calc_inc_time() + self.inc_time += child.inc_time + + def __str__(self) -> str: + return "{} ({})\n".format(self.name, self.exc_time) + "\n".join( + [ + textwrap.indent(str(run_time) + ": " + str(child), "\t") + for run_time, child in sorted(self.children.items()) + ] + ) + + def to_events(self, begin_time, process, data): + """ + Returns event data for this tree, with time starting at begin_time. + Inclusive time must have already been computed. + data is an array that is built up and then converted to a DataFrame + once the entire tree has been processed. + """ + data.append( + [ + begin_time, + "Enter", + self.name + "()", + process, + self.inc_time, + self.exc_time, + ] + ) + # total_time accumulates durations of already processed children + total_time = begin_time + for run_time, child in sorted(self.children.items()): + # children will add their own lines to data + child.to_events(total_time + run_time, process, data) + total_time += child.inc_time + + # time.inc and time.exc are both NaN for Leave events + data.append( + [ + begin_time + self.inc_time, + "Leave", + self.name + "()", + process, + float("nan"), + float("nan"), + ] + ) + + +def gen_fake_node(): + """ + Generates a node with a random numeric name and execution time. + """ + return FakeNode("func_" + str(random.randint(0, 1000000)), random.random() * 10) + + +def gen_fake_tree(num_nodes): + """ + Generates a whole tree of FakeNodes by randomly appending children. + """ + nodes = [gen_fake_node() for n in range(num_nodes)] + root = nodes[0] + for index, node in enumerate(nodes[1:]): + # choose a node that's currently in the graph to add child to + parent = random.choice(nodes[: index + 1]) + # select a random point for that child to run + run_time = random.random() * parent.exc_time + parent.add_child(node, run_time) + return root + + +def emit_tree_file(trees, test_file, ground_truth_file): + """ + Writes trees (one per process) as a CSV to the File object test_file. + At the same time, write ground truth function call information + to the File object ground_truth_file. + ground_truth_file will contain columns corresponding to Pipit's + time.inc, time.exc. + """ + data = [] + for process, tree in enumerate(trees): + tree.calc_inc_time() + # add small random fudge factor, so that we don't have many times of exactly 0 + # which would lead to undefined sorting order and rows not matching + tree.to_events(random.random(), process, data) + + dataframe = pd.DataFrame( + data, + None, + ["Timestamp (s)", "Event Type", "Name", "Process", "time.inc", "time.exc"], + ).sort_values("Timestamp (s)") + dataframe[["Timestamp (s)", "Event Type", "Name", "Process"]].to_csv( + test_file, index=False + ) + dataframe[["time.inc", "time.exc"]].to_csv(ground_truth_file, index=False) From 17afcd6bf6a3adbc0b35b9d3dd136d136ec5ba00 Mon Sep 17 00:00:00 2001 From: joshop Date: Wed, 19 Jul 2023 14:35:33 -0400 Subject: [PATCH 2/5] MPI events, and some general realism changes --- pipit/trace.py | 8 ++ pipit/util/fake.py | 8 +- pipit/util/faketest.py | 265 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 260 insertions(+), 21 deletions(-) diff --git a/pipit/trace.py b/pipit/trace.py index 6f3838b6..f400fdc0 100644 --- a/pipit/trace.py +++ b/pipit/trace.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +import ast class Trace: @@ -72,6 +73,13 @@ def from_csv(filename): # ensure that ranks are ints events_dataframe = events_dataframe.astype({"Process": "int32"}) + # ensure that the attributes are a dict, not a string + if "Attributes" in events_dataframe.columns: + # use literal_eval so we're not running a security risk + events_dataframe["Attributes"] = events_dataframe["Attributes"].apply( + ast.literal_eval + ) + # make certain columns categorical events_dataframe = events_dataframe.astype( { diff --git a/pipit/util/fake.py b/pipit/util/fake.py index 7fb609b7..40b02ff3 100644 --- a/pipit/util/fake.py +++ b/pipit/util/fake.py @@ -1,9 +1,12 @@ from pipit import Trace import numpy as np -from faketest import gen_fake_tree, emit_tree_file +from faketest import gen_fake_tree, emit_tree_file, gen_forest import pandas as pd +function_names = ["foo", "bar", "baz", "quux", "grault", "garply", "waldo"] + + def test_with_fake_data(): """ Generate a fake test file and ground truth file, read the test file @@ -12,7 +15,8 @@ def test_with_fake_data(): """ num_processes = 8 # generate one fake tree per process, 2000 functions in the tree - trees = [gen_fake_tree(2000) for n in range(num_processes)] + seed_tree = gen_fake_tree(200, function_names) + trees = gen_forest(seed_tree, num_processes) test_file = open("fake.csv", "w") ground_truth = open("fake_ground.csv", "w") emit_tree_file(trees, test_file, ground_truth) diff --git a/pipit/util/faketest.py b/pipit/util/faketest.py index bf43670f..bfdb8811 100644 --- a/pipit/util/faketest.py +++ b/pipit/util/faketest.py @@ -1,6 +1,7 @@ import random import textwrap import pandas as pd +import numpy as np class FakeNode: @@ -9,27 +10,75 @@ class FakeNode: Represents a single function call. """ - def __init__(self, name, exc_time): + def __init__( + self, name, exc_time, mpi_type="", mpi_tgt=0, mpi_volume=0, mpi_time=0 + ): self.name = name self.exc_time = exc_time + self.inc_time = exc_time self.children = {} # run_time -> child node + self.is_mpi = mpi_type != "" + self.mpi_type = mpi_type + self.mpi_tgt = mpi_tgt + self.mpi_volume = mpi_volume + self.mpi_time = mpi_time + self.total_nodes = 1 + self.parent = None + + def grow_inc_time(self, time): + """ + This function adjusts inclusive time metric when a new child is added, + adding the time to each parent's inc_time, up to the root. + """ + self.inc_time += time + if self.parent is not None: + self.parent.grow_inc_time(time) + + def grow_total_nodes(self, amt): + """ + This function adjusts the total count of nodes when a new node is added, + adding the new count to each parent's total_nodes, up to the root. + """ + self.total_nodes += amt + if self.parent is not None: + self.parent.grow_total_nodes(amt) def add_child(self, child, run_time): """ This function adds a child that executes after run_time exclusive time within the function represented by the current node. """ + assert run_time not in self.children self.children[run_time] = child + child.parent = self + self.grow_total_nodes(child.total_nodes) + self.grow_inc_time(child.inc_time) - def calc_inc_time(self): + def choose_random_node(self): """ - Similar to the calc_*_metrics functions in trace.py, - computes inclusive execution time for this tree. + This function selects a random node, with all descendants + of the current node being weighted equally, and returns it. """ - self.inc_time = self.exc_time - for run_time, child in self.children.items(): - child.calc_inc_time() - self.inc_time += child.inc_time + if not self.children: + return self + rng = random.random() + total = 0 + for child in self.children.values(): + weight = child.total_nodes / self.total_nodes + if rng < weight: + return child.choose_random_node() + else: + rng -= weight + return self + + def pick_by_name(self, name): + """ + Returns all nodes in this tree that have the given name. + """ + valid = [self] if self.name == name else [] + for child in self.children.values(): + valid += child.pick_by_name(name) + return valid def __str__(self) -> str: return "{} ({})\n".format(self.name, self.exc_time) + "\n".join( @@ -39,10 +88,23 @@ def __str__(self) -> str: ] ) + def mpi_attributes(self): + """ + Returns the Attributes dictionary for this node, including + (if it is an MPI event) receiver/sender and msg_length. + """ + if not self.is_mpi: + return {} + attr = {"msg_length": self.mpi_volume} + if self.mpi_type == "MpiSend": + attr["receiver"] = self.mpi_tgt + else: + attr["sender"] = self.mpi_tgt + return attr + def to_events(self, begin_time, process, data): """ Returns event data for this tree, with time starting at begin_time. - Inclusive time must have already been computed. data is an array that is built up and then converted to a DataFrame once the entire tree has been processed. """ @@ -52,10 +114,23 @@ def to_events(self, begin_time, process, data): "Enter", self.name + "()", process, + {}, self.inc_time, self.exc_time, ] ) + if self.is_mpi: + data.append( + [ + begin_time + self.mpi_time, + "Instant", + self.mpi_type, + process, + self.mpi_attributes(), + float("nan"), + float("nan"), + ] + ) # total_time accumulates durations of already processed children total_time = begin_time for run_time, child in sorted(self.children.items()): @@ -70,34 +145,179 @@ def to_events(self, begin_time, process, data): "Leave", self.name + "()", process, + {}, float("nan"), float("nan"), ] ) + def tweak_tree(self): + """ + Adds small exclusive time perturbations to + function length to generate a "similar" tree, for other processes + to use, that is not identical to the original. + """ + exc_time = self.exc_time + factor = (0.7) + random.random() * 0.6 + exc_time_new = exc_time * factor + # preserve inclusive time relations + self.grow_inc_time(exc_time_new - exc_time) + self.exc_time = exc_time_new + for run_time, child in self.children.items(): + child.tweak_tree() + # also scale back child run times + self.children = { + run_time * factor: child for run_time, child in self.children.items() + } + + def deepcopy(self): + """ + Returns a deep copy of the tree. + """ + mycopy = FakeNode( + self.name, + self.exc_time, + self.mpi_type, + self.mpi_tgt, + self.mpi_volume, + self.mpi_time, + ) + mycopy.inc_time = self.inc_time + mycopy.total_nodes = self.total_nodes + for run_time, child in self.children.items(): + mycopy.children[run_time] = child.deepcopy() + mycopy.children[run_time].parent = mycopy + return mycopy + + def node_at_time(self, target_time): + """ + Returns the node that contains the given target_time, + as well as the offset into that node that the time occurs. + """ + total_time = 0 + # TODO: may be more efficient to store self.children sorted already + for run_time, child in sorted(self.children.items()): + if target_time < total_time + run_time: + # this time occurs in the current node! + return self, target_time - total_time + elif target_time < total_time + run_time + child.inc_time: + # this time occurs in the given child node + return child.node_at_time(target_time - total_time - run_time) + else: + # this time occurs after this child + total_time += child.inc_time + # if no children find it, it must be the current node + return self, target_time - total_time + + def insert_at_time(self, child, target_time): + """ + Adds the child to the proper node such that it executes + at target_time. + """ + node, offset = self.node_at_time(target_time) + node.add_child(child, offset) -def gen_fake_node(): + +def gen_fake_node(function_names): """ Generates a node with a random numeric name and execution time. """ - return FakeNode("func_" + str(random.randint(0, 1000000)), random.random() * 10) + return FakeNode(random.choice(function_names), random.random() * 10) -def gen_fake_tree(num_nodes): +def gen_fake_tree(num_nodes, function_names, copy_subtrees=True): """ Generates a whole tree of FakeNodes by randomly appending children. """ - nodes = [gen_fake_node() for n in range(num_nodes)] + nodes = [gen_fake_node(function_names) for n in range(num_nodes)] root = nodes[0] for index, node in enumerate(nodes[1:]): # choose a node that's currently in the graph to add child to - parent = random.choice(nodes[: index + 1]) + parent = root.choose_random_node() # select a random point for that child to run run_time = random.random() * parent.exc_time - parent.add_child(node, run_time) + # find nodes with the same name to copy off of + same_name = root.pick_by_name(node.name) + if not same_name or not copy_subtrees: + parent.add_child(node, run_time) + else: + subtree = random.choice(same_name) + # larger subtrees are less likely to be copied + if random.random() > 0.7 / (subtree.total_nodes**0.5): + parent.add_child(node, run_time) + else: + subtree = subtree.deepcopy() + subtree.tweak_tree() + parent.add_child(subtree, run_time) + return root +def gen_forest(seed_tree, num_trees): + """ + Generates num_trees new trees by tweaking seed_tree. + """ + forest = [seed_tree.deepcopy() for n in range(num_trees)] + for tree in forest: + tree.tweak_tree() + return forest + + +def add_fake_mpi_events(trees, num_pairs): + """ + Adds fake MPIevents to a set of trees (one per process). In total, + num_pairs pairs of Send/Recv events are generated and inserted. + Each event is a function with is_mpi=True. + """ + planned_evts = [] + # choose times for events to happen + last_proc = -1 + maxtime = min([t.inc_time for t in trees]) + for i in range(2 * num_pairs): + planned_evts.append(random.random() * maxtime) + # sort from last to first events in timeline + # iterate from first to last to avoid + # dependencies among the events' times + planned_evts.sort(reverse=True) + while planned_evts: + # pair two first events + first_evt = planned_evts.pop() + second_evt = planned_evts.pop() + # time that the first one has to idle + idle_time = second_evt - first_evt + # pick two different processes + first_proc, second_proc = random.sample(range(len(trees)), 2) + first_tree = trees[first_proc] + second_tree = trees[second_proc] + # either first process sends (1) or receives (0) + send_first = random.randint(0, 1) + volume = random.randint(1, 1000000) + # give both a small, random, extra time ("latency", etc) + if send_first: + # mpi sends don't need to block + idle_time = 0 + first_dur = random.random() + idle_time + second_dur = random.random() + first_node = FakeNode( + "MPI_Send" if send_first else "MPI_Recv", + first_dur, + "MpiSend" if send_first else "MpiRecv", + second_proc, + volume, + random.random() * first_dur, + ) + second_node = FakeNode( + "MPI_Recv" if send_first else "MPI_Send", + second_dur, + "MpiRecv" if send_first else "MpiSend", + first_proc, + volume, + random.random() * second_dur, + ) + first_tree.insert_at_time(first_node, first_evt) + second_tree.insert_at_time(second_node, second_evt) + + def emit_tree_file(trees, test_file, ground_truth_file): """ Writes trees (one per process) as a CSV to the File object test_file. @@ -108,17 +328,24 @@ def emit_tree_file(trees, test_file, ground_truth_file): """ data = [] for process, tree in enumerate(trees): - tree.calc_inc_time() # add small random fudge factor, so that we don't have many times of exactly 0 # which would lead to undefined sorting order and rows not matching - tree.to_events(random.random(), process, data) + tree.to_events(random.random() * 0.01, process, data) dataframe = pd.DataFrame( data, None, - ["Timestamp (s)", "Event Type", "Name", "Process", "time.inc", "time.exc"], + [ + "Timestamp (s)", + "Event Type", + "Name", + "Process", + "Attributes", + "time.inc", + "time.exc", + ], ).sort_values("Timestamp (s)") - dataframe[["Timestamp (s)", "Event Type", "Name", "Process"]].to_csv( + dataframe[["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"]].to_csv( test_file, index=False ) dataframe[["time.inc", "time.exc"]].to_csv(ground_truth_file, index=False) From f1cc393b5713ec6dc9dddb9fc13042e0d0354fde Mon Sep 17 00:00:00 2001 From: joshop Date: Thu, 27 Jul 2023 10:57:08 -0400 Subject: [PATCH 3/5] Cleaned up code, removed pytest test and added top-level function --- pipit/trace.py | 1 + pipit/util/fake.py | 87 ------------------- pipit/util/{faketest.py => test_generator.py} | 45 +++++++--- 3 files changed, 33 insertions(+), 100 deletions(-) delete mode 100644 pipit/util/fake.py rename pipit/util/{faketest.py => test_generator.py} (89%) diff --git a/pipit/trace.py b/pipit/trace.py index f400fdc0..4e320f12 100644 --- a/pipit/trace.py +++ b/pipit/trace.py @@ -73,6 +73,7 @@ def from_csv(filename): # ensure that ranks are ints events_dataframe = events_dataframe.astype({"Process": "int32"}) + # this next part is needed for fake test reading # ensure that the attributes are a dict, not a string if "Attributes" in events_dataframe.columns: # use literal_eval so we're not running a security risk diff --git a/pipit/util/fake.py b/pipit/util/fake.py deleted file mode 100644 index 40b02ff3..00000000 --- a/pipit/util/fake.py +++ /dev/null @@ -1,87 +0,0 @@ -from pipit import Trace -import numpy as np -from faketest import gen_fake_tree, emit_tree_file, gen_forest -import pandas as pd - - -function_names = ["foo", "bar", "baz", "quux", "grault", "garply", "waldo"] - - -def test_with_fake_data(): - """ - Generate a fake test file and ground truth file, read the test file - with Pipit, and check it against the ground truth. Tests inclusive and - exclusive metrics, and uses time_profile_test_generic. - """ - num_processes = 8 - # generate one fake tree per process, 2000 functions in the tree - seed_tree = gen_fake_tree(200, function_names) - trees = gen_forest(seed_tree, num_processes) - test_file = open("fake.csv", "w") - ground_truth = open("fake_ground.csv", "w") - emit_tree_file(trees, test_file, ground_truth) - test_file.close() - ground_truth.close() - trace = Trace.from_csv("fake.csv") - # gt_dataframe should hold identical values to the columns of trace.events - gt_dataframe = pd.read_csv("fake_ground.csv") - trace.calc_exc_metrics() - pipit_dataframe = trace.events[["time.inc", "time.exc"]] - # adjust for nanoseconds - gt_dataframe["time.inc"] *= 1e9 - gt_dataframe["time.exc"] *= 1e9 - # NaN values for time won't compare equal, so check ourselves - assert ( - np.isclose(pipit_dataframe["time.inc"], gt_dataframe["time.inc"]) - | (np.isnan(gt_dataframe["time.inc"]) & np.isnan(pipit_dataframe["time.inc"])) - ).all() - # likewise, check exclusive metrics - assert ( - np.isclose(pipit_dataframe["time.exc"], gt_dataframe["time.exc"]) - | (np.isnan(gt_dataframe["time.exc"]) & np.isnan(pipit_dataframe["time.exc"])) - ).all() - time_profile_test_generic(trace, num_processes) - - -def time_profile_test_generic(trace, num_processes): - """ - Tests universal properties of time_profile, regardless of the trace. - Most asserts were taken from pipit/tests/trace.py, except those specific - to the ping-pong trace. - """ - trace.calc_exc_metrics(["Timestamp (ns)"]) - - time_profile = trace.time_profile(num_bins=62) - - # check length - assert len(time_profile) == 62 - - # check bin sizes - exp_duration = ( - trace.events["Timestamp (ns)"].max() - trace.events["Timestamp (ns)"].min() - ) - exp_bin_size = exp_duration / 62 - bin_sizes = time_profile["bin_end"] - time_profile["bin_start"] - - assert np.isclose(bin_sizes, exp_bin_size).all() - - # check that sum of function contributions per bin equals bin duration - exp_bin_total_duration = exp_bin_size * num_processes - time_profile.drop(columns=["bin_start", "bin_end"], inplace=True) - - assert np.isclose(time_profile.sum(axis=1), exp_bin_total_duration).all() - - # check for each function that sum of exc time per bin equals total exc time - total_exc_times = trace.events.groupby("Name")["time.exc"].sum() - - for column in time_profile: - if column == "idle_time": - continue - - assert np.isclose(time_profile[column].sum(), total_exc_times[column]) - - # check normalization - norm = trace.time_profile(num_bins=62, normalized=True) - norm.drop(columns=["bin_start", "bin_end"], inplace=True) - - assert (time_profile / exp_bin_total_duration).equals(norm) diff --git a/pipit/util/faketest.py b/pipit/util/test_generator.py similarity index 89% rename from pipit/util/faketest.py rename to pipit/util/test_generator.py index bfdb8811..be12c455 100644 --- a/pipit/util/faketest.py +++ b/pipit/util/test_generator.py @@ -229,9 +229,10 @@ def gen_fake_tree(num_nodes, function_names, copy_subtrees=True): """ Generates a whole tree of FakeNodes by randomly appending children. """ - nodes = [gen_fake_node(function_names) for n in range(num_nodes)] - root = nodes[0] - for index, node in enumerate(nodes[1:]): + root = gen_fake_node(function_names) + # continue to add nodes until we've reached the target + while root.total_nodes < num_nodes: + node = gen_fake_node(function_names) # choose a node that's currently in the graph to add child to parent = root.choose_random_node() # select a random point for that child to run @@ -243,7 +244,7 @@ def gen_fake_tree(num_nodes, function_names, copy_subtrees=True): else: subtree = random.choice(same_name) # larger subtrees are less likely to be copied - if random.random() > 0.7 / (subtree.total_nodes**0.5): + if random.random() > 4 / (subtree.total_nodes**0.5): parent.add_child(node, run_time) else: subtree = subtree.deepcopy() @@ -318,12 +319,11 @@ def add_fake_mpi_events(trees, num_pairs): second_tree.insert_at_time(second_node, second_evt) -def emit_tree_file(trees, test_file, ground_truth_file): +def emit_tree_data(trees): """ - Writes trees (one per process) as a CSV to the File object test_file. - At the same time, write ground truth function call information - to the File object ground_truth_file. - ground_truth_file will contain columns corresponding to Pipit's + Writes trees (one per process) as a CSV and returns them. + At the same time, return ground truth function call information. + The ground truth data will contain columns corresponding to Pipit's time.inc, time.exc. """ data = [] @@ -345,7 +345,26 @@ def emit_tree_file(trees, test_file, ground_truth_file): "time.exc", ], ).sort_values("Timestamp (s)") - dataframe[["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"]].to_csv( - test_file, index=False - ) - dataframe[["time.inc", "time.exc"]].to_csv(ground_truth_file, index=False) + data_csv = dataframe[ + ["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"] + ].to_csv(index=False) + ground_csv = dataframe[["time.inc", "time.exc"]].to_csv(index=False) + return data_csv, ground_csv + + +def generate_fake_test( + num_events, + num_processes, + function_names=["foo", "bar", "baz", "quux", "grault", "garply", "waldo"], + num_mpi_events=0, +): + """ + Top level test generation function. Generates test and ground truth datasets with a + minimum of num_events Enter/Leave events per process, of which there are + num_processes. Optionally, MPI events can be added. + """ + seed_tree = gen_fake_tree(num_events // 2, function_names) + print(num_events // 2, seed_tree.total_nodes) + forest = gen_forest(seed_tree, num_processes) + add_fake_mpi_events(forest, num_mpi_events) + return emit_tree_data(forest) From 62ca889cb698e3b34fb00d2caee12c5fbdb43fda Mon Sep 17 00:00:00 2001 From: joshop Date: Wed, 9 Aug 2023 10:52:40 -0400 Subject: [PATCH 4/5] Clean up, CSV reader can be passed CSVs as strings --- pipit/trace.py | 9 +++++++-- pipit/util/test_generator.py | 7 +++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pipit/trace.py b/pipit/trace.py index 4e320f12..f0f483fb 100644 --- a/pipit/trace.py +++ b/pipit/trace.py @@ -5,7 +5,8 @@ import numpy as np import pandas as pd -import ast +from ast import literal_eval +from io import StringIO class Trace: @@ -61,6 +62,10 @@ def from_nsight(filename): @staticmethod def from_csv(filename): + # detect if the input is a CSV as a string + if "," in filename: + # wrapping with StringIO allows pandas to read it + filename = StringIO(filename) events_dataframe = pd.read_csv(filename, skipinitialspace=True) # if timestamps are in seconds, convert them to nanoseconds @@ -78,7 +83,7 @@ def from_csv(filename): if "Attributes" in events_dataframe.columns: # use literal_eval so we're not running a security risk events_dataframe["Attributes"] = events_dataframe["Attributes"].apply( - ast.literal_eval + literal_eval ) # make certain columns categorical diff --git a/pipit/util/test_generator.py b/pipit/util/test_generator.py index be12c455..c6343256 100644 --- a/pipit/util/test_generator.py +++ b/pipit/util/test_generator.py @@ -352,11 +352,11 @@ def emit_tree_data(trees): return data_csv, ground_csv -def generate_fake_test( +def generate_trace( num_events, num_processes, function_names=["foo", "bar", "baz", "quux", "grault", "garply", "waldo"], - num_mpi_events=0, + num_mpi_pairs=0, ): """ Top level test generation function. Generates test and ground truth datasets with a @@ -364,7 +364,6 @@ def generate_fake_test( num_processes. Optionally, MPI events can be added. """ seed_tree = gen_fake_tree(num_events // 2, function_names) - print(num_events // 2, seed_tree.total_nodes) forest = gen_forest(seed_tree, num_processes) - add_fake_mpi_events(forest, num_mpi_events) + add_fake_mpi_events(forest, num_mpi_pairs) return emit_tree_data(forest) From 4fada8387b7e245a87b452fb90f5f93b97091828 Mon Sep 17 00:00:00 2001 From: joshop Date: Tue, 29 Aug 2023 09:37:29 -0400 Subject: [PATCH 5/5] Slight cleanup --- pipit/util/test_generator.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pipit/util/test_generator.py b/pipit/util/test_generator.py index c6343256..9c395218 100644 --- a/pipit/util/test_generator.py +++ b/pipit/util/test_generator.py @@ -1,7 +1,6 @@ import random import textwrap import pandas as pd -import numpy as np class FakeNode: @@ -62,7 +61,6 @@ def choose_random_node(self): if not self.children: return self rng = random.random() - total = 0 for child in self.children.values(): weight = child.total_nodes / self.total_nodes if rng < weight: @@ -272,7 +270,6 @@ def add_fake_mpi_events(trees, num_pairs): """ planned_evts = [] # choose times for events to happen - last_proc = -1 maxtime = min([t.inc_time for t in trees]) for i in range(2 * num_pairs): planned_evts.append(random.random() * maxtime)