From 42725cfbeabb775d3e33a776b6296ec928a00c07 Mon Sep 17 00:00:00 2001
From: joshop <joshop4646@gmail.com>
Date: Fri, 14 Jul 2023 16:28:34 -0400
Subject: [PATCH 1/5] Added initial fake dataset generator and test

---
 pipit/util/fake.py     |  83 +++++++++++++++++++++++++++
 pipit/util/faketest.py | 124 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 207 insertions(+)
 create mode 100644 pipit/util/fake.py
 create mode 100644 pipit/util/faketest.py

diff --git a/pipit/util/fake.py b/pipit/util/fake.py
new file mode 100644
index 00000000..7fb609b7
--- /dev/null
+++ b/pipit/util/fake.py
@@ -0,0 +1,83 @@
+from pipit import Trace
+import numpy as np
+from faketest import gen_fake_tree, emit_tree_file
+import pandas as pd
+
+
+def test_with_fake_data():
+    """
+    Generate a fake test file and ground truth file, read the test file
+    with Pipit, and check it against the ground truth. Tests inclusive and
+    exclusive metrics, and uses time_profile_test_generic.
+    """
+    num_processes = 8
+    # generate one fake tree per process, 2000 functions in the tree
+    trees = [gen_fake_tree(2000) for n in range(num_processes)]
+    test_file = open("fake.csv", "w")
+    ground_truth = open("fake_ground.csv", "w")
+    emit_tree_file(trees, test_file, ground_truth)
+    test_file.close()
+    ground_truth.close()
+    trace = Trace.from_csv("fake.csv")
+    # gt_dataframe should hold identical values to the columns of trace.events
+    gt_dataframe = pd.read_csv("fake_ground.csv")
+    trace.calc_exc_metrics()
+    pipit_dataframe = trace.events[["time.inc", "time.exc"]]
+    # adjust for nanoseconds
+    gt_dataframe["time.inc"] *= 1e9
+    gt_dataframe["time.exc"] *= 1e9
+    # NaN values for time won't compare equal, so check ourselves
+    assert (
+        np.isclose(pipit_dataframe["time.inc"], gt_dataframe["time.inc"])
+        | (np.isnan(gt_dataframe["time.inc"]) & np.isnan(pipit_dataframe["time.inc"]))
+    ).all()
+    # likewise, check exclusive metrics
+    assert (
+        np.isclose(pipit_dataframe["time.exc"], gt_dataframe["time.exc"])
+        | (np.isnan(gt_dataframe["time.exc"]) & np.isnan(pipit_dataframe["time.exc"]))
+    ).all()
+    time_profile_test_generic(trace, num_processes)
+
+
+def time_profile_test_generic(trace, num_processes):
+    """
+    Tests universal properties of time_profile, regardless of the trace.
+    Most asserts were taken from pipit/tests/trace.py, except those specific
+    to the ping-pong trace.
+    """
+    trace.calc_exc_metrics(["Timestamp (ns)"])
+
+    time_profile = trace.time_profile(num_bins=62)
+
+    # check length
+    assert len(time_profile) == 62
+
+    # check bin sizes
+    exp_duration = (
+        trace.events["Timestamp (ns)"].max() - trace.events["Timestamp (ns)"].min()
+    )
+    exp_bin_size = exp_duration / 62
+    bin_sizes = time_profile["bin_end"] - time_profile["bin_start"]
+
+    assert np.isclose(bin_sizes, exp_bin_size).all()
+
+    # check that sum of function contributions per bin equals bin duration
+    exp_bin_total_duration = exp_bin_size * num_processes
+    time_profile.drop(columns=["bin_start", "bin_end"], inplace=True)
+
+    assert np.isclose(time_profile.sum(axis=1), exp_bin_total_duration).all()
+
+    # check for each function that sum of exc time per bin equals total exc time
+    total_exc_times = trace.events.groupby("Name")["time.exc"].sum()
+
+    for column in time_profile:
+        if column == "idle_time":
+            continue
+
+        assert np.isclose(time_profile[column].sum(), total_exc_times[column])
+
+    # check normalization
+    norm = trace.time_profile(num_bins=62, normalized=True)
+    norm.drop(columns=["bin_start", "bin_end"], inplace=True)
+
+    assert (time_profile / exp_bin_total_duration).equals(norm)
diff --git a/pipit/util/faketest.py b/pipit/util/faketest.py
new file mode 100644
index 00000000..bf43670f
--- /dev/null
+++ b/pipit/util/faketest.py
@@ -0,0 +1,124 @@
+import random
+import textwrap
+import pandas as pd
+
+
+class FakeNode:
+    """
+    A single node of the calling tree used to produce fake traces.
+    Represents a single function call.
+    """
+
+    def __init__(self, name, exc_time):
+        self.name = name
+        self.exc_time = exc_time
+        self.children = {}  # run_time -> child node
+
+    def add_child(self, child, run_time):
+        """
+        This function adds a child that executes after run_time
+        exclusive time within the function represented by the current node.
+        """
+        self.children[run_time] = child
+
+    def calc_inc_time(self):
+        """
+        Similar to the calc_*_metrics functions in trace.py,
+        computes inclusive execution time for this tree.
+        """
+        self.inc_time = self.exc_time
+        for run_time, child in self.children.items():
+            child.calc_inc_time()
+            self.inc_time += child.inc_time
+
+    def __str__(self) -> str:
+        return "{} ({})\n".format(self.name, self.exc_time) + "\n".join(
+            [
+                textwrap.indent(str(run_time) + ": " + str(child), "\t")
+                for run_time, child in sorted(self.children.items())
+            ]
+        )
+
+    def to_events(self, begin_time, process, data):
+        """
+        Returns event data for this tree, with time starting at begin_time.
+        Inclusive time must have already been computed.
+        data is an array that is built up and then converted to a DataFrame
+        once the entire tree has been processed.
+        """
+        data.append(
+            [
+                begin_time,
+                "Enter",
+                self.name + "()",
+                process,
+                self.inc_time,
+                self.exc_time,
+            ]
+        )
+        # total_time accumulates durations of already processed children
+        total_time = begin_time
+        for run_time, child in sorted(self.children.items()):
+            # children will add their own lines to data
+            child.to_events(total_time + run_time, process, data)
+            total_time += child.inc_time
+
+        # time.inc and time.exc are both NaN for Leave events
+        data.append(
+            [
+                begin_time + self.inc_time,
+                "Leave",
+                self.name + "()",
+                process,
+                float("nan"),
+                float("nan"),
+            ]
+        )
+
+
+def gen_fake_node():
+    """
+    Generates a node with a random numeric name and execution time.
+    """
+    return FakeNode("func_" + str(random.randint(0, 1000000)), random.random() * 10)
+
+
+def gen_fake_tree(num_nodes):
+    """
+    Generates a whole tree of FakeNodes by randomly appending children.
+    """
+    nodes = [gen_fake_node() for n in range(num_nodes)]
+    root = nodes[0]
+    for index, node in enumerate(nodes[1:]):
+        # choose a node that's currently in the graph to add child to
+        parent = random.choice(nodes[: index + 1])
+        # select a random point for that child to run
+        run_time = random.random() * parent.exc_time
+        parent.add_child(node, run_time)
+    return root
+
+
+def emit_tree_file(trees, test_file, ground_truth_file):
+    """
+    Writes trees (one per process) as a CSV to the File object test_file.
+    At the same time, write ground truth function call information
+    to the File object ground_truth_file.
+    ground_truth_file will contain columns corresponding to Pipit's
+    time.inc, time.exc.
+    """
+    data = []
+    for process, tree in enumerate(trees):
+        tree.calc_inc_time()
+        # add small random fudge factor, so that we don't have many times of exactly 0
+        # which would lead to undefined sorting order and rows not matching
+        tree.to_events(random.random(), process, data)
+
+    dataframe = pd.DataFrame(
+        data,
+        None,
+        ["Timestamp (s)", "Event Type", "Name", "Process", "time.inc", "time.exc"],
+    ).sort_values("Timestamp (s)")
+    dataframe[["Timestamp (s)", "Event Type", "Name", "Process"]].to_csv(
+        test_file, index=False
+    )
+    dataframe[["time.inc", "time.exc"]].to_csv(ground_truth_file, index=False)

From 17afcd6bf6a3adbc0b35b9d3dd136d136ec5ba00 Mon Sep 17 00:00:00 2001
From: joshop <joshop4646@gmail.com>
Date: Wed, 19 Jul 2023 14:35:33 -0400
Subject: [PATCH 2/5] MPI events, and some general realism changes

---
 pipit/trace.py         |   8 ++
 pipit/util/fake.py     |   8 +-
 pipit/util/faketest.py | 265 ++++++++++++++++++++++++++++++++++++++---
 3 files changed, 260 insertions(+), 21 deletions(-)

diff --git a/pipit/trace.py b/pipit/trace.py
index 6f3838b6..f400fdc0 100644
--- a/pipit/trace.py
+++ b/pipit/trace.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pandas as pd
+import ast
 
 
 class Trace:
@@ -72,6 +73,13 @@ def from_csv(filename):
         # ensure that ranks are ints
         events_dataframe = events_dataframe.astype({"Process": "int32"})
 
+        # ensure that the attributes are a dict, not a string
+        if "Attributes" in events_dataframe.columns:
+            # use literal_eval so we're not running a security risk
+            events_dataframe["Attributes"] = events_dataframe["Attributes"].apply(
+                ast.literal_eval
+            )
+
         # make certain columns categorical
         events_dataframe = events_dataframe.astype(
             {
diff --git a/pipit/util/fake.py b/pipit/util/fake.py
index 7fb609b7..40b02ff3 100644
--- a/pipit/util/fake.py
+++ b/pipit/util/fake.py
@@ -1,9 +1,12 @@
 from pipit import Trace
 import numpy as np
-from faketest import gen_fake_tree, emit_tree_file
+from faketest import gen_fake_tree, emit_tree_file, gen_forest
 import pandas as pd
 
 
+function_names = ["foo", "bar", "baz", "quux", "grault", "garply", "waldo"]
+
+
 def test_with_fake_data():
     """
     Generate a fake test file and ground truth file, read the test file
@@ -12,7 +15,8 @@ def test_with_fake_data():
     """
     num_processes = 8
     # generate one fake tree per process, 2000 functions in the tree
-    trees = [gen_fake_tree(2000) for n in range(num_processes)]
+    seed_tree = gen_fake_tree(200, function_names)
+    trees = gen_forest(seed_tree, num_processes)
     test_file = open("fake.csv", "w")
     ground_truth = open("fake_ground.csv", "w")
     emit_tree_file(trees, test_file, ground_truth)
diff --git a/pipit/util/faketest.py b/pipit/util/faketest.py
index bf43670f..bfdb8811 100644
--- a/pipit/util/faketest.py
+++ b/pipit/util/faketest.py
@@ -1,6 +1,7 @@
 import random
 import textwrap
 import pandas as pd
+import numpy as np
 
 
 class FakeNode:
@@ -9,27 +10,75 @@ class FakeNode:
     Represents a single function call.
     """
 
-    def __init__(self, name, exc_time):
+    def __init__(
+        self, name, exc_time, mpi_type="", mpi_tgt=0, mpi_volume=0, mpi_time=0
+    ):
         self.name = name
         self.exc_time = exc_time
+        self.inc_time = exc_time
         self.children = {}  # run_time -> child node
+        self.is_mpi = mpi_type != ""
+        self.mpi_type = mpi_type
+        self.mpi_tgt = mpi_tgt
+        self.mpi_volume = mpi_volume
+        self.mpi_time = mpi_time
+        self.total_nodes = 1
+        self.parent = None
+
+    def grow_inc_time(self, time):
+        """
+        This function adjusts inclusive time metric when a new child is added,
+        adding the time to each parent's inc_time, up to the root.
+        """
+        self.inc_time += time
+        if self.parent is not None:
+            self.parent.grow_inc_time(time)
+
+    def grow_total_nodes(self, amt):
+        """
+        This function adjusts the total count of nodes when a new node is added,
+        adding the new count to each parent's total_nodes, up to the root.
+        """
+        self.total_nodes += amt
+        if self.parent is not None:
+            self.parent.grow_total_nodes(amt)
 
     def add_child(self, child, run_time):
         """
         This function adds a child that executes after run_time
         exclusive time within the function represented by the current node.
         """
+        assert run_time not in self.children
         self.children[run_time] = child
+        child.parent = self
+        self.grow_total_nodes(child.total_nodes)
+        self.grow_inc_time(child.inc_time)
 
-    def calc_inc_time(self):
+    def choose_random_node(self):
         """
-        Similar to the calc_*_metrics functions in trace.py,
-        computes inclusive execution time for this tree.
+        This function selects a random node, with all descendants
+        of the current node being weighted equally, and returns it.
         """
-        self.inc_time = self.exc_time
-        for run_time, child in self.children.items():
-            child.calc_inc_time()
-            self.inc_time += child.inc_time
+        if not self.children:
+            return self
+        rng = random.random()
+        total = 0
+        for child in self.children.values():
+            weight = child.total_nodes / self.total_nodes
+            if rng < weight:
+                return child.choose_random_node()
+            else:
+                rng -= weight
+        return self
+
+    def pick_by_name(self, name):
+        """
+        Returns all nodes in this tree that have the given name.
+        """
+        valid = [self] if self.name == name else []
+        for child in self.children.values():
+            valid += child.pick_by_name(name)
+        return valid
 
     def __str__(self) -> str:
         return "{} ({})\n".format(self.name, self.exc_time) + "\n".join(
@@ -39,10 +88,23 @@ def __str__(self) -> str:
             ]
         )
 
+    def mpi_attributes(self):
+        """
+        Returns the Attributes dictionary for this node, including
+        (if it is an MPI event) receiver/sender and msg_length.
+        """
+        if not self.is_mpi:
+            return {}
+        attr = {"msg_length": self.mpi_volume}
+        if self.mpi_type == "MpiSend":
+            attr["receiver"] = self.mpi_tgt
+        else:
+            attr["sender"] = self.mpi_tgt
+        return attr
+
     def to_events(self, begin_time, process, data):
         """
         Returns event data for this tree, with time starting at begin_time.
-        Inclusive time must have already been computed.
         data is an array that is built up and then converted to a DataFrame
         once the entire tree has been processed.
         """
@@ -52,10 +114,23 @@ def to_events(self, begin_time, process, data):
                 "Enter",
                 self.name + "()",
                 process,
+                {},
                 self.inc_time,
                 self.exc_time,
             ]
         )
+        if self.is_mpi:
+            data.append(
+                [
+                    begin_time + self.mpi_time,
+                    "Instant",
+                    self.mpi_type,
+                    process,
+                    self.mpi_attributes(),
+                    float("nan"),
+                    float("nan"),
+                ]
+            )
         # total_time accumulates durations of already processed children
         total_time = begin_time
         for run_time, child in sorted(self.children.items()):
@@ -70,34 +145,179 @@ def to_events(self, begin_time, process, data):
                 "Leave",
                 self.name + "()",
                 process,
+                {},
                 float("nan"),
                 float("nan"),
             ]
         )
 
+    def tweak_tree(self):
+        """
+        Adds small exclusive time perturbations to
+        function length to generate a "similar" tree, for other processes
+        to use, that is not identical to the original.
+        """
+        exc_time = self.exc_time
+        factor = (0.7) + random.random() * 0.6
+        exc_time_new = exc_time * factor
+        # preserve inclusive time relations
+        self.grow_inc_time(exc_time_new - exc_time)
+        self.exc_time = exc_time_new
+        for run_time, child in self.children.items():
+            child.tweak_tree()
+        # also scale back child run times
+        self.children = {
+            run_time * factor: child for run_time, child in self.children.items()
+        }
+
+    def deepcopy(self):
+        """
+        Returns a deep copy of the tree.
+        """
+        mycopy = FakeNode(
+            self.name,
+            self.exc_time,
+            self.mpi_type,
+            self.mpi_tgt,
+            self.mpi_volume,
+            self.mpi_time,
+        )
+        mycopy.inc_time = self.inc_time
+        mycopy.total_nodes = self.total_nodes
+        for run_time, child in self.children.items():
+            mycopy.children[run_time] = child.deepcopy()
+            mycopy.children[run_time].parent = mycopy
+        return mycopy
+
+    def node_at_time(self, target_time):
+        """
+        Returns the node that contains the given target_time,
+        as well as the offset into that node that the time occurs.
+        """
+        total_time = 0
+        # TODO: may be more efficient to store self.children sorted already
+        for run_time, child in sorted(self.children.items()):
+            if target_time < total_time + run_time:
+                # this time occurs in the current node!
+                return self, target_time - total_time
+            elif target_time < total_time + run_time + child.inc_time:
+                # this time occurs in the given child node
+                return child.node_at_time(target_time - total_time - run_time)
+            else:
+                # this time occurs after this child
+                total_time += child.inc_time
+        # if no children find it, it must be the current node
+        return self, target_time - total_time
+
+    def insert_at_time(self, child, target_time):
+        """
+        Adds the child to the proper node such that it executes
+        at target_time.
+        """
+        node, offset = self.node_at_time(target_time)
+        node.add_child(child, offset)
 
-def gen_fake_node():
+
+def gen_fake_node(function_names):
     """
     Generates a node with a random numeric name and execution time.
     """
-    return FakeNode("func_" + str(random.randint(0, 1000000)), random.random() * 10)
+    return FakeNode(random.choice(function_names), random.random() * 10)
 
 
-def gen_fake_tree(num_nodes):
+def gen_fake_tree(num_nodes, function_names, copy_subtrees=True):
     """
     Generates a whole tree of FakeNodes by randomly appending children.
     """
-    nodes = [gen_fake_node() for n in range(num_nodes)]
+    nodes = [gen_fake_node(function_names) for n in range(num_nodes)]
     root = nodes[0]
     for index, node in enumerate(nodes[1:]):
         # choose a node that's currently in the graph to add child to
-        parent = random.choice(nodes[: index + 1])
+        parent = root.choose_random_node()
         # select a random point for that child to run
         run_time = random.random() * parent.exc_time
-        parent.add_child(node, run_time)
+        # find nodes with the same name to copy off of
+        same_name = root.pick_by_name(node.name)
+        if not same_name or not copy_subtrees:
+            parent.add_child(node, run_time)
+        else:
+            subtree = random.choice(same_name)
+            # larger subtrees are less likely to be copied
+            if random.random() > 0.7 / (subtree.total_nodes**0.5):
+                parent.add_child(node, run_time)
+            else:
+                subtree = subtree.deepcopy()
+                subtree.tweak_tree()
+                parent.add_child(subtree, run_time)
+
     return root
 
 
+def gen_forest(seed_tree, num_trees):
+    """
+    Generates num_trees new trees by tweaking seed_tree.
+    """
+    forest = [seed_tree.deepcopy() for n in range(num_trees)]
+    for tree in forest:
+        tree.tweak_tree()
+    return forest
+
+
+def add_fake_mpi_events(trees, num_pairs):
+    """
+    Adds fake MPIevents to a set of trees (one per process). In total,
+    num_pairs pairs of Send/Recv events are generated and inserted.
+    Each event is a function with is_mpi=True.
+    """
+    planned_evts = []
+    # choose times for events to happen
+    last_proc = -1
+    maxtime = min([t.inc_time for t in trees])
+    for i in range(2 * num_pairs):
+        planned_evts.append(random.random() * maxtime)
+    # sort from last to first events in timeline
+    # iterate from first to last to avoid
+    # dependencies among the events' times
+    planned_evts.sort(reverse=True)
+    while planned_evts:
+        # pair two first events
+        first_evt = planned_evts.pop()
+        second_evt = planned_evts.pop()
+        # time that the first one has to idle
+        idle_time = second_evt - first_evt
+        # pick two different processes
+        first_proc, second_proc = random.sample(range(len(trees)), 2)
+        first_tree = trees[first_proc]
+        second_tree = trees[second_proc]
+        # either first process sends (1) or receives (0)
+        send_first = random.randint(0, 1)
+        volume = random.randint(1, 1000000)
+        # give both a small, random, extra time ("latency", etc)
+        if send_first:
+            # mpi sends don't need to block
+            idle_time = 0
+        first_dur = random.random() + idle_time
+        second_dur = random.random()
+        first_node = FakeNode(
+            "MPI_Send" if send_first else "MPI_Recv",
+            first_dur,
+            "MpiSend" if send_first else "MpiRecv",
+            second_proc,
+            volume,
+            random.random() * first_dur,
+        )
+        second_node = FakeNode(
+            "MPI_Recv" if send_first else "MPI_Send",
+            second_dur,
+            "MpiRecv" if send_first else "MpiSend",
+            first_proc,
+            volume,
+            random.random() * second_dur,
+        )
+        first_tree.insert_at_time(first_node, first_evt)
+        second_tree.insert_at_time(second_node, second_evt)
+
+
 def emit_tree_file(trees, test_file, ground_truth_file):
     """
     Writes trees (one per process) as a CSV to the File object test_file.
@@ -108,17 +328,24 @@ def emit_tree_file(trees, test_file, ground_truth_file):
     """
     data = []
     for process, tree in enumerate(trees):
-        tree.calc_inc_time()
         # add small random fudge factor, so that we don't have many times of exactly 0
         # which would lead to undefined sorting order and rows not matching
-        tree.to_events(random.random(), process, data)
+        tree.to_events(random.random() * 0.01, process, data)
 
     dataframe = pd.DataFrame(
         data,
         None,
-        ["Timestamp (s)", "Event Type", "Name", "Process", "time.inc", "time.exc"],
+        [
+            "Timestamp (s)",
+            "Event Type",
+            "Name",
+            "Process",
+            "Attributes",
+            "time.inc",
+            "time.exc",
+        ],
     ).sort_values("Timestamp (s)")
-    dataframe[["Timestamp (s)", "Event Type", "Name", "Process"]].to_csv(
+    dataframe[["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"]].to_csv(
         test_file, index=False
     )
     dataframe[["time.inc", "time.exc"]].to_csv(ground_truth_file, index=False)

From f1cc393b5713ec6dc9dddb9fc13042e0d0354fde Mon Sep 17 00:00:00 2001
From: joshop <joshop4646@gmail.com>
Date: Thu, 27 Jul 2023 10:57:08 -0400
Subject: [PATCH 3/5] Cleaned up code, removed pytest test and added top-level
 function

---
 pipit/trace.py                                |  1 +
 pipit/util/fake.py                            | 87 -------------------
 pipit/util/{faketest.py => test_generator.py} | 45 +++++++---
 3 files changed, 33 insertions(+), 100 deletions(-)
 delete mode 100644 pipit/util/fake.py
 rename pipit/util/{faketest.py => test_generator.py} (89%)

diff --git a/pipit/trace.py b/pipit/trace.py
index f400fdc0..4e320f12 100644
--- a/pipit/trace.py
+++ b/pipit/trace.py
@@ -73,6 +73,7 @@ def from_csv(filename):
         # ensure that ranks are ints
         events_dataframe = events_dataframe.astype({"Process": "int32"})
 
+        # this next part is needed for fake test reading
         # ensure that the attributes are a dict, not a string
         if "Attributes" in events_dataframe.columns:
             # use literal_eval so we're not running a security risk
diff --git a/pipit/util/fake.py b/pipit/util/fake.py
deleted file mode 100644
index 40b02ff3..00000000
--- a/pipit/util/fake.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from pipit import Trace
-import numpy as np
-from faketest import gen_fake_tree, emit_tree_file, gen_forest
-import pandas as pd
-
-
-function_names = ["foo", "bar", "baz", "quux", "grault", "garply", "waldo"]
-
-
-def test_with_fake_data():
-    """
-    Generate a fake test file and ground truth file, read the test file
-    with Pipit, and check it against the ground truth. Tests inclusive and
-    exclusive metrics, and uses time_profile_test_generic.
-    """
-    num_processes = 8
-    # generate one fake tree per process, 2000 functions in the tree
-    seed_tree = gen_fake_tree(200, function_names)
-    trees = gen_forest(seed_tree, num_processes)
-    test_file = open("fake.csv", "w")
-    ground_truth = open("fake_ground.csv", "w")
-    emit_tree_file(trees, test_file, ground_truth)
-    test_file.close()
-    ground_truth.close()
-    trace = Trace.from_csv("fake.csv")
-    # gt_dataframe should hold identical values to the columns of trace.events
-    gt_dataframe = pd.read_csv("fake_ground.csv")
-    trace.calc_exc_metrics()
-    pipit_dataframe = trace.events[["time.inc", "time.exc"]]
-    # adjust for nanoseconds
-    gt_dataframe["time.inc"] *= 1e9
-    gt_dataframe["time.exc"] *= 1e9
-    # NaN values for time won't compare equal, so check ourselves
-    assert (
-        np.isclose(pipit_dataframe["time.inc"], gt_dataframe["time.inc"])
-        | (np.isnan(gt_dataframe["time.inc"]) & np.isnan(pipit_dataframe["time.inc"]))
-    ).all()
-    # likewise, check exclusive metrics
-    assert (
-        np.isclose(pipit_dataframe["time.exc"], gt_dataframe["time.exc"])
-        | (np.isnan(gt_dataframe["time.exc"]) & np.isnan(pipit_dataframe["time.exc"]))
-    ).all()
-    time_profile_test_generic(trace, num_processes)
-
-
-def time_profile_test_generic(trace, num_processes):
-    """
-    Tests universal properties of time_profile, regardless of the trace.
-    Most asserts were taken from pipit/tests/trace.py, except those specific
-    to the ping-pong trace.
-    """
-    trace.calc_exc_metrics(["Timestamp (ns)"])
-
-    time_profile = trace.time_profile(num_bins=62)
-
-    # check length
-    assert len(time_profile) == 62
-
-    # check bin sizes
-    exp_duration = (
-        trace.events["Timestamp (ns)"].max() - trace.events["Timestamp (ns)"].min()
-    )
-    exp_bin_size = exp_duration / 62
-    bin_sizes = time_profile["bin_end"] - time_profile["bin_start"]
-
-    assert np.isclose(bin_sizes, exp_bin_size).all()
-
-    # check that sum of function contributions per bin equals bin duration
-    exp_bin_total_duration = exp_bin_size * num_processes
-    time_profile.drop(columns=["bin_start", "bin_end"], inplace=True)
-
-    assert np.isclose(time_profile.sum(axis=1), exp_bin_total_duration).all()
-
-    # check for each function that sum of exc time per bin equals total exc time
-    total_exc_times = trace.events.groupby("Name")["time.exc"].sum()
-
-    for column in time_profile:
-        if column == "idle_time":
-            continue
-
-        assert np.isclose(time_profile[column].sum(), total_exc_times[column])
-
-    # check normalization
-    norm = trace.time_profile(num_bins=62, normalized=True)
-    norm.drop(columns=["bin_start", "bin_end"], inplace=True)
-
-    assert (time_profile / exp_bin_total_duration).equals(norm)
diff --git a/pipit/util/faketest.py b/pipit/util/test_generator.py
similarity index 89%
rename from pipit/util/faketest.py
rename to pipit/util/test_generator.py
index bfdb8811..be12c455 100644
--- a/pipit/util/faketest.py
+++ b/pipit/util/test_generator.py
@@ -229,9 +229,10 @@ def gen_fake_tree(num_nodes, function_names, copy_subtrees=True):
     """
     Generates a whole tree of FakeNodes by randomly appending children.
     """
-    nodes = [gen_fake_node(function_names) for n in range(num_nodes)]
-    root = nodes[0]
-    for index, node in enumerate(nodes[1:]):
+    root = gen_fake_node(function_names)
+    # continue to add nodes until we've reached the target
+    while root.total_nodes < num_nodes:
+        node = gen_fake_node(function_names)
         # choose a node that's currently in the graph to add child to
         parent = root.choose_random_node()
         # select a random point for that child to run
@@ -243,7 +244,7 @@ def gen_fake_tree(num_nodes, function_names, copy_subtrees=True):
         else:
             subtree = random.choice(same_name)
             # larger subtrees are less likely to be copied
-            if random.random() > 0.7 / (subtree.total_nodes**0.5):
+            if random.random() > 4 / (subtree.total_nodes**0.5):
                 parent.add_child(node, run_time)
             else:
                 subtree = subtree.deepcopy()
@@ -318,12 +319,11 @@ def add_fake_mpi_events(trees, num_pairs):
         second_tree.insert_at_time(second_node, second_evt)
 
 
-def emit_tree_file(trees, test_file, ground_truth_file):
+def emit_tree_data(trees):
     """
-    Writes trees (one per process) as a CSV to the File object test_file.
-    At the same time, write ground truth function call information
-    to the File object ground_truth_file.
-    ground_truth_file will contain columns corresponding to Pipit's
+    Writes trees (one per process) as a CSV and returns them.
+    At the same time, return ground truth function call information.
+    The ground truth data will contain columns corresponding to Pipit's
     time.inc, time.exc.
     """
     data = []
@@ -345,7 +345,26 @@ def emit_tree_file(trees, test_file, ground_truth_file):
             "time.exc",
         ],
     ).sort_values("Timestamp (s)")
-    dataframe[["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"]].to_csv(
-        test_file, index=False
-    )
-    dataframe[["time.inc", "time.exc"]].to_csv(ground_truth_file, index=False)
+    data_csv = dataframe[
+        ["Timestamp (s)", "Event Type", "Name", "Process", "Attributes"]
+    ].to_csv(index=False)
+    ground_csv = dataframe[["time.inc", "time.exc"]].to_csv(index=False)
+    return data_csv, ground_csv
+
+
+def generate_fake_test(
+    num_events,
+    num_processes,
+    function_names=["foo", "bar", "baz", "quux", "grault", "garply", "waldo"],
+    num_mpi_events=0,
+):
+    """
+    Top level test generation function. Generates test and ground truth datasets with a
+    minimum of num_events Enter/Leave events per process, of which there are
+    num_processes. Optionally, MPI events can be added.
+    """
+    seed_tree = gen_fake_tree(num_events // 2, function_names)
+    print(num_events // 2, seed_tree.total_nodes)
+    forest = gen_forest(seed_tree, num_processes)
+    add_fake_mpi_events(forest, num_mpi_events)
+    return emit_tree_data(forest)

From 62ca889cb698e3b34fb00d2caee12c5fbdb43fda Mon Sep 17 00:00:00 2001
From: joshop <joshop4646@gmail.com>
Date: Wed, 9 Aug 2023 10:52:40 -0400
Subject: [PATCH 4/5] Clean up, CSV reader can be passed CSVs as strings

---
 pipit/trace.py               | 9 +++++++--
 pipit/util/test_generator.py | 7 +++----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/pipit/trace.py b/pipit/trace.py
index 4e320f12..f0f483fb 100644
--- a/pipit/trace.py
+++ b/pipit/trace.py
@@ -5,7 +5,8 @@
 
 import numpy as np
 import pandas as pd
-import ast
+from ast import literal_eval
+from io import StringIO
 
 
 class Trace:
@@ -61,6 +62,10 @@ def from_nsight(filename):
 
     @staticmethod
     def from_csv(filename):
+        # detect if the input is a CSV as a string
+        if "," in filename:
+            # wrapping with StringIO allows pandas to read it
+            filename = StringIO(filename)
         events_dataframe = pd.read_csv(filename, skipinitialspace=True)
 
         # if timestamps are in seconds, convert them to nanoseconds
@@ -78,7 +83,7 @@ def from_csv(filename):
         if "Attributes" in events_dataframe.columns:
             # use literal_eval so we're not running a security risk
             events_dataframe["Attributes"] = events_dataframe["Attributes"].apply(
-                ast.literal_eval
+                literal_eval
             )
 
         # make certain columns categorical
diff --git a/pipit/util/test_generator.py b/pipit/util/test_generator.py
index be12c455..c6343256 100644
--- a/pipit/util/test_generator.py
+++ b/pipit/util/test_generator.py
@@ -352,11 +352,11 @@ def emit_tree_data(trees):
     return data_csv, ground_csv
 
 
-def generate_fake_test(
+def generate_trace(
     num_events,
     num_processes,
     function_names=["foo", "bar", "baz", "quux", "grault", "garply", "waldo"],
-    num_mpi_events=0,
+    num_mpi_pairs=0,
 ):
     """
     Top level test generation function. Generates test and ground truth datasets with a
@@ -364,7 +364,6 @@ def generate_fake_test(
     num_processes. Optionally, MPI events can be added.
     """
     seed_tree = gen_fake_tree(num_events // 2, function_names)
-    print(num_events // 2, seed_tree.total_nodes)
     forest = gen_forest(seed_tree, num_processes)
-    add_fake_mpi_events(forest, num_mpi_events)
+    add_fake_mpi_events(forest, num_mpi_pairs)
     return emit_tree_data(forest)

From 4fada8387b7e245a87b452fb90f5f93b97091828 Mon Sep 17 00:00:00 2001
From: joshop <joshop4646@gmail.com>
Date: Tue, 29 Aug 2023 09:37:29 -0400
Subject: [PATCH 5/5] Slight cleanup

---
 pipit/util/test_generator.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pipit/util/test_generator.py b/pipit/util/test_generator.py
index c6343256..9c395218 100644
--- a/pipit/util/test_generator.py
+++ b/pipit/util/test_generator.py
@@ -1,7 +1,6 @@
 import random
 import textwrap
 import pandas as pd
-import numpy as np
 
 
 class FakeNode:
@@ -62,7 +61,6 @@ def choose_random_node(self):
         if not self.children:
             return self
         rng = random.random()
-        total = 0
         for child in self.children.values():
             weight = child.total_nodes / self.total_nodes
             if rng < weight:
@@ -272,7 +270,6 @@ def add_fake_mpi_events(trees, num_pairs):
     """
     planned_evts = []
     # choose times for events to happen
-    last_proc = -1
     maxtime = min([t.inc_time for t in trees])
     for i in range(2 * num_pairs):
         planned_evts.append(random.random() * maxtime)