diff --git a/bin/run-model/src/run-model/main.cc b/bin/run-model/src/run-model/main.cc index a6f1d026ef..b77354dcdc 100644 --- a/bin/run-model/src/run-model/main.cc +++ b/bin/run-model/src/run-model/main.cc @@ -99,7 +99,7 @@ int main(int argc, char **argv) { /*optimizer=*/optimizer_attrs, /*loss=*/std::nullopt, /*input_tensors=*/input_tensors, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*device_handle=*/device_handle, /*device_type=*/DeviceType::GPU); @@ -108,7 +108,7 @@ int main(int argc, char **argv) { for (int i = 0; i < num_epochs; i++) { perform_all_passes_for_pcg_instance( /*instance=*/pcg_instance, - /*profiling_settings=*/ProfilingSettings{0, 1}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*device_handle=*/device_handle); } }); diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h index 6b79f40359..d11a945d88 100644 --- a/lib/kernels/include/kernels/profiling.h +++ b/lib/kernels/include/kernels/profiling.h @@ -17,8 +17,8 @@ std::optional profiling_wrapper(F const &f, Ts &&...ts) { if (enable_profiling) { ProfilingSettings settings = ProfilingSettings{ - /*warmup_iters=*/0, - /*measure_iters=*/1, + /*warmup_iters=*/0_n, + /*measure_iters=*/1_p, }; return profiling_wrapper(f, settings, std::forward(ts)...); } else { @@ -33,10 +33,6 @@ std::optional ProfilingSettings const &settings, DeviceType device_type, Ts &&...ts) { - if (settings.measure_iters <= 0) { - return std::nullopt; - } - if (device_type == DeviceType::GPU) { return gpu_profiling_wrapper(f, settings, std::forward(ts)...); } else { @@ -49,8 +45,6 @@ template milliseconds_t cpu_profiling_wrapper(F const &f, ProfilingSettings const &settings, Ts &&...ts) { - ASSERT(settings.measure_iters > 0); - device_stream_t stream = get_cpu_device_stream(); using TimePoint = std::chrono::time_point; @@ -58,8 +52,10 @@ milliseconds_t cpu_profiling_wrapper(F const &f, std::optional start = std::nullopt; std::optional end = std::nullopt; - for (int i = 0; i < settings.warmup_iters + settings.measure_iters; i++) { - if (i == settings.warmup_iters) { + for (int i = 0; i < settings.warmup_iters.int_from_nonnegative_int() + + settings.measure_iters.int_from_positive_int(); + i++) { + if (i == settings.warmup_iters.int_from_nonnegative_int()) { start = std::chrono::steady_clock::now(); } f(stream, std::forward(ts)...); @@ -67,7 +63,8 @@ milliseconds_t cpu_profiling_wrapper(F const &f, end = std::chrono::steady_clock::now(); std::chrono::duration avg_duration = - (end.value() - start.value()) / settings.measure_iters; + (end.value() - start.value()) / + settings.measure_iters.int_from_positive_int(); return milliseconds_t{ static_cast(avg_duration.count()), @@ -78,16 +75,16 @@ template milliseconds_t gpu_profiling_wrapper(F const &f, ProfilingSettings const &settings, Ts &&...ts) { - ASSERT(settings.measure_iters > 0); - device_stream_t stream = get_gpu_device_stream(); ffEvent_t t_start, t_end; checkCUDA(ffEventCreate(&t_start)); checkCUDA(ffEventCreate(&t_end)); - for (int i = 0; i < settings.warmup_iters + settings.measure_iters; i++) { - if (i == settings.warmup_iters) { + for (int i = 0; i < settings.warmup_iters.int_from_nonnegative_int() + + settings.measure_iters.int_from_positive_int(); + i++) { + if (i == settings.warmup_iters.int_from_nonnegative_int()) { checkCUDA(ffEventRecord(t_start, stream.require_gpu())); } f(stream, std::forward(ts)...); @@ -100,7 +97,7 @@ milliseconds_t gpu_profiling_wrapper(F const &f, checkCUDA(ffEventDestroy(t_start)); checkCUDA(ffEventDestroy(t_end)); return milliseconds_t{ - elapsed / settings.measure_iters, + elapsed / settings.measure_iters.int_from_positive_int(), }; } diff --git a/lib/kernels/include/kernels/profiling_settings.dtg.toml b/lib/kernels/include/kernels/profiling_settings.dtg.toml index c9f19c3a50..434b3713b5 100644 --- a/lib/kernels/include/kernels/profiling_settings.dtg.toml +++ b/lib/kernels/include/kernels/profiling_settings.dtg.toml @@ -10,10 +10,16 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", +] + [[fields]] name = "warmup_iters" -type = "int" +type = "::FlexFlow::nonnegative_int" + [[fields]] name = "measure_iters" -type = "int" +type = "::FlexFlow::positive_int" diff --git a/lib/local-execution/test/src/local-execution/computation_graph_instance.cc b/lib/local-execution/test/src/local-execution/computation_graph_instance.cc index ae8365b127..1ed6b88841 100644 --- a/lib/local-execution/test/src/local-execution/computation_graph_instance.cc +++ b/lib/local-execution/test/src/local-execution/computation_graph_instance.cc @@ -161,7 +161,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, /*input_tensors=*/input_tensors, /*allocator=*/allocator, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*device_handle=*/ff_handle, /*global_device_id=*/global_device_id); @@ -172,7 +172,7 @@ TEST_SUITE(FF_TEST_SUITE) { for (int i = 0; i < num_epochs; i++) { perform_all_passes_for_computation_graph_instance( /*instance=*/computation_graph_instance, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*ff_handle=*/ff_handle, /*global_device_id=*/global_device_id); loss_values.push_back(copy_tensor_accessor_r( @@ -335,7 +335,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }, /*input_tensors=*/input_tensors, /*allocator=*/allocator, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*device_handle=*/ff_handle, /*device_idx=*/device_idx); @@ -348,7 +348,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { for (int i = 0; i < num_epochs; i++) { perform_all_passes_for_computation_graph_instance( /*instance=*/computation_graph_instance, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*ff_handle=*/ff_handle, /*device_idx=*/device_idx); loss_values.push_back(copy_tensor_accessor_r( @@ -459,13 +459,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }, /*input_tensors=*/input_tensors, /*allocator=*/allocator, - /*profiling_settings=*/ProfilingSettings{0, 1}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*device_handle=*/ff_handle, /*device_idx=*/device_idx); perform_all_passes_for_computation_graph_instance( /*instance=*/computation_graph_instance, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*ff_handle=*/ff_handle, /*device_idx=*/device_idx); assert_unwrap(computation_graph_instance.get_loss_tensor_accessor()); diff --git a/lib/local-execution/test/src/local-execution/cost_estimator/local_cost_estimator.cc b/lib/local-execution/test/src/local-execution/cost_estimator/local_cost_estimator.cc index 3d1b691a08..928a52a007 100644 --- a/lib/local-execution/test/src/local-execution/cost_estimator/local_cost_estimator.cc +++ b/lib/local-execution/test/src/local-execution/cost_estimator/local_cost_estimator.cc @@ -44,8 +44,8 @@ TEST_SUITE(FF_TEST_SUITE) { /*interconnect_specification=*/interconnect_specification, /*allocator=*/allocator, /*profiling_settings=*/ - ProfilingSettings{/*warmup_iters=*/0, - /*measure_iters=*/1}, + ProfilingSettings{/*warmup_iters=*/0_n, + /*measure_iters=*/1_p}, /*device_handle=*/ff_handle, /*device_idx=*/device_idx); @@ -121,8 +121,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*interconnect_specification=*/interconnect_specification, /*allocator=*/allocator, /*profiling_settings=*/ - ProfilingSettings{/*warmup_iters=*/0, - /*measure_iters=*/1}, + ProfilingSettings{/*warmup_iters=*/0_n, + /*measure_iters=*/1_p}, /*device_handle=*/ff_handle, /*device_idx=*/device_idx); diff --git a/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc index 8800711fef..696ae4b4cd 100644 --- a/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc +++ b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc @@ -62,7 +62,7 @@ TEST_SUITE(FF_TEST_SUITE) { LocalTaskArgumentAccessor acc = LocalTaskArgumentAccessor{ /*allocator=*/allocator, /*tensor_slots_backing=*/tensor_slots_backing, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*ff_handle=*/cpu_make_device_handle_t(), /*op_attrs=*/PCGOperatorAttrs{InputAttrs{input_tensor_shape}}, /*loss_attrs=*/std::nullopt, diff --git a/lib/realm-execution/test/src/realm-execution/test_e2e.cc b/lib/realm-execution/test/src/realm-execution/test_e2e.cc index 57871fb148..5469c59ce5 100644 --- a/lib/realm-execution/test/src/realm-execution/test_e2e.cc +++ b/lib/realm-execution/test/src/realm-execution/test_e2e.cc @@ -253,7 +253,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*loss_mapping=*/cfg.loss_mapping, }, /*input_tensors=*/input_tensors, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*device_handle=*/device_handle, /*device_type=*/DeviceType::CPU); @@ -264,7 +264,7 @@ TEST_SUITE(FF_TEST_SUITE) { for (int i = 0; i < num_epochs; i++) { perform_all_passes_for_pcg_instance( /*instance=*/pcg_instance, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*device_handle=*/device_handle); loss_values.push_back(copy_tensor_accessor_r( dynamic_tensor_accessor_from_instance( @@ -332,7 +332,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*loss_mapping=*/cfg.loss_mapping, }, /*input_tensors=*/input_tensors, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*device_handle=*/device_handle, /*device_type=*/DeviceType::GPU); @@ -343,7 +343,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { for (int i = 0; i < num_epochs; i++) { perform_all_passes_for_pcg_instance( /*instance=*/pcg_instance, - /*profiling_settings=*/ProfilingSettings{0, 0}, + /*profiling_settings=*/ProfilingSettings{0_n, 1_p}, /*device_handle=*/device_handle); loss_values.push_back(copy_tensor_accessor_r(