pytorch · kirklandsign · Apr 13, 2026 · lucylq · Apr 13, 2026 · kirklandsign
@@ -23,32 +23,32 @@ struct ET_EXPERIMENTAL Stats {
   const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
   // Time stamps for the different stages of the execution
   // model_load_start_ms: Start of model loading.
-  long model_load_start_ms;
+  long model_load_start_ms = 0;
   // model_load_end_ms: End of model loading.
-  long model_load_end_ms;
+  long model_load_end_ms = 0;
   // inference_start_ms: Immediately after the model is loaded (or we check
   // for model load), measure the inference time.
   // NOTE: It's actually the tokenizer encode + model execution time.
-  long inference_start_ms;
+  long inference_start_ms = 0;
   // End of the tokenizer encode time.
-  long token_encode_end_ms;
+  long token_encode_end_ms = 0;
   // Start of the model execution (forward function) time.
-  long model_execution_start_ms;
+  long model_execution_start_ms = 0;
   // End of the model execution (forward function) time.
-  long model_execution_end_ms;
+  long model_execution_end_ms = 0;
   // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
   // before the inference loop starts
-  long prompt_eval_end_ms;
+  long prompt_eval_end_ms = 0;
   // first_token: Timestamp when the first generated token is emitted
-  long first_token_ms;
+  long first_token_ms = 0;
   // inference_end_ms: End of inference/generation.
-  long inference_end_ms;
+  long inference_end_ms = 0;
   // Keep a running total of the time spent in sampling.
   long aggregate_sampling_time_ms = 0;
   // Token count from prompt
-  int64_t num_prompt_tokens;
+  int64_t num_prompt_tokens = 0;
   // Token count from generated (total - prompt)
-  int64_t num_generated_tokens;
+  int64_t num_generated_tokens = 0;
   // GPU memory stats (optional; may be zero if not available)
   // GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate
   // "not available".
@@ -171,18 +171,18 @@ inline void print_report(const Stats& stats) {
       Info,
       "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
       inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-
-      (stats.num_generated_tokens) /
-          (double)(stats.inference_end_ms - stats.inference_start_ms) *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+      inference_time_ms > 0 ? (stats.num_generated_tokens) / inference_time_ms *
+              stats.SCALING_FACTOR_UNITS_PER_SECOND
+                            : 0.0);
   double prompt_eval_time =
       (double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
   ET_LOG(
       Info,
       "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
       prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      (stats.num_prompt_tokens) / prompt_eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+      prompt_eval_time > 0 ? (stats.num_prompt_tokens) / prompt_eval_time *
+              stats.SCALING_FACTOR_UNITS_PER_SECOND
+                           : 0.0);
 
   double eval_time =
       (double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
@@ -192,8 +192,9 @@ inline void print_report(const Stats& stats) {
       " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
       stats.num_generated_tokens,
       eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
-      stats.num_generated_tokens / eval_time *
-          stats.SCALING_FACTOR_UNITS_PER_SECOND);
+      eval_time > 0 ? stats.num_generated_tokens / eval_time *
+              stats.SCALING_FACTOR_UNITS_PER_SECOND
+                    : 0.0);
 
   // Time to first token is measured from the start of inference, excluding
   // model load time.

@@ -233,7 +233,7 @@ Error TextLLMRunner::generate(
       "RSS after finishing text generation: %f MiB (0 if unsupported)",
       get_rss_bytes() / 1024.0 / 1024.0);
 
-  if (num_generated_tokens == max_new_tokens) {
+  if (num_generated_tokens == max_new_tokens - 1) {
     RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
   }