diff --git a/extension/llm/runner/stats.h b/extension/llm/runner/stats.h index 04eb7a3f765..3c84511de3a 100644 --- a/extension/llm/runner/stats.h +++ b/extension/llm/runner/stats.h @@ -23,32 +23,32 @@ struct ET_EXPERIMENTAL Stats { const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; // Time stamps for the different stages of the execution // model_load_start_ms: Start of model loading. - long model_load_start_ms; + long model_load_start_ms = 0; // model_load_end_ms: End of model loading. - long model_load_end_ms; + long model_load_end_ms = 0; // inference_start_ms: Immediately after the model is loaded (or we check // for model load), measure the inference time. // NOTE: It's actually the tokenizer encode + model execution time. - long inference_start_ms; + long inference_start_ms = 0; // End of the tokenizer encode time. - long token_encode_end_ms; + long token_encode_end_ms = 0; // Start of the model execution (forward function) time. - long model_execution_start_ms; + long model_execution_start_ms = 0; // End of the model execution (forward function) time. - long model_execution_end_ms; + long model_execution_end_ms = 0; // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right // before the inference loop starts - long prompt_eval_end_ms; + long prompt_eval_end_ms = 0; // first_token: Timestamp when the first generated token is emitted - long first_token_ms; + long first_token_ms = 0; // inference_end_ms: End of inference/generation. - long inference_end_ms; + long inference_end_ms = 0; // Keep a running total of the time spent in sampling. long aggregate_sampling_time_ms = 0; // Token count from prompt - int64_t num_prompt_tokens; + int64_t num_prompt_tokens = 0; // Token count from generated (total - prompt) - int64_t num_generated_tokens; + int64_t num_generated_tokens = 0; // GPU memory stats (optional; may be zero if not available) // GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate // "not available". @@ -171,18 +171,18 @@ inline void print_report(const Stats& stats) { Info, "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)", inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND, - - (stats.num_generated_tokens) / - (double)(stats.inference_end_ms - stats.inference_start_ms) * - stats.SCALING_FACTOR_UNITS_PER_SECOND); + inference_time_ms > 0 ? (stats.num_generated_tokens) / inference_time_ms * + stats.SCALING_FACTOR_UNITS_PER_SECOND + : 0.0); double prompt_eval_time = (double)(stats.prompt_eval_end_ms - stats.inference_start_ms); ET_LOG( Info, "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, - (stats.num_prompt_tokens) / prompt_eval_time * - stats.SCALING_FACTOR_UNITS_PER_SECOND); + prompt_eval_time > 0 ? (stats.num_prompt_tokens) / prompt_eval_time * + stats.SCALING_FACTOR_UNITS_PER_SECOND + : 0.0); double eval_time = (double)(stats.inference_end_ms - stats.prompt_eval_end_ms); @@ -192,8 +192,9 @@ inline void print_report(const Stats& stats) { " tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", stats.num_generated_tokens, eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND, - stats.num_generated_tokens / eval_time * - stats.SCALING_FACTOR_UNITS_PER_SECOND); + eval_time > 0 ? stats.num_generated_tokens / eval_time * + stats.SCALING_FACTOR_UNITS_PER_SECOND + : 0.0); // Time to first token is measured from the start of inference, excluding // model load time. diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp index e8b37ba8863..7d503fc1c08 100644 --- a/extension/llm/runner/text_llm_runner.cpp +++ b/extension/llm/runner/text_llm_runner.cpp @@ -233,7 +233,7 @@ Error TextLLMRunner::generate( "RSS after finishing text generation: %f MiB (0 if unsupported)", get_rss_bytes() / 1024.0 / 1024.0); - if (num_generated_tokens == max_new_tokens) { + if (num_generated_tokens == max_new_tokens - 1) { RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens); }