Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 20 additions & 19 deletions extension/llm/runner/stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,32 @@ struct ET_EXPERIMENTAL Stats {
const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
// Time stamps for the different stages of the execution
// model_load_start_ms: Start of model loading.
long model_load_start_ms;
long model_load_start_ms = 0;
// model_load_end_ms: End of model loading.
long model_load_end_ms;
long model_load_end_ms = 0;
// inference_start_ms: Immediately after the model is loaded (or we check
// for model load), measure the inference time.
// NOTE: It's actually the tokenizer encode + model execution time.
long inference_start_ms;
long inference_start_ms = 0;
// End of the tokenizer encode time.
long token_encode_end_ms;
long token_encode_end_ms = 0;
// Start of the model execution (forward function) time.
long model_execution_start_ms;
long model_execution_start_ms = 0;
// End of the model execution (forward function) time.
long model_execution_end_ms;
long model_execution_end_ms = 0;
// prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
// before the inference loop starts
long prompt_eval_end_ms;
long prompt_eval_end_ms = 0;
// first_token: Timestamp when the first generated token is emitted
long first_token_ms;
long first_token_ms = 0;
// inference_end_ms: End of inference/generation.
long inference_end_ms;
long inference_end_ms = 0;
// Keep a running total of the time spent in sampling.
long aggregate_sampling_time_ms = 0;
// Token count from prompt
int64_t num_prompt_tokens;
int64_t num_prompt_tokens = 0;
// Token count from generated (total - prompt)
int64_t num_generated_tokens;
int64_t num_generated_tokens = 0;
// GPU memory stats (optional; may be zero if not available)
// GPU memory stats (optional). Use sentinel UINT64_MAX / -1.0 to indicate
// "not available".
Expand Down Expand Up @@ -171,18 +171,18 @@ inline void print_report(const Stats& stats) {
Info,
"\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
inference_time_ms / stats.SCALING_FACTOR_UNITS_PER_SECOND,

(stats.num_generated_tokens) /
(double)(stats.inference_end_ms - stats.inference_start_ms) *
stats.SCALING_FACTOR_UNITS_PER_SECOND);
inference_time_ms > 0 ? (stats.num_generated_tokens) / inference_time_ms *
stats.SCALING_FACTOR_UNITS_PER_SECOND
: 0.0);
double prompt_eval_time =
(double)(stats.prompt_eval_end_ms - stats.inference_start_ms);
ET_LOG(
Info,
"\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
prompt_eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
(stats.num_prompt_tokens) / prompt_eval_time *
stats.SCALING_FACTOR_UNITS_PER_SECOND);
prompt_eval_time > 0 ? (stats.num_prompt_tokens) / prompt_eval_time *
stats.SCALING_FACTOR_UNITS_PER_SECOND
: 0.0);

double eval_time =
(double)(stats.inference_end_ms - stats.prompt_eval_end_ms);
Expand All @@ -192,8 +192,9 @@ inline void print_report(const Stats& stats) {
" tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
stats.num_generated_tokens,
eval_time / stats.SCALING_FACTOR_UNITS_PER_SECOND,
stats.num_generated_tokens / eval_time *
stats.SCALING_FACTOR_UNITS_PER_SECOND);
eval_time > 0 ? stats.num_generated_tokens / eval_time *
stats.SCALING_FACTOR_UNITS_PER_SECOND
: 0.0);

// Time to first token is measured from the start of inference, excluding
// model load time.
Expand Down
2 changes: 1 addition & 1 deletion extension/llm/runner/text_llm_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ Error TextLLMRunner::generate(
"RSS after finishing text generation: %f MiB (0 if unsupported)",
get_rss_bytes() / 1024.0 / 1024.0);

if (num_generated_tokens == max_new_tokens) {
if (num_generated_tokens == max_new_tokens - 1) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intended?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
}

Expand Down
Loading