From 4a72647ce2b7b17238f1a5edb5431b71d3668db5 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 17 Apr 2026 14:36:52 -0700 Subject: [PATCH] diff: fix out-of-bounds reads and NULL deref in diffstat UTF-8 truncation f85b49f3d4a (diff: improve scaling of filenames in diffstat to handle UTF-8 chars, 2026-01-16) introduced a loop in show_stats() that calls utf8_width() repeatedly to skip leading characters until the displayed width fits. However, utf8_width() can return problematic values: - For invalid UTF-8 sequences, pick_one_utf8_char() sets the name pointer to NULL and utf8_width() returns 0. Since name_len does not change, the loop iterates once more and pick_one_utf8_char() dereferences the NULL pointer, crashing. - For control characters, utf8_width() returns -1, so name_len grows when it is expected to shrink. This can cause the loop to consume more characters than the string contains, reading past the trailing NUL. By default, fill_print_name() will C-quotes filenames which escapes control characters and invalid bytes to printable text. That avoids this bug from being triggered; however, with core.quotePath=false, raw bytes can reach this code. Add tests exercising both failure modes with core.quotePath=false and a narrow --stat-name-width to force truncation: one with a bare 0xC0 byte (invalid UTF-8 lead byte, triggers NULL deref) and one with a 0x01 byte (control character, causes the loop to read past the end of the string). Fix both issues by introducing utf8_ish_width(), a thin wrapper around utf8_width() that guarantees the pointer always advances and the returned width is never negative: - On invalid UTF-8 it restores the pointer, advances by one byte, and returns width 1 (matching the strlen()-based fallback used by utf8_strwidth()). - On a control character it returns 0 (matching utf8_strnwidth() which skips them). Also add a "&& *name" guard to the while-loop condition so it terminates at end-of-string even when utf8_strwidth()'s strlen() fallback causes name_len to exceed the sum of per-character widths. Signed-off-by: Elijah Newren --- diff.c | 26 ++++++++++++++++++++++++-- t/t4052-stat-output.sh | 25 +++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/diff.c b/diff.c index 397e38b41cc6fa..1a3b19f71f0e30 100644 --- a/diff.c +++ b/diff.c @@ -2927,6 +2927,28 @@ void print_stat_summary(FILE *fp, int files, print_stat_summary_inserts_deletes(&o, files, insertions, deletions); } +/* + * Like utf8_width(), but guaranteed safe for use in loops that subtract + * per-character widths: + * + * - utf8_width() sets *start to NULL on invalid UTF-8 and returns 0; + * we restore the pointer and advance by one byte, returning width 1 + * (matching the strlen()-based fallback in utf8_strwidth()). + * + * - utf8_width() returns -1 for control characters; we return 0 + * (matching utf8_strnwidth() which skips them). + */ +static int utf8_ish_width(const char **start) +{ + const char *old = *start; + int w = utf8_width(start, NULL); + if (!*start) { + *start = old + 1; + return 1; + } + return (w < 0) ? 0 : w; +} + static void show_stats(struct diffstat_t *data, struct diff_options *options) { int i, len, add, del, adds = 0, dels = 0; @@ -3093,8 +3115,8 @@ static void show_stats(struct diffstat_t *data, struct diff_options *options) if (len < 0) len = 0; - while (name_len > len) - name_len -= utf8_width((const char**)&name, NULL); + while (name_len > len && *name) + name_len -= utf8_ish_width((const char**)&name); slash = strchr(name, '/'); if (slash) diff --git a/t/t4052-stat-output.sh b/t/t4052-stat-output.sh index 7c749062e2b8d1..84c53c1a511906 100755 --- a/t/t4052-stat-output.sh +++ b/t/t4052-stat-output.sh @@ -445,4 +445,29 @@ test_expect_success 'diffstat where line_prefix contains ANSI escape codes is co test_grep "| ${FILENAME_TRIMMED} | 0" out ' +test_expect_success 'diffstat truncation with invalid UTF-8 does not crash' ' + empty_blob=$(git hash-object -w --stdin tree_file && + tree=$(cat tree_file) && + empty_tree=$(git mktree output && + test_grep "| 0" output +' + +test_expect_success FUNNYNAMES 'diffstat truncation with control chars does not crash' ' + FNAME=$(printf "aaa-\x01-aaa") && + git commit --allow-empty -m setup && + >$FNAME && + git add -- $FNAME && + git commit -m "add file with control char name" && + git -c core.quotepath=false diff --stat --stat-name-width=5 HEAD~1..HEAD >output && + test_grep "| 0" output && + rm -- $FNAME && + git rm -- $FNAME && + git commit -m "remove test file" +' + test_done