From eb11772acd8d9afeae85e72543a4f2a1b10aeae1 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Tue, 14 Apr 2026 01:30:54 +0200
Subject: [PATCH] _hashindex.c: more fixes

- avoid buckets_length integer overflow on 32bit systems via huge num_buckets
- always initialize index-> min_empty and num_empty
- correctly free memory when header validation fails.
  this is a minor issue, because borg will terminate in that case anyway.
- make it possible to lookup in compacted hashtables
- deal safely with empty index: we must use num_buckets = 1 to avoid division
  by zero and sanity check in hashindex_read.
- reinitialize upper/lower limit and min_empty after compact
- fix size_idx / fit_size / grow_size / shrink_size (mind array bounds)
- deal with growing when already at max capacity
- hashindex_resize: replace num_entries assertion, rather return error
---
 src/borg/_hashindex.c           | 100 ++++++++++++++++++++++----------
 src/borg/testsuite/hashindex.py |   3 +-
 2 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/src/borg/_hashindex.c b/src/borg/_hashindex.c
index 11052aec29..2eb4f385a1 100644
--- a/src/borg/_hashindex.c
+++ b/src/borg/_hashindex.c
@@ -178,14 +178,9 @@ hashindex_lookup(HashIndex *index, const unsigned char *key, int *start_idx)
         if (idx >= index->num_buckets) {  /* triggers at == already */
             idx = 0;
         }
-        /* When idx == start, we have done a full pass over all buckets.
-         * - We did not find a bucket with the key we searched for.
-         * - We did not find an empty bucket either.
-         * So all buckets are either full or deleted/tombstones.
-         * This is an invalid state we never should get into, see
-         * upper_limit and min_empty.
-         */
-        assert(idx != start);
+        if (idx == start) {
+            break;
+        }
     }
     /* we get here if we did not find a bucket with the key we searched for. */
     if (start_idx != NULL) {
@@ -215,7 +210,12 @@ hashindex_resize(HashIndex *index, int capacity)
             return 0;
         }
     }
-    assert(index->num_entries == new->num_entries);
+
+    if(index->num_entries != new->num_entries) {
+        /* This detects structurally corrupt indices maliciously supplied with wrong num_entries */
+        hashindex_free(new);
+        return 0;
+    }
 
     hashindex_free_buckets(index);
     index->buckets = new->buckets;
@@ -251,9 +251,15 @@ int get_min_empty(int num_buckets){
 
 int size_idx(int size){
     /* find the smallest hash_sizes index with entry >= size */
-    int i = NELEMS(hash_sizes) - 1;
+    int i, max_idx = NELEMS(hash_sizes) - 1;
+    i = max_idx;
     while(i >= 0 && hash_sizes[i] >= size) i--;
-    return i + 1;
+    i++;
+    /* if size is larger than any hash_sizes entry, return the largest one */
+    if (i > max_idx) {
+        return max_idx;
+    }
+    return i;
 }
 
 int fit_size(int current){
@@ -262,17 +268,17 @@ int fit_size(int current){
 }
 
 int grow_size(int current){
+    int max_idx = NELEMS(hash_sizes) - 1;
     int i = size_idx(current) + 1;
-    int elems = NELEMS(hash_sizes);
-    if (i >= elems)
-        return hash_sizes[elems - 1];
+    if (i > max_idx)
+        i = max_idx;
     return hash_sizes[i];
 }
 
 int shrink_size(int current){
     int i = size_idx(current) - 1;
     if (i < 0)
-        return hash_sizes[0];
+        i = 0;
     return hash_sizes[i];
 }
 
@@ -377,30 +383,35 @@ hashindex_read(PyObject *file_py, int permit_compact, int expected_key_size, int
     if (expected_key_size != -1 && header->key_size != expected_key_size) {
         PyErr_Format(PyExc_ValueError, "Expected key size %d, got %d.",
                      expected_key_size, header->key_size);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
     }
     if (expected_value_size != -1 && header->value_size != expected_value_size) {
         PyErr_Format(PyExc_ValueError, "Expected value size %d, got %d.",
                      expected_value_size, header->value_size);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
     }
     // in any case, the key and value sizes must be both >= 4, the code assumes this.
     if (header->key_size < 4) {
         PyErr_Format(PyExc_ValueError, "Expected key size >= 4, got %d.", header->key_size);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
     }
     if (header->value_size < 4) {
         PyErr_Format(PyExc_ValueError, "Expected value size >= 4, got %d.", header->value_size);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
     }
     // sanity check for num_buckets and num_entries.
     if (header_num_buckets < 1) {
         PyErr_Format(PyExc_ValueError, "Expected num_buckets >= 1, got %d.", header_num_buckets);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
     }
     if ((header_num_entries < 0) || (header_num_entries > header_num_buckets)) {
         PyErr_Format(PyExc_ValueError, "Expected 0 <= num_entries <= num_buckets, got %d.", header_num_entries);
-        goto fail_decref_header;
+        goto fail_release_header_buffer;
+    }
+
+    if ((Py_ssize_t)header_num_buckets > (PY_SSIZE_T_MAX - (Py_ssize_t)sizeof(HashHeader)) / (header->key_size + header->value_size)) {
+        PyErr_Format(PyExc_ValueError, "Hash index size overflows Py_ssize_t.");
+        goto fail_release_header_buffer;
     }
 
     buckets_length = (Py_ssize_t)header_num_buckets * (header->key_size + header->value_size);
@@ -448,8 +459,9 @@ hashindex_read(PyObject *file_py, int permit_compact, int expected_key_size, int
     }
     index->buckets = index->buckets_buffer.buf;
 
+    index->min_empty = get_min_empty(index->num_buckets);
+
     if(!permit_compact) {
-        index->min_empty = get_min_empty(index->num_buckets);
         index->num_empty = count_empty(index);
 
         if(index->num_empty < index->min_empty) {
@@ -459,6 +471,8 @@ hashindex_read(PyObject *file_py, int permit_compact, int expected_key_size, int
                 goto fail_free_buckets;
             }
         }
+    } else {
+        index->num_empty = index->num_buckets - index->num_entries; /* upper bound */
     }
 
     /*
@@ -612,16 +626,22 @@ hashindex_set(HashIndex *index, const unsigned char *key, const void *value)
     if(idx < 0)
     {
         if(index->num_entries > index->upper_limit) {
-            /* hashtable too full, grow it! */
-            if(!hashindex_resize(index, grow_size(index->num_buckets))) {
+            int next_size = grow_size(index->num_buckets);
+            if(next_size != index->num_buckets) {
+                /* hashtable too full, grow it! */
+                if(!hashindex_resize(index, next_size)) {
+                    return 0;
+                }
+                /* we have just built a fresh hashtable and removed all tombstones,
+                 * so we only have EMPTY or USED buckets, but no DELETED ones any more.
+                 */
+                idx = hashindex_lookup(index, key, &start_idx);
+                assert(idx < 0);
+                assert(BUCKET_IS_EMPTY(index, start_idx));
+            } else if (index->num_entries >= index->num_buckets) {
+                /* Table is maximally sized and 100% full, cannot insert cleanly. */
                 return 0;
             }
-            /* we have just built a fresh hashtable and removed all tombstones,
-             * so we only have EMPTY or USED buckets, but no DELETED ones any more.
-             */
-            idx = hashindex_lookup(index, key, &start_idx);
-            assert(idx < 0);
-            assert(BUCKET_IS_EMPTY(index, start_idx));
         }
         idx = start_idx;
         if(BUCKET_IS_EMPTY(index, idx)){
@@ -699,13 +719,25 @@ hashindex_compact(HashIndex *index)
     int begin_used_idx;
     int empty_slot_count, count, buckets_to_copy;
     int compact_tail_idx = 0;
-    uint64_t saved_size = (index->num_buckets - index->num_entries) * (uint64_t)index->bucket_size;
+    int new_num_buckets = index->num_entries == 0 ? 1 : index->num_entries;
+    uint64_t saved_size = (index->num_buckets - new_num_buckets) * (uint64_t)index->bucket_size;
 
-    if(index->num_buckets - index->num_entries == 0) {
+    if(index->num_buckets == new_num_buckets) {
         /* already compact */
         return 0;
     }
 
+    if (index->num_entries == 0) {
+        index->num_buckets = 1;
+        memset(BUCKET_ADDR(index, 0), 0, index->bucket_size);
+        BUCKET_MARK_EMPTY(index, 0);
+        index->lower_limit = get_lower_limit(index->num_buckets);
+        index->upper_limit = get_upper_limit(index->num_buckets);
+        index->min_empty = get_min_empty(index->num_buckets);
+        index->num_empty = 1;
+        return saved_size;
+    }
+
     while(idx < index->num_buckets) {
         /* Phase 1: Find some empty slots */
         start_idx = idx;
@@ -744,6 +776,10 @@ hashindex_compact(HashIndex *index)
     }
 
     index->num_buckets = index->num_entries;
+    index->lower_limit = get_lower_limit(index->num_buckets);
+    index->upper_limit = get_upper_limit(index->num_buckets);
+    index->min_empty = get_min_empty(index->num_buckets);
+    index->num_empty = 0;
     return saved_size;
 }
 
diff --git a/src/borg/testsuite/hashindex.py b/src/borg/testsuite/hashindex.py
index b75dfff4f8..2921803cc5 100644
--- a/src/borg/testsuite/hashindex.py
+++ b/src/borg/testsuite/hashindex.py
@@ -493,7 +493,8 @@ def test_empty(self):
 
         compact_index = self.index_from_data_compact_to_data()
 
-        self.index(num_entries=0, num_buckets=0)
+        self.index(num_entries=0, num_buckets=1)
+        self.write_empty(b'\0' * 32)
         assert compact_index == self.index_data.getvalue()
 
     def test_merge(self):