diff --git a/include/proxy/hdrs/HuffmanCodec.h b/include/proxy/hdrs/HuffmanCodec.h index e979bdc3151..9d1aa7316f4 100644 --- a/include/proxy/hdrs/HuffmanCodec.h +++ b/include/proxy/hdrs/HuffmanCodec.h @@ -25,5 +25,15 @@ #include +/** Decode a Huffman-encoded string per RFC 7541 section 5.2. + + @return The decoded length, or a negative value on invalid input or + insufficient destination space. + + @note dst_len must be strictly greater than the decoded length; with an + exactly-sized destination the decoder may report insufficient space. + Huffman expands to at most 8/5 of the encoded length, so sizing dst at + 2x src_len always suffices (see xpack_decode_string). + */ int64_t huffman_decode(char *dst, uint32_t dst_len, uint8_t const *src, uint32_t src_len); int64_t huffman_encode(uint8_t *dst, uint32_t dst_len, uint8_t const *src, uint32_t src_len); diff --git a/lib/ls-hpack/README.md b/lib/ls-hpack/README.md index b6ba2c769f5..2b7ccb1948e 100644 --- a/lib/ls-hpack/README.md +++ b/lib/ls-hpack/README.md @@ -23,4 +23,18 @@ Huffman decoding acceleration. The current implementation pulled into ATS is based upon what is currently the latest version, -[v2.3.4](https://github.com/litespeedtech/ls-hpack/releases/tag/v2.3.4). +[v2.3.5](https://github.com/litespeedtech/ls-hpack/releases/tag/v2.3.5). + +# ATS Modifications + +The code is kept as close to upstream as practical, with one deliberate +behavioral divergence that any future re-sync must preserve: + +- `lshpack_dec_huff_decode` (the fast decoder) rejects Huffman padding of 8 or + more bits, as required by RFC 7541 section 5.2. Upstream's fast decoder + accepts such padding when it follows the final symbol near the end of the + input; the 4-bit FSM decoder (`lshpack_dec_huff_decode_full`) has always + rejected it. See the commented tail check in `lshpack.cc` and the + `decode_overlong_padding` test in + `src/proxy/hdrs/unit_tests/test_Huffmancode.cc`, which fails if the check is + dropped. diff --git a/lib/ls-hpack/lshpack.cc b/lib/ls-hpack/lshpack.cc index 6e7f0ee0e6f..b1d4167c654 100644 --- a/lib/ls-hpack/lshpack.cc +++ b/lib/ls-hpack/lshpack.cc @@ -52,6 +52,9 @@ SOFTWARE. */ #include "huff-tables.h" +#include "lshpack.h" +#include +#include #include #include @@ -61,7 +64,7 @@ namespace litespeed { namespace { -constexpr int64_t LSHPACK_ERR_MORE_BUF = -3; +constexpr unsigned SHORTEST_CODE = 5; struct decode_status { uint8_t state; @@ -243,4 +246,189 @@ lshpack_dec_huff_decode_full(uint8_t const *src, uint32_t src_len, char *dst, ui return p_dst - dst; } +// Implementation taken from LiteSpeed: +// lshpack_dec_huff_decode +// +// The decoder is optimized for the common case. Most of the time, we decode +// data whose encoding is 16 bits or shorter. This lets us use a 64 KB table +// indexed by two bytes of input and outputs 1, 2, or 3 bytes at a time. +// +// In the case a longer code is encountered, we fall back to the original +// Huffman decoder that supports all code lengths. +int64_t +lshpack_dec_huff_decode(uint8_t const *src, uint32_t src_len, char *dst, uint32_t dst_len) +{ + char *const orig_dst = dst; + uint8_t const *const src_end = src + src_len; + char *const dst_end = dst + dst_len; + uintptr_t buf = 0; + unsigned avail_bits, len; + struct hdec hdec = {0, {0, 0, 0}}; + uint16_t idx; + int64_t r; + + avail_bits = 0; + while (true) { + if (src + sizeof(buf) <= src_end) { + len = (sizeof(buf) * 8 - avail_bits) >> 3; + avail_bits += len << 3; + switch (len) { +#if UINTPTR_MAX == 18446744073709551615ull + case 8: + buf <<= 8; + buf |= static_cast(*src++); + [[fallthrough]]; + case 7: + buf <<= 8; + buf |= static_cast(*src++); + [[fallthrough]]; + default: + buf <<= 48; + buf |= static_cast(*src++) << 40; + buf |= static_cast(*src++) << 32; + buf |= static_cast(*src++) << 24; + buf |= static_cast(*src++) << 16; +#else + [[fallthrough]]; + case 4: + buf <<= 8; + buf |= static_cast(*src++); + [[fallthrough]]; + case 3: + buf <<= 8; + buf |= static_cast(*src++); + [[fallthrough]]; + default: + buf <<= 16; +#endif + buf |= static_cast(*src++) << 8; + buf |= static_cast(*src++) << 0; + } + } else if (src < src_end) { + do { + buf <<= 8; + buf |= static_cast(*src++); + avail_bits += 8; + } while (src < src_end && avail_bits <= sizeof(buf) * 8 - 8); + } else { + break; // Normal case terminating condition: out of input + } + + if (dst_end - dst >= static_cast(8 * sizeof(buf) / SHORTEST_CODE) && avail_bits >= 16) { + // Fast path: don't check destination bounds + do { + idx = static_cast(buf >> (avail_bits - 16)); + hdec = hdecs[idx]; + dst[0] = static_cast(hdec.out[0]); + dst[1] = static_cast(hdec.out[1]); + dst[2] = static_cast(hdec.out[2]); + dst += hdec.lens & 3; + avail_bits -= hdec.lens >> 2; + } while (avail_bits >= 16 && hdec.lens); + if (avail_bits < 16) { + continue; + } + goto slow_path; + } else { + while (avail_bits >= 16) { + idx = static_cast(buf >> (avail_bits - 16)); + hdec = hdecs[idx]; + len = hdec.lens & 3; + if (len && dst + len <= dst_end) { + switch (len) { + case 3: + *dst++ = static_cast(hdec.out[0]); + *dst++ = static_cast(hdec.out[1]); + *dst++ = static_cast(hdec.out[2]); + break; + case 2: + *dst++ = static_cast(hdec.out[0]); + *dst++ = static_cast(hdec.out[1]); + break; + default: + *dst++ = static_cast(hdec.out[0]); + break; + } + avail_bits -= hdec.lens >> 2; + } else if (dst + len > dst_end) { + r = dst_end - dst - static_cast(len); + if (r > LSHPACK_ERR_MORE_BUF) { + r = LSHPACK_ERR_MORE_BUF; + } + return r; + } else { + goto slow_path; + } + } + } + } + + if (avail_bits >= SHORTEST_CODE) { + idx = static_cast(buf << (16 - avail_bits)); + idx |= (1 << (16 - avail_bits)) - 1; // EOF + if (idx == 0xFFFF && avail_bits < 8) { + goto end; + } + // If a byte or more of input is left, this means there is a valid + // encoding, not just EOF. + hdec = hdecs[idx]; + len = hdec.lens & 3; + if ((static_cast(hdec.lens) >> 2) > avail_bits) { + return -1; + } + if (len && dst + len <= dst_end) { + switch (len) { + case 3: + *dst++ = static_cast(hdec.out[0]); + *dst++ = static_cast(hdec.out[1]); + *dst++ = static_cast(hdec.out[2]); + break; + case 2: + *dst++ = static_cast(hdec.out[0]); + *dst++ = static_cast(hdec.out[1]); + break; + default: + *dst++ = static_cast(hdec.out[0]); + break; + } + avail_bits -= hdec.lens >> 2; + } else if (dst + len > dst_end) { + r = dst_end - dst - static_cast(len); + if (r > LSHPACK_ERR_MORE_BUF) { + r = LSHPACK_ERR_MORE_BUF; + } + return r; + } else { + // This must be an invalid code, otherwise it would have fit + return -1; + } + } + + if (avail_bits > 0) { + // ATS: unlike upstream ls-hpack, also reject padding of 8 or more bits + // (possible here after the final symbol consumed only part of the tail). + // RFC 7541 5.2: "A padding strictly longer than 7 bits MUST be treated as + // a decoding error." This keeps the strictness of the 4-bit FSM decoder. + if (avail_bits >= 8 || ((1u << avail_bits) - 1) != (buf & ((1u << avail_bits) - 1))) { + return -1; // Not EOF as expected + } + } + +end: + return dst - orig_dst; + +slow_path: + // Find previous byte boundary and finish decoding thence. + while ((avail_bits & 7) && dst > orig_dst) { + avail_bits += encode_table[static_cast(*--dst)].bits; + } + assert((avail_bits & 7) == 0); + src -= avail_bits >> 3; + r = lshpack_dec_huff_decode_full(src, static_cast(src_end - src), dst, static_cast(dst_end - dst)); + if (r >= 0) { + return dst - orig_dst + r; + } + return r; +} + } // namespace litespeed diff --git a/lib/ls-hpack/lshpack.h b/lib/ls-hpack/lshpack.h index 7069317b28a..c64c1f19cc9 100644 --- a/lib/ls-hpack/lshpack.h +++ b/lib/ls-hpack/lshpack.h @@ -59,10 +59,16 @@ SOFTWARE. namespace litespeed { +// Matches upstream lshpack.h's LSHPACK_ERR_MORE_BUF. +constexpr int64_t LSHPACK_ERR_MORE_BUF = -3; + int64_t lshpack_enc_huff_encode(uint8_t const *src, uint8_t const *const src_end, uint8_t *dst, uint32_t dst_len); int64_t lshpack_dec_huff_decode_full(uint8_t const *src, uint32_t src_len, char *dst, uint32_t dst_len); +int64_t lshpack_dec_huff_decode(uint8_t const *src, uint32_t src_len, + char *dst, uint32_t dst_len); + } // namespace litespeed diff --git a/src/proxy/hdrs/CMakeLists.txt b/src/proxy/hdrs/CMakeLists.txt index 45a084dd6ee..4d036a654d8 100644 --- a/src/proxy/hdrs/CMakeLists.txt +++ b/src/proxy/hdrs/CMakeLists.txt @@ -64,6 +64,8 @@ if(BUILD_TESTING) lshpack configmanager ) + # For the ls-hpack decoder parity tests. + target_include_directories(test_proxy_hdrs PRIVATE ${CMAKE_SOURCE_DIR}/lib) add_catch2_test(NAME test_proxy_hdrs COMMAND test_proxy_hdrs) add_executable(test_proxy_hdrs_xpack unit_tests/test_XPACK.cc) diff --git a/src/proxy/hdrs/HuffmanCodec.cc b/src/proxy/hdrs/HuffmanCodec.cc index e8a14275841..5f144d7294e 100644 --- a/src/proxy/hdrs/HuffmanCodec.cc +++ b/src/proxy/hdrs/HuffmanCodec.cc @@ -26,11 +26,11 @@ #include // Implementation taken from LiteSpeed: -// lshpack_dec_huff_decode_full +// lshpack_dec_huff_decode int64_t huffman_decode(char *dst, uint32_t dst_len, const uint8_t *src, uint32_t src_len) { - return litespeed::lshpack_dec_huff_decode_full(src, src_len, dst, dst_len); + return litespeed::lshpack_dec_huff_decode(src, src_len, dst, dst_len); } // Implementation taken from LiteSpeed: diff --git a/src/proxy/hdrs/unit_tests/test_Huffmancode.cc b/src/proxy/hdrs/unit_tests/test_Huffmancode.cc index 4b942d1f889..6a7c2cb22a3 100644 --- a/src/proxy/hdrs/unit_tests/test_Huffmancode.cc +++ b/src/proxy/hdrs/unit_tests/test_Huffmancode.cc @@ -22,10 +22,14 @@ */ #include "proxy/hdrs/HuffmanCodec.h" +#include "ls-hpack/lshpack.h" +#include #include #include #include #include +#include +#include #include using namespace std; @@ -207,3 +211,136 @@ TEST_CASE("decode_errors", "[proxy][huffman]") free(dst); } } + +TEST_CASE("decode_known_vectors", "[proxy][huffman]") +{ + for (const auto &i : huffman_encode_test_data) { + char dst[64]; + int64_t len = huffman_decode(dst, sizeof(dst), i.expect, i.expect_len); + + REQUIRE(len == i.src_len); + REQUIRE(memcmp(dst, i.src, len) == 0); + } +} + +// The fast table-based decoder must agree with the original 4-bit FSM decoder +// on input validity and decoded content. The full decoder run with a roomy +// destination serves as the oracle. The one permitted difference is an +// exactly-sized destination (dst_len == decoded length): depending on how the +// trailing padding falls on nibble boundaries, either decoder may report +// LSHPACK_ERR_MORE_BUF where the other succeeds. One byte of headroom +// guarantees success for both; ATS callers always allocate twice the encoded +// length, which is strictly more than any decoded length. Sentinel bytes +// verify nothing is written past dst_len. +static void +require_decoder_parity(const uint8_t *src, uint32_t src_len, uint32_t dst_len) +{ + constexpr size_t SENTINEL_LEN = 64; + std::vector oracle_buf(2 * src_len + 8); + int64_t oracle = litespeed::lshpack_dec_huff_decode_full(src, src_len, oracle_buf.data(), oracle_buf.size()); + + std::vector fast_buf(dst_len + SENTINEL_LEN, '\xa5'); + int64_t fast = litespeed::lshpack_dec_huff_decode(src, src_len, fast_buf.data(), dst_len); + + if (oracle < 0 || static_cast(oracle) > dst_len) { + REQUIRE(fast < 0); + } else if (static_cast(oracle) == dst_len && fast < 0) { + REQUIRE(fast == litespeed::LSHPACK_ERR_MORE_BUF); // exact-fit destination + } else { + REQUIRE(fast == oracle); + REQUIRE(memcmp(fast_buf.data(), oracle_buf.data(), oracle) == 0); + } + REQUIRE(std::all_of(fast_buf.begin() + dst_len, fast_buf.end(), [](char c) { return c == '\xa5'; })); +} + +TEST_CASE("decoder_parity_exhaustive_short", "[proxy][huffman]") +{ + uint8_t src[2]; + + for (uint32_t hi = 0; hi < 256; ++hi) { + src[0] = static_cast(hi); + for (uint32_t dst_len = 0; dst_len <= 4; ++dst_len) { + require_decoder_parity(src, 1, dst_len); + } + for (uint32_t lo = 0; lo < 256; ++lo) { + src[1] = static_cast(lo); + require_decoder_parity(src, 2, 8); + } + } +} + +TEST_CASE("decoder_parity_fuzz", "[proxy][huffman]") +{ + std::mt19937 rng(0x5eed); + std::uniform_int_distribution len_dist(0, 64); + std::uniform_int_distribution byte_dist(0, 255); + uint8_t src[64]; + + for (int i = 0; i < 100000; ++i) { + int src_len = len_dist(rng); + for (int j = 0; j < src_len; ++j) { + src[j] = static_cast(byte_dist(rng)); + } + // Bias some iterations toward trailing 0xff runs to exercise the + // EOS-prefix and padding paths. + if (i % 4 == 0 && src_len > 0) { + for (int j = byte_dist(rng) % src_len; j < src_len; ++j) { + src[j] = 0xff; + } + } + // Mostly roomy destination, sometimes a tight one to exercise the + // bounds-checked path and LSHPACK_ERR_MORE_BUF. + uint32_t dst_len = (i % 8 == 0) ? static_cast(byte_dist(rng) % 16) : static_cast(2 * src_len + 8); + require_decoder_parity(src, src_len, dst_len); + } +} + +// Streams whose final symbol is followed by 8 or more bits of all-ones +// padding. RFC 7541 5.2 requires treating padding longer than 7 bits as a +// decoding error. Upstream ls-hpack's fast decoder accepts these; the ATS +// copy deliberately rejects them (see the tail check in +// lib/ls-hpack/lshpack.cc), and this test pins that divergence against a +// future re-sync. The exhaustive parity test cannot cover this branch: it is +// only reachable with inputs of four or more bytes. +TEST_CASE("decode_overlong_padding", "[proxy][huffman]") +{ + const static struct { + const uint8_t *input; + uint32_t input_len; + } test_cases[] = { + {reinterpret_cast("\xbf\xd0\x37\xd9\xff"), 5 }, + {reinterpret_cast("\xd9\x68\x63\x84\x4c\xfe\x77\xa0\xbc\xc9\xff"), 11}, + }; + + for (const auto &tc : test_cases) { + char dst[64]; + REQUIRE(litespeed::lshpack_dec_huff_decode(tc.input, tc.input_len, dst, sizeof(dst)) == -1); + REQUIRE(litespeed::lshpack_dec_huff_decode_full(tc.input, tc.input_len, dst, sizeof(dst)) == -1); + } +} + +TEST_CASE("decoder_roundtrip_fuzz", "[proxy][huffman]") +{ + std::mt19937 rng(0xc0ffee); + std::uniform_int_distribution len_dist(0, 300); + std::uniform_int_distribution byte_dist(0, 255); + + for (int i = 0; i < 20000; ++i) { + uint32_t src_len = static_cast(len_dist(rng)); + std::vector src(src_len); + for (auto &c : src) { + c = static_cast(byte_dist(rng)); + } + + // Worst-case Huffman expansion is 30 bits per input byte. + std::vector encoded(src_len * 4 + 8); + int64_t enc_len = huffman_encode(encoded.data(), encoded.size(), src.data(), src_len); + REQUIRE(enc_len >= 0); + + // One byte of headroom guarantees success (see require_decoder_parity). + std::vector decoded(src_len + 1); + int64_t dec_len = huffman_decode(decoded.data(), src_len + 1, encoded.data(), enc_len); + REQUIRE(dec_len == static_cast(src_len)); + REQUIRE(memcmp(decoded.data(), src.data(), src_len) == 0); + } +} diff --git a/tools/benchmark/CMakeLists.txt b/tools/benchmark/CMakeLists.txt index 49f25fad1c1..e58e8dcf26b 100644 --- a/tools/benchmark/CMakeLists.txt +++ b/tools/benchmark/CMakeLists.txt @@ -47,3 +47,7 @@ target_link_libraries( ts::inkcache ts::inkhostdb ) + +add_executable(benchmark_HuffmanDecode benchmark_HuffmanDecode.cc) +target_link_libraries(benchmark_HuffmanDecode PRIVATE Catch2::Catch2WithMain lshpack) +target_include_directories(benchmark_HuffmanDecode PRIVATE ${CMAKE_SOURCE_DIR}/lib) diff --git a/tools/benchmark/benchmark_HuffmanDecode.cc b/tools/benchmark/benchmark_HuffmanDecode.cc new file mode 100644 index 00000000000..188501a5f07 --- /dev/null +++ b/tools/benchmark/benchmark_HuffmanDecode.cc @@ -0,0 +1,142 @@ +/** @file + + Benchmark comparing the two ls-hpack Huffman decoders: the original 4-bit + FSM decoder (lshpack_dec_huff_decode_full) and the 16-bit table decoder + (lshpack_dec_huff_decode). + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +#include "ls-hpack/lshpack.h" + +#include +#include +#include +#include + +#define CATCH_CONFIG_ENABLE_BENCHMARKING +#include +#include + +namespace +{ + +// Representative header field values seen on the wire. +constexpr std::string_view SHORT_VALUE = "text/css"; +constexpr std::string_view MEDIUM_VALUE = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"; +constexpr std::string_view LONG_VALUE = + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"; +constexpr std::string_view CORPUS[] = { + SHORT_VALUE, + MEDIUM_VALUE, + LONG_VALUE, + "/", + "gzip, deflate, br, zstd", + "Mon, 21 Oct 2013 20:13:21 GMT", + "https://www.example.com/images/2026/06/some-article/hero.webp?w=1200&q=75", + "max-age=31536000, public, immutable", + "session=8f14e45fceea167a5a36dedd4bea2543; theme=dark; region=us-west-2; ab_bucket=37", + "ATS/10.1.0", +}; + +std::vector +encode(std::string_view text) +{ + std::vector encoded(text.size() * 4 + 8); + int64_t len = litespeed::lshpack_enc_huff_encode(reinterpret_cast(text.data()), // + reinterpret_cast(text.data()) + text.size(), encoded.data(), + static_cast(encoded.size())); + REQUIRE(len > 0); + encoded.resize(len); + return encoded; +} + +} // namespace + +TEST_CASE("huffman decode: full vs fast", "[bench][huffman]") +{ + char dst[4096]; + + // Sanity: both decoders agree on the corpus. + for (auto text : CORPUS) { + auto encoded = encode(text); + int64_t full = litespeed::lshpack_dec_huff_decode_full(encoded.data(), encoded.size(), dst, sizeof(dst)); + REQUIRE(full == static_cast(text.size())); + REQUIRE(memcmp(dst, text.data(), text.size()) == 0); + int64_t fast = litespeed::lshpack_dec_huff_decode(encoded.data(), encoded.size(), dst, sizeof(dst)); + REQUIRE(fast == full); + REQUIRE(memcmp(dst, text.data(), text.size()) == 0); + } + + auto short_enc = encode(SHORT_VALUE); + auto medium_enc = encode(MEDIUM_VALUE); + auto long_enc = encode(LONG_VALUE); + + BENCHMARK("full: short (8B)") + { + return litespeed::lshpack_dec_huff_decode_full(short_enc.data(), short_enc.size(), dst, sizeof(dst)); + }; + BENCHMARK("fast: short (8B)") + { + return litespeed::lshpack_dec_huff_decode(short_enc.data(), short_enc.size(), dst, sizeof(dst)); + }; + + BENCHMARK("full: medium (86B)") + { + return litespeed::lshpack_dec_huff_decode_full(medium_enc.data(), medium_enc.size(), dst, sizeof(dst)); + }; + BENCHMARK("fast: medium (86B)") + { + return litespeed::lshpack_dec_huff_decode(medium_enc.data(), medium_enc.size(), dst, sizeof(dst)); + }; + + BENCHMARK("full: long (113B)") + { + return litespeed::lshpack_dec_huff_decode_full(long_enc.data(), long_enc.size(), dst, sizeof(dst)); + }; + BENCHMARK("fast: long (113B)") + { + return litespeed::lshpack_dec_huff_decode(long_enc.data(), long_enc.size(), dst, sizeof(dst)); + }; + + std::vector> corpus_enc; + size_t corpus_bytes = 0; + for (auto text : CORPUS) { + corpus_enc.push_back(encode(text)); + corpus_bytes += text.size(); + } + WARN("corpus: " << corpus_bytes << " decoded bytes per iteration"); + + BENCHMARK("full: corpus") + { + int64_t acc = 0; + for (auto const &encoded : corpus_enc) { + acc += litespeed::lshpack_dec_huff_decode_full(encoded.data(), encoded.size(), dst, sizeof(dst)); + } + return acc; + }; + BENCHMARK("fast: corpus") + { + int64_t acc = 0; + for (auto const &encoded : corpus_enc) { + acc += litespeed::lshpack_dec_huff_decode(encoded.data(), encoded.size(), dst, sizeof(dst)); + } + return acc; + }; +}