From 8ff0648fd0768b9229853f933056855420ad82ee Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com> Date: Tue, 16 May 2023 14:43:18 +0200 Subject: [PATCH] Fix ubsan failure in gorilla decompression Also add more tests --- tsl/src/compression/compression.c | 60 ++--------- tsl/src/compression/decompress_test_impl.c | 111 +++++++++++++++++++++ tsl/src/compression/gorilla.c | 12 +++ tsl/test/expected/compression_algos.out | 4 +- 4 files changed, 136 insertions(+), 51 deletions(-) create mode 100644 tsl/src/compression/decompress_test_impl.c diff --git a/tsl/src/compression/compression.c b/tsl/src/compression/compression.c index cb588854a..6194fe0dd 100644 --- a/tsl/src/compression/compression.c +++ b/tsl/src/compression/compression.c @@ -2063,68 +2063,28 @@ get_compression_algorithm(char *name) return _INVALID_COMPRESSION_ALGORITHM; } -#define FUNCTION_NAME_HELPER(X, Y) decompress_##X##_##Y -#define FUNCTION_NAME(X, Y) FUNCTION_NAME_HELPER(X, Y) - -#define TOSTRING_HELPER(x) #x -#define TOSTRING(x) TOSTRING_HELPER(x) - -/* Try to decompress the given compressed data. Used for fuzzing. */ -#define DECOMPRESS_FN \ - static int FUNCTION_NAME(ALGO, CTYPE)(const uint8 *Data, size_t Size) \ - { \ - StringInfoData si = { .data = (char *) Data, .len = Size }; \ - \ - int algo = pq_getmsgbyte(&si); \ - \ - CheckCompressedData(algo > 0 && algo < _END_COMPRESSION_ALGORITHMS); \ - \ - if (algo != get_compression_algorithm(TOSTRING(ALGO))) \ - { \ - /* \ - * It's convenient to fuzz only one algorithm at a time. We specialize \ - * the fuzz target for one algorithm, so that the fuzzer doesn't waste \ - * time discovering others from scratch. \ - */ \ - return -1; \ - } \ - \ - Datum compressed_data = definitions[algo].compressed_data_recv(&si); \ - DecompressionIterator *iter = \ - definitions[algo].iterator_init_forward(compressed_data, PGTYPE); \ - \ - int n = 0; \ - for (DecompressResult r = iter->try_next(iter); !r.is_done; r = iter->try_next(iter)) \ - { \ - n++; \ - } \ - \ - return n; \ - } - #define ALGO gorilla #define CTYPE float8 #define PGTYPE FLOAT8OID -DECOMPRESS_FN +#define DATUM_TO_CTYPE DatumGetFloat8 +#include "decompress_test_impl.c" #undef ALGO #undef CTYPE #undef PGTYPE +#undef DATUM_TO_CTYPE #define ALGO deltadelta #define CTYPE int64 #define PGTYPE INT8OID -DECOMPRESS_FN +#define DATUM_TO_CTYPE DatumGetInt64 +#include "decompress_test_impl.c" #undef ALGO #undef CTYPE #undef PGTYPE +#undef DATUM_TO_CTYPE -#undef TOSTRING -#undef TOSTRING_HELPER - -#undef FUNCTION_NAME -#undef FUNCTION_NAME_HELPER - -static int (*get_decompress_fn(int algo, Oid type))(const uint8 *Data, size_t Size) +static int (*get_decompress_fn(int algo, Oid type))(const uint8 *Data, size_t Size, + bool check_compression) { if (algo == COMPRESSION_ALGORITHM_GORILLA && type == FLOAT8OID) { @@ -2185,7 +2145,9 @@ ts_read_compressed_data_file(PG_FUNCTION_ARGS) Oid type = PG_GETARG_OID(1); - int res = get_decompress_fn(algo, type)((const uint8 *) string, fsize); + int res = + get_decompress_fn(algo, + type)((const uint8 *) string, fsize, /* check_compression = */ true); PG_RETURN_INT32(res); } diff --git a/tsl/src/compression/decompress_test_impl.c b/tsl/src/compression/decompress_test_impl.c new file mode 100644 index 000000000..d2dc1f30a --- /dev/null +++ b/tsl/src/compression/decompress_test_impl.c @@ -0,0 +1,111 @@ +/* + * This file and its contents are licensed under the Timescale License. + * Please see the included NOTICE for copyright information and + * LICENSE-TIMESCALE for a copy of the license. + */ + +#define FUNCTION_NAME_HELPER(X, Y) decompress_##X##_##Y +#define FUNCTION_NAME(X, Y) FUNCTION_NAME_HELPER(X, Y) + +#define TOSTRING_HELPER(x) #x +#define TOSTRING(x) TOSTRING_HELPER(x) + +/* + * Try to decompress the given compressed data. Used for fuzzing and for checking + * the examples found by fuzzing. For fuzzing we don't check that the + * recompression result is the same. + */ +static int +FUNCTION_NAME(ALGO, CTYPE)(const uint8 *Data, size_t Size, bool check_compression) +{ + StringInfoData si = { .data = (char *) Data, .len = Size }; + + int algo = pq_getmsgbyte(&si); + + CheckCompressedData(algo > 0 && algo < _END_COMPRESSION_ALGORITHMS); + + if (algo != get_compression_algorithm(TOSTRING(ALGO))) + { + /* + * It's convenient to fuzz only one algorithm at a time. We specialize + * the fuzz target for one algorithm, so that the fuzzer doesn't waste + * time discovering others from scratch. + */ + return -1; + } + + Compressor *compressor = NULL; + if (check_compression) + { + compressor = definitions[algo].compressor_for_type(PGTYPE); + } + + Datum compressed_data = definitions[algo].compressed_data_recv(&si); + DecompressionIterator *iter = definitions[algo].iterator_init_forward(compressed_data, PGTYPE); + + DecompressResult results[GLOBAL_MAX_ROWS_PER_COMPRESSION]; + int n = 0; + for (DecompressResult r = iter->try_next(iter); !r.is_done; r = iter->try_next(iter)) + { + if (check_compression) + { + if (r.is_null) + { + compressor->append_null(compressor); + } + else + { + compressor->append_val(compressor, r.val); + } + results[n] = r; + } + + n++; + } + + if (!check_compression || n == 0) + { + return n; + } + + compressed_data = (Datum) compressor->finish(compressor); + if (compressed_data == 0) + { + /* The gorilla compressor returns NULL for all-null input sets. */ + return n; + }; + + /* + * Check that the result is still the same after we compress and decompress + * back. + */ + iter = definitions[algo].iterator_init_forward(compressed_data, PGTYPE); + int nn = 0; + for (DecompressResult r = iter->try_next(iter); !r.is_done; r = iter->try_next(iter)) + { + if (r.is_null != results[nn].is_null) + { + elog(ERROR, "the decompression result doesn't match"); + } + + if (!r.is_null && (DATUM_TO_CTYPE(r.val) != DATUM_TO_CTYPE(results[nn].val))) + { + elog(ERROR, "the decompression result doesn't match"); + } + + nn++; + + if (nn > n) + { + elog(ERROR, "the recompression result doesn't match"); + } + } + + return n; +} + +#undef TOSTRING +#undef TOSTRING_HELPER + +#undef FUNCTION_NAME +#undef FUNCTION_NAME_HELPER diff --git a/tsl/src/compression/gorilla.c b/tsl/src/compression/gorilla.c index e26bc2ec2..45f8850ea 100644 --- a/tsl/src/compression/gorilla.c +++ b/tsl/src/compression/gorilla.c @@ -647,9 +647,21 @@ gorilla_decompression_iterator_try_next_forward_internal(GorillaDecompressionIte iter->prev_xor_bits_used = num_xor_bits.val; CheckCompressedData(iter->prev_xor_bits_used <= 64); + /* + * More than 64 significant bits don't make sense. Exactly 64 we get for + * the first encoded number. + */ CheckCompressedData(iter->prev_xor_bits_used + iter->prev_leading_zeroes <= 64); } + /* + * Zero significant bits would mean that the previous number is repeated, + * but this should have been encoded with tag0 = 0. + * This also might fail if we haven't seen the tag1 = 1 for the first number + * and didn't initialize the bit widths. + */ + CheckCompressedData(iter->prev_xor_bits_used + iter->prev_leading_zeroes > 0); + xor = bit_array_iter_next(&iter->xors, iter->prev_xor_bits_used); if (iter->prev_leading_zeroes + iter->prev_xor_bits_used < 64) xor <<= 64 - (iter->prev_leading_zeroes + iter->prev_xor_bits_used); diff --git a/tsl/test/expected/compression_algos.out b/tsl/test/expected/compression_algos.out index 28d135864..d152f3534 100644 --- a/tsl/test/expected/compression_algos.out +++ b/tsl/test/expected/compression_algos.out @@ -1550,8 +1550,8 @@ from ts_read_compressed_data_directory('gorilla', 'float8', (:'TEST_INPUT_DIR' | group by 2 order by 1 desc; count | result -------+-------- - 521 | XX001 - 214 | true + 657 | XX001 + 78 | true 72 | 08P01 2 | false (4 rows)