Fix ubsan failure in gorilla decompression

Also add more tests
This commit is contained in:
Alexander Kuzmenkov 2023-05-16 14:43:18 +02:00
parent 936d751037
commit 8ff0648fd0
4 changed files with 136 additions and 51 deletions

View File

@ -2063,68 +2063,28 @@ get_compression_algorithm(char *name)
return _INVALID_COMPRESSION_ALGORITHM;
}
#define FUNCTION_NAME_HELPER(X, Y) decompress_##X##_##Y
#define FUNCTION_NAME(X, Y) FUNCTION_NAME_HELPER(X, Y)
#define TOSTRING_HELPER(x) #x
#define TOSTRING(x) TOSTRING_HELPER(x)
/* Try to decompress the given compressed data. Used for fuzzing. */
#define DECOMPRESS_FN \
static int FUNCTION_NAME(ALGO, CTYPE)(const uint8 *Data, size_t Size) \
{ \
StringInfoData si = { .data = (char *) Data, .len = Size }; \
\
int algo = pq_getmsgbyte(&si); \
\
CheckCompressedData(algo > 0 && algo < _END_COMPRESSION_ALGORITHMS); \
\
if (algo != get_compression_algorithm(TOSTRING(ALGO))) \
{ \
/* \
* It's convenient to fuzz only one algorithm at a time. We specialize \
* the fuzz target for one algorithm, so that the fuzzer doesn't waste \
* time discovering others from scratch. \
*/ \
return -1; \
} \
\
Datum compressed_data = definitions[algo].compressed_data_recv(&si); \
DecompressionIterator *iter = \
definitions[algo].iterator_init_forward(compressed_data, PGTYPE); \
\
int n = 0; \
for (DecompressResult r = iter->try_next(iter); !r.is_done; r = iter->try_next(iter)) \
{ \
n++; \
} \
\
return n; \
}
#define ALGO gorilla
#define CTYPE float8
#define PGTYPE FLOAT8OID
DECOMPRESS_FN
#define DATUM_TO_CTYPE DatumGetFloat8
#include "decompress_test_impl.c"
#undef ALGO
#undef CTYPE
#undef PGTYPE
#undef DATUM_TO_CTYPE
#define ALGO deltadelta
#define CTYPE int64
#define PGTYPE INT8OID
DECOMPRESS_FN
#define DATUM_TO_CTYPE DatumGetInt64
#include "decompress_test_impl.c"
#undef ALGO
#undef CTYPE
#undef PGTYPE
#undef DATUM_TO_CTYPE
#undef TOSTRING
#undef TOSTRING_HELPER
#undef FUNCTION_NAME
#undef FUNCTION_NAME_HELPER
static int (*get_decompress_fn(int algo, Oid type))(const uint8 *Data, size_t Size)
static int (*get_decompress_fn(int algo, Oid type))(const uint8 *Data, size_t Size,
bool check_compression)
{
if (algo == COMPRESSION_ALGORITHM_GORILLA && type == FLOAT8OID)
{
@ -2185,7 +2145,9 @@ ts_read_compressed_data_file(PG_FUNCTION_ARGS)
Oid type = PG_GETARG_OID(1);
int res = get_decompress_fn(algo, type)((const uint8 *) string, fsize);
int res =
get_decompress_fn(algo,
type)((const uint8 *) string, fsize, /* check_compression = */ true);
PG_RETURN_INT32(res);
}

View File

@ -0,0 +1,111 @@
/*
* This file and its contents are licensed under the Timescale License.
* Please see the included NOTICE for copyright information and
* LICENSE-TIMESCALE for a copy of the license.
*/
#define FUNCTION_NAME_HELPER(X, Y) decompress_##X##_##Y
#define FUNCTION_NAME(X, Y) FUNCTION_NAME_HELPER(X, Y)
#define TOSTRING_HELPER(x) #x
#define TOSTRING(x) TOSTRING_HELPER(x)
/*
* Try to decompress the given compressed data. Used for fuzzing and for checking
* the examples found by fuzzing. For fuzzing we don't check that the
* recompression result is the same.
*/
static int
FUNCTION_NAME(ALGO, CTYPE)(const uint8 *Data, size_t Size, bool check_compression)
{
StringInfoData si = { .data = (char *) Data, .len = Size };
int algo = pq_getmsgbyte(&si);
CheckCompressedData(algo > 0 && algo < _END_COMPRESSION_ALGORITHMS);
if (algo != get_compression_algorithm(TOSTRING(ALGO)))
{
/*
* It's convenient to fuzz only one algorithm at a time. We specialize
* the fuzz target for one algorithm, so that the fuzzer doesn't waste
* time discovering others from scratch.
*/
return -1;
}
Compressor *compressor = NULL;
if (check_compression)
{
compressor = definitions[algo].compressor_for_type(PGTYPE);
}
Datum compressed_data = definitions[algo].compressed_data_recv(&si);
DecompressionIterator *iter = definitions[algo].iterator_init_forward(compressed_data, PGTYPE);
DecompressResult results[GLOBAL_MAX_ROWS_PER_COMPRESSION];
int n = 0;
for (DecompressResult r = iter->try_next(iter); !r.is_done; r = iter->try_next(iter))
{
if (check_compression)
{
if (r.is_null)
{
compressor->append_null(compressor);
}
else
{
compressor->append_val(compressor, r.val);
}
results[n] = r;
}
n++;
}
if (!check_compression || n == 0)
{
return n;
}
compressed_data = (Datum) compressor->finish(compressor);
if (compressed_data == 0)
{
/* The gorilla compressor returns NULL for all-null input sets. */
return n;
};
/*
* Check that the result is still the same after we compress and decompress
* back.
*/
iter = definitions[algo].iterator_init_forward(compressed_data, PGTYPE);
int nn = 0;
for (DecompressResult r = iter->try_next(iter); !r.is_done; r = iter->try_next(iter))
{
if (r.is_null != results[nn].is_null)
{
elog(ERROR, "the decompression result doesn't match");
}
if (!r.is_null && (DATUM_TO_CTYPE(r.val) != DATUM_TO_CTYPE(results[nn].val)))
{
elog(ERROR, "the decompression result doesn't match");
}
nn++;
if (nn > n)
{
elog(ERROR, "the recompression result doesn't match");
}
}
return n;
}
#undef TOSTRING
#undef TOSTRING_HELPER
#undef FUNCTION_NAME
#undef FUNCTION_NAME_HELPER

View File

@ -647,9 +647,21 @@ gorilla_decompression_iterator_try_next_forward_internal(GorillaDecompressionIte
iter->prev_xor_bits_used = num_xor_bits.val;
CheckCompressedData(iter->prev_xor_bits_used <= 64);
/*
* More than 64 significant bits don't make sense. Exactly 64 we get for
* the first encoded number.
*/
CheckCompressedData(iter->prev_xor_bits_used + iter->prev_leading_zeroes <= 64);
}
/*
* Zero significant bits would mean that the previous number is repeated,
* but this should have been encoded with tag0 = 0.
* This also might fail if we haven't seen the tag1 = 1 for the first number
* and didn't initialize the bit widths.
*/
CheckCompressedData(iter->prev_xor_bits_used + iter->prev_leading_zeroes > 0);
xor = bit_array_iter_next(&iter->xors, iter->prev_xor_bits_used);
if (iter->prev_leading_zeroes + iter->prev_xor_bits_used < 64)
xor <<= 64 - (iter->prev_leading_zeroes + iter->prev_xor_bits_used);

View File

@ -1550,8 +1550,8 @@ from ts_read_compressed_data_directory('gorilla', 'float8', (:'TEST_INPUT_DIR' |
group by 2 order by 1 desc;
count | result
-------+--------
521 | XX001
214 | true
657 | XX001
78 | true
72 | 08P01
2 | false
(4 rows)