Fix ubsan failure in gorilla decompression

Also add more tests
2025-05-15 10:11:29 +08:00 · 2023-05-16 14:43:18 +02:00 · 2023-05-16 14:43:18 +02:00 · 8ff0648fd0
commit 8ff0648fd0
parent 936d751037
4 changed files with 136 additions and 51 deletions
--- a/tsl/src/compression/compression.c
+++ b/tsl/src/compression/compression.c
@ -2063,68 +2063,28 @@ get_compression_algorithm(char *name)
 	return _INVALID_COMPRESSION_ALGORITHM;
 }

-#define FUNCTION_NAME_HELPER(X, Y) decompress_##X##_##Y
-#define FUNCTION_NAME(X, Y) FUNCTION_NAME_HELPER(X, Y)
-
-#define TOSTRING_HELPER(x) #x
-#define TOSTRING(x) TOSTRING_HELPER(x)
-
-/* Try to decompress the given compressed data. Used for fuzzing. */
-#define DECOMPRESS_FN                                                                              \
-	static int FUNCTION_NAME(ALGO, CTYPE)(const uint8 *Data, size_t Size)                          \
-	{                                                                                              \
-		StringInfoData si = { .data = (char *) Data, .len = Size };                                \
-                                                                                                   \
-		int algo = pq_getmsgbyte(&si);                                                             \
-                                                                                                   \
-		CheckCompressedData(algo > 0 && algo < _END_COMPRESSION_ALGORITHMS);                       \
-                                                                                                   \
-		if (algo != get_compression_algorithm(TOSTRING(ALGO)))                                     \
-		{                                                                                          \
-			/*                                                                                     \
-			 * It's convenient to fuzz only one algorithm at a time. We specialize                 \
-			 * the fuzz target for one algorithm, so that the fuzzer doesn't waste                 \
-			 * time discovering others from scratch.                                               \
-			 */                                                                                    \
-			return -1;                                                                             \
-		}                                                                                          \
-                                                                                                   \
-		Datum compressed_data = definitions[algo].compressed_data_recv(&si);                       \
-		DecompressionIterator *iter =                                                              \
-			definitions[algo].iterator_init_forward(compressed_data, PGTYPE);                      \
-                                                                                                   \
-		int n = 0;                                                                                 \
-		for (DecompressResult r = iter->try_next(iter); !r.is_done; r = iter->try_next(iter))      \
-		{                                                                                          \
-			n++;                                                                                   \
-		}                                                                                          \
-                                                                                                   \
-		return n;                                                                                  \
-	}
-
 #define ALGO gorilla
 #define CTYPE float8
 #define PGTYPE FLOAT8OID
-DECOMPRESS_FN
+#define DATUM_TO_CTYPE DatumGetFloat8
+#include "decompress_test_impl.c"
 #undef ALGO
 #undef CTYPE
 #undef PGTYPE
+#undef DATUM_TO_CTYPE

 #define ALGO deltadelta
 #define CTYPE int64
 #define PGTYPE INT8OID
-DECOMPRESS_FN
+#define DATUM_TO_CTYPE DatumGetInt64
+#include "decompress_test_impl.c"
 #undef ALGO
 #undef CTYPE
 #undef PGTYPE
+#undef DATUM_TO_CTYPE

-#undef TOSTRING
-#undef TOSTRING_HELPER
-
-#undef FUNCTION_NAME
-#undef FUNCTION_NAME_HELPER
-
-static int (*get_decompress_fn(int algo, Oid type))(const uint8 *Data, size_t Size)
+static int (*get_decompress_fn(int algo, Oid type))(const uint8 *Data, size_t Size,
+													bool check_compression)
 {
 	if (algo == COMPRESSION_ALGORITHM_GORILLA && type == FLOAT8OID)
 	{
@ -2185,7 +2145,9 @@ ts_read_compressed_data_file(PG_FUNCTION_ARGS)

 	Oid type = PG_GETARG_OID(1);

-	int res = get_decompress_fn(algo, type)((const uint8 *) string, fsize);
+	int res =
+		get_decompress_fn(algo,
+						  type)((const uint8 *) string, fsize, /* check_compression = */ true);

 	PG_RETURN_INT32(res);
 }
--- a/tsl/src/compression/decompress_test_impl.c
+++ b/tsl/src/compression/decompress_test_impl.c
@ -0,0 +1,111 @@
+/*
+ * This file and its contents are licensed under the Timescale License.
+ * Please see the included NOTICE for copyright information and
+ * LICENSE-TIMESCALE for a copy of the license.
+ */
+
+#define FUNCTION_NAME_HELPER(X, Y) decompress_##X##_##Y
+#define FUNCTION_NAME(X, Y) FUNCTION_NAME_HELPER(X, Y)
+
+#define TOSTRING_HELPER(x) #x
+#define TOSTRING(x) TOSTRING_HELPER(x)
+
+/*
+ * Try to decompress the given compressed data. Used for fuzzing and for checking
+ * the examples found by fuzzing. For fuzzing we don't check that the
+ * recompression result is the same.
+ */
+static int
+FUNCTION_NAME(ALGO, CTYPE)(const uint8 *Data, size_t Size, bool check_compression)
+{
+	StringInfoData si = { .data = (char *) Data, .len = Size };
+
+	int algo = pq_getmsgbyte(&si);
+
+	CheckCompressedData(algo > 0 && algo < _END_COMPRESSION_ALGORITHMS);
+
+	if (algo != get_compression_algorithm(TOSTRING(ALGO)))
+	{
+		/*
+		 * It's convenient to fuzz only one algorithm at a time. We specialize
+		 * the fuzz target for one algorithm, so that the fuzzer doesn't waste
+		 * time discovering others from scratch.
+		 */
+		return -1;
+	}
+
+	Compressor *compressor = NULL;
+	if (check_compression)
+	{
+		compressor = definitions[algo].compressor_for_type(PGTYPE);
+	}
+
+	Datum compressed_data = definitions[algo].compressed_data_recv(&si);
+	DecompressionIterator *iter = definitions[algo].iterator_init_forward(compressed_data, PGTYPE);
+
+	DecompressResult results[GLOBAL_MAX_ROWS_PER_COMPRESSION];
+	int n = 0;
+	for (DecompressResult r = iter->try_next(iter); !r.is_done; r = iter->try_next(iter))
+	{
+		if (check_compression)
+		{
+			if (r.is_null)
+			{
+				compressor->append_null(compressor);
+			}
+			else
+			{
+				compressor->append_val(compressor, r.val);
+			}
+			results[n] = r;
+		}
+
+		n++;
+	}
+
+	if (!check_compression || n == 0)
+	{
+		return n;
+	}
+
+	compressed_data = (Datum) compressor->finish(compressor);
+	if (compressed_data == 0)
+	{
+		/* The gorilla compressor returns NULL for all-null input sets. */
+		return n;
+	};
+
+	/*
+	 * Check that the result is still the same after we compress and decompress
+	 * back.
+	 */
+	iter = definitions[algo].iterator_init_forward(compressed_data, PGTYPE);
+	int nn = 0;
+	for (DecompressResult r = iter->try_next(iter); !r.is_done; r = iter->try_next(iter))
+	{
+		if (r.is_null != results[nn].is_null)
+		{
+			elog(ERROR, "the decompression result doesn't match");
+		}
+
+		if (!r.is_null && (DATUM_TO_CTYPE(r.val) != DATUM_TO_CTYPE(results[nn].val)))
+		{
+			elog(ERROR, "the decompression result doesn't match");
+		}
+
+		nn++;
+
+		if (nn > n)
+		{
+			elog(ERROR, "the recompression result doesn't match");
+		}
+	}
+
+	return n;
+}
+
+#undef TOSTRING
+#undef TOSTRING_HELPER
+
+#undef FUNCTION_NAME
+#undef FUNCTION_NAME_HELPER
--- a/tsl/src/compression/gorilla.c
+++ b/tsl/src/compression/gorilla.c
@ -647,9 +647,21 @@ gorilla_decompression_iterator_try_next_forward_internal(GorillaDecompressionIte
 		iter->prev_xor_bits_used = num_xor_bits.val;
 		CheckCompressedData(iter->prev_xor_bits_used <= 64);

+		/*
+		 * More than 64 significant bits don't make sense. Exactly 64 we get for
+		 * the first encoded number.
+		 */
 		CheckCompressedData(iter->prev_xor_bits_used + iter->prev_leading_zeroes <= 64);
 	}

+	/*
+	 * Zero significant bits would mean that the previous number is repeated,
+	 * but this should have been encoded with tag0 = 0.
+	 * This also might fail if we haven't seen the tag1 = 1 for the first number
+	 * and didn't initialize the bit widths.
+	 */
+	CheckCompressedData(iter->prev_xor_bits_used + iter->prev_leading_zeroes > 0);
+
 	xor = bit_array_iter_next(&iter->xors, iter->prev_xor_bits_used);
 	if (iter->prev_leading_zeroes + iter->prev_xor_bits_used < 64)
 		xor <<= 64 - (iter->prev_leading_zeroes + iter->prev_xor_bits_used);
--- a/tsl/test/expected/compression_algos.out
+++ b/tsl/test/expected/compression_algos.out
@ -1550,8 +1550,8 @@ from ts_read_compressed_data_directory('gorilla', 'float8', (:'TEST_INPUT_DIR' |
 group by 2 order by 1 desc;
 count | result 
 -------+--------
-   521 | XX001
-   214 | true
+   657 | XX001
+    78 | true
    72 | 08P01
     2 | false
 (4 rows)