Fix overflow in gapfill's interpolate

All integer types must use numeric-based interpolation calculations since they are multiplied by int64 and this could cause an overflow. numerics also interpolate better because the answer is rounded and not truncated. We can't use float8 because that doesn't handle really big ints exactly. We can't use the Postgres INT128 implementation because it doesn't support division. In the future we can optimize this for cases where overflow doesn't occur. Fixes #1491.
2025-05-16 02:23:49 +08:00 · 2019-10-27 14:07:30 -04:00 · 2019-10-27 14:07:30 -04:00 · bf9eea0595
commit bf9eea0595
parent f82d83783e
3 changed files with 99 additions and 9 deletions
--- a/tsl/src/nodes/gapfill/interpolate.c
+++ b/tsl/src/nodes/gapfill/interpolate.c
@ -10,6 +10,7 @@
 #include <utils/builtins.h>
 #include <utils/datum.h>
 #include <utils/typcache.h>
+#include <utils/numeric.h>

 #include "compat.h"
 #include "nodes/gapfill/interpolate.h"
@ -144,6 +145,25 @@ gapfill_fetch_sample(GapFillState *state, GapFillInterpolateColumnState *column,
 	DecrTupleDescRefCount(tupdesc);
 }

+/* Calculate the interpolation using numerics, returning the result as a numeric datum */
+static Datum
+interpolate_numeric(int64 x_i, int64 x0_i, int64 x1_i, Datum y0, Datum y1)
+{
+	Datum x0 = DirectFunctionCall1(int8_numeric, Int64GetDatum(x0_i));
+	Datum x1 = DirectFunctionCall1(int8_numeric, Int64GetDatum(x1_i));
+	Datum x = DirectFunctionCall1(int8_numeric, Int64GetDatum(x_i));
+
+	Datum x1_sub_x = DirectFunctionCall2(numeric_sub, x1, x);
+	Datum x_sub_x0 = DirectFunctionCall2(numeric_sub, x, x0);
+	Datum y0_mul_x1_sub_x = DirectFunctionCall2(numeric_mul, y0, x1_sub_x);
+	Datum y1_mul_x_sub_x0 = DirectFunctionCall2(numeric_mul, y1, x_sub_x0);
+
+	Datum numerator = DirectFunctionCall2(numeric_add, y0_mul_x1_sub_x, y1_mul_x_sub_x0);
+	Datum denominator = DirectFunctionCall2(numeric_sub, x1, x0);
+
+	return DirectFunctionCall2(numeric_div, numerator, denominator);
+}
+
 /*
 * gapfill_interpolate_calculate gets called for every gapfilled tuple to calculate values
 *
@ -178,14 +198,43 @@ gapfill_interpolate_calculate(GapFillInterpolateColumnState *column, GapFillStat

 	switch (column->base.typid)
 	{
+		/* All integer types must use numeric-based interpolation calculations since they are
+		 * multiplied by int64 and this could cause an overflow. numerics also interpolate better
+		 * because the answer is rounded and not truncated. We can't use float8 because that
+		 doesn't handle really big ints exactly. We can't use the Postgres INT128 implementation
+		 because it doesn't support division. */
 		case INT2OID:
-			*value = Int16GetDatum(INTERPOLATE(x, x0, x1, DatumGetInt16(y0), DatumGetInt16(y1)));
+			*value =
+				DirectFunctionCall1(numeric_int2,
+									interpolate_numeric(x,
+														x0,
+														x1,
+														DirectFunctionCall1(int2_numeric,
+																			DatumGetInt16(y0)),
+														DirectFunctionCall1(int2_numeric,
+																			DatumGetInt16(y1))));
 			break;
 		case INT4OID:
-			*value = Int32GetDatum(INTERPOLATE(x, x0, x1, DatumGetInt32(y0), DatumGetInt32(y1)));
+			*value =
+				DirectFunctionCall1(numeric_int4,
+									interpolate_numeric(x,
+														x0,
+														x1,
+														DirectFunctionCall1(int4_numeric,
+																			DatumGetInt32(y0)),
+														DirectFunctionCall1(int4_numeric,
+																			DatumGetInt32(y1))));
 			break;
 		case INT8OID:
-			*value = Int64GetDatum(INTERPOLATE(x, x0, x1, DatumGetInt64(y0), DatumGetInt64(y1)));
+			*value =
+				DirectFunctionCall1(numeric_int8,
+									interpolate_numeric(x,
+														x0,
+														x1,
+														DirectFunctionCall1(int8_numeric,
+																			DatumGetInt64(y0)),
+														DirectFunctionCall1(int8_numeric,
+																			DatumGetInt64(y1))));
 			break;
 		case FLOAT4OID:
 			*value = Float4GetDatum(INTERPOLATE(x, x0, x1, DatumGetFloat4(y0), DatumGetFloat4(y1)));
--- a/tsl/test/expected/gapfill.out
+++ b/tsl/test/expected/gapfill.out
@ -1083,10 +1083,10 @@ GROUP BY 1 ORDER BY 1;
 time | smallint | int | bigint | float4 | float8 
 ------+----------+-----+--------+--------+--------
    0 |       -3 |  -3 |     -3 |     -3 |     -3
-   10 |       -1 |  -1 |     -1 |   -1.8 |   -1.8
-   20 |        0 |   0 |      0 |   -0.6 |   -0.6
-   30 |        0 |   0 |      0 |    0.6 |    0.6
-   40 |        1 |   1 |      1 |    1.8 |    1.8
+   10 |       -2 |  -2 |     -2 |   -1.8 |   -1.8
+   20 |       -1 |  -1 |     -1 |   -0.6 |   -0.6
+   30 |        1 |   1 |      1 |    0.6 |    0.6
+   40 |        2 |   2 |      2 |    1.8 |    1.8
   50 |        3 |   3 |      3 |      3 |      3
 (6 rows)

@ -2683,3 +2683,26 @@ GROUP BY 1,device_id;
    4 | Device 2
 (10 rows)

+--test interpolation with big diifferences in values (test overflows in calculations)
+--we use the biggest possible difference in time(x) and the value(y).
+--For bigints we also test values of smaller than bigintmax/min to avoid
+--the symmetry where x=y (which catches more errors)
+SELECT  9223372036854775807 as big_int_max \gset
+SELECT -9223372036854775808	 as big_int_min \gset
+SELECT
+  time_bucket_gapfill(1,time,0,1) AS time,
+  interpolate(min(s)) AS "smallint",
+  interpolate(min(i)) AS "int",
+  interpolate(min(b)) AS "bigint",
+  interpolate(min(b2)) AS "bigint2",
+  interpolate(min(d)) AS "double"
+FROM (values (:big_int_min,(-32768)::smallint,(-2147483648)::int,:big_int_min,-2147483648::bigint, '-Infinity'::double precision),
+             (:big_int_max, 32767::smallint, 2147483647::int,:big_int_max, 2147483647::bigint, 'Infinity'::double precision)) v(time,s,i,b,b2,d)
+GROUP BY 1 ORDER BY 1;
+         time         | smallint |     int     |        bigint        |   bigint2   |  double   
+----------------------+----------+-------------+----------------------+-------------+-----------
+ -9223372036854775808 |   -32768 | -2147483648 | -9223372036854775808 | -2147483648 | -Infinity
+                    0 |        0 |           0 |                    0 |           0 |  Infinity
+  9223372036854775807 |    32767 |  2147483647 |  9223372036854775807 |  2147483647 |  Infinity
+(3 rows)
+
--- a/tsl/test/sql/gapfill.sql
+++ b/tsl/test/sql/gapfill.sql
@ -1456,3 +1456,21 @@ GROUP BY 1,device_id;



+
+--test interpolation with big diifferences in values (test overflows in calculations)
+--we use the biggest possible difference in time(x) and the value(y).
+--For bigints we also test values of smaller than bigintmax/min to avoid
+--the symmetry where x=y (which catches more errors)
+SELECT  9223372036854775807 as big_int_max \gset
+SELECT -9223372036854775808	 as big_int_min \gset
+
+SELECT
+  time_bucket_gapfill(1,time,0,1) AS time,
+  interpolate(min(s)) AS "smallint",
+  interpolate(min(i)) AS "int",
+  interpolate(min(b)) AS "bigint",
+  interpolate(min(b2)) AS "bigint2",
+  interpolate(min(d)) AS "double"
+FROM (values (:big_int_min,(-32768)::smallint,(-2147483648)::int,:big_int_min,-2147483648::bigint, '-Infinity'::double precision),
+             (:big_int_max, 32767::smallint, 2147483647::int,:big_int_max, 2147483647::bigint, 'Infinity'::double precision)) v(time,s,i,b,b2,d)
+GROUP BY 1 ORDER BY 1;