Align gapfill bucket generation with time_bucket

In certain scenarios, when generating buckets with
monthly buckets and different timezones, gapfill
would create timestamps which don't align with
time_bucket and thus potentially generating multiple
rows for an individual month. Instead of relying on
previous timestamp to generate the next one, now
we generate them always from the start point
which will make us align with time_bucket buckets.
This commit is contained in:
Ante Kresic 2023-10-05 14:13:41 +02:00 committed by Ante Kresic
parent a409065285
commit 65ecda93e6
8 changed files with 200 additions and 57 deletions

1
.unreleased/PR_6155 Normal file
View File

@ -0,0 +1 @@
Fixes: #6155 Align gapfill bucket generation with time_bucket

View File

@ -640,15 +640,15 @@ gapfill_advance_timestamp(GapFillState *state)
{
case DATEOID:
next = DirectFunctionCall2(date_pl_interval,
DateADTGetDatum(state->next_timestamp),
IntervalPGetDatum(state->gapfill_interval));
DateADTGetDatum(state->gapfill_start),
IntervalPGetDatum(state->next_offset));
next = DirectFunctionCall1(timestamp_date, next);
state->next_timestamp = DatumGetDateADT(next);
break;
case TIMESTAMPOID:
next = DirectFunctionCall2(timestamp_pl_interval,
TimestampGetDatum(state->next_timestamp),
IntervalPGetDatum(state->gapfill_interval));
TimestampGetDatum(state->gapfill_start),
IntervalPGetDatum(state->next_offset));
state->next_timestamp = DatumGetTimestamp(next);
break;
case TIMESTAMPTZOID:
@ -658,14 +658,22 @@ gapfill_advance_timestamp(GapFillState *state)
*/
next = DirectFunctionCall2(state->have_timezone ? timestamptz_pl_interval :
timestamp_pl_interval,
TimestampTzGetDatum(state->next_timestamp),
IntervalPGetDatum(state->gapfill_interval));
TimestampTzGetDatum(state->gapfill_start),
IntervalPGetDatum(state->next_offset));
state->next_timestamp = DatumGetTimestampTz(next);
break;
default:
state->next_timestamp += state->gapfill_period;
break;
}
/* Advance the interval offset if necessary */
if (state->gapfill_interval)
{
Datum tspan = DirectFunctionCall2(interval_pl,
IntervalPGetDatum(state->gapfill_interval),
IntervalPGetDatum(state->next_offset));
state->next_offset = DatumGetIntervalP(tspan);
}
}
/*
@ -742,6 +750,7 @@ gapfill_begin(CustomScanState *node, EState *estate, int eflags)
state->gapfill_start = align_with_time_bucket(state, get_start_arg(state));
}
state->next_timestamp = state->gapfill_start;
state->next_offset = state->gapfill_interval;
/* gap fill end */
if (is_const_null(get_finish_arg(state)))
@ -938,6 +947,7 @@ gapfill_state_reset_group(GapFillState *state, TupleTableSlot *slot)
break;
}
}
state->next_offset = state->gapfill_interval;
}
/*

View File

@ -104,6 +104,8 @@ typedef struct GapFillState
Interval *gapfill_interval;
int64 next_timestamp;
/* interval offset for next_timestamp from gapfill_start */
Interval *next_offset;
int64 subslot_time; /* time of tuple in subslot */
int time_index; /* position of time column */

View File

@ -313,6 +313,7 @@ QUERY PLAN
-> Seq Scan on _hyper_X_X_chunk
(8 rows)
DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
@ -1579,6 +1580,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)
DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
@ -3292,11 +3294,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)
SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
@ -3313,11 +3315,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)
SET timezone TO 'Europe/Berlin';
@ -3332,14 +3334,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)
RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
@ -3364,3 +3365,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)
-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)
INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)
DROP TABLE month_timezone;

View File

@ -313,6 +313,7 @@ QUERY PLAN
-> Seq Scan on _hyper_X_X_chunk
(8 rows)
DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
@ -1579,6 +1580,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)
DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
@ -3292,11 +3294,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)
SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
@ -3313,11 +3315,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)
SET timezone TO 'Europe/Berlin';
@ -3332,14 +3334,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)
RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
@ -3364,3 +3365,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)
-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)
INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)
DROP TABLE month_timezone;

View File

@ -313,6 +313,7 @@ QUERY PLAN
-> Seq Scan on _hyper_X_X_chunk
(8 rows)
DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
@ -1579,6 +1580,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)
DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
@ -3292,11 +3294,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)
SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
@ -3313,11 +3315,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)
SET timezone TO 'Europe/Berlin';
@ -3332,14 +3334,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)
RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
@ -3364,3 +3365,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)
-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)
INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)
DROP TABLE month_timezone;

View File

@ -315,6 +315,7 @@ QUERY PLAN
-> Index Only Scan using _hyper_X_X_chunk_gapfill_plan_test_indx on _hyper_X_X_chunk
(10 rows)
DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
\set ON_ERROR_STOP 0
@ -1581,6 +1582,7 @@ SELECT * FROM gapfill_insert_test;
4
(4 rows)
DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
@ -3294,11 +3296,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
time_bucket_gapfill
Fri Dec 31 15:00:00 1999 PST
Tue Feb 29 15:00:00 2000 PST
Sat Apr 29 15:00:00 2000 PDT
Thu Jun 29 15:00:00 2000 PDT
Tue Aug 29 15:00:00 2000 PDT
Sun Oct 29 15:00:00 2000 PST
Fri Dec 29 15:00:00 2000 PST
Sun Apr 30 15:00:00 2000 PDT
Fri Jun 30 15:00:00 2000 PDT
Thu Aug 31 15:00:00 2000 PDT
Tue Oct 31 15:00:00 2000 PST
Sun Dec 31 15:00:00 2000 PST
(7 rows)
SELECT time_bucket_gapfill('2 month'::interval, ts, current_setting('timezone'), '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
@ -3315,11 +3317,11 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'UTC', '2000-01-01','2001-01
time_bucket_gapfill
Fri Dec 31 16:00:00 1999 PST
Tue Feb 29 16:00:00 2000 PST
Sat Apr 29 16:00:00 2000 PDT
Thu Jun 29 16:00:00 2000 PDT
Tue Aug 29 16:00:00 2000 PDT
Sun Oct 29 16:00:00 2000 PST
Fri Dec 29 16:00:00 2000 PST
Sun Apr 30 16:00:00 2000 PDT
Fri Jun 30 16:00:00 2000 PDT
Thu Aug 31 16:00:00 2000 PDT
Tue Oct 31 16:00:00 2000 PST
Sun Dec 31 16:00:00 2000 PST
(7 rows)
SET timezone TO 'Europe/Berlin';
@ -3334,14 +3336,13 @@ SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01
(6 rows)
RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
ts | int_arr | locf
----+-----------+--------
0 | {1,2,3,4} | 500001
@ -3366,3 +3367,30 @@ GROUP BY 1, 2
95 | {1,2,3,4} | 500001
(20 rows)
-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
table_name
month_timezone
(1 row)
INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
time | sum
------------------------------+-------
Sat Dec 31 15:00:00 2022 PST |
Tue Jan 31 15:00:00 2023 PST |
Tue Feb 28 15:00:00 2023 PST | 3.123
Fri Mar 31 15:00:00 2023 PDT | 4.123
Sun Apr 30 15:00:00 2023 PDT | 5.123
Wed May 31 15:00:00 2023 PDT |
Fri Jun 30 15:00:00 2023 PDT |
(7 rows)
DROP TABLE month_timezone;

View File

@ -139,6 +139,8 @@ ORDER BY 1,2;
FROM gapfill_plan_test
ORDER BY 2,1;
DROP TABLE gapfill_plan_test;
\set METRICS metrics_int
-- All test against table :METRICS first
@ -623,6 +625,8 @@ CREATE TABLE gapfill_insert_test(id INT);
INSERT INTO gapfill_insert_test SELECT time_bucket_gapfill(1,time,1,5) FROM (VALUES (1),(2)) v(time) GROUP BY 1 ORDER BY 1;
SELECT * FROM gapfill_insert_test;
DROP TABLE gapfill_insert_test;
-- test join
SELECT t1.*,t2.m FROM
(
@ -1510,12 +1514,26 @@ SET timezone TO 'Europe/Berlin';
SELECT time_bucket_gapfill('2 month'::interval, ts, 'Europe/Berlin', '2000-01-01','2001-01-01') FROM (VALUES ('2000-03-01'::timestamptz)) v(ts) GROUP BY 1;
RESET timezone;
DROP INDEX gapfill_plan_test_indx;
-- Test gapfill with arrays (#5981)
SELECT time_bucket_gapfill(5, ts, 1, 100) as ts, int_arr, locf(last(value, ts))
FROM (
SELECT ARRAY[1,2,3,4]::int[] as int_arr, x as ts, x+500000 as value
FROM generate_series(1, 10, 100) as x
) t
GROUP BY 1, 2
GROUP BY 1, 2;
-- Test gapfill is aligned with non-gapfill time_bucket
-- when using different timezones and month bucketing
CREATE TABLE month_timezone(time timestamptz NOT NULL, value float);
SELECT table_name FROM create_hypertable('month_timezone','time');
INSERT INTO month_timezone VALUES ('2023-03-01 14:05:00+01', 3.123), ('2023-04-01 14:05:00+01',4.123), ('2023-05-01 14:05:00+01', 5.123);
SELECT
time_bucket_gapfill('1 month'::interval, time, 'Europe/Berlin', '2023-01-01', '2023-07-01') AS time,
sum(value)
FROM
month_timezone
GROUP BY 1;
DROP TABLE month_timezone;