Fix vectorized hash agg in GroupAgg mode in reverse order (#7686)

The hash grouping strategies do not preserve the input order when the
compressed batches need to be reversed. Disable them in this case.
This commit is contained in:
Alexander Kuzmenkov 2025-02-11 17:06:57 +01:00 committed by GitHub
parent d496a186d0
commit 6ce2fc0df4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 359 additions and 1 deletions

View File

@ -0,0 +1 @@
Fixes: #7686 Potential wrong aggregation result when using vectorized aggregation with hash grouping in reverse order

View File

@ -43,7 +43,9 @@ typedef struct GroupingPolicyHash GroupingPolicyHash;
* results are emitted into the output slot. This is done in the order of unique * results are emitted into the output slot. This is done in the order of unique
* grouping key indexes, thereby preserving the incoming key order. This * grouping key indexes, thereby preserving the incoming key order. This
* guarantees that this policy works correctly even in a Partial GroupAggregate * guarantees that this policy works correctly even in a Partial GroupAggregate
* node, even though it's not optimal performance-wise. * node, even though it's not optimal performance-wise. We only support the
* direct order of records in batch though, not reverse. This is checked at
* planning time.
*/ */
typedef struct GroupingPolicyHash typedef struct GroupingPolicyHash
{ {

View File

@ -505,6 +505,8 @@ get_vectorized_grouping_type(Agg *agg, CustomScan *custom, List *resolved_target
/* /*
* We support hashed vectorized grouping by one fixed-size by-value * We support hashed vectorized grouping by one fixed-size by-value
* compressed column. * compressed column.
* We can use our hash table for GroupAggregate as well, because it preserves
* the input order of the keys, but only for the direct order, not reverse.
*/ */
if (num_grouping_columns == 1) if (num_grouping_columns == 1)
{ {
@ -729,6 +731,21 @@ try_insert_vector_agg_node(Plan *plan)
return plan; return plan;
} }
/*
* The hash grouping strategies do not preserve the input key order when the
* reverse ordering is requested, so in this case they cannot work in
* GroupAggregate mode.
*/
if (grouping_type != VAGT_Batch && agg->aggstrategy != AGG_HASHED)
{
List *settings = linitial(custom->custom_private);
const bool reverse = list_nth_int(settings, DCS_Reverse);
if (reverse)
{
return plan;
}
}
/* /*
* Build supplementary info to determine whether we can vectorize the * Build supplementary info to determine whether we can vectorize the
* aggregate FILTER clauses. * aggregate FILTER clauses.

View File

@ -0,0 +1,236 @@
-- This file and its contents are licensed under the Timescale License.
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
-- Check that the vectorized aggregation works properly in the GroupAggregate
-- mode.
\pset null $
set max_parallel_workers_per_gather = 0;
set enable_hashagg to off;
--set timescaledb.enable_vectorized_aggregation to off;
--set timescaledb.debug_require_vector_agg to 'forbid';
create table groupagg(t int, s int, value int);
select create_hypertable('groupagg', 't', chunk_time_interval => 10000);
NOTICE: adding not-null constraint to column "t"
create_hypertable
-----------------------
(1,public,groupagg,t)
(1 row)
insert into groupagg
select
xslow * 10,
case when xfast = 13 then null else xfast end,
xfast * 7 + xslow * 3
from generate_series(10, 99) xfast,
generate_series(1, 1487) xslow
;
alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '',
timescaledb.compress_orderby = 's');
select count(compress_chunk(x)) from show_chunks('groupagg') x;
count
-------
2
(1 row)
vacuum analyze groupagg;
set timescaledb.debug_require_vector_agg to 'require';
select s, sum(value) from groupagg group by s order by s limit 10;
s | sum
----+---------
10 | 3423074
11 | 3433483
12 | 3443892
14 | 3464710
15 | 3475119
16 | 3485528
17 | 3495937
18 | 3506346
19 | 3516755
20 | 3527164
(10 rows)
-- The hash grouping policies do not support the GroupAggregate mode in the
-- reverse order.
set timescaledb.debug_require_vector_agg to 'forbid';
select s, sum(value) from groupagg group by s order by s desc limit 10;
s | sum
----+---------
$ | 3454301
99 | 4349475
98 | 4339066
97 | 4328657
96 | 4318248
95 | 4307839
94 | 4297430
93 | 4287021
92 | 4276612
91 | 4266203
(10 rows)
reset timescaledb.debug_require_vector_agg;
-- Also test the NULLS FIRST order.
select count(decompress_chunk(x)) from show_chunks('groupagg') x;
count
-------
2
(1 row)
alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '',
timescaledb.compress_orderby = 's nulls first');
select count(compress_chunk(x)) from show_chunks('groupagg') x;
count
-------
2
(1 row)
vacuum analyze groupagg;
set timescaledb.debug_require_vector_agg to 'require';
select s, sum(value) from groupagg group by s order by s nulls first limit 10;
s | sum
----+---------
$ | 3454301
10 | 3423074
11 | 3433483
12 | 3443892
14 | 3464710
15 | 3475119
16 | 3485528
17 | 3495937
18 | 3506346
19 | 3516755
(10 rows)
reset timescaledb.debug_require_vector_agg;
-- More tests for dictionary encoding. This is not vectorized at the moment.
set timescaledb.debug_require_vector_agg to 'forbid';
create table text_table(ts int);
select create_hypertable('text_table', 'ts', chunk_time_interval => 3);
NOTICE: adding not-null constraint to column "ts"
create_hypertable
-------------------------
(3,public,text_table,t)
(1 row)
alter table text_table set (timescaledb.compress);
WARNING: there was some uncertainty picking the default segment by for the hypertable: You do not have any indexes on columns that can be used for segment_by and thus we are not using segment_by for compression. Please make sure you are not missing any indexes
NOTICE: default segment by for hypertable "text_table" is set to ""
NOTICE: default order by for hypertable "text_table" is set to "ts DESC"
insert into text_table select 0 /*, default */ from generate_series(1, 1000) x;
select count(compress_chunk(x)) from show_chunks('text_table') x;
count
-------
1
(1 row)
alter table text_table add column a text default 'default';
alter table text_table set (timescaledb.compress,
timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'a');
insert into text_table select 1, '' from generate_series(1, 1000) x;
insert into text_table select 2, 'same' from generate_series(1, 1000) x;
insert into text_table select 3, 'different' || x from generate_series(1, 1000) x;
insert into text_table select 4, case when x % 2 = 0 then null else 'same-with-nulls' end from generate_series(1, 1000) x;
insert into text_table select 5, case when x % 2 = 0 then null else 'different-with-nulls' || x end from generate_series(1, 1000) x;
select count(compress_chunk(x)) from show_chunks('text_table') x;
count
-------
2
(1 row)
vacuum analyze text_table;
-- Test the GroupAggregate by a text column. Not vectorized for now.
select a, count(*) from text_table group by a order by a limit 10;
a | count
-------------------------+-------
| 1000
default | 1000
different-with-nulls1 | 1
different-with-nulls101 | 1
different-with-nulls103 | 1
different-with-nulls105 | 1
different-with-nulls107 | 1
different-with-nulls109 | 1
different-with-nulls11 | 1
different-with-nulls111 | 1
(10 rows)
-- The hash grouping policies do not support the GroupAggregate mode in the
-- reverse order.
select a, count(*) from text_table group by a order by a desc limit 10;
a | count
-----------------+-------
$ | 1000
same-with-nulls | 500
same | 1000
different999 | 1
different998 | 1
different997 | 1
different996 | 1
different995 | 1
different994 | 1
different993 | 1
(10 rows)
-- with NULLS FIRST
select count(decompress_chunk(x)) from show_chunks('text_table') x;
count
-------
2
(1 row)
alter table text_table set (timescaledb.compress,
timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'a nulls first');
select count(compress_chunk(x)) from show_chunks('text_table') x;
count
-------
2
(1 row)
select a, count(*) from text_table group by a order by a nulls first limit 10;
a | count
-------------------------+-------
$ | 1000
| 1000
default | 1000
different-with-nulls1 | 1
different-with-nulls101 | 1
different-with-nulls103 | 1
different-with-nulls105 | 1
different-with-nulls107 | 1
different-with-nulls109 | 1
different-with-nulls11 | 1
(10 rows)
-- TODO verify that this works with the serialized hash grouping strategy
select ts, a, count(*) from text_table group by ts, a order by ts, a limit 10;
ts | a | count
----+---------------+-------
0 | default | 1000
1 | | 1000
2 | same | 1000
3 | different1 | 1
3 | different10 | 1
3 | different100 | 1
3 | different1000 | 1
3 | different101 | 1
3 | different102 | 1
3 | different103 | 1
(10 rows)
select a, ts, count(*) from text_table group by a, ts order by a desc, ts desc limit 10;
a | ts | count
-----------------+----+-------
$ | 5 | 500
$ | 4 | 500
same-with-nulls | 4 | 500
same | 2 | 1000
different999 | 3 | 1
different998 | 3 | 1
different997 | 3 | 1
different996 | 3 | 1
different995 | 3 | 1
different994 | 3 | 1
(10 rows)
reset max_parallel_workers_per_gather;
reset timescaledb.debug_require_vector_agg;
reset enable_hashagg;

View File

@ -45,6 +45,7 @@ set(TEST_FILES
skip_scan.sql skip_scan.sql
transparent_decompression_join_index.sql transparent_decompression_join_index.sql
vector_agg_functions.sql vector_agg_functions.sql
vector_agg_groupagg.sql
vector_agg_param.sql vector_agg_param.sql
vectorized_aggregation.sql) vectorized_aggregation.sql)

View File

@ -0,0 +1,101 @@
-- This file and its contents are licensed under the Timescale License.
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
-- Check that the vectorized aggregation works properly in the GroupAggregate
-- mode.
\pset null $
set max_parallel_workers_per_gather = 0;
set enable_hashagg to off;
--set timescaledb.enable_vectorized_aggregation to off;
--set timescaledb.debug_require_vector_agg to 'forbid';
create table groupagg(t int, s int, value int);
select create_hypertable('groupagg', 't', chunk_time_interval => 10000);
insert into groupagg
select
xslow * 10,
case when xfast = 13 then null else xfast end,
xfast * 7 + xslow * 3
from generate_series(10, 99) xfast,
generate_series(1, 1487) xslow
;
alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '',
timescaledb.compress_orderby = 's');
select count(compress_chunk(x)) from show_chunks('groupagg') x;
vacuum analyze groupagg;
set timescaledb.debug_require_vector_agg to 'require';
select s, sum(value) from groupagg group by s order by s limit 10;
-- The hash grouping policies do not support the GroupAggregate mode in the
-- reverse order.
set timescaledb.debug_require_vector_agg to 'forbid';
select s, sum(value) from groupagg group by s order by s desc limit 10;
reset timescaledb.debug_require_vector_agg;
-- Also test the NULLS FIRST order.
select count(decompress_chunk(x)) from show_chunks('groupagg') x;
alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '',
timescaledb.compress_orderby = 's nulls first');
select count(compress_chunk(x)) from show_chunks('groupagg') x;
vacuum analyze groupagg;
set timescaledb.debug_require_vector_agg to 'require';
select s, sum(value) from groupagg group by s order by s nulls first limit 10;
reset timescaledb.debug_require_vector_agg;
-- More tests for dictionary encoding. This is not vectorized at the moment.
set timescaledb.debug_require_vector_agg to 'forbid';
create table text_table(ts int);
select create_hypertable('text_table', 'ts', chunk_time_interval => 3);
alter table text_table set (timescaledb.compress);
insert into text_table select 0 /*, default */ from generate_series(1, 1000) x;
select count(compress_chunk(x)) from show_chunks('text_table') x;
alter table text_table add column a text default 'default';
alter table text_table set (timescaledb.compress,
timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'a');
insert into text_table select 1, '' from generate_series(1, 1000) x;
insert into text_table select 2, 'same' from generate_series(1, 1000) x;
insert into text_table select 3, 'different' || x from generate_series(1, 1000) x;
insert into text_table select 4, case when x % 2 = 0 then null else 'same-with-nulls' end from generate_series(1, 1000) x;
insert into text_table select 5, case when x % 2 = 0 then null else 'different-with-nulls' || x end from generate_series(1, 1000) x;
select count(compress_chunk(x)) from show_chunks('text_table') x;
vacuum analyze text_table;
-- Test the GroupAggregate by a text column. Not vectorized for now.
select a, count(*) from text_table group by a order by a limit 10;
-- The hash grouping policies do not support the GroupAggregate mode in the
-- reverse order.
select a, count(*) from text_table group by a order by a desc limit 10;
-- with NULLS FIRST
select count(decompress_chunk(x)) from show_chunks('text_table') x;
alter table text_table set (timescaledb.compress,
timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'a nulls first');
select count(compress_chunk(x)) from show_chunks('text_table') x;
select a, count(*) from text_table group by a order by a nulls first limit 10;
-- TODO verify that this works with the serialized hash grouping strategy
select ts, a, count(*) from text_table group by ts, a order by ts, a limit 10;
select a, ts, count(*) from text_table group by a, ts order by a desc, ts desc limit 10;
reset max_parallel_workers_per_gather;
reset timescaledb.debug_require_vector_agg;
reset enable_hashagg;