From 6ce2fc0df492892e915003b30239f5ab25a2e49e Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com> Date: Tue, 11 Feb 2025 17:06:57 +0100 Subject: [PATCH] Fix vectorized hash agg in GroupAgg mode in reverse order (#7686) The hash grouping strategies do not preserve the input order when the compressed batches need to be reversed. Disable them in this case. --- .unreleased/hash-groupagg-bug | 1 + .../nodes/vector_agg/grouping_policy_hash.h | 4 +- tsl/src/nodes/vector_agg/plan.c | 17 ++ tsl/test/expected/vector_agg_groupagg.out | 236 ++++++++++++++++++ tsl/test/sql/CMakeLists.txt | 1 + tsl/test/sql/vector_agg_groupagg.sql | 101 ++++++++ 6 files changed, 359 insertions(+), 1 deletion(-) create mode 100644 .unreleased/hash-groupagg-bug create mode 100644 tsl/test/expected/vector_agg_groupagg.out create mode 100644 tsl/test/sql/vector_agg_groupagg.sql diff --git a/.unreleased/hash-groupagg-bug b/.unreleased/hash-groupagg-bug new file mode 100644 index 000000000..6d01a5ff3 --- /dev/null +++ b/.unreleased/hash-groupagg-bug @@ -0,0 +1 @@ +Fixes: #7686 Potential wrong aggregation result when using vectorized aggregation with hash grouping in reverse order diff --git a/tsl/src/nodes/vector_agg/grouping_policy_hash.h b/tsl/src/nodes/vector_agg/grouping_policy_hash.h index 2d8c35381..3516efb88 100644 --- a/tsl/src/nodes/vector_agg/grouping_policy_hash.h +++ b/tsl/src/nodes/vector_agg/grouping_policy_hash.h @@ -43,7 +43,9 @@ typedef struct GroupingPolicyHash GroupingPolicyHash; * results are emitted into the output slot. This is done in the order of unique * grouping key indexes, thereby preserving the incoming key order. This * guarantees that this policy works correctly even in a Partial GroupAggregate - * node, even though it's not optimal performance-wise. + * node, even though it's not optimal performance-wise. We only support the + * direct order of records in batch though, not reverse. This is checked at + * planning time. */ typedef struct GroupingPolicyHash { diff --git a/tsl/src/nodes/vector_agg/plan.c b/tsl/src/nodes/vector_agg/plan.c index e705d23f3..fc6a98348 100644 --- a/tsl/src/nodes/vector_agg/plan.c +++ b/tsl/src/nodes/vector_agg/plan.c @@ -505,6 +505,8 @@ get_vectorized_grouping_type(Agg *agg, CustomScan *custom, List *resolved_target /* * We support hashed vectorized grouping by one fixed-size by-value * compressed column. + * We can use our hash table for GroupAggregate as well, because it preserves + * the input order of the keys, but only for the direct order, not reverse. */ if (num_grouping_columns == 1) { @@ -729,6 +731,21 @@ try_insert_vector_agg_node(Plan *plan) return plan; } + /* + * The hash grouping strategies do not preserve the input key order when the + * reverse ordering is requested, so in this case they cannot work in + * GroupAggregate mode. + */ + if (grouping_type != VAGT_Batch && agg->aggstrategy != AGG_HASHED) + { + List *settings = linitial(custom->custom_private); + const bool reverse = list_nth_int(settings, DCS_Reverse); + if (reverse) + { + return plan; + } + } + /* * Build supplementary info to determine whether we can vectorize the * aggregate FILTER clauses. diff --git a/tsl/test/expected/vector_agg_groupagg.out b/tsl/test/expected/vector_agg_groupagg.out new file mode 100644 index 000000000..85a4a1126 --- /dev/null +++ b/tsl/test/expected/vector_agg_groupagg.out @@ -0,0 +1,236 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. +-- Check that the vectorized aggregation works properly in the GroupAggregate +-- mode. +\pset null $ +set max_parallel_workers_per_gather = 0; +set enable_hashagg to off; +--set timescaledb.enable_vectorized_aggregation to off; +--set timescaledb.debug_require_vector_agg to 'forbid'; +create table groupagg(t int, s int, value int); +select create_hypertable('groupagg', 't', chunk_time_interval => 10000); +NOTICE: adding not-null constraint to column "t" + create_hypertable +----------------------- + (1,public,groupagg,t) +(1 row) + +insert into groupagg +select + xslow * 10, + case when xfast = 13 then null else xfast end, + xfast * 7 + xslow * 3 +from generate_series(10, 99) xfast, + generate_series(1, 1487) xslow +; +alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '', + timescaledb.compress_orderby = 's'); +select count(compress_chunk(x)) from show_chunks('groupagg') x; + count +------- + 2 +(1 row) + +vacuum analyze groupagg; +set timescaledb.debug_require_vector_agg to 'require'; +select s, sum(value) from groupagg group by s order by s limit 10; + s | sum +----+--------- + 10 | 3423074 + 11 | 3433483 + 12 | 3443892 + 14 | 3464710 + 15 | 3475119 + 16 | 3485528 + 17 | 3495937 + 18 | 3506346 + 19 | 3516755 + 20 | 3527164 +(10 rows) + +-- The hash grouping policies do not support the GroupAggregate mode in the +-- reverse order. +set timescaledb.debug_require_vector_agg to 'forbid'; +select s, sum(value) from groupagg group by s order by s desc limit 10; + s | sum +----+--------- + $ | 3454301 + 99 | 4349475 + 98 | 4339066 + 97 | 4328657 + 96 | 4318248 + 95 | 4307839 + 94 | 4297430 + 93 | 4287021 + 92 | 4276612 + 91 | 4266203 +(10 rows) + +reset timescaledb.debug_require_vector_agg; +-- Also test the NULLS FIRST order. +select count(decompress_chunk(x)) from show_chunks('groupagg') x; + count +------- + 2 +(1 row) + +alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '', + timescaledb.compress_orderby = 's nulls first'); +select count(compress_chunk(x)) from show_chunks('groupagg') x; + count +------- + 2 +(1 row) + +vacuum analyze groupagg; +set timescaledb.debug_require_vector_agg to 'require'; +select s, sum(value) from groupagg group by s order by s nulls first limit 10; + s | sum +----+--------- + $ | 3454301 + 10 | 3423074 + 11 | 3433483 + 12 | 3443892 + 14 | 3464710 + 15 | 3475119 + 16 | 3485528 + 17 | 3495937 + 18 | 3506346 + 19 | 3516755 +(10 rows) + +reset timescaledb.debug_require_vector_agg; +-- More tests for dictionary encoding. This is not vectorized at the moment. +set timescaledb.debug_require_vector_agg to 'forbid'; +create table text_table(ts int); +select create_hypertable('text_table', 'ts', chunk_time_interval => 3); +NOTICE: adding not-null constraint to column "ts" + create_hypertable +------------------------- + (3,public,text_table,t) +(1 row) + +alter table text_table set (timescaledb.compress); +WARNING: there was some uncertainty picking the default segment by for the hypertable: You do not have any indexes on columns that can be used for segment_by and thus we are not using segment_by for compression. Please make sure you are not missing any indexes +NOTICE: default segment by for hypertable "text_table" is set to "" +NOTICE: default order by for hypertable "text_table" is set to "ts DESC" +insert into text_table select 0 /*, default */ from generate_series(1, 1000) x; +select count(compress_chunk(x)) from show_chunks('text_table') x; + count +------- + 1 +(1 row) + +alter table text_table add column a text default 'default'; +alter table text_table set (timescaledb.compress, + timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'a'); +insert into text_table select 1, '' from generate_series(1, 1000) x; +insert into text_table select 2, 'same' from generate_series(1, 1000) x; +insert into text_table select 3, 'different' || x from generate_series(1, 1000) x; +insert into text_table select 4, case when x % 2 = 0 then null else 'same-with-nulls' end from generate_series(1, 1000) x; +insert into text_table select 5, case when x % 2 = 0 then null else 'different-with-nulls' || x end from generate_series(1, 1000) x; +select count(compress_chunk(x)) from show_chunks('text_table') x; + count +------- + 2 +(1 row) + +vacuum analyze text_table; +-- Test the GroupAggregate by a text column. Not vectorized for now. +select a, count(*) from text_table group by a order by a limit 10; + a | count +-------------------------+------- + | 1000 + default | 1000 + different-with-nulls1 | 1 + different-with-nulls101 | 1 + different-with-nulls103 | 1 + different-with-nulls105 | 1 + different-with-nulls107 | 1 + different-with-nulls109 | 1 + different-with-nulls11 | 1 + different-with-nulls111 | 1 +(10 rows) + +-- The hash grouping policies do not support the GroupAggregate mode in the +-- reverse order. +select a, count(*) from text_table group by a order by a desc limit 10; + a | count +-----------------+------- + $ | 1000 + same-with-nulls | 500 + same | 1000 + different999 | 1 + different998 | 1 + different997 | 1 + different996 | 1 + different995 | 1 + different994 | 1 + different993 | 1 +(10 rows) + +-- with NULLS FIRST +select count(decompress_chunk(x)) from show_chunks('text_table') x; + count +------- + 2 +(1 row) + +alter table text_table set (timescaledb.compress, + timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'a nulls first'); +select count(compress_chunk(x)) from show_chunks('text_table') x; + count +------- + 2 +(1 row) + +select a, count(*) from text_table group by a order by a nulls first limit 10; + a | count +-------------------------+------- + $ | 1000 + | 1000 + default | 1000 + different-with-nulls1 | 1 + different-with-nulls101 | 1 + different-with-nulls103 | 1 + different-with-nulls105 | 1 + different-with-nulls107 | 1 + different-with-nulls109 | 1 + different-with-nulls11 | 1 +(10 rows) + +-- TODO verify that this works with the serialized hash grouping strategy +select ts, a, count(*) from text_table group by ts, a order by ts, a limit 10; + ts | a | count +----+---------------+------- + 0 | default | 1000 + 1 | | 1000 + 2 | same | 1000 + 3 | different1 | 1 + 3 | different10 | 1 + 3 | different100 | 1 + 3 | different1000 | 1 + 3 | different101 | 1 + 3 | different102 | 1 + 3 | different103 | 1 +(10 rows) + +select a, ts, count(*) from text_table group by a, ts order by a desc, ts desc limit 10; + a | ts | count +-----------------+----+------- + $ | 5 | 500 + $ | 4 | 500 + same-with-nulls | 4 | 500 + same | 2 | 1000 + different999 | 3 | 1 + different998 | 3 | 1 + different997 | 3 | 1 + different996 | 3 | 1 + different995 | 3 | 1 + different994 | 3 | 1 +(10 rows) + +reset max_parallel_workers_per_gather; +reset timescaledb.debug_require_vector_agg; +reset enable_hashagg; diff --git a/tsl/test/sql/CMakeLists.txt b/tsl/test/sql/CMakeLists.txt index 243aab81a..9bdebf28a 100644 --- a/tsl/test/sql/CMakeLists.txt +++ b/tsl/test/sql/CMakeLists.txt @@ -45,6 +45,7 @@ set(TEST_FILES skip_scan.sql transparent_decompression_join_index.sql vector_agg_functions.sql + vector_agg_groupagg.sql vector_agg_param.sql vectorized_aggregation.sql) diff --git a/tsl/test/sql/vector_agg_groupagg.sql b/tsl/test/sql/vector_agg_groupagg.sql new file mode 100644 index 000000000..6c04205d3 --- /dev/null +++ b/tsl/test/sql/vector_agg_groupagg.sql @@ -0,0 +1,101 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. + +-- Check that the vectorized aggregation works properly in the GroupAggregate +-- mode. + +\pset null $ + +set max_parallel_workers_per_gather = 0; +set enable_hashagg to off; + +--set timescaledb.enable_vectorized_aggregation to off; +--set timescaledb.debug_require_vector_agg to 'forbid'; + +create table groupagg(t int, s int, value int); +select create_hypertable('groupagg', 't', chunk_time_interval => 10000); + +insert into groupagg +select + xslow * 10, + case when xfast = 13 then null else xfast end, + xfast * 7 + xslow * 3 +from generate_series(10, 99) xfast, + generate_series(1, 1487) xslow +; + +alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '', + timescaledb.compress_orderby = 's'); +select count(compress_chunk(x)) from show_chunks('groupagg') x; +vacuum analyze groupagg; + + +set timescaledb.debug_require_vector_agg to 'require'; +select s, sum(value) from groupagg group by s order by s limit 10; + +-- The hash grouping policies do not support the GroupAggregate mode in the +-- reverse order. +set timescaledb.debug_require_vector_agg to 'forbid'; +select s, sum(value) from groupagg group by s order by s desc limit 10; +reset timescaledb.debug_require_vector_agg; + +-- Also test the NULLS FIRST order. +select count(decompress_chunk(x)) from show_chunks('groupagg') x; +alter table groupagg set (timescaledb.compress, timescaledb.compress_segmentby = '', + timescaledb.compress_orderby = 's nulls first'); +select count(compress_chunk(x)) from show_chunks('groupagg') x; +vacuum analyze groupagg; + +set timescaledb.debug_require_vector_agg to 'require'; +select s, sum(value) from groupagg group by s order by s nulls first limit 10; +reset timescaledb.debug_require_vector_agg; + + +-- More tests for dictionary encoding. This is not vectorized at the moment. +set timescaledb.debug_require_vector_agg to 'forbid'; +create table text_table(ts int); +select create_hypertable('text_table', 'ts', chunk_time_interval => 3); +alter table text_table set (timescaledb.compress); + +insert into text_table select 0 /*, default */ from generate_series(1, 1000) x; +select count(compress_chunk(x)) from show_chunks('text_table') x; + +alter table text_table add column a text default 'default'; +alter table text_table set (timescaledb.compress, + timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'a'); + +insert into text_table select 1, '' from generate_series(1, 1000) x; +insert into text_table select 2, 'same' from generate_series(1, 1000) x; +insert into text_table select 3, 'different' || x from generate_series(1, 1000) x; +insert into text_table select 4, case when x % 2 = 0 then null else 'same-with-nulls' end from generate_series(1, 1000) x; +insert into text_table select 5, case when x % 2 = 0 then null else 'different-with-nulls' || x end from generate_series(1, 1000) x; + +select count(compress_chunk(x)) from show_chunks('text_table') x; +vacuum analyze text_table; + + +-- Test the GroupAggregate by a text column. Not vectorized for now. +select a, count(*) from text_table group by a order by a limit 10; + +-- The hash grouping policies do not support the GroupAggregate mode in the +-- reverse order. +select a, count(*) from text_table group by a order by a desc limit 10; + + +-- with NULLS FIRST +select count(decompress_chunk(x)) from show_chunks('text_table') x; +alter table text_table set (timescaledb.compress, + timescaledb.compress_segmentby = '', timescaledb.compress_orderby = 'a nulls first'); +select count(compress_chunk(x)) from show_chunks('text_table') x; + +select a, count(*) from text_table group by a order by a nulls first limit 10; + + +-- TODO verify that this works with the serialized hash grouping strategy +select ts, a, count(*) from text_table group by ts, a order by ts, a limit 10; +select a, ts, count(*) from text_table group by a, ts order by a desc, ts desc limit 10; + +reset max_parallel_workers_per_gather; +reset timescaledb.debug_require_vector_agg; +reset enable_hashagg;