timescaledb/tsl/test/sql/vector_agg_functions.sql
Erik Nordström 3170014030 Fix NaN-handling for vectorized aggregation
The vector agg functions didn't handle NaN-floats properly for min/max
functions, which produced the wrong `min()` output when NaN values
were presents (e.g., picking NaN over -Inf). NaN-checks are different
for min and max so the checks are moved to the predicate macro instead
of being defined in the template function.

The previously erroneous handling of NaN is evident by some of the
changes in the test output. However, some queries didn't run any
actual vectorized agg plans when they should have, thus "accidentally"
producing the correct min result in the test file. In those cases,
instead of the vectorized plan, the test ran an init plan doing a sort
with a limit of 1 to find the min. Disabling sort in the test ensures
the plan is vectorized, and thus producing the erroneous result when
the fix is not present.
2025-01-10 09:16:37 +01:00

185 lines
6.6 KiB
PL/PgSQL

-- This file and its contents are licensed under the Timescale License.
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
\c :TEST_DBNAME :ROLE_SUPERUSER
-- helper function: float -> pseudorandom float [-0.5..0.5]
CREATE OR REPLACE FUNCTION mix(x anyelement) RETURNS float8 AS $$
SELECT hashfloat8(x::float8) / pow(2, 32)
$$ LANGUAGE SQL;
\set CHUNKS 2::int
\set CHUNK_ROWS 100000::int
\set GROUPING_CARDINALITY 10::int
create table aggfns(t int, s int,
cint2 int2, cint4 int4, cint8 int8,
cfloat4 float4,
cts timestamp, ctstz timestamptz,
cdate date);
select create_hypertable('aggfns', 's', chunk_time_interval => :GROUPING_CARDINALITY / :CHUNKS);
create view source as
select s * 10000 + t as t,
s,
case when t % 1051 = 0 then null
else (mix(s + t * 1019) * 32767)::int2 end as cint2,
(mix(s + t * 1021) * 32767)::int4 as cint4,
(mix(s + t * 1031) * 32767)::int8 as cint8,
case when s = 1 and t = 1061 then 'nan'::float4
when s = 2 and t = 1061 then '+inf'::float4
when s = 3 and t = 1061 then '-inf'::float4
else (mix(s + t * 1033) * 100::int)::float4 end as cfloat4,
'2021-01-01 01:01:01'::timestamp + interval '1 second' * (s * 10000) as cts,
'2021-01-01 01:01:01'::timestamptz + interval '1 second' * (s * 10000) as ctstz,
'2021-01-01'::date + interval '1 day' * (s * 10000) as cdate
from
generate_series(1::int, :CHUNK_ROWS * :CHUNKS / :GROUPING_CARDINALITY) t,
generate_series(0::int, :GROUPING_CARDINALITY - 1::int) s(s)
;
insert into aggfns select * from source where s = 1;
alter table aggfns set (timescaledb.compress, timescaledb.compress_orderby = 't',
timescaledb.compress_segmentby = 's');
select count(compress_chunk(x)) from show_chunks('aggfns') x;
alter table aggfns add column ss int default 11;
alter table aggfns add column cfloat8 float8 default '13';
alter table aggfns add column x text default '11';
insert into aggfns
select *, ss::text as x from (
select *
, case
-- null in entire batch
when s = 2 then null
-- null for some rows
when s = 3 and t % 1053 = 0 then null
-- for some rows same as default
when s = 4 and t % 1057 = 0 then 11
-- not null for entire batch
else s
end as ss
, (mix(s + t * 1039) * 100)::float8 as cfloat8
from source where s != 1
) t
;
select count(compress_chunk(x)) from show_chunks('aggfns') x;
vacuum freeze analyze aggfns;
create table edges(t int, s int, ss int, f1 int);
select create_hypertable('edges', 't', chunk_time_interval => 100000);
alter table edges set (timescaledb.compress, timescaledb.compress_segmentby='s');
insert into edges select
s * 1000 + f1 as t,
s,
s,
f1
from generate_series(0, 12) s,
lateral generate_series(0, 60 + s + (s / 5::int) * 64 + (s / 10::int) * 2048) f1
;
insert into edges select 200000 t, 111 s, 111 ss, 1 f1;
select count(compress_chunk(x)) from show_chunks('edges') x;
vacuum freeze analyze edges;
-- We can't vectorize some aggregate functions on platforms withouth int128
-- support. Just relax the test requirements for them. I don't want to disable
-- this test in release builds, and don't want to have the guc in release builds,
-- so we'll assume we have int128 in all release builds.
select case when setting::bool then 'require' else 'allow' end guc_value
from pg_settings where name = 'timescaledb.debug_have_int128'
union all select 'require' guc_value
limit 1
\gset
set timescaledb.debug_require_vector_agg = :'guc_value';
---- Uncomment to generate reference. Note that there are minor discrepancies
---- on float4 due to different numeric stability in our and PG implementations.
--set timescaledb.enable_chunkwise_aggregation to off; set timescaledb.enable_vectorized_aggregation to off; set timescaledb.debug_require_vector_agg = 'forbid';
set max_parallel_workers_per_gather = 0;
-- Disable sorting to force vectorized agg plans for min and max,
-- which otherwise can produce a non-vectorized init-plan that does a
-- sort with limit 1.
set enable_sort = false;
select
format('%sselect %s%s(%s) from aggfns%s%s%s;',
explain,
grouping || ', ',
function, variable,
' where ' || condition,
' group by ' || grouping,
format(' order by %s(%s), ', function, variable) || grouping || ' limit 10',
function, variable)
from
unnest(array[
'explain (costs off) ',
null]) explain,
unnest(array[
't',
's',
'ss',
'cint2',
'cint4',
'cint8',
'cfloat4',
'cfloat8',
'cts',
'ctstz',
'cdate',
'*']) variable,
unnest(array[
'min',
'max',
'sum',
'avg',
'stddev',
'count']) function,
unnest(array[
null,
'cfloat8 > 0',
'cfloat8 <= 0',
'cfloat8 < 1000' /* vectorized qual is true for all rows */,
'cfloat8 > 1000' /* vectorized qual is false for all rows */,
'cint2 is null']) with ordinality as condition(condition, n),
unnest(array[
null,
's',
'ss']) with ordinality as grouping(grouping, n)
where
true
and (explain is null /* or condition is null and grouping = 's' */)
and (variable != '*' or function = 'count')
and (variable not in ('t', 'cts', 'ctstz', 'cdate') or function in ('min', 'max'))
-- This is not vectorized yet
and (variable != 'cint8' or function != 'stddev')
and (function != 'count' or variable in ('cint2', 's', '*'))
and (condition is distinct from 'cint2 is null' or variable = 'cint2')
order by explain, condition.n, variable, function, grouping.n
\gexec
-- Test multiple aggregate functions as well.
select count(*), count(cint2), min(cfloat4), cint2 from aggfns group by cint2
order by count(*) desc, cint2 limit 10
;
-- Test edge cases for various batch sizes and the filter matching around batch
-- end.
select count(*) from edges;
select s, count(*) from edges group by 1 order by 1;
select s, count(*), min(f1) from edges where f1 = 63 group by 1 order by 1;
select s, count(*), min(f1) from edges where f1 = 64 group by 1 order by 1;
select s, count(*), min(f1) from edges where f1 = 65 group by 1 order by 1;
select ss, count(*), min(f1) from edges where f1 = 63 group by 1 order by 1;
select ss, count(*), min(f1) from edges where f1 = 64 group by 1 order by 1;
select ss, count(*), min(f1) from edges where f1 = 65 group by 1 order by 1;
reset max_parallel_workers_per_gather;