timescaledb/tsl/test/sql/vectorized_aggregation.sql
Alexander Kuzmenkov 11e866e378
Vectorized aggregation with grouping by one fixed-size column (#7341)
The implementation uses the Postgres simplehash hash table for by-value
fixed-size compressed columns.

The biggest improvement on a "sensible" query is about 90%, and a couple
of queries show bigger improvements but these are very synthetic cases
that don't make much sense:

https://grafana.ops.savannah-dev.timescale.com/d/fasYic_4z/compare-akuzm?orgId=1&var-branch=All&var-run1=3815&var-run2=3816&var-threshold=0.02&var-use_historical_thresholds=true&var-threshold_expression=2%20%2A%20percentile_cont%280.90%29&var-exact_suite_version=false&from=now-2d&to=now

---------

Signed-off-by: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Co-authored-by: Erik Nordström <819732+erimatnor@users.noreply.github.com>
2025-01-02 19:46:15 +00:00

418 lines
12 KiB
SQL

-- This file and its contents are licensed under the Timescale License.
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
\set EXPLAIN 'EXPLAIN (VERBOSE, COSTS OFF)'
CREATE TABLE testtable (
time timestamptz NOT NULL,
segment_by_value integer,
int_value integer,
float_value double precision);
SELECT FROM create_hypertable(relation=>'testtable', time_column_name=> 'time');
ALTER TABLE testtable SET (timescaledb.compress, timescaledb.compress_segmentby='segment_by_value');
INSERT INTO testtable
SELECT time AS time,
value AS segment_by_value,
value AS int_value,
value AS float_value
FROM
generate_series('1980-01-01 00:00:00-00', '1980-03-01 00:00:00-00', INTERVAL '1 day') AS g1(time),
generate_series(-10, 100, 1) AS g2(value)
ORDER BY time;
-- Aggregation result without any vectorization
SELECT sum(segment_by_value), sum(int_value), sum(float_value) FROM testtable;
---
-- Tests with some chunks compressed
---
SELECT compress_chunk(ch) FROM show_chunks('testtable') ch LIMIT 3;
-- Vectorized aggregation possible
SELECT sum(segment_by_value) FROM testtable;
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable;
-- Vectorization possible - filter on segment_by
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable WHERE segment_by_value > 0;
-- Vectorization with filter on compressed columns
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable WHERE segment_by_value > 0 AND int_value > 0;
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable WHERE int_value > 0;
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable WHERE float_value > 0;
-- Vectorization possible with grouping by one fixed-size column
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable GROUP BY float_value;
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable GROUP BY int_value;
-- Vectorization not possible with grouping by multiple columns
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable GROUP BY int_value, float_value;
-- Vectorization possible with grouping by a segmentby column.
:EXPLAIN
SELECT sum(int_value) FROM testtable GROUP BY segment_by_value;
:EXPLAIN
SELECT sum(segment_by_value), segment_by_value FROM testtable GROUP BY segment_by_value ORDER BY 1, 2;
SELECT sum(segment_by_value), segment_by_value FROM testtable GROUP BY segment_by_value ORDER BY 1, 2;
:EXPLAIN
SELECT segment_by_value, sum(segment_by_value) FROM testtable GROUP BY segment_by_value ORDER BY 1, 2;
SELECT segment_by_value, sum(segment_by_value) FROM testtable GROUP BY segment_by_value ORDER BY 1, 2;
-- Vectorized aggregation possible
SELECT sum(int_value) FROM testtable;
:EXPLAIN
SELECT sum(int_value) FROM testtable;
-- Vectorized aggregation possible
SELECT sum(float_value) FROM testtable;
:EXPLAIN
SELECT sum(float_value) FROM testtable;
---
-- Tests with all chunks compressed
---
SELECT compress_chunk(ch, if_not_compressed => true) FROM show_chunks('testtable') ch;
-- Vectorized aggregation possible
SELECT sum(segment_by_value) FROM testtable;
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable;
-- Vectorized aggregation possible
SELECT sum(int_value) FROM testtable;
:EXPLAIN
SELECT sum(int_value) FROM testtable;
---
-- Tests with some chunks are partially compressed
---
INSERT INTO testtable (time, segment_by_value, int_value, float_value)
VALUES ('1980-01-02 01:00:00-00', 0, 0, 0);
-- Vectorized aggregation possible
SELECT sum(segment_by_value) FROM testtable;
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable;
-- Vectorized aggregation possible
SELECT sum(int_value) FROM testtable;
:EXPLAIN
SELECT sum(int_value) FROM testtable;
--Vectorized aggregation not possible for expression
SELECT sum(abs(int_value)) FROM testtable;
:EXPLAIN
SELECT sum(abs(int_value)) FROM testtable;
-- Vectorized aggregation NOT possible
SET timescaledb.enable_vectorized_aggregation = OFF;
:EXPLAIN
SELECT sum(int_value) FROM testtable;
RESET timescaledb.enable_vectorized_aggregation;
-- Vectorized aggregation without bulk decompression only possible for
-- segmentby columns.
SET timescaledb.enable_bulk_decompression = OFF;
:EXPLAIN
SELECT sum(int_value) FROM testtable;
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable;
SELECT sum(segment_by_value) FROM testtable;
RESET timescaledb.enable_bulk_decompression;
-- Using the same sum function multiple times is supported by vectorization
:EXPLAIN
SELECT sum(int_value), sum(int_value) FROM testtable;
-- Using the same sum function multiple times is supported by vectorization
:EXPLAIN
SELECT sum(segment_by_value), sum(segment_by_value) FROM testtable;
-- Performing a sum on multiple columns is supported.
:EXPLAIN
SELECT sum(int_value), sum(segment_by_value) FROM testtable;
SELECT sum(int_value), sum(segment_by_value) FROM testtable;
-- Using the sum function together with another non-vector capable aggregate is not supported
:EXPLAIN
SELECT sum(int_value), bit_or(int_value) FROM testtable;
-- Using the sum function together with another non-vector capable aggregate is not supported
:EXPLAIN
SELECT sum(segment_by_value), bit_or(segment_by_value) FROM testtable;
---
-- Tests with only negative values
---
TRUNCATE testtable;
INSERT INTO testtable
SELECT time AS time,
value AS segment_by_value,
value AS int_value,
value AS float_value
FROM
generate_series('1980-01-01 00:00:00-00', '1980-03-01 00:00:00-00', INTERVAL '1 day') AS g1(time),
generate_series(-10, 0, 1) AS g2(value)
ORDER BY time;
-- Aggregation result without any vectorization
SELECT sum(segment_by_value), sum(int_value), sum(float_value) FROM testtable;
SELECT compress_chunk(ch) FROM show_chunks('testtable') ch;
-- Aggregation with vectorization
SELECT sum(segment_by_value) FROM testtable;
SELECT sum(int_value) FROM testtable;
---
-- Tests with only positive values
---
TRUNCATE testtable;
INSERT INTO testtable
SELECT time AS time,
value AS segment_by_value,
value AS int_value,
value AS float_value
FROM
generate_series('1980-01-01 00:00:00-00', '1980-03-01 00:00:00-00', INTERVAL '1 day') AS g1(time),
generate_series(0, 10, 1) AS g2(value)
ORDER BY time;
-- Aggregation result without any vectorization
SELECT sum(segment_by_value), sum(int_value), sum(float_value) FROM testtable;
SELECT compress_chunk(ch) FROM show_chunks('testtable') ch;
-- Aggregation with vectorization
SELECT sum(segment_by_value) FROM testtable;
SELECT sum(int_value) FROM testtable;
-- Vectorization possible - filter on segment_by
:EXPLAIN
SELECT sum(int_value) FROM testtable WHERE segment_by_value > 5;
SELECT sum(int_value) FROM testtable WHERE segment_by_value > 5;
SET timescaledb.enable_vectorized_aggregation = OFF;
SELECT sum(int_value) FROM testtable WHERE segment_by_value > 5;
RESET timescaledb.enable_vectorized_aggregation;
SELECT sum(int_value) FROM testtable WHERE segment_by_value > 10;
SET timescaledb.enable_vectorized_aggregation = OFF;
SELECT sum(int_value) FROM testtable WHERE segment_by_value > 10;
RESET timescaledb.enable_vectorized_aggregation;
---
-- Tests with parallel plans
---
SET parallel_leader_participation = off;
SET min_parallel_table_scan_size = 0;
SET parallel_setup_cost = 0;
SET parallel_tuple_cost = 0;
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable;
SELECT sum(segment_by_value) FROM testtable;
RESET parallel_leader_participation;
RESET min_parallel_table_scan_size;
RESET parallel_setup_cost;
RESET parallel_tuple_cost;
---
-- Tests with only zero values
---
TRUNCATE testtable;
INSERT INTO testtable
SELECT time AS time,
0 AS segment_by_value,
0 AS int_value,
0 AS float_value
FROM
generate_series('1980-01-01 00:00:00-00', '1980-03-01 00:00:00-00', INTERVAL '1 day') AS g1(time)
ORDER BY time;
-- Aggregation result without any vectorization
SELECT sum(segment_by_value), sum(int_value), sum(float_value) FROM testtable;
SELECT compress_chunk(ch) FROM show_chunks('testtable') ch;
-- Aggregation with vectorization
SELECT sum(segment_by_value) FROM testtable;
SELECT sum(int_value) FROM testtable;
---
-- Tests with null values
---
TRUNCATE testtable;
INSERT INTO testtable
SELECT time AS time,
value AS segment_by_value,
value AS int_value,
value AS float_value
FROM
generate_series('1980-01-01 00:00:00-00', '1980-03-01 00:00:00-00', INTERVAL '1 day') AS g1(time),
generate_series(0, 10, 1) AS g2(value)
ORDER BY time;
-- NULL values for compressed data
INSERT INTO testtable
SELECT time AS time,
value AS segment_by_value,
NULL AS int_value,
NULL AS float_value
FROM
generate_series('1980-01-01 00:00:00-00', '1980-03-01 00:00:00-00', INTERVAL '1 day') AS g1(time),
generate_series(0, 5, 1) AS g2(value)
ORDER BY time;
-- NULL values for segment_by
INSERT INTO testtable
SELECT time AS time,
NULL AS segment_by_value,
value AS int_value,
value AS float_value
FROM
generate_series('1980-01-01 00:00:00-00', '1980-03-01 00:00:00-00', INTERVAL '1 day') AS g1(time),
generate_series(0, 2, 1) AS g2(value)
ORDER BY time;
-- Aggregation result without any vectorization
SELECT sum(segment_by_value), sum(int_value), sum(float_value) FROM testtable;
SELECT compress_chunk(ch) FROM show_chunks('testtable') ch;
-- Aggregation with vectorization
:EXPLAIN
SELECT sum(segment_by_value) FROM testtable;
:EXPLAIN
SELECT sum(int_value) FROM testtable;
SELECT sum(segment_by_value) FROM testtable;
SELECT sum(int_value) FROM testtable;
-- Vectorizable aggregation filters are supported
:EXPLAIN
SELECT sum(segment_by_value) FILTER (WHERE segment_by_value > 99999) FROM testtable;
SET timescaledb.enable_vectorized_aggregation = OFF;
SELECT sum(segment_by_value) FILTER (WHERE segment_by_value > 99999) FROM testtable;
RESET timescaledb.enable_vectorized_aggregation;
SELECT sum(segment_by_value) FILTER (WHERE segment_by_value > 99999) FROM testtable;
---
-- Tests with multiple segment by values
---
CREATE TABLE testtable2 (
time timestamptz NOT NULL,
segment_by_value1 integer NOT NULL,
segment_by_value2 integer NOT NULL,
int_value integer NOT NULL,
float_value double precision NOT NULL);
SELECT FROM create_hypertable(relation=>'testtable2', time_column_name=> 'time');
ALTER TABLE testtable2 SET (timescaledb.compress, timescaledb.compress_segmentby='segment_by_value1, segment_by_value2');
INSERT INTO testtable2
SELECT time AS time,
value1 AS segment_by_value1,
value2 AS segment_by_value2,
value1 AS int_value,
value1 AS float_value
FROM
generate_series('1980-01-03 00:00:00-00', '1980-03-04 00:00:00-00', INTERVAL '1 day') AS g1(time),
generate_series(-10, 25, 1) AS g2(value1),
generate_series(-30, 20, 1) AS g3(value2)
ORDER BY time;
-- Aggregation result without any vectorization
SELECT sum(segment_by_value1), sum(segment_by_value2) FROM testtable2;
SELECT compress_chunk(ch) FROM show_chunks('testtable2') ch;
ANALYZE testtable2;
:EXPLAIN
SELECT sum(segment_by_value1) FROM testtable2;
SELECT sum(segment_by_value1) FROM testtable2;
:EXPLAIN
SELECT sum(segment_by_value2) FROM testtable2;
SELECT sum(segment_by_value2) FROM testtable2;
-- Vectorization possible - filter on segment_by
:EXPLAIN
SELECT sum(segment_by_value1) FROM testtable2 WHERE segment_by_value1 > 0;
:EXPLAIN
SELECT sum(segment_by_value1) FROM testtable2 WHERE segment_by_value1 > 0 AND segment_by_value2 > 0;
:EXPLAIN
SELECT sum(segment_by_value1) FROM testtable2 WHERE segment_by_value1 > 0 AND segment_by_value2 > 0 AND 2>1;
-- Vectorization not possible filter on segment_by and compressed value
-- Disable parallel worker to get deterministic query plans on i386
SET max_parallel_workers_per_gather = 0;
:EXPLAIN
SELECT sum(segment_by_value1) FROM testtable2 WHERE segment_by_value1 > 1000 AND int_value > 1000;
RESET max_parallel_workers_per_gather;
-- Can't group by a system column
SELECT sum(float_value) FROM testtable2 GROUP BY tableoid ORDER BY 1 LIMIT 1;
-- Postgres versions starting with 16 remove the grouping columns that are
-- equated to a constant. Check that our planning code handles this well.
SELECT sum(float_value), int_value FROM testtable2 WHERE int_value = 1 GROUP BY int_value;