1
0
mirror of https://github.com/timescale/timescaledb.git synced 2025-05-20 20:54:29 +08:00
Alexander Kuzmenkov 063a9dae29 Improve cost model for data node scans
1) Simplify the path generation for the parameterized data node scans.
1) Adjust the data node scan cost if it's an index scan, instead of always
   treating it as a sequential scan.
1) Hard-code the grouping estimation for distributed hypertable, instead
   of using the totally bogus per-column ndistinct value.
1) Add the GUC to disable parameterized
data node scan.
1) Add more tests.
2023-02-08 16:12:01 +04:00

298 lines
9.3 KiB
PL/PgSQL

-- This file and its contents are licensed under the Timescale License.
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
-- Test parameterized data node scan.
\c :TEST_DBNAME :ROLE_CLUSTER_SUPERUSER;
\set DN_DBNAME_1 :TEST_DBNAME _1
-- pg_regress doesn't drop these databases for repeated invocation such as in
-- the flaky check.
set client_min_messages to ERROR;
drop database if exists :"DN_DBNAME_1";
select 1 from add_data_node('data_node_1', host => 'localhost',
database => :'DN_DBNAME_1');
grant usage on foreign server data_node_1 to public;
grant create on schema public to :ROLE_1;
set role :ROLE_1;
reset client_min_messages;
-- helper function: float -> pseudorandom float [0..1].
create or replace function mix(x float4) returns float4 as $$ select ((hashfloat4(x) / (pow(2., 31) - 1) + 1) / 2)::float4 $$ language sql;
-- distributed hypertable
create table metric_dist(ts timestamptz, id int, value float);
select create_distributed_hypertable('metric_dist', 'ts', 'id');
insert into metric_dist
select '2022-02-02 02:02:02+03'::timestamptz + interval '1 year' * mix(x),
mix(x + 1.) * 20,
mix(x + 2.) * 50
from generate_series(1, 1000000) x(x)
;
analyze metric_dist;
select count(*) from show_chunks('metric_dist');
-- dictionary
create table metric_name(id int, name text collate "C",
constraint metric_name_name unique (name),
constraint metric_name_id primary key (id));
insert into metric_name values (1, 'cpu1'), (3, 'cpu3'), (7, 'cpu7');
insert into metric_name select x, 'other' || x
from generate_series(1000, 10000) x
;
analyze metric_name;
-- for predictable plans
set enable_hashagg to off;
set enable_material to off;
set enable_mergejoin to off;
-- not present on PG 12
\set ECHO errors
select 'set enable_memoize to off' from pg_settings where name = 'enable_memoize' \gexec
\set ECHO all
-- Subquery + IN
select id, max(value), count(*)
from metric_dist
where id in (select id from metric_name where name like 'cpu%')
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by id
order by id
;
explain (costs off, verbose)
select id, max(value), count(*)
from metric_dist
where id in (select id from metric_name where name like 'cpu%')
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by id
order by id
;
-- Check that the GUC to disable these plans works. Our cost model is very
-- heuristic and may be often wrong, so there needs to be a way to disable them.
set timescaledb.enable_parameterized_data_node_scan to false;
select id, max(value), count(*)
from metric_dist
where id in (select id from metric_name where name like 'cpu%')
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by id
order by id
;
explain (costs off, verbose)
select id, max(value), count(*)
from metric_dist
where id in (select id from metric_name where name like 'cpu%')
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by id
order by id
;
reset timescaledb.enable_parameterized_data_node_scan;
-- Shippable EC join
select name, max(value), count(*)
from metric_dist join metric_name using (id)
where name like 'cpu%'
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by name
order by name
;
explain (costs off, verbose)
select name, max(value), count(*)
from metric_dist join metric_name using (id)
where name like 'cpu%'
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by name
order by name
;
-- Shipping still might make sense if the local table is outer.
explain (costs off, verbose)
select name, max(value), count(*)
from metric_dist right join metric_name using (id)
where name like 'cpu%'
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by name
order by name
;
-- Shipping doesn't make sense if the distributed table is outer.
explain (costs off, verbose)
select name, max(value), count(*)
from metric_dist
left join (select * from metric_name where name like 'cpu%') t using (id)
where ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by name
order by name
;
-- Non-shippable EC join
explain (costs off, verbose)
select name, max(value), count(*)
from metric_dist join metric_name on name = concat('cpu', metric_dist.id)
where metric_name.name like 'cpu%'
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by name
order by name
;
-- Shippable non-EC join. The weird condition is to only use immutable functions
-- that can be shipped to the remote node. `id::text` does CoerceViaIO which is
-- not generally shippable. And `int4out` returns cstring, not text, that's why
-- the `textin` is needed.
select name, max(value), count(*)
from metric_dist join metric_name
on texteq('cpu' || textin(int4out(metric_dist.id)), name)
where metric_name.name like 'cpu%'
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by name
order by name
;
explain (costs off, verbose)
select name, max(value), count(*)
from metric_dist join metric_name
on texteq('cpu' || textin(int4out(metric_dist.id)), name)
where metric_name.name like 'cpu%'
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by name
order by name
;
-- Non-shippable non-EC join.
explain (costs off, verbose)
select name, max(value), count(*)
from metric_dist join metric_name
on texteq(concat('cpu', textin(int4out(metric_dist.id))), name)
where metric_name.name like 'cpu%'
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by name
order by name
;
-- distinct on, order by, limit 1, with subquery
select distinct on (id)
id, ts, value
from metric_dist
where id in (select id from metric_name where name like 'cpu%')
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
order by id, ts, value
limit 1
;
explain (costs off, verbose)
select distinct on (id)
id, ts, value
from metric_dist
where id in (select id from metric_name where name like 'cpu%')
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
order by id, ts, value
limit 1
;
-- distinct on, order by, limit 1, with explicit join
select distinct on (name)
name, ts, value
from metric_dist join metric_name using (id)
where name like 'cpu%'
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
order by name, ts, value
limit 1
;
explain (costs off, verbose)
select distinct on (name)
name, ts, value
from metric_dist join metric_name using (id)
where name like 'cpu%'
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
order by name, ts, value
limit 1
;
-- If there are a lot of rows chosen from the local table, the parameterized
-- nested loop might download the entire dist table or even more than that (in
-- case of not equi-join or duplicate join keys).
-- Check that the parameterized plan is not chosen in this case.
explain (costs off, verbose)
select name, max(value), count(*)
from metric_dist
join metric_name using (id)
where ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by name
order by name
;
-- An interesting special case is when the remote SQL has a parameter, but it is
-- the result of an initplan. It's not "parameterized" in the join sense, because
-- there is only one param value. This is the most efficient plan for querying a
-- small number of ids.
explain (costs off, verbose)
select id, max(value)
from metric_dist
where id = any((select array_agg(id) from metric_name where name like 'cpu%')::int[])
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by id
order by id
;
-- Multiple joins. Test both EC and non-EC (texteq) join in one query.
create table metric_location(id int, location text);
insert into metric_location values (1, 'Yerevan'), (3, 'Dilijan'), (7, 'Stepanakert');
analyze metric_location;
select id, max(value)
from metric_dist natural join metric_location natural join metric_name
where name like 'cpu%' and texteq(location, 'Yerevan')
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by id
;
explain (costs off, verbose)
select id, max(value)
from metric_dist natural join metric_location natural join metric_name
where name like 'cpu%' and texteq(location, 'Yerevan')
and ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
group by id
;
-- Multiple joins on different variables. Use a table instead of a CTE for saner
-- stats.
create table max_value_times as
select distinct on (id) id, ts from metric_dist
where ts between '2022-02-02 02:02:02+03' and '2022-03-03 02:02:02+03'
order by id, value desc
;
analyze max_value_times;
explain (costs off, verbose)
select id, value
from metric_dist natural join max_value_times natural join metric_name
where name like 'cpu%'
order by 1
;
-- Two distributed hypertables, each joined to reference and all joined together.
-- The query finds the percentage of time points where one metric is higher than
-- another, and also tweaked not to use initplans.Requires hash join.
explain (analyze, verbose, costs off, timing off, summary off)
select count(*) filter (where m1.value > m2.value) / count(*)
from metric_dist m1
join metric_dist m2 using (ts)
where m1.id in (select id from metric_name where name = 'cpu1')
and m2.id in (select id from metric_name where name = 'cpu3')
;