Don't analyze the uncompressed chunk before compressing it

We don't use the statistics on the uncompressed chunk anyway. This
significantly improves the compression performance.
This commit is contained in:
Alexander Kuzmenkov 2024-01-11 12:02:05 +01:00
parent 1cd77b3a24
commit 4e7edf370a
17 changed files with 65 additions and 74 deletions

View File

@ -150,7 +150,6 @@ static int create_segment_filter_scankey(RowDecompressor *decompressor,
char *segment_filter_col_name, StrategyNumber strategy,
ScanKeyData *scankeys, int num_scankeys,
Bitmapset **null_columns, Datum value, bool isnull);
static void run_analyze_on_chunk(Oid chunk_relid);
static void create_per_compressed_column(RowDecompressor *decompressor);
/********************
@ -425,7 +424,6 @@ compress_chunk(Oid in_table, Oid out_table, int insert_options)
row_compressor_process_ordered_slot(&row_compressor, slot, mycid);
}
run_analyze_on_chunk(in_rel->rd_id);
if (row_compressor.rows_compressed_into_current_value > 0)
row_compressor_flush(&row_compressor, mycid, true);
@ -530,12 +528,6 @@ compress_chunk_sort_relation(CompressionSettings *settings, Relation in_rel)
table_endscan(scan);
/* Perform an analyze on the chunk to get up-to-date stats before compressing.
* We do it at this point because we've just read out the entire chunk into
* tuplesort, so its pages are likely to be cached and we can save on I/O.
*/
run_analyze_on_chunk(in_rel->rd_id);
ExecDropSingleTupleTableSlot(slot);
tuplesort_performsort(tuplesortstate);
@ -591,26 +583,8 @@ compress_chunk_populate_sort_info_for_column(CompressionSettings *settings, Oid
ReleaseSysCache(tp);
}
static void
run_analyze_on_chunk(Oid chunk_relid)
{
VacuumRelation vr = {
.type = T_VacuumRelation,
.relation = NULL,
.oid = chunk_relid,
.va_cols = NIL,
};
VacuumStmt vs = {
.type = T_VacuumStmt,
.rels = list_make1(&vr),
.is_vacuumcmd = false,
.options = NIL,
};
ExecVacuum(NULL, &vs, true);
}
/* Find segment by index for setting the correct sequence number if
/*
* Find segment by index for setting the correct sequence number if
* we are trying to roll up chunks while compressing
*/
static Oid

View File

@ -23,6 +23,7 @@ SELECT compress_chunk(c) FROM show_chunks('testtable') c;
_timescaledb_internal._hyper_1_2_chunk
(2 rows)
ANALYZE testtable;
-- Pushdown aggregation to the chunk level
SELECT count(*), sum(v0), sum(v1), sum(v2), sum(v3) FROM testtable WHERE time >= '2000-01-01 00:00:00+0' AND time <= '2000-02-01 00:00:00+0';
count | sum | sum | sum | sum
@ -61,6 +62,7 @@ SELECT count(*), sum(v0), sum(v1), sum(v2), sum(v3) FROM testtable WHERE time >=
INSERT INTO testtable(time,device_id,v0,v1,v2,v3)
SELECT time, device_id, device_id+1, device_id + 2, device_id + 0.5, NULL
FROM generate_series('2000-01-01 0:00:00+0'::timestamptz,'2000-01-10 23:55:00+0','1day') gtime(time), generate_series(1,5,1) gdevice(device_id);
ANALYZE testtable;
-- Pushdown aggregation to the chunk level
SELECT count(*), sum(v0), sum(v1), sum(v2), sum(v3) FROM testtable WHERE time >= '2000-01-01 00:00:00+0' AND time <= '2000-02-01 00:00:00+0';
count | sum | sum | sum | sum
@ -177,10 +179,11 @@ SELECT count(*), sum(v0), sum(v1), sum(v2), sum(v3) FROM testtable WHERE time >=
Filter: ((compress_hyper_2_4_chunk._ts_meta_max_1 >= ('2000-01-09 00:00:00+0'::cstring)::timestamp with time zone) AND (compress_hyper_2_4_chunk._ts_meta_min_1 <= ('2000-02-01 00:00:00+0'::cstring)::timestamp with time zone))
-> Partial Aggregate (actual rows=1 loops=1)
Output: PARTIAL count(*), PARTIAL sum(_hyper_1_2_chunk.v0), PARTIAL sum(_hyper_1_2_chunk.v1), PARTIAL sum(_hyper_1_2_chunk.v2), PARTIAL sum(_hyper_1_2_chunk.v3)
-> Index Scan using _hyper_1_2_chunk_testtable_time_idx on _timescaledb_internal._hyper_1_2_chunk (actual rows=10 loops=1)
-> Seq Scan on _timescaledb_internal._hyper_1_2_chunk (actual rows=10 loops=1)
Output: _hyper_1_2_chunk.v0, _hyper_1_2_chunk.v1, _hyper_1_2_chunk.v2, _hyper_1_2_chunk.v3
Index Cond: ((_hyper_1_2_chunk."time" >= ('2000-01-09 00:00:00+0'::cstring)::timestamp with time zone) AND (_hyper_1_2_chunk."time" <= ('2000-02-01 00:00:00+0'::cstring)::timestamp with time zone))
(22 rows)
Filter: ((_hyper_1_2_chunk."time" >= ('2000-01-09 00:00:00+0'::cstring)::timestamp with time zone) AND (_hyper_1_2_chunk."time" <= ('2000-02-01 00:00:00+0'::cstring)::timestamp with time zone))
Rows Removed by Filter: 15
(23 rows)
-- Force plain / sorted aggregation
SET enable_hashagg = OFF;

View File

@ -1282,12 +1282,6 @@ SELECT relpages, CASE WHEN reltuples > 0 THEN reltuples ELSE 0 END as reltuples
0 | 0
(1 row)
SELECT histogram_bounds FROM pg_stats WHERE tablename = :statchunk AND attname = 'c1';
histogram_bounds
-------------------------------------------------------------------------------------------------------------------------------
{0,250,500,750,1000,1250,1500,1750,2000,2250,2500,2750,3000,3250,3500,3750,4000,4250,4500,4750,5000,5250,5500,5750,6000,6250}
(1 row)
SELECT compch.table_name as "STAT_COMP_CHUNK_NAME"
FROM _timescaledb_catalog.hypertable ht, _timescaledb_catalog.chunk ch
, _timescaledb_catalog.chunk compch
@ -1300,15 +1294,8 @@ SELECT relpages, CASE WHEN reltuples > 0 THEN reltuples ELSE 0 END as reltuples
0 | 0
(1 row)
-- Now verify stats are updated on compressed chunk table when we analyze the hypertable.
ANALYZE stattest;
SELECT histogram_bounds FROM pg_stats WHERE tablename = :statchunk AND attname = 'c1';
histogram_bounds
-------------------------------------------------------------------------------------------------------------------------------
{0,250,500,750,1000,1250,1500,1750,2000,2250,2500,2750,3000,3250,3500,3750,4000,4250,4500,4750,5000,5250,5500,5750,6000,6250}
(1 row)
-- Unfortunately, the stats on the hypertable won't find any rows to sample from the chunk
ANALYZE stattest;
SELECT histogram_bounds FROM pg_stats WHERE tablename = 'stattest' AND attname = 'c1';
histogram_bounds
------------------

View File

@ -1481,6 +1481,7 @@ SELECT compress_chunk(i) FROM show_chunks('bugtab') i;
_timescaledb_internal._hyper_11_23_chunk
(1 row)
ANALYZE bugtab;
:PREFIX
SELECT "time","hin"::text,"model"::text,"block"::text,"message_name"::text,"signal_name"::text,"signal_numeric_value","signal_string_value"::text FROM :chunk_table_bugtab ORDER BY "time" DESC;
QUERY PLAN

View File

@ -1481,6 +1481,7 @@ SELECT compress_chunk(i) FROM show_chunks('bugtab') i;
_timescaledb_internal._hyper_11_23_chunk
(1 row)
ANALYZE bugtab;
:PREFIX
SELECT "time","hin"::text,"model"::text,"block"::text,"message_name"::text,"signal_name"::text,"signal_numeric_value","signal_string_value"::text FROM :chunk_table_bugtab ORDER BY "time" DESC;
QUERY PLAN

View File

@ -1481,6 +1481,7 @@ SELECT compress_chunk(i) FROM show_chunks('bugtab') i;
_timescaledb_internal._hyper_11_23_chunk
(1 row)
ANALYZE bugtab;
:PREFIX
SELECT "time","hin"::text,"model"::text,"block"::text,"message_name"::text,"signal_name"::text,"signal_numeric_value","signal_string_value"::text FROM :chunk_table_bugtab ORDER BY "time" DESC;
QUERY PLAN

View File

@ -1481,6 +1481,7 @@ SELECT compress_chunk(i) FROM show_chunks('bugtab') i;
_timescaledb_internal._hyper_11_23_chunk
(1 row)
ANALYZE bugtab;
:PREFIX
SELECT "time","hin"::text,"model"::text,"block"::text,"message_name"::text,"signal_name"::text,"signal_numeric_value","signal_string_value"::text FROM :chunk_table_bugtab ORDER BY "time" DESC;
QUERY PLAN

View File

@ -1,7 +1,6 @@
-- This file and its contents are licensed under the Timescale License.
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
\c :TEST_DBNAME :ROLE_SUPERUSER
-- github issue 5585
create table test (
time timestamptz not null,
@ -33,8 +32,11 @@ select compress_chunk(show_chunks('test'));
-- force an index scan
set enable_seqscan = 'off';
-- disable jit to avoid test flakiness
-- make some tweaks to avoid flakiness
analyze test;
analyze test_copy;
set jit = off;
set max_parallel_workers_per_gather = 0;
explain (costs off) with query_params as (
select distinct a, b
from test_copy

View File

@ -1,7 +1,6 @@
-- This file and its contents are licensed under the Timescale License.
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
\c :TEST_DBNAME :ROLE_SUPERUSER
-- github issue 5585
create table test (
time timestamptz not null,
@ -33,8 +32,11 @@ select compress_chunk(show_chunks('test'));
-- force an index scan
set enable_seqscan = 'off';
-- disable jit to avoid test flakiness
-- make some tweaks to avoid flakiness
analyze test;
analyze test_copy;
set jit = off;
set max_parallel_workers_per_gather = 0;
explain (costs off) with query_params as (
select distinct a, b
from test_copy

View File

@ -1,7 +1,6 @@
-- This file and its contents are licensed under the Timescale License.
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
\c :TEST_DBNAME :ROLE_SUPERUSER
-- github issue 5585
create table test (
time timestamptz not null,
@ -33,8 +32,11 @@ select compress_chunk(show_chunks('test'));
-- force an index scan
set enable_seqscan = 'off';
-- disable jit to avoid test flakiness
-- make some tweaks to avoid flakiness
analyze test;
analyze test_copy;
set jit = off;
set max_parallel_workers_per_gather = 0;
explain (costs off) with query_params as (
select distinct a, b
from test_copy

View File

@ -1,7 +1,6 @@
-- This file and its contents are licensed under the Timescale License.
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
\c :TEST_DBNAME :ROLE_SUPERUSER
-- github issue 5585
create table test (
time timestamptz not null,
@ -33,8 +32,11 @@ select compress_chunk(show_chunks('test'));
-- force an index scan
set enable_seqscan = 'off';
-- disable jit to avoid test flakiness
-- make some tweaks to avoid flakiness
analyze test;
analyze test_copy;
set jit = off;
set max_parallel_workers_per_gather = 0;
explain (costs off) with query_params as (
select distinct a, b
from test_copy

View File

@ -18,6 +18,7 @@ SELECT count(compress_chunk(ch)) FROM show_chunks('decompress_tracking') ch;
2
(1 row)
ANALYZE decompress_tracking;
-- no tracking without analyze
:EXPLAIN UPDATE decompress_tracking SET value = value + 3;
QUERY PLAN
@ -131,31 +132,35 @@ QUERY PLAN
-- test prepared statements EXPLAIN still works after execution
SET plan_cache_mode TO force_generic_plan;
PREPARE p1 AS UPDATE decompress_tracking SET value = value + 3 WHERE device = 'd1';
BEGIN; EXPLAIN EXECUTE p1; EXECUTE p1; EXPLAIN EXECUTE p1; ROLLBACK;
BEGIN;
EXPLAIN (COSTS OFF) EXECUTE p1;
QUERY PLAN
Custom Scan (HypertableModify) (cost=0.00..70.83 rows=433 width=18)
-> Update on decompress_tracking (cost=0.00..70.83 rows=433 width=18)
Custom Scan (HypertableModify)
-> Update on decompress_tracking
Update on _hyper_X_X_chunk decompress_tracking_1
Update on _hyper_X_X_chunk decompress_tracking_2
-> Result (cost=0.00..70.83 rows=433 width=18)
-> Append (cost=0.00..65.42 rows=433 width=18)
-> Seq Scan on _hyper_X_X_chunk decompress_tracking_1 (cost=0.00..31.62 rows=432 width=18)
-> Result
-> Append
-> Seq Scan on _hyper_X_X_chunk decompress_tracking_1
Filter: (device = 'd1'::text)
-> Seq Scan on _hyper_X_X_chunk decompress_tracking_2 (cost=0.00..31.62 rows=1 width=18)
-> Seq Scan on _hyper_X_X_chunk decompress_tracking_2
Filter: (device = 'd1'::text)
(10 rows)
EXECUTE p1;
EXPLAIN (COSTS OFF) EXECUTE p1;
QUERY PLAN
Custom Scan (HypertableModify) (cost=0.00..70.83 rows=433 width=18)
-> Update on decompress_tracking (cost=0.00..70.83 rows=433 width=18)
Custom Scan (HypertableModify)
-> Update on decompress_tracking
Update on _hyper_X_X_chunk decompress_tracking_1
Update on _hyper_X_X_chunk decompress_tracking_2
-> Result (cost=0.00..70.83 rows=433 width=18)
-> Append (cost=0.00..65.42 rows=433 width=18)
-> Seq Scan on _hyper_X_X_chunk decompress_tracking_1 (cost=0.00..31.62 rows=432 width=18)
-> Result
-> Append
-> Seq Scan on _hyper_X_X_chunk decompress_tracking_1
Filter: (device = 'd1'::text)
-> Seq Scan on _hyper_X_X_chunk decompress_tracking_2 (cost=0.00..31.62 rows=1 width=18)
-> Seq Scan on _hyper_X_X_chunk decompress_tracking_2
Filter: (device = 'd1'::text)
(10 rows)
ROLLBACK;
DROP TABLE decompress_tracking;

View File

@ -15,6 +15,8 @@ INSERT INTO decompress_tracking SELECT '2020-01-01'::timestamptz + format('%s ho
SELECT count(compress_chunk(ch)) FROM show_chunks('decompress_tracking') ch;
ANALYZE decompress_tracking;
-- no tracking without analyze
:EXPLAIN UPDATE decompress_tracking SET value = value + 3;
@ -30,6 +32,10 @@ BEGIN; :EXPLAIN_ANALYZE INSERT INTO decompress_tracking (VALUES ('2020-01-01 1:3
-- test prepared statements EXPLAIN still works after execution
SET plan_cache_mode TO force_generic_plan;
PREPARE p1 AS UPDATE decompress_tracking SET value = value + 3 WHERE device = 'd1';
BEGIN; EXPLAIN EXECUTE p1; EXECUTE p1; EXPLAIN EXECUTE p1; ROLLBACK;
BEGIN;
EXPLAIN (COSTS OFF) EXECUTE p1;
EXECUTE p1;
EXPLAIN (COSTS OFF) EXECUTE p1;
ROLLBACK;
DROP TABLE decompress_tracking;

View File

@ -18,6 +18,8 @@ FROM generate_series('2000-01-01 0:00:00+0'::timestamptz,'2000-01-10 23:55:00+0'
SELECT compress_chunk(c) FROM show_chunks('testtable') c;
ANALYZE testtable;
-- Pushdown aggregation to the chunk level
SELECT count(*), sum(v0), sum(v1), sum(v2), sum(v3) FROM testtable WHERE time >= '2000-01-01 00:00:00+0' AND time <= '2000-02-01 00:00:00+0';
@ -29,6 +31,8 @@ INSERT INTO testtable(time,device_id,v0,v1,v2,v3)
SELECT time, device_id, device_id+1, device_id + 2, device_id + 0.5, NULL
FROM generate_series('2000-01-01 0:00:00+0'::timestamptz,'2000-01-10 23:55:00+0','1day') gtime(time), generate_series(1,5,1) gdevice(device_id);
ANALYZE testtable;
-- Pushdown aggregation to the chunk level
SELECT count(*), sum(v0), sum(v1), sum(v2), sum(v3) FROM testtable WHERE time >= '2000-01-01 00:00:00+0' AND time <= '2000-02-01 00:00:00+0';

View File

@ -542,7 +542,6 @@ SELECT count(*) from stattest;
-- Uncompressed chunk table is empty since we just compressed the chunk and moved everything to compressed chunk table.
-- reltuples is initially -1 on PG14 before VACUUM/ANALYZE was run
SELECT relpages, CASE WHEN reltuples > 0 THEN reltuples ELSE 0 END as reltuples FROM pg_class WHERE relname = :statchunk;
SELECT histogram_bounds FROM pg_stats WHERE tablename = :statchunk AND attname = 'c1';
SELECT compch.table_name as "STAT_COMP_CHUNK_NAME"
FROM _timescaledb_catalog.hypertable ht, _timescaledb_catalog.chunk ch
@ -553,10 +552,8 @@ FROM _timescaledb_catalog.hypertable ht, _timescaledb_catalog.chunk ch
-- reltuples is initially -1 on PG14 before VACUUM/ANALYZE was run
SELECT relpages, CASE WHEN reltuples > 0 THEN reltuples ELSE 0 END as reltuples FROM pg_class WHERE relname = :'STAT_COMP_CHUNK_NAME';
-- Now verify stats are updated on compressed chunk table when we analyze the hypertable.
ANALYZE stattest;
SELECT histogram_bounds FROM pg_stats WHERE tablename = :statchunk AND attname = 'c1';
-- Unfortunately, the stats on the hypertable won't find any rows to sample from the chunk
ANALYZE stattest;
SELECT histogram_bounds FROM pg_stats WHERE tablename = 'stattest' AND attname = 'c1';
SELECT relpages, reltuples FROM pg_class WHERE relname = :statchunk;

View File

@ -524,6 +524,8 @@ SELECT chunk_schema || '.' || chunk_name AS "chunk_table_bugtab"
SELECT compress_chunk(i) FROM show_chunks('bugtab') i;
ANALYZE bugtab;
:PREFIX
SELECT "time","hin"::text,"model"::text,"block"::text,"message_name"::text,"signal_name"::text,"signal_numeric_value","signal_string_value"::text FROM :chunk_table_bugtab ORDER BY "time" DESC;

View File

@ -2,8 +2,6 @@
-- Please see the included NOTICE for copyright information and
-- LICENSE-TIMESCALE for a copy of the license.
\c :TEST_DBNAME :ROLE_SUPERUSER
-- github issue 5585
create table test (
time timestamptz not null,
@ -28,8 +26,11 @@ alter table test set (timescaledb.compress, timescaledb.compress_segmentby='a, b
select compress_chunk(show_chunks('test'));
-- force an index scan
set enable_seqscan = 'off';
-- disable jit to avoid test flakiness
-- make some tweaks to avoid flakiness
analyze test;
analyze test_copy;
set jit = off;
set max_parallel_workers_per_gather = 0;
explain (costs off) with query_params as (
select distinct a, b