Use numrows_pre_compression in approx row count

The approximate_row_count function was using the reltuples from compressed chunks and multiplying that with 1000 which is the default batch size. This was leading to a huge skew between the actual row count and the approximate one. We now use the numrows_pre_compression value from the timescaledb catalog which accurately represents the number of rows before the actual compression.
2025-05-14 17:43:34 +08:00 · 2023-11-30 19:52:08 +05:30 · 2023-11-30 19:52:08 +05:30 · 293104add2
commit 293104add2
parent ef030d29c0
4 changed files with 130 additions and 19 deletions
--- a/.unreleased/fix_6365
+++ b/.unreleased/fix_6365
@ -0,0 +1 @@
+Fixes: #6365 Use numrows_pre_compression in approximate row count
--- a/sql/size_utils.sql
+++ b/sql/size_utils.sql
@ -453,14 +453,15 @@ BEGIN
        IF local_compressed_hypertable_id IS NOT NULL THEN
           uncompressed_row_count = _timescaledb_functions.get_approx_row_count(relation);

-           WITH compressed_hypertable AS (SELECT table_name, schema_name FROM _timescaledb_catalog.hypertable ht
-           WHERE ht.id = local_compressed_hypertable_id)
-           SELECT c.oid INTO compressed_hypertable_oid FROM pg_class c
-           INNER JOIN compressed_hypertable h ON (c.relname = h.table_name)
-           INNER JOIN pg_namespace n ON (n.nspname = h.schema_name);
+           -- use the compression_chunk_size stats to fetch precompressed num rows
+           SELECT COALESCE(SUM(numrows_pre_compression), 0) FROM _timescaledb_catalog.chunk srcch,
+                _timescaledb_catalog.compression_chunk_size map, _timescaledb_catalog.hypertable srcht
+                INTO compressed_row_count
+                WHERE map.chunk_id = srcch.id
+                AND srcht.id = srcch.hypertable_id AND srcht.table_name = local_table_name
+                AND srcht.schema_name = local_schema_name;

-           compressed_row_count = _timescaledb_functions.get_approx_row_count(compressed_hypertable_oid);
-           RETURN (uncompressed_row_count + (compressed_row_count * max_compressed_row_count));
+           RETURN (uncompressed_row_count + compressed_row_count);
        ELSE
           uncompressed_row_count = _timescaledb_functions.get_approx_row_count(relation);
           RETURN uncompressed_row_count;
@ -477,22 +478,24 @@ BEGIN
        -- 'input is chunk #1';
        IF is_compressed_chunk IS NULL AND local_compressed_chunk_id IS NOT NULL THEN
        -- 'Include both uncompressed  and compressed chunk #2';
-            WITH compressed_ns_oid AS ( SELECT table_name, oid FROM _timescaledb_catalog.chunk ch INNER JOIN pg_namespace ns ON
-            (ch.id = local_compressed_chunk_id and ch.schema_name = ns.nspname))
-            SELECT c.oid FROM pg_class c INNER JOIN compressed_ns_oid
-            ON ( c.relnamespace = compressed_ns_oid.oid AND c.relname = compressed_ns_oid.table_name)
-            INTO local_compressed_chunk_oid;
+            -- use the compression_chunk_size stats to fetch precompressed num rows
+            SELECT COALESCE(numrows_pre_compression, 0) FROM _timescaledb_catalog.compression_chunk_size
+                INTO compressed_row_count
+                WHERE compressed_chunk_id = local_compressed_chunk_id;

            uncompressed_row_count = _timescaledb_functions.get_approx_row_count(relation);
-            compressed_row_count = _timescaledb_functions.get_approx_row_count(local_compressed_chunk_oid);
-            RETURN uncompressed_row_count + (compressed_row_count * max_compressed_row_count);
+            RETURN (uncompressed_row_count + compressed_row_count);
        ELSIF is_compressed_chunk IS NULL AND local_compressed_chunk_id IS NULL THEN
        -- 'input relation is uncompressed chunk #3';
            uncompressed_row_count = _timescaledb_functions.get_approx_row_count(relation);
            RETURN uncompressed_row_count;
        ELSE
        -- 'compressed chunk only #4';
-            compressed_row_count = _timescaledb_functions.get_approx_row_count(relation) * max_compressed_row_count;
+            -- use the compression_chunk_size stats to fetch precompressed num rows
+            SELECT COALESCE(SUM(numrows_pre_compression), 0) FROM _timescaledb_catalog.chunk srcch,
+                _timescaledb_catalog.compression_chunk_size map INTO compressed_row_count
+                WHERE map.compressed_chunk_id = srcch.id
+                AND srcch.table_name = local_table_name AND srcch.schema_name = local_schema_name;
            RETURN compressed_row_count;
        END IF;
    END IF;
--- a/tsl/test/expected/compression.out
+++ b/tsl/test/expected/compression.out
@ -39,6 +39,20 @@ insert into foo values( 3 , 16 , 20, NULL);
 insert into foo values( 10 , 10 , 20, NULL);
 insert into foo values( 20 , 11 , 20, NULL);
 insert into foo values( 30 , 12 , 20, NULL);
+analyze foo;
+-- check that approximate_row_count works with a regular table
+SELECT approximate_row_count('foo');
+ approximate_row_count 
+-----------------------
+                     4
+(1 row)
+
+SELECT count(*) from foo;
+ count 
+-------
+     4
+(1 row)
+
 alter table foo set (timescaledb.compress, timescaledb.compress_segmentby = 'a,b', timescaledb.compress_orderby = 'c desc, d asc nulls last');
 --test self-refencing updates
 SET timescaledb.enable_transparent_decompression to ON;
@ -1259,6 +1273,7 @@ SELECT * FROM pg_stats WHERE tablename = :statchunk;
 (0 rows)

 ALTER TABLE stattest SET (timescaledb.compress);
+-- check that approximate_row_count works with all normal chunks
 SELECT approximate_row_count('stattest');
 approximate_row_count 
 -----------------------
@ -1271,10 +1286,18 @@ SELECT compress_chunk(c) FROM show_chunks('stattest') c;
 _timescaledb_internal._hyper_27_55_chunk
 (1 row)

+-- check that approximate_row_count works with all compressed chunks
 SELECT approximate_row_count('stattest');
 approximate_row_count 
 -----------------------
-                     0
+                    26
+(1 row)
+
+-- actual count should match with the above
+SELECT count(*) from stattest;
+ count 
+-------
+    26
 (1 row)

 -- Uncompressed chunk table is empty since we just compressed the chunk and moved everything to compressed chunk table.
@ -1330,6 +1353,13 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :'STAT_COMP_CHUNK_NAME'
        1 |         1
 (1 row)

+-- verify that approximate_row_count works fine on a chunk with compressed data
+SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_COMP_CHUNK_NAME');
+ approximate_row_count 
+-----------------------
+                    26
+(1 row)
+
 -- Verify partial chunk stats are handled correctly when analyzing
 -- for both uncompressed and compressed chunk tables
 INSERT INTO stattest SELECT '2020/02/20 01:00'::TIMESTAMPTZ + ('1 hour'::interval * v), 250 * v FROM generate_series(25,50) v;
@ -1362,6 +1392,28 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :'STAT_COMP_CHUNK_NAME'
        1 |         1
 (1 row)

+-- verify that approximate_row_count works fine on a chunk with a mix of uncompressed
+-- and compressed data
+SELECT table_name  as "STAT_CHUNK_NAME" from temptable \gset
+SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_CHUNK_NAME');
+ approximate_row_count 
+-----------------------
+                    52
+(1 row)
+
+-- should match with the result via the hypertable post in-memory decompression
+SELECT count(*) from stattest;
+ count 
+-------
+    52
+(1 row)
+
+SELECT count(*) from show_chunks('stattest');
+ count 
+-------
+     1
+(1 row)
+
 -- Verify that decompressing the chunk restores autoanalyze to the hypertable's setting
 SELECT reloptions FROM pg_class WHERE relname = :statchunk;
 reloptions 
@ -1423,7 +1475,6 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :statchunk;
        1 |        52
 (1 row)

-DROP TABLE stattest;
 --- Test that analyze on compression internal table updates stats on original chunks
 CREATE TABLE stattest2(time TIMESTAMPTZ NOT NULL, c1 int, c2 int);
 SELECT create_hypertable('stattest2', 'time', chunk_time_interval=>'1 day'::interval);
@ -2626,3 +2677,30 @@ SELECT * FROM sensor_data_compressed ORDER BY time DESC LIMIT 5;
 (70 rows)

 RESET timescaledb.enable_decompression_sorted_merge;
+-- create another chunk
+INSERT INTO stattest SELECT '2021/02/20 01:00'::TIMESTAMPTZ + ('1 hour'::interval * v), 250 * v FROM generate_series(125,140) v;
+ANALYZE stattest;
+SELECT count(*) from show_chunks('stattest');
+ count 
+-------
+     2
+(1 row)
+
+SELECT table_name INTO TEMPORARY temptable FROM _timescaledb_catalog.chunk WHERE hypertable_id = (SELECT id FROM _timescaledb_catalog.hypertable WHERE table_name = 'stattest') ORDER BY creation_time desc limit 1;
+SELECT table_name  as "STAT_CHUNK2_NAME" FROM temptable \gset
+-- verify that approximate_row_count works ok on normal chunks
+SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_CHUNK2_NAME');
+ approximate_row_count 
+-----------------------
+                    16
+(1 row)
+
+-- verify that approximate_row_count works fine on a hypertable with a mix of uncompressed
+-- and compressed data
+SELECT approximate_row_count('stattest');
+ approximate_row_count 
+-----------------------
+                    68
+(1 row)
+
+DROP TABLE stattest;
--- a/tsl/test/sql/compression.sql
+++ b/tsl/test/sql/compression.sql
@ -19,6 +19,10 @@ insert into foo values( 3 , 16 , 20, NULL);
 insert into foo values( 10 , 10 , 20, NULL);
 insert into foo values( 20 , 11 , 20, NULL);
 insert into foo values( 30 , 12 , 20, NULL);
+analyze foo;
+-- check that approximate_row_count works with a regular table
+SELECT approximate_row_count('foo');
+SELECT count(*) from foo;

 alter table foo set (timescaledb.compress, timescaledb.compress_segmentby = 'a,b', timescaledb.compress_orderby = 'c desc, d asc nulls last');

@ -538,9 +542,13 @@ SELECT table_name INTO TEMPORARY temptable FROM _timescaledb_catalog.chunk WHERE
 SELECT * FROM pg_stats WHERE tablename = :statchunk;

 ALTER TABLE stattest SET (timescaledb.compress);
+-- check that approximate_row_count works with all normal chunks
 SELECT approximate_row_count('stattest');
 SELECT compress_chunk(c) FROM show_chunks('stattest') c;
+-- check that approximate_row_count works with all compressed chunks
 SELECT approximate_row_count('stattest');
+-- actual count should match with the above
+SELECT count(*) from stattest;
 -- Uncompressed chunk table is empty since we just compressed the chunk and moved everything to compressed chunk table.
 -- reltuples is initially -1 on PG14 before VACUUM/ANALYZE was run
 SELECT relpages, CASE WHEN reltuples > 0 THEN reltuples ELSE 0 END as reltuples FROM pg_class WHERE relname = :statchunk;
@ -564,6 +572,8 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :statchunk;

 -- verify that corresponding compressed chunk table stats is updated as well.
 SELECT relpages, reltuples FROM pg_class WHERE relname = :'STAT_COMP_CHUNK_NAME';
+-- verify that approximate_row_count works fine on a chunk with compressed data
+SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_COMP_CHUNK_NAME');

 -- Verify partial chunk stats are handled correctly when analyzing
 -- for both uncompressed and compressed chunk tables
@ -577,7 +587,13 @@ SELECT relpages, reltuples FROM pg_class WHERE relname = :statchunk;
 -- verify that corresponding compressed chunk table stats have not changed since
 -- we didn't compress anything new.
 SELECT relpages, reltuples FROM pg_class WHERE relname = :'STAT_COMP_CHUNK_NAME';
-
+-- verify that approximate_row_count works fine on a chunk with a mix of uncompressed
+-- and compressed data
+SELECT table_name  as "STAT_CHUNK_NAME" from temptable \gset
+SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_CHUNK_NAME');
+-- should match with the result via the hypertable post in-memory decompression
+SELECT count(*) from stattest;
+SELECT count(*) from show_chunks('stattest');

 -- Verify that decompressing the chunk restores autoanalyze to the hypertable's setting
 SELECT reloptions FROM pg_class WHERE relname = :statchunk;
@ -597,7 +613,6 @@ SET client_min_messages TO NOTICE;
 SELECT histogram_bounds FROM pg_stats WHERE tablename = :statchunk and attname = 'c1';
 SELECT relpages, reltuples FROM pg_class WHERE relname = :statchunk;

-DROP TABLE stattest;

 --- Test that analyze on compression internal table updates stats on original chunks
 CREATE TABLE stattest2(time TIMESTAMPTZ NOT NULL, c1 int, c2 int);
@ -1052,3 +1067,17 @@ SET timescaledb.enable_decompression_sorted_merge = FALSE;
 :PREFIX
 SELECT * FROM sensor_data_compressed ORDER BY time DESC LIMIT 5;
 RESET timescaledb.enable_decompression_sorted_merge;
+
+-- create another chunk
+INSERT INTO stattest SELECT '2021/02/20 01:00'::TIMESTAMPTZ + ('1 hour'::interval * v), 250 * v FROM generate_series(125,140) v;
+ANALYZE stattest;
+SELECT count(*) from show_chunks('stattest');
+SELECT table_name INTO TEMPORARY temptable FROM _timescaledb_catalog.chunk WHERE hypertable_id = (SELECT id FROM _timescaledb_catalog.hypertable WHERE table_name = 'stattest') ORDER BY creation_time desc limit 1;
+SELECT table_name  as "STAT_CHUNK2_NAME" FROM temptable \gset
+-- verify that approximate_row_count works ok on normal chunks
+SELECT approximate_row_count('_timescaledb_internal.' || :'STAT_CHUNK2_NAME');
+-- verify that approximate_row_count works fine on a hypertable with a mix of uncompressed
+-- and compressed data
+SELECT approximate_row_count('stattest');
+
+DROP TABLE stattest;
				`@ -0,0 +1 @@`
				`Fixes: #6365 Use numrows_pre_compression in approximate row count`