Set correct collation for segmentby columns of compressed chunks

We don't do this currently, so some queries return the wrong ordering of rows if there is an index on the compressed chunk. The fix only works for the newly created chunks. We could add a migration that corrects the old compressed chunks, but it seems to be too heavy and not to lend itself well to automation -- we'll have to recreate the indexes if there are any. So the old chunks continue to return a wrong result.
2025-05-18 03:23:37 +08:00 · 2022-04-14 19:13:09 +03:00 · 2022-04-14 19:13:09 +03:00 · 0ab2d39f25
commit 0ab2d39f25
parent 472a68726c
5 changed files with 116 additions and 6 deletions
--- a/scripts/gh_matrix_builder.py
+++ b/scripts/gh_matrix_builder.py
@ -111,7 +111,7 @@ def macos_config(overrides):
    "tsdb_build_args": "-DASSERTIONS=ON -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl",
    "llvm_config": "/usr/local/opt/llvm/bin/llvm-config",
    "coverage": False,
-    "installcheck_args": "IGNORES='bgw_db_scheduler bgw_launcher pg_dump remote_connection'",
+    "installcheck_args": "IGNORES='bgw_db_scheduler bgw_launcher pg_dump remote_connection compressed_collation'",
    "extra_packages": "",
  })
  base_config.update(overrides)
--- a/tsl/src/compression/create.c
+++ b/tsl/src/compression/create.c
@ -270,6 +270,9 @@ compresscolinfo_init(CompressColInfo *cc, Oid srctbl_relid, List *segmentby_cols
 	for (attno = 0; attno < tupdesc->natts; attno++)
 	{
 		Oid attroid = InvalidOid;
 		int32 typmod = -1;
 		Oid collid = 0;
 		Form_pg_attribute attr = TupleDescAttr(tupdesc, attno);
 		ColumnDef *coldef;
 		if (attr->attisdropped)
@ -287,6 +290,8 @@ compresscolinfo_init(CompressColInfo *cc, Oid srctbl_relid, List *segmentby_cols
 			if (segorder_colindex[attno] <= seg_attnolen)
 			{
 				attroid = attr->atttypid; /*segment by columns have original type */
 				typmod = attr->atttypmod;
 				collid = attr->attcollation;
 				cc->col_meta[colno].segmentby_column_index = segorder_colindex[attno];
 			}
 			else
@ -307,7 +312,7 @@ compresscolinfo_init(CompressColInfo *cc, Oid srctbl_relid, List *segmentby_cols
 		{
 			cc->col_meta[colno].algo_id = 0; // invalid algo number
 		}
-		coldef = makeColumnDef(NameStr(attr->attname), attroid, -1 /*typmod*/, 0 /*collation*/);
+		coldef = makeColumnDef(NameStr(attr->attname), attroid, typmod, collid);
 		cc->coldeflist = lappend(cc->coldeflist, coldef);
 		colno++;
 	}
--- a/tsl/test/expected/compressed_collation.out
+++ b/tsl/test/expected/compressed_collation.out
@ -0,0 +1,58 @@
 -- This file and its contents are licensed under the Timescale License.
 -- Please see the included NOTICE for copyright information and
 -- LICENSE-TIMESCALE for a copy of the license.
 \c :TEST_DBNAME :ROLE_SUPERUSER
 -- We have different collation names such as en_US, en-US-x-icu and so on,
 -- that are available on different platforms.
 select * from (
    select 3 priority, 'en_US' "COLLATION"
    union all (select 2, collname from pg_collation where collname ilike 'en_us%' order by collname limit 1)
    union all (select 1, collname from pg_collation where collname ilike 'en_us_utf%8%' order by collname limit 1)
 ) c
 order by priority limit 1 \gset
 create table compressed_collation_ht(time timestamp, name text collate :"COLLATION",
    value float);
 select create_hypertable('compressed_collation_ht', 'time');
 NOTICE:  adding not-null constraint to column "time"
          create_hypertable           
 --------------------------------------
 (1,public,compressed_collation_ht,t)
 (1 row)
 alter table compressed_collation_ht set (timescaledb.compress,
    timescaledb.compress_segmentby = 'name', timescaledb.compress_orderby = 'time');
 insert into compressed_collation_ht values ('2021-01-01 01:01:01', 'á', '1'),
    ('2021-01-01 01:01:02', 'b', '2'), ('2021-01-01 01:01:03', 'ç', '2');
 select 1 from (
 	select compress_chunk(chunk_schema || '.' || chunk_name)
 	from timescaledb_information.chunks
 	where hypertable_name = 'compressed_collation_ht'
 ) t;
 ?column? 
 ----------
        1
 (1 row)
 select ht.schema_name || '.' || ht.table_name as "CHUNK"
 from _timescaledb_catalog.hypertable ht
    inner join _timescaledb_catalog.hypertable ht2
    on ht.id = ht2.compressed_hypertable_id
        and ht2.table_name = 'compressed_collation_ht' \gset
 create index on :CHUNK (name);
 set enable_seqscan to off;
 explain (costs off)
 select * from compressed_collation_ht order by name;
                                                  QUERY PLAN                                                   
 ---------------------------------------------------------------------------------------------------------------
 Custom Scan (DecompressChunk) on _hyper_1_1_chunk
   ->  Index Scan using compress_hyper_2_2_chunk__compressed_hypertable_2_name_idx on compress_hyper_2_2_chunk
 (2 rows)
 select * from compressed_collation_ht order by name;
           time           | name | value 
 --------------------------+------+-------
 Fri Jan 01 01:01:01 2021 | á    |     1
 Fri Jan 01 01:01:02 2021 | b    |     2
 Fri Jan 01 01:01:03 2021 | ç    |     2
 (3 rows)
--- a/tsl/test/sql/CMakeLists.txt
+++ b/tsl/test/sql/CMakeLists.txt
@ -10,18 +10,19 @@ set(TEST_CONFIGURATIONS postgresql max_bgw_8)
 set(TEST_FILES_postgresql
    bgw_custom.sql
    bgw_policy.sql
    compression_bgw.sql
    compression_permissions.sql
    compression_qualpushdown.sql
    cagg_errors.sql
    cagg_invalidation.sql
    cagg_permissions.sql
    cagg_policy.sql
    cagg_refresh.sql
    cagg_watermark.sql
    compressed_collation.sql
    compression_bgw.sql
    compression_permissions.sql
    compression_qualpushdown.sql
    dist_views.sql
    exp_cagg_next_gen.sql
    exp_cagg_monthly.sql
    exp_cagg_next_gen.sql
    exp_cagg_origin.sql
    exp_cagg_timezone.sql
    move.sql
--- a/tsl/test/sql/compressed_collation.sql
+++ b/tsl/test/sql/compressed_collation.sql
@ -0,0 +1,46 @@
 -- This file and its contents are licensed under the Timescale License.
 -- Please see the included NOTICE for copyright information and
 -- LICENSE-TIMESCALE for a copy of the license.
 \c :TEST_DBNAME :ROLE_SUPERUSER
 -- We have different collation names such as en_US, en-US-x-icu and so on,
 -- that are available on different platforms.
 select * from (
    select 3 priority, 'en_US' "COLLATION"
    union all (select 2, collname from pg_collation where collname ilike 'en_us%' order by collname limit 1)
    union all (select 1, collname from pg_collation where collname ilike 'en_us_utf%8%' order by collname limit 1)
 ) c
 order by priority limit 1 \gset
 create table compressed_collation_ht(time timestamp, name text collate :"COLLATION",
    value float);
 select create_hypertable('compressed_collation_ht', 'time');
 alter table compressed_collation_ht set (timescaledb.compress,
    timescaledb.compress_segmentby = 'name', timescaledb.compress_orderby = 'time');
 insert into compressed_collation_ht values ('2021-01-01 01:01:01', 'á', '1'),
    ('2021-01-01 01:01:02', 'b', '2'), ('2021-01-01 01:01:03', 'ç', '2');
 select 1 from (
 	select compress_chunk(chunk_schema || '.' || chunk_name)
 	from timescaledb_information.chunks
 	where hypertable_name = 'compressed_collation_ht'
 ) t;
 select ht.schema_name || '.' || ht.table_name as "CHUNK"
 from _timescaledb_catalog.hypertable ht
    inner join _timescaledb_catalog.hypertable ht2
    on ht.id = ht2.compressed_hypertable_id
        and ht2.table_name = 'compressed_collation_ht' \gset
 create index on :CHUNK (name);
 set enable_seqscan to off;
 explain (costs off)
 select * from compressed_collation_ht order by name;
 select * from compressed_collation_ht order by name;