Don't guess historical/current chunk by ids

For the chunks that don't have the ANALYZE stats, we have to estimate the number of pages and tuples in some other ways. One thing we factor in this estimation is the expected fill factor of the chunk. There are two ways we calculate the fill factor: 1. For time dimensions, by looking at whether the now() belongs to the chunks [begin, end) interval and calculating the fill factor accordingly. 2. For space dimensions, by looking at whether the chunk is one of the last chunks in the hypertable. To check (2), we used to compare the chunk ids. Turns out it didn't work correctly, because the chunk ids are global for all hypertables, and looking at them doesn't tell us anything if there are many hypertables. The code that did that was also very slow. This commit just removes this logic and considers all the chunks on space dimensions w/o the ANALYZE stats as recent chunks with fill factor of 0.5.
2025-05-19 04:03:06 +08:00 · 2022-01-11 17:37:58 +03:00 · 2022-01-11 17:37:58 +03:00 · 60edef6eba
commit 60edef6eba
parent c2bfc5d17c
5 changed files with 68 additions and 50 deletions
--- a/tsl/src/fdw/relinfo.c
+++ b/tsl/src/fdw/relinfo.c
@ -157,27 +157,34 @@ get_total_number_of_slices(Hyperspace *space)
 }

 /*
- * Fillfactor values are between 0 and 1. It's an indication of how much data is in the chunk.
+ * Estimate fill factor for the chunks that don't have ANALYZE statistics.
+ * Fill factor values are between 0 and 1. It's an indication of how much data is
+ * in the chunk, expressed as a fraction of its estimated final size.
 *
- * Two major drivers for estimation is current time and number of chunks created after.
- *
- * Fill factor estimation assumes that data written is 'recent' in regards to time dimension (eg.
- * almost real-time). For the case when writing historical data, given estimates might be more off
- * as we assume that historical chunks have fill factor 1 unless the number of chunks created after
- * is smaller then total number of slices. Even for writing historical data we might not be totally
+ * Fill factor estimation assumes that data written is 'recent' in regards to
+ * time dimension (eg. almost real-time). For the case when writing historical
+ * data, given estimates might be more off as we assume that historical chunks
+ * have fill factor 1. Even for writing historical data we might not be totally
 * wrong since most probably data has monotonically increasing time.
 *
- * Estimation handles two possible hypertable configurations: 1. time dimension is of timestamp
- * type 2. time dimension is of integer type. If hypertable uses timestamp type to partition data
- * then there are three possible scenarios here: we are beyond chunk end time (historical chunk), we
- * are somewhere in between chunk time boundaries (current chunk) or chunk start time is in the
- * future (highly unlikely). For integer type we assume that all chunks execpt for current have
- * factor 1.
+ * Estimation handles two possible hypertable configurations:
+ * 1. time dimension is of timestamp type
+ * 2. time dimension is of integer type.
 *
- * To explain how number of chunks created after the chunk affects estimation
- * let's imagine that table is space partitioned with one dimension and having 3 partitions. If data
- * is equaliy distributed amount partitions then there will be 3 current chunks. If there are two
- * new chunks created after chunk X then chunk X is the current chunk.
+ * If hypertable uses timestamp type to partition data then there are three
+ * possible scenarios here: we are beyond chunk end time (historical chunk), we
+ * are somewhere in between chunk time boundaries (current chunk) or chunk start
+ * time is in the future (highly unlikely, also treated as current chunk).
+ *
+ * For integer type we assume that all chunks w/o ANALYZE stats are current.
+ * We could use the user-specified integer time function here
+ * (set_integer_now_func()), but this logic is a fallback so we're keeping it
+ * simple for now.
+ *
+ * Earlier, this function used chunk ids to guess which chunks are created later,
+ * and treated such chunks as current. Unfortunately, the chunk ids are global
+ * for all hypertables, so this approach didn't really work if there was more
+ * than one hypertable.
 */
 static double
 estimate_chunk_fillfactor(Chunk *chunk, Hyperspace *space)
@ -185,49 +192,60 @@ estimate_chunk_fillfactor(Chunk *chunk, Hyperspace *space)
 	const Dimension *time_dim = hyperspace_get_open_dimension(space, 0);
 	const DimensionSlice *time_slice = get_chunk_time_slice(chunk, space);
 	Oid time_dim_type = ts_dimension_get_partition_type(time_dim);
-	int num_created_after = ts_chunk_num_of_chunks_created_after(chunk);
-	int total_slices = get_total_number_of_slices(space);

 	if (IS_TIMESTAMP_TYPE(time_dim_type))
 	{
 		TimestampTz now = GetSQLCurrentTimestamp(-1);
-		int64 now_internal_time;
-		double elapsed;
-		double interval;
-
 #ifdef TS_DEBUG
 		if (ts_current_timestamp_override_value >= 0)
 			now = ts_current_timestamp_override_value;
 #endif
-		now_internal_time = ts_time_value_to_internal(TimestampTzGetDatum(now), TIMESTAMPTZOID);
+		int64 now_internal_time =
+			ts_time_value_to_internal(TimestampTzGetDatum(now), TIMESTAMPTZOID);

 		/* if we are beyond end range then chunk can possibly be totally filled */
 		if (time_slice->fd.range_end <= now_internal_time)
 		{
-			/* If there are less newly created chunks then the number of slices then this is current
-			 * chunk. This also works better when writing historical data */
-			return num_created_after < total_slices ? FILL_FACTOR_CURRENT_CHUNK :
-													  FILL_FACTOR_HISTORICAL_CHUNK;
+			/*
+			 * Current time is later than the end of the chunk time range, which
+			 * means it is a historical chunk.
+			 */
+			return FILL_FACTOR_HISTORICAL_CHUNK;
 		}

-		/* for chunks in future (highly unlikely) we assume same as for `current` chunk */
+		/*
+		 * The chunk time range starts later than current time, so we treat it
+		 * as a current chunk.
+		 */
 		if (time_slice->fd.range_start >= now_internal_time)
 			return FILL_FACTOR_CURRENT_CHUNK;

-		/* current time falls within chunk time constraints */
-		elapsed = (now_internal_time - time_slice->fd.range_start);
-		interval = (time_slice->fd.range_end - time_slice->fd.range_start);
+		/*
+		 * Current time falls within chunk time constraints. The fill factor is
+		 * interpolated linearly based on where the current time is inside the
+		 * range, from 'current chunk fill factor' at the start of the range, to
+		 * 'historical chunk fill factor' at the end of the range.
+		 */
+		double elapsed = (now_internal_time - time_slice->fd.range_start);
+		double interval = (time_slice->fd.range_end - time_slice->fd.range_start);
+		Assert(interval > 0);
+		Assert(elapsed <= interval);

-		Assert(interval != 0);
+		Assert(FILL_FACTOR_HISTORICAL_CHUNK >= FILL_FACTOR_CURRENT_CHUNK);
+		double fill_factor =
+			FILL_FACTOR_CURRENT_CHUNK +
+			(FILL_FACTOR_HISTORICAL_CHUNK - FILL_FACTOR_CURRENT_CHUNK) * (elapsed / interval);

-		return elapsed / interval;
-	}
-	else
-	{
-		/* if current chunk is the last created we assume it has 0.5 fill factor */
-		return num_created_after < total_slices ? FILL_FACTOR_CURRENT_CHUNK :
-												  FILL_FACTOR_HISTORICAL_CHUNK;
+		Assert(fill_factor >= 0.);
+		Assert(fill_factor <= 1.);
+		return fill_factor;
 	}
+
+	/*
+	 * This chunk doesn't have the ANALYZE data, so it's more likely to be a
+	 * recently created, current chunk, not an old historical chunk.
+	 */
+	return FILL_FACTOR_CURRENT_CHUNK;
 }

 static void
--- a/tsl/test/expected/dist_hypertable-12.out
+++ b/tsl/test/expected/dist_hypertable-12.out
@ -3681,7 +3681,7 @@ NOTICE:  adding not-null constraint to column "time"
 -- This will enable us to more easily see estimates per chunk
 SET timescaledb.enable_per_data_node_queries = false;
 -- Estimating chunk progress uses current timestamp so we override it for test purposes
-SELECT test.tsl_override_current_timestamptz('2019-11-11 00:00'::timestamptz);
+SELECT test.tsl_override_current_timestamptz('2017-11-11 00:00'::timestamptz);
 tsl_override_current_timestamptz 
 ----------------------------------
 
@ -3745,14 +3745,14 @@ SELECT *
 FROM hyper_estimate;
                                      QUERY PLAN                                      
 --------------------------------------------------------------------------------------
- Append  (cost=100.00..707.79 rows=17 width=20)
+ Append  (cost=100.00..707.77 rows=17 width=20)
   ->  Foreign Scan on _dist_hyper_16_38_chunk  (cost=100.00..101.40 rows=4 width=20)
   ->  Foreign Scan on _dist_hyper_16_39_chunk  (cost=100.00..101.20 rows=2 width=20)
   ->  Foreign Scan on _dist_hyper_16_40_chunk  (cost=100.00..101.30 rows=3 width=20)
   ->  Foreign Scan on _dist_hyper_16_41_chunk  (cost=100.00..101.20 rows=2 width=20)
   ->  Foreign Scan on _dist_hyper_16_42_chunk  (cost=100.00..101.30 rows=3 width=20)
   ->  Foreign Scan on _dist_hyper_16_43_chunk  (cost=100.00..101.10 rows=1 width=20)
-   ->  Foreign Scan on _dist_hyper_16_44_chunk  (cost=100.00..100.20 rows=2 width=20)
+   ->  Foreign Scan on _dist_hyper_16_44_chunk  (cost=100.00..100.19 rows=2 width=20)
 (8 rows)

 CREATE TABLE devices (
--- a/tsl/test/expected/dist_hypertable-13.out
+++ b/tsl/test/expected/dist_hypertable-13.out
@ -3680,7 +3680,7 @@ NOTICE:  adding not-null constraint to column "time"
 -- This will enable us to more easily see estimates per chunk
 SET timescaledb.enable_per_data_node_queries = false;
 -- Estimating chunk progress uses current timestamp so we override it for test purposes
-SELECT test.tsl_override_current_timestamptz('2019-11-11 00:00'::timestamptz);
+SELECT test.tsl_override_current_timestamptz('2017-11-11 00:00'::timestamptz);
 tsl_override_current_timestamptz 
 ----------------------------------
 
@ -3744,14 +3744,14 @@ SELECT *
 FROM hyper_estimate;
                                      QUERY PLAN                                      
 --------------------------------------------------------------------------------------
- Append  (cost=100.00..707.79 rows=17 width=20)
+ Append  (cost=100.00..707.77 rows=17 width=20)
   ->  Foreign Scan on _dist_hyper_16_38_chunk  (cost=100.00..101.40 rows=4 width=20)
   ->  Foreign Scan on _dist_hyper_16_39_chunk  (cost=100.00..101.20 rows=2 width=20)
   ->  Foreign Scan on _dist_hyper_16_40_chunk  (cost=100.00..101.30 rows=3 width=20)
   ->  Foreign Scan on _dist_hyper_16_41_chunk  (cost=100.00..101.20 rows=2 width=20)
   ->  Foreign Scan on _dist_hyper_16_42_chunk  (cost=100.00..101.30 rows=3 width=20)
   ->  Foreign Scan on _dist_hyper_16_43_chunk  (cost=100.00..101.10 rows=1 width=20)
-   ->  Foreign Scan on _dist_hyper_16_44_chunk  (cost=100.00..100.20 rows=2 width=20)
+   ->  Foreign Scan on _dist_hyper_16_44_chunk  (cost=100.00..100.19 rows=2 width=20)
 (8 rows)

 CREATE TABLE devices (
--- a/tsl/test/expected/dist_hypertable-14.out
+++ b/tsl/test/expected/dist_hypertable-14.out
@ -3687,7 +3687,7 @@ NOTICE:  adding not-null constraint to column "time"
 -- This will enable us to more easily see estimates per chunk
 SET timescaledb.enable_per_data_node_queries = false;
 -- Estimating chunk progress uses current timestamp so we override it for test purposes
-SELECT test.tsl_override_current_timestamptz('2019-11-11 00:00'::timestamptz);
+SELECT test.tsl_override_current_timestamptz('2017-11-11 00:00'::timestamptz);
 tsl_override_current_timestamptz 
 ----------------------------------
 
@ -3751,14 +3751,14 @@ SELECT *
 FROM hyper_estimate;
                                      QUERY PLAN                                      
 --------------------------------------------------------------------------------------
- Append  (cost=100.00..707.79 rows=17 width=20)
+ Append  (cost=100.00..707.77 rows=17 width=20)
   ->  Foreign Scan on _dist_hyper_16_38_chunk  (cost=100.00..101.40 rows=4 width=20)
   ->  Foreign Scan on _dist_hyper_16_39_chunk  (cost=100.00..101.20 rows=2 width=20)
   ->  Foreign Scan on _dist_hyper_16_40_chunk  (cost=100.00..101.30 rows=3 width=20)
   ->  Foreign Scan on _dist_hyper_16_41_chunk  (cost=100.00..101.20 rows=2 width=20)
   ->  Foreign Scan on _dist_hyper_16_42_chunk  (cost=100.00..101.30 rows=3 width=20)
   ->  Foreign Scan on _dist_hyper_16_43_chunk  (cost=100.00..101.10 rows=1 width=20)
-   ->  Foreign Scan on _dist_hyper_16_44_chunk  (cost=100.00..100.20 rows=2 width=20)
+   ->  Foreign Scan on _dist_hyper_16_44_chunk  (cost=100.00..100.19 rows=2 width=20)
 (8 rows)

 CREATE TABLE devices (
--- a/tsl/test/sql/dist_hypertable.sql.in
+++ b/tsl/test/sql/dist_hypertable.sql.in
@ -1061,7 +1061,7 @@ SELECT * FROM create_distributed_hypertable('hyper_estimate', 'time', 'device',
 SET timescaledb.enable_per_data_node_queries = false;

 -- Estimating chunk progress uses current timestamp so we override it for test purposes
-SELECT test.tsl_override_current_timestamptz('2019-11-11 00:00'::timestamptz);
+SELECT test.tsl_override_current_timestamptz('2017-11-11 00:00'::timestamptz);

 -- Test estimates when backfilling. 3 chunks should be historical and 3 should be considered current when estimating.
 -- Note that estimate numbers are way off since we are using shared buffer size as starting point. This will not be