Fix restoring/scheduler entrypoint to avoid BGW death

There was a race condition between the post_restore function restarting the background worker and the setting of the restoring flag to "off". If the worker started before the change to the restoring flag had been committed, it would not see the change and then die because the worker should exit when the db is in a restoring state. This modifies the post_restore function to use a restart instead of a start so that it waits on the commit to start up. It also adds logic to the entrypoint to reload config changes caused by an `ALTER DATABASE SET` command. These changes are normally only seen at connection startup but we have to wait until after our lock on the modifying transaction is released to know whether we should adopt them.
2025-05-17 02:53:51 +08:00 · 2020-02-27 10:28:59 -05:00 · 2020-02-27 10:28:59 -05:00 · ad97d266c0
commit ad97d266c0
parent a19be04d7b
4 changed files with 84 additions and 4 deletions
--- a/sql/restoring.sql
+++ b/sql/restoring.sql
@ -27,7 +27,7 @@ BEGIN
    SELECT current_database() INTO db;
    EXECUTE format($$ALTER DATABASE %I SET timescaledb.restoring ='off'$$, db);
    SET SESSION timescaledb.restoring='off';
-    PERFORM _timescaledb_internal.start_background_workers();
+    PERFORM _timescaledb_internal.restart_background_workers();

    --try to restore the backed up uuid, if the restore did not set one
    INSERT INTO _timescaledb_catalog.metadata
--- a/src/loader/bgw_launcher.c
+++ b/src/loader/bgw_launcher.c
@ -40,6 +40,9 @@
 /* for allocating the htab storage */
 #include <utils/memutils.h>

+/* for getting settings correct before loading the versioned scheduler */
+#include "catalog/pg_db_role_setting.h"
+
 #include "../compat.h"
 #include "../extension_constants.h"
 #include "loader.h"
@ -842,6 +845,39 @@ database_is_template_check(void)
 	ReleaseSysCache(tuple);
 }

+/*
+ * Before we morph into the scheduler, we also need to reload configs from their
+ * defaults if the database default has changed. Defaults are changed in the
+ * post_restore function where we change the db default for the restoring guc
+ * wait until the txn commits and then must see if the txn made the change.
+ * Checks for changes are normally run at connection startup, but because we
+ * have to connect in order to wait on the txn we have to re-run after the wait.
+ * This function is based on the postgres function in postinit.c by the same
+ * name.
+ */
+
+static void
+process_settings(Oid databaseid)
+{
+	Relation relsetting;
+	Snapshot snapshot;
+
+	if (!IsUnderPostmaster)
+		return;
+
+	relsetting = heap_open(DbRoleSettingRelationId, AccessShareLock);
+
+	/* read all the settings under the same snapshot for efficiency */
+	snapshot = RegisterSnapshot(GetCatalogSnapshot(DbRoleSettingRelationId));
+
+	/* Later settings are ignored if set earlier. */
+	ApplySetting(snapshot, databaseid, InvalidOid, relsetting, PGC_S_DATABASE);
+	ApplySetting(snapshot, InvalidOid, InvalidOid, relsetting, PGC_S_GLOBAL);
+
+	UnregisterSnapshot(snapshot);
+	heap_close(relsetting, AccessShareLock);
+}
+
 /*
 * This can be run either from the cluster launcher at db_startup time, or
 * in the case of an install/uninstall/update of the extension, in the
@ -890,6 +926,8 @@ ts_bgw_db_scheduler_entrypoint(PG_FUNCTION_ARGS)
 	 * so, as we don't want to run in template dbs.
 	 */
 	database_is_template_check();
+	/*  Process any config changes caused by an ALTER DATABASE */
+	process_settings(MyDatabaseId);
 	ts_installed = ts_loader_extension_exists();
 	if (ts_installed)
 		StrNCpy(version, ts_loader_extension_version(), MAX_VERSION_LEN);
--- a/test/expected/bgw_launcher.out
+++ b/test/expected/bgw_launcher.out
@ -350,7 +350,30 @@ SELECT wait_worker_counts(1,0,0,0);
 t
 (1 row)

--And post_restore starts them
+-- Make sure a restart with restoring on first starts the background worker 
+BEGIN;
+SELECT _timescaledb_internal.restart_background_workers();
+ restart_background_workers 
+----------------------------
+ t
+(1 row)
+
+SELECT wait_worker_counts(1,0,1,0);
+ wait_worker_counts 
+--------------------
+ t
+(1 row)
+
+COMMIT;
+-- Then the worker dies when it sees that restoring is on after the txn commits
+SELECT wait_worker_counts(1,0,0,0);
+ wait_worker_counts 
+--------------------
+ t
+(1 row)
+
+--And post_restore starts them 
+BEGIN;
 SELECT timescaledb_post_restore();
 timescaledb_post_restore 
 --------------------------
@ -363,6 +386,14 @@ SELECT wait_worker_counts(1,0,1,0);
 t
 (1 row)

+COMMIT;
+-- And they stay started
+SELECT wait_worker_counts(1,0,1,0);
+ wait_worker_counts 
+--------------------
+ t
+(1 row)
+
 -- Make sure dropping the extension means that the scheduler is stopped
 BEGIN;
 DROP EXTENSION timescaledb;
--- a/test/sql/bgw_launcher.sql
+++ b/test/sql/bgw_launcher.sql
@ -157,10 +157,21 @@ SELECT ((current_setting('server_version_num')::int < 100000) OR wait_greater(:'
 -- Make sure running pre_restore function stops background workers
 SELECT timescaledb_pre_restore();
 SELECT wait_worker_counts(1,0,0,0);
--And post_restore starts them
+-- Make sure a restart with restoring on first starts the background worker 
+BEGIN;
+SELECT _timescaledb_internal.restart_background_workers();
+SELECT wait_worker_counts(1,0,1,0);
+COMMIT;
+-- Then the worker dies when it sees that restoring is on after the txn commits
+SELECT wait_worker_counts(1,0,0,0);
+
+--And post_restore starts them 
+BEGIN;
 SELECT timescaledb_post_restore();
 SELECT wait_worker_counts(1,0,1,0);
-
+COMMIT;
+-- And they stay started
+SELECT wait_worker_counts(1,0,1,0);

 -- Make sure dropping the extension means that the scheduler is stopped
 BEGIN;