Fix restoring/scheduler entrypoint to avoid BGW death

There was a race condition between the post_restore function
restarting the background worker and the setting of the
restoring flag to "off". If the worker started before the
change to the restoring flag had been committed, it would not
see the change and then die because the worker should exit
when the db is in a restoring state. This modifies the
post_restore function to use a restart instead of a start
so that it waits on the commit to start up. It also adds
logic to the entrypoint to reload config changes caused
by an `ALTER DATABASE SET` command. These changes are
normally only seen at connection startup but we have to
wait until after our lock on the modifying transaction is
released to know whether we should adopt them.
This commit is contained in:
David Kohn 2020-02-27 10:28:59 -05:00 committed by davidkohn88
parent a19be04d7b
commit ad97d266c0
4 changed files with 84 additions and 4 deletions

View File

@ -27,7 +27,7 @@ BEGIN
SELECT current_database() INTO db;
EXECUTE format($$ALTER DATABASE %I SET timescaledb.restoring ='off'$$, db);
SET SESSION timescaledb.restoring='off';
PERFORM _timescaledb_internal.start_background_workers();
PERFORM _timescaledb_internal.restart_background_workers();
--try to restore the backed up uuid, if the restore did not set one
INSERT INTO _timescaledb_catalog.metadata

View File

@ -40,6 +40,9 @@
/* for allocating the htab storage */
#include <utils/memutils.h>
/* for getting settings correct before loading the versioned scheduler */
#include "catalog/pg_db_role_setting.h"
#include "../compat.h"
#include "../extension_constants.h"
#include "loader.h"
@ -842,6 +845,39 @@ database_is_template_check(void)
ReleaseSysCache(tuple);
}
/*
* Before we morph into the scheduler, we also need to reload configs from their
* defaults if the database default has changed. Defaults are changed in the
* post_restore function where we change the db default for the restoring guc
* wait until the txn commits and then must see if the txn made the change.
* Checks for changes are normally run at connection startup, but because we
* have to connect in order to wait on the txn we have to re-run after the wait.
* This function is based on the postgres function in postinit.c by the same
* name.
*/
static void
process_settings(Oid databaseid)
{
Relation relsetting;
Snapshot snapshot;
if (!IsUnderPostmaster)
return;
relsetting = heap_open(DbRoleSettingRelationId, AccessShareLock);
/* read all the settings under the same snapshot for efficiency */
snapshot = RegisterSnapshot(GetCatalogSnapshot(DbRoleSettingRelationId));
/* Later settings are ignored if set earlier. */
ApplySetting(snapshot, databaseid, InvalidOid, relsetting, PGC_S_DATABASE);
ApplySetting(snapshot, InvalidOid, InvalidOid, relsetting, PGC_S_GLOBAL);
UnregisterSnapshot(snapshot);
heap_close(relsetting, AccessShareLock);
}
/*
* This can be run either from the cluster launcher at db_startup time, or
* in the case of an install/uninstall/update of the extension, in the
@ -890,6 +926,8 @@ ts_bgw_db_scheduler_entrypoint(PG_FUNCTION_ARGS)
* so, as we don't want to run in template dbs.
*/
database_is_template_check();
/* Process any config changes caused by an ALTER DATABASE */
process_settings(MyDatabaseId);
ts_installed = ts_loader_extension_exists();
if (ts_installed)
StrNCpy(version, ts_loader_extension_version(), MAX_VERSION_LEN);

View File

@ -350,7 +350,30 @@ SELECT wait_worker_counts(1,0,0,0);
t
(1 row)
--And post_restore starts them
-- Make sure a restart with restoring on first starts the background worker
BEGIN;
SELECT _timescaledb_internal.restart_background_workers();
restart_background_workers
----------------------------
t
(1 row)
SELECT wait_worker_counts(1,0,1,0);
wait_worker_counts
--------------------
t
(1 row)
COMMIT;
-- Then the worker dies when it sees that restoring is on after the txn commits
SELECT wait_worker_counts(1,0,0,0);
wait_worker_counts
--------------------
t
(1 row)
--And post_restore starts them
BEGIN;
SELECT timescaledb_post_restore();
timescaledb_post_restore
--------------------------
@ -363,6 +386,14 @@ SELECT wait_worker_counts(1,0,1,0);
t
(1 row)
COMMIT;
-- And they stay started
SELECT wait_worker_counts(1,0,1,0);
wait_worker_counts
--------------------
t
(1 row)
-- Make sure dropping the extension means that the scheduler is stopped
BEGIN;
DROP EXTENSION timescaledb;

View File

@ -157,10 +157,21 @@ SELECT ((current_setting('server_version_num')::int < 100000) OR wait_greater(:'
-- Make sure running pre_restore function stops background workers
SELECT timescaledb_pre_restore();
SELECT wait_worker_counts(1,0,0,0);
--And post_restore starts them
-- Make sure a restart with restoring on first starts the background worker
BEGIN;
SELECT _timescaledb_internal.restart_background_workers();
SELECT wait_worker_counts(1,0,1,0);
COMMIT;
-- Then the worker dies when it sees that restoring is on after the txn commits
SELECT wait_worker_counts(1,0,0,0);
--And post_restore starts them
BEGIN;
SELECT timescaledb_post_restore();
SELECT wait_worker_counts(1,0,1,0);
COMMIT;
-- And they stay started
SELECT wait_worker_counts(1,0,1,0);
-- Make sure dropping the extension means that the scheduler is stopped
BEGIN;