mirror of
https://github.com/typesense/typesense.git
synced 2025-05-19 05:08:43 +08:00
Return 503 for both read and write lag.
This commit is contained in:
parent
60c03bce19
commit
2a89acd84e
@ -33,8 +33,8 @@ private:
|
||||
float max_memory_ratio;
|
||||
int snapshot_interval_seconds;
|
||||
|
||||
size_t read_max_lag;
|
||||
size_t write_max_lag;
|
||||
size_t healthy_read_lag;
|
||||
size_t healthy_write_lag;
|
||||
|
||||
std::string config_file;
|
||||
int config_file_validity;
|
||||
@ -55,8 +55,8 @@ protected:
|
||||
this->enable_cors = false;
|
||||
this->max_memory_ratio = 1.0f;
|
||||
this->snapshot_interval_seconds = 3600;
|
||||
this->read_max_lag = 1000;
|
||||
this->write_max_lag = 100;
|
||||
this->healthy_read_lag = 1000;
|
||||
this->healthy_write_lag = 500;
|
||||
this->log_slow_requests_time_ms = -1;
|
||||
this->num_collections_parallel_load = 0; // will be set dynamically if not overridden
|
||||
this->num_documents_parallel_load = 1000;
|
||||
@ -191,12 +191,12 @@ public:
|
||||
return this->snapshot_interval_seconds;
|
||||
}
|
||||
|
||||
int get_read_max_lag() const {
|
||||
return this->read_max_lag;
|
||||
int get_healthy_read_lag() const {
|
||||
return this->healthy_read_lag;
|
||||
}
|
||||
|
||||
int get_write_max_lag() const {
|
||||
return this->write_max_lag;
|
||||
int get_healthy_write_lag() const {
|
||||
return this->healthy_write_lag;
|
||||
}
|
||||
|
||||
int get_log_slow_requests_time_ms() const {
|
||||
@ -279,12 +279,12 @@ public:
|
||||
this->snapshot_interval_seconds = std::stoi(get_env("TYPESENSE_SNAPSHOT_INTERVAL_SECONDS"));
|
||||
}
|
||||
|
||||
if(!get_env("TYPESENSE_READ_MAX_LAG").empty()) {
|
||||
this->read_max_lag = std::stoi(get_env("TYPESENSE_READ_MAX_LAG"));
|
||||
if(!get_env("TYPESENSE_HEALTHY_READ_LAG").empty()) {
|
||||
this->healthy_read_lag = std::stoi(get_env("TYPESENSE_HEALTHY_READ_LAG"));
|
||||
}
|
||||
|
||||
if(!get_env("TYPESENSE_WRITE_MAX_LAG").empty()) {
|
||||
this->write_max_lag = std::stoi(get_env("TYPESENSE_WRITE_MAX_LAG"));
|
||||
if(!get_env("TYPESENSE_HEALTHY_WRITE_LAG").empty()) {
|
||||
this->healthy_write_lag = std::stoi(get_env("TYPESENSE_HEALTHY_WRITE_LAG"));
|
||||
}
|
||||
|
||||
if(!get_env("TYPESENSE_LOG_SLOW_REQUESTS_TIME_MS").empty()) {
|
||||
@ -396,12 +396,12 @@ public:
|
||||
this->snapshot_interval_seconds = (int) reader.GetInteger("server", "snapshot-interval-seconds", 3600);
|
||||
}
|
||||
|
||||
if(reader.Exists("server", "read-max-lag")) {
|
||||
this->read_max_lag = (int) reader.GetInteger("server", "read-max-lag", 1000);
|
||||
if(reader.Exists("server", "healthy-read-lag")) {
|
||||
this->healthy_read_lag = (int) reader.GetInteger("server", "healthy-read-lag", 1000);
|
||||
}
|
||||
|
||||
if(reader.Exists("server", "write-max-lag")) {
|
||||
this->write_max_lag = (int) reader.GetInteger("server", "write-max-lag", 100);
|
||||
if(reader.Exists("server", "healthy-write-lag")) {
|
||||
this->healthy_write_lag = (int) reader.GetInteger("server", "healthy-write-lag", 100);
|
||||
}
|
||||
|
||||
if(reader.Exists("server", "log-slow-requests-time-ms")) {
|
||||
@ -495,12 +495,12 @@ public:
|
||||
this->snapshot_interval_seconds = options.get<int>("snapshot-interval-seconds");
|
||||
}
|
||||
|
||||
if(options.exist("read-max-lag")) {
|
||||
this->read_max_lag = options.get<int>("read-max-lag");
|
||||
if(options.exist("healthy-read-lag")) {
|
||||
this->healthy_read_lag = options.get<int>("healthy-read-lag");
|
||||
}
|
||||
|
||||
if(options.exist("write-max-lag")) {
|
||||
this->write_max_lag = options.get<int>("write-max-lag");
|
||||
if(options.exist("healthy-write-lag")) {
|
||||
this->healthy_write_lag = options.get<int>("healthy-write-lag");
|
||||
}
|
||||
|
||||
if(options.exist("log-slow-requests-time-ms")) {
|
||||
|
@ -108,8 +108,8 @@ private:
|
||||
|
||||
const bool api_uses_ssl;
|
||||
|
||||
const size_t read_max_lag;
|
||||
const size_t write_max_lag;
|
||||
const size_t healthy_read_lag;
|
||||
const size_t healthy_write_lag;
|
||||
|
||||
const size_t num_collections_parallel_load;
|
||||
const size_t num_documents_parallel_load;
|
||||
@ -138,7 +138,7 @@ public:
|
||||
|
||||
ReplicationState(HttpServer* server, Store* store, Store* meta_store,
|
||||
ThreadPool* thread_pool, http_message_dispatcher* message_dispatcher,
|
||||
bool api_uses_ssl, size_t read_max_lag, size_t write_max_lag,
|
||||
bool api_uses_ssl, size_t healthy_read_lag, size_t healthy_write_lag,
|
||||
size_t num_collections_parallel_load, size_t num_documents_parallel_load);
|
||||
|
||||
// Starts this node
|
||||
@ -231,6 +231,7 @@ private:
|
||||
|
||||
void on_leader_start(int64_t term) {
|
||||
leader_term.store(term, butil::memory_order_release);
|
||||
refresh_catchup_status(true);
|
||||
LOG(INFO) << "Node becomes leader, term: " << term;
|
||||
}
|
||||
|
||||
@ -253,6 +254,7 @@ private:
|
||||
}
|
||||
|
||||
void on_start_following(const ::braft::LeaderChangeContext& ctx) {
|
||||
refresh_catchup_status(true);
|
||||
LOG(INFO) << "Node starts following " << ctx;
|
||||
}
|
||||
|
||||
|
@ -322,7 +322,9 @@ int HttpServer::catch_all_handler(h2o_handler_t *_h2o_handler, h2o_req_t *req) {
|
||||
// Except for health check, wait for replicating state to be ready before allowing requests
|
||||
// Follower or leader must have started AND data must also have been loaded
|
||||
bool needs_readiness_check = !(
|
||||
path_without_query == "/health" || path_without_query == "/debug" || path_without_query == "/sequence"
|
||||
path_without_query == "/health" || path_without_query == "/debug" ||
|
||||
path_without_query == "/stats.json" || path_without_query == "/metrics.json" ||
|
||||
path_without_query == "/sequence"
|
||||
);
|
||||
|
||||
if(needs_readiness_check) {
|
||||
@ -332,14 +334,14 @@ int HttpServer::catch_all_handler(h2o_handler_t *_h2o_handler, h2o_req_t *req) {
|
||||
|
||||
bool write_op = !is_read_op;
|
||||
|
||||
std::string message = "{ \"message\": \"Not Ready or Lagging\"}";
|
||||
|
||||
if(is_read_op && !h2o_handler->http_server->get_replication_state()->is_read_caught_up()) {
|
||||
std::string message = "{ \"message\": \"Not Ready\"}";
|
||||
return send_response(req, 503, message);
|
||||
}
|
||||
|
||||
if(write_op && !h2o_handler->http_server->get_replication_state()->is_write_caught_up()) {
|
||||
std::string message = "{ \"message\": \"Too Many Writes\"}";
|
||||
return send_response(req, 429, message);
|
||||
else if(write_op && !h2o_handler->http_server->get_replication_state()->is_write_caught_up()) {
|
||||
return send_response(req, 503, message);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -55,7 +55,7 @@ int ReplicationState::start(const butil::EndPoint & peering_endpoint, const int
|
||||
// flag controls snapshot download size of each RPC
|
||||
braft::FLAGS_raft_max_byte_count_per_rpc = 4 * 1024 * 1024; // 4 MB
|
||||
|
||||
node_options.catchup_margin = read_max_lag;
|
||||
node_options.catchup_margin = healthy_read_lag;
|
||||
node_options.election_timeout_ms = election_timeout_ms;
|
||||
node_options.fsm = this;
|
||||
node_options.node_owns_fsm = false;
|
||||
@ -484,7 +484,6 @@ int ReplicationState::on_snapshot_load(braft::SnapshotReader* reader) {
|
||||
|
||||
bool init_db_status = init_db();
|
||||
|
||||
read_caught_up = write_caught_up = (init_db_status == 0);
|
||||
return init_db_status;
|
||||
}
|
||||
|
||||
@ -540,16 +539,14 @@ void ReplicationState::refresh_nodes(const std::string & nodes) {
|
||||
|
||||
void ReplicationState::refresh_catchup_status(bool log_msg) {
|
||||
std::shared_lock lock(node_mutex);
|
||||
|
||||
if (!node) {
|
||||
LOG_IF(WARNING, log_msg) << "Node state is not initialized: unable to refresh nodes.";
|
||||
return;
|
||||
if(node == nullptr ) {
|
||||
read_caught_up = write_caught_up = false;
|
||||
return ;
|
||||
}
|
||||
|
||||
if(!node->is_leader() && node->leader_id().is_empty()) {
|
||||
// follower does not have a leader!
|
||||
this->read_caught_up = false;
|
||||
this->write_caught_up = false;
|
||||
bool leader_or_follower = (node->is_leader() || !node->leader_id().is_empty());
|
||||
if(!leader_or_follower) {
|
||||
read_caught_up = write_caught_up = false;
|
||||
return ;
|
||||
}
|
||||
|
||||
@ -557,33 +554,34 @@ void ReplicationState::refresh_catchup_status(bool log_msg) {
|
||||
node->get_status(&n_status);
|
||||
lock.unlock();
|
||||
|
||||
if (n_status.applying_index == 0) {
|
||||
this->read_caught_up = true;
|
||||
this->write_caught_up = true;
|
||||
return ;
|
||||
}
|
||||
|
||||
size_t apply_lag = size_t(n_status.last_index - n_status.known_applied_index);
|
||||
|
||||
if (apply_lag > read_max_lag) {
|
||||
LOG(ERROR) << apply_lag << " lagging entries > read max lag of " + std::to_string(read_max_lag);
|
||||
//LOG(INFO) << "last_index: " << n_status.applying_index << ", known_applied_index: " << n_status.known_applied_index;
|
||||
//LOG(INFO) << "apply_lag: " << apply_lag;
|
||||
|
||||
if (apply_lag > healthy_read_lag) {
|
||||
LOG_IF(ERROR, log_msg) << apply_lag << " lagging entries > read max lag of " + std::to_string(healthy_read_lag);
|
||||
this->read_caught_up = false;
|
||||
} else {
|
||||
this->read_caught_up = true;
|
||||
}
|
||||
|
||||
if (apply_lag > write_max_lag) {
|
||||
LOG(ERROR) << apply_lag << " lagging entries > write max lag of " + std::to_string(write_max_lag);
|
||||
if (apply_lag > healthy_write_lag) {
|
||||
LOG_IF(ERROR, log_msg) << apply_lag << " lagging entries > write max lag of " + std::to_string(healthy_write_lag);
|
||||
this->write_caught_up = false;
|
||||
} else {
|
||||
this->write_caught_up = true;
|
||||
}
|
||||
}
|
||||
|
||||
ReplicationState::ReplicationState(HttpServer* server, Store *store, Store* meta_store, ThreadPool* thread_pool,
|
||||
http_message_dispatcher *message_dispatcher,
|
||||
bool api_uses_ssl,
|
||||
size_t read_max_lag, size_t write_max_lag,
|
||||
size_t healthy_read_lag, size_t healthy_write_lag,
|
||||
size_t num_collections_parallel_load, size_t num_documents_parallel_load):
|
||||
node(nullptr), leader_term(-1), server(server), store(store), meta_store(meta_store),
|
||||
thread_pool(thread_pool), message_dispatcher(message_dispatcher), api_uses_ssl(api_uses_ssl),
|
||||
read_max_lag(read_max_lag), write_max_lag(write_max_lag),
|
||||
healthy_read_lag(healthy_read_lag), healthy_write_lag(healthy_write_lag),
|
||||
num_collections_parallel_load(num_collections_parallel_load),
|
||||
num_documents_parallel_load(num_documents_parallel_load),
|
||||
ready(false), shutting_down(false), pending_writes(0) {
|
||||
@ -591,17 +589,6 @@ ReplicationState::ReplicationState(HttpServer* server, Store *store, Store* meta
|
||||
}
|
||||
|
||||
bool ReplicationState::is_alive() const {
|
||||
std::shared_lock lock(node_mutex);
|
||||
|
||||
if(node == nullptr ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool leader_or_follower = (node->is_leader() || !node->leader_id().is_empty());
|
||||
if(!leader_or_follower) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// for general health check we will only care about the `read_caught_up` threshold
|
||||
return read_caught_up;
|
||||
}
|
||||
|
@ -79,8 +79,8 @@ void init_cmdline_options(cmdline::parser & options, int argc, char **argv) {
|
||||
|
||||
options.add<float>("max-memory-ratio", '\0', "Maximum fraction of system memory to be used.", false, 1.0f);
|
||||
options.add<int>("snapshot-interval-seconds", '\0', "Frequency of replication log snapshots.", false, 3600);
|
||||
options.add<int>("read-max-lag", '\0', "Reads are rejected if the updates lag behind this threshold.", false, 1000);
|
||||
options.add<int>("write-max-lag", '\0', "Writes are rejected if the updates lag behind this threshold.", false, 100);
|
||||
options.add<int>("healthy-read-lag", '\0', "Reads are rejected if the updates lag behind this threshold.", false, 1000);
|
||||
options.add<int>("healthy-write-lag", '\0', "Writes are rejected if the updates lag behind this threshold.", false, 500);
|
||||
options.add<int>("log-slow-requests-time-ms", '\0', "When > 0, requests that take longer than this duration are logged.", false, -1);
|
||||
|
||||
options.add<uint32_t>("num-collections-parallel-load", '\0', "Number of collections that are loaded in parallel during start up.", false, 4);
|
||||
@ -395,8 +395,8 @@ int run_server(const Config & config, const std::string & version, void (*master
|
||||
|
||||
ReplicationState replication_state(server, &store, &meta_store, &app_thread_pool, server->get_message_dispatcher(),
|
||||
ssl_enabled,
|
||||
config.get_read_max_lag(),
|
||||
config.get_write_max_lag(),
|
||||
config.get_healthy_read_lag(),
|
||||
config.get_healthy_write_lag(),
|
||||
num_collections_parallel_load,
|
||||
config.get_num_documents_parallel_load());
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user