Address lag in health check resource error update.

We will now call resource check even for health checks.
This commit is contained in:
Kishore Nallan 2024-03-03 19:29:19 +05:30
parent 51525108c2
commit 9c90290ea7
3 changed files with 33 additions and 29 deletions

View File

@ -1,5 +1,7 @@
#pragma once
#include <cstdint>
#include <atomic>
#include <mutex>
#include <chrono>
#include <string>
#include <sys/statvfs.h>
@ -14,18 +16,8 @@ public:
private:
const static size_t REFRESH_INTERVAL_SECS = 5;
uint64_t disk_total_bytes = 0;
uint64_t disk_used_bytes = 0;
uint64_t memory_total_bytes = 0;
uint64_t memory_available_bytes = 0;
uint64_t swap_total_bytes = 0;
uint64_t swap_free_bytes = 0;
uint64_t last_checked_ts = 0;
resource_check_t resource_error;
std::atomic<uint64_t> last_checked_ts = 0;
std::mutex m;
cached_resource_stat_t() = default;
@ -41,6 +33,4 @@ public:
resource_check_t has_enough_resources(const std::string& data_dir_path,
const int disk_used_max_percentage,
const int memory_used_max_percentage);
const resource_check_t get_out_of_resource_error() const;
};

View File

@ -11,6 +11,17 @@ cached_resource_stat_t::has_enough_resources(const std::string& data_dir_path,
return cached_resource_stat_t::OK;
}
std::unique_lock lk(m);
uint64_t disk_total_bytes = 0;
uint64_t disk_used_bytes = 0;
uint64_t memory_total_bytes = 0;
uint64_t memory_available_bytes = 0;
uint64_t swap_total_bytes = 0;
uint64_t swap_free_bytes = 0;
uint64_t now = std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
@ -61,12 +72,12 @@ cached_resource_stat_t::has_enough_resources(const std::string& data_dir_path,
last_checked_ts = now;
}
lk.unlock();
double disk_used_percentage = (double(disk_used_bytes)/double(disk_total_bytes)) * 100;
if(disk_used_percentage > disk_used_max_percentage) {
LOG(INFO) << "disk_total_bytes: " << disk_total_bytes << ", disk_used_bytes: " << disk_used_bytes
<< ", disk_used_percentage: " << disk_used_percentage;
resource_error = cached_resource_stat_t::OUT_OF_DISK;
return cached_resource_stat_t::OUT_OF_DISK;
}
@ -79,7 +90,6 @@ cached_resource_stat_t::has_enough_resources(const std::string& data_dir_path,
uint64_t all_memory_used = (memory_total_bytes - memory_available_bytes) + (swap_total_bytes - swap_free_bytes);
if(all_memory_used >= memory_total_bytes) {
resource_error = cached_resource_stat_t::OUT_OF_MEMORY;
return cached_resource_stat_t::OUT_OF_MEMORY;
}
@ -92,14 +102,8 @@ cached_resource_stat_t::has_enough_resources(const std::string& data_dir_path,
LOG(INFO) << "memory_total: " << memory_total_bytes << ", memory_available: " << memory_available_bytes
<< ", all_memory_used: " << all_memory_used << ", free_mem: " << free_mem
<< ", memory_free_min: " << memory_free_min_bytes;
resource_error = cached_resource_stat_t::OUT_OF_MEMORY;
return cached_resource_stat_t::OUT_OF_MEMORY;
}
resource_error = cached_resource_stat_t::OK;
return cached_resource_stat_t::OK;
}
const cached_resource_stat_t::resource_check_t cached_resource_stat_t::get_out_of_resource_error() const {
return resource_error;
}

View File

@ -315,9 +315,14 @@ bool get_health_with_resource_usage(const std::shared_ptr<http_req>& req, const
nlohmann::json result;
bool alive = server->is_alive();
auto resource_error = cached_resource_stat_t::get_instance().get_out_of_resource_error();
if (resource_error != cached_resource_stat_t::resource_check_t::OK) {
result["resource_error"] = std::string(magic_enum::enum_name(resource_error));
auto resource_check = cached_resource_stat_t::get_instance().has_enough_resources(
Config::get_instance().get_data_dir(),
Config::get_instance().get_disk_used_max_percentage(),
Config::get_instance().get_memory_used_max_percentage()
);
if (resource_check != cached_resource_stat_t::resource_check_t::OK) {
result["resource_error"] = std::string(magic_enum::enum_name(resource_check));
}
if(req->params.count("cpu_threshold") != 0 && StringUtils::is_float(req->params["cpu_threshold"])) {
@ -345,9 +350,14 @@ bool get_health(const std::shared_ptr<http_req>& req, const std::shared_ptr<http
bool alive = server->is_alive();
result["ok"] = alive;
auto resource_error = cached_resource_stat_t::get_instance().get_out_of_resource_error();
if (resource_error != cached_resource_stat_t::resource_check_t::OK) {
result["resource_error"] = std::string(magic_enum::enum_name(resource_error));
auto resource_check = cached_resource_stat_t::get_instance().has_enough_resources(
Config::get_instance().get_data_dir(),
Config::get_instance().get_disk_used_max_percentage(),
Config::get_instance().get_memory_used_max_percentage()
);
if (resource_check != cached_resource_stat_t::resource_check_t::OK) {
result["resource_error"] = std::string(magic_enum::enum_name(resource_check));
}
if(alive) {