// monitor.go
//
// This source file is part of the FoundationDB open source project
//
// Copyright 2021 Apple Inc. and the FoundationDB project authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package main

import (
	"bufio"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"net/http/pprof"
	"os"
	"os/exec"
	"os/signal"
	"path"
	"sync"
	"syscall"
	"time"

	"k8s.io/utils/pointer"
	"github.com/prometheus/client_golang/prometheus/promhttp"

	"github.com/apple/foundationdb/fdbkubernetesmonitor/api"
	"github.com/fsnotify/fsnotify"
	"github.com/go-logr/logr"
)

// errorBackoffSeconds is the time to wait after a process fails before starting
// another process.
// This delay will only be applied when there has been more than one failure
// within this time window.
const errorBackoffSeconds = 60

// Monitor provides the main monitor loop
type Monitor struct {
	// ConfigFile defines the path to the config file to load.
	ConfigFile string

	// CustomEnvironment defines the custom environment variables to use when
	// interpreting the monitor configuration.
	CustomEnvironment map[string]string

	// ActiveConfiguration defines the active process configuration.
	ActiveConfiguration *api.ProcessConfiguration

	// ActiveConfigurationBytes defines the source data for the active process
	// configuration.
	ActiveConfigurationBytes []byte

	// LastConfigurationTime is the last time we successfully reloaded the
	// configuration file.
	LastConfigurationTime time.Time

	// ProcessCount defines how many processes the
	ProcessCount int

	// ProcessIDs stores the PIDs of the processes that are running. A PID of
	// zero will indicate that a process does not have a run loop. A PID of -1
	// will indicate that a process has a run loop but is not currently running
	// the subprocess.
	ProcessIDs []int

	// Mutex defines a mutex around working with configuration.
	// This is used to synchronize access to local state like the active
	// configuration and the process IDs from multiple goroutines.
	Mutex sync.Mutex

	// PodClient is a client for posting updates about this pod to
	// Kubernetes.
	PodClient *PodClient

	// Logger is the logger instance for this monitor.
	Logger logr.Logger
}

// StartMonitor starts the monitor loop.
func StartMonitor(logger logr.Logger, configFile string, customEnvironment map[string]string, processCount int, listenAddr string, enableDebug bool) {
	podClient, err := CreatePodClient(logger)
	if err != nil {
		panic(err)
	}

	monitor := &Monitor{
		ConfigFile:        configFile,
		PodClient:         podClient,
		Logger:            logger,
		CustomEnvironment: customEnvironment,
		ProcessCount:      processCount,
	}

	go func() { monitor.WatchPodTimestamps() }()

	mux := http.NewServeMux()
	// Enable pprof endpoints for debugging purposes.
	if enableDebug {
		mux.Handle("/debug/pprof/heap", pprof.Handler("heap"))
		mux.Handle("/debug/pprof/goroutine", pprof.Handler("goroutine"))
		mux.Handle("/debug/pprof/threadcreate", pprof.Handler("threadcreate"))
		mux.Handle("/debug/pprof/allocs", pprof.Handler("allocs"))
		mux.Handle("/debug/pprof/block", pprof.Handler("block"))
		mux.Handle("/debug/pprof/mutex", pprof.Handler("mutex"))
		mux.HandleFunc("/debug/pprof/", pprof.Index)
		mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
		mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
		mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
		mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
	}

	// Add Prometheus support
	mux.Handle("/metrics", promhttp.Handler())
	go func() {
		err := http.ListenAndServe(listenAddr, mux)
		if err != nil {
			logger.Error(err, "could not start HTTP server")
			os.Exit(1)
		}
	}()

	monitor.Run()
}

// LoadConfiguration loads the latest configuration from the config file.
func (monitor *Monitor) LoadConfiguration() {
	file, err := os.Open(monitor.ConfigFile)
	if err != nil {
		monitor.Logger.Error(err, "Error reading monitor config file", "monitorConfigPath", monitor.ConfigFile)
		return
	}
	defer file.Close()
	configuration := &api.ProcessConfiguration{}
	configurationBytes, err := io.ReadAll(file)
	if err != nil {
		monitor.Logger.Error(err, "Error reading monitor configuration", "monitorConfigPath", monitor.ConfigFile)
	}
	err = json.Unmarshal(configurationBytes, configuration)
	if err != nil {
		monitor.Logger.Error(err, "Error parsing monitor configuration", "rawConfiguration", string(configurationBytes))
		return
	}

	if currentContainerVersion == configuration.Version {
		configuration.BinaryPath = fdbserverPath
	} else {
		configuration.BinaryPath = path.Join(sharedBinaryDir, configuration.Version, "fdbserver")
	}

	err = checkOwnerExecutable(configuration.BinaryPath)
	if err != nil {
		monitor.Logger.Error(err, "Error with binary path for latest configuration", "configuration", configuration, "binaryPath", configuration.BinaryPath)
		return
	}

	_, err = configuration.GenerateArguments(1, monitor.CustomEnvironment)
	if err != nil {
		monitor.Logger.Error(err, "Error generating arguments for latest configuration", "configuration", configuration, "binaryPath", configuration.BinaryPath)
		return
	}

	monitor.acceptConfiguration(configuration, configurationBytes)
}

// checkOwnerExecutable validates that a path is a file that exists and is
// executable by its owner.
func checkOwnerExecutable(path string) error {
	binaryStat, err := os.Stat(path)
	if err != nil {
		return err
	}
	if binaryStat.Mode()&0o100 == 0 {
		return fmt.Errorf("Binary is not executable")
	}
	return nil
}

// acceptConfiguration is called when the monitor process parses and accepts
// a configuration from the local config file.
func (monitor *Monitor) acceptConfiguration(configuration *api.ProcessConfiguration, configurationBytes []byte) {
	monitor.Mutex.Lock()
	defer monitor.Mutex.Unlock()
	monitor.Logger.Info("Received new configuration file", "configuration", configuration)

	if monitor.ProcessIDs == nil {
		monitor.ProcessIDs = make([]int, monitor.ProcessCount+1)
	} else {
		for len(monitor.ProcessIDs) <= monitor.ProcessCount {
			monitor.ProcessIDs = append(monitor.ProcessIDs, 0)
		}
	}

	monitor.ActiveConfiguration = configuration
	monitor.ActiveConfigurationBytes = configurationBytes
	monitor.LastConfigurationTime = time.Now()

	for processNumber := 1; processNumber <= monitor.ProcessCount; processNumber++ {
		if monitor.ProcessIDs[processNumber] == 0 {
			monitor.ProcessIDs[processNumber] = -1
			tempNumber := processNumber
			go func() { monitor.RunProcess(tempNumber) }()
		}
	}

	err := monitor.PodClient.UpdateAnnotations(monitor)
	if err != nil {
		monitor.Logger.Error(err, "Error updating pod annotations")
	}
}

// RunProcess runs a loop to continually start and watch a process.
func (monitor *Monitor) RunProcess(processNumber int) {
	pid := 0
	logger := monitor.Logger.WithValues("processNumber", processNumber, "area", "RunProcess")
	logger.Info("Starting run loop")
	for {
		if !monitor.checkProcessRequired(processNumber) {
			return
		}

		arguments, err := monitor.ActiveConfiguration.GenerateArguments(processNumber, monitor.CustomEnvironment)
		if err != nil {
			logger.Error(err, "Error generating arguments for subprocess", "configuration", monitor.ActiveConfiguration)
			time.Sleep(errorBackoffSeconds * time.Second)
		}
		cmd := exec.Cmd{
			Path: arguments[0],
			Args: arguments,
		}

		logger.Info("Starting subprocess", "arguments", arguments)

		stdout, err := cmd.StdoutPipe()
		if err != nil {
			logger.Error(err, "Error getting stdout from subprocess")
		}

		stderr, err := cmd.StderrPipe()
		if err != nil {
			logger.Error(err, "Error getting stderr from subprocess")
		}

		err = cmd.Start()
		if err != nil {
			logger.Error(err, "Error starting subprocess")
			time.Sleep(errorBackoffSeconds * time.Second)
			continue
		}

		if cmd.Process != nil {
			pid = cmd.Process.Pid
		} else {
			logger.Error(nil, "No Process information available for subprocess")
		}

		startTime := time.Now()
		logger.Info("Subprocess started", "PID", pid)

		monitor.updateProcessID(processNumber, pid)

		if stdout != nil {
			stdoutScanner := bufio.NewScanner(stdout)
			go func() {
				for stdoutScanner.Scan() {
					logger.Info("Subprocess output", "msg", stdoutScanner.Text(), "PID", pid)
				}
			}()
		}

		if stderr != nil {
			stderrScanner := bufio.NewScanner(stderr)
			go func() {
				for stderrScanner.Scan() {
					logger.Error(nil, "Subprocess error log", "msg", stderrScanner.Text(), "PID", pid)
				}
			}()
		}

		err = cmd.Wait()
		if err != nil {
			logger.Error(err, "Error from subprocess", "PID", pid)
		}
		exitCode := -1
		if cmd.ProcessState != nil {
			exitCode = cmd.ProcessState.ExitCode()
		}

		logger.Info("Subprocess terminated", "exitCode", exitCode, "PID", pid)

		endTime := time.Now()
		monitor.updateProcessID(processNumber, -1)

		processDuration := endTime.Sub(startTime)
		if processDuration.Seconds() < errorBackoffSeconds {
			logger.Info("Backing off from restarting subprocess", "backOffTimeSeconds", errorBackoffSeconds, "lastExecutionDurationSeconds", processDuration)
			time.Sleep(errorBackoffSeconds * time.Second)
		}
	}
}

// checkProcessRequired determines if the latest configuration requires that a
// process stay running.
// If the process is no longer desired, this will remove it from the process ID
// list and return false. If the process is still desired, this will return
// true.
func (monitor *Monitor) checkProcessRequired(processNumber int) bool {
	monitor.Mutex.Lock()
	defer monitor.Mutex.Unlock()
	logger := monitor.Logger.WithValues("processNumber", processNumber, "area", "checkProcessRequired")
	runProcesses := pointer.BoolDeref(monitor.ActiveConfiguration.RunServers, true)
	if monitor.ProcessCount < processNumber || !runProcesses {
		logger.Info("Terminating run loop")
		monitor.ProcessIDs[processNumber] = 0
		return false
	}

	return true
}

// updateProcessID records a new Process ID from a newly launched process.
func (monitor *Monitor) updateProcessID(processNumber int, pid int) {
	monitor.Mutex.Lock()
	defer monitor.Mutex.Unlock()
	monitor.ProcessIDs[processNumber] = pid
}

// WatchConfiguration detects changes to the monitor configuration file.
func (monitor *Monitor) WatchConfiguration(watcher *fsnotify.Watcher) {
	for {
		select {
		case event, ok := <-watcher.Events:
			if !ok {
				return
			}
			monitor.Logger.Info("Detected event on monitor conf file", "event", event)
			if event.Op&fsnotify.Write == fsnotify.Write || event.Op&fsnotify.Create == fsnotify.Create {
				monitor.LoadConfiguration()
			} else if event.Op&fsnotify.Remove == fsnotify.Remove {
				err := watcher.Add(monitor.ConfigFile)
				if err != nil {
					panic(err)
				}
				monitor.LoadConfiguration()
			}
		case err, ok := <-watcher.Errors:
			if !ok {
				return
			}
			monitor.Logger.Error(err, "Error watching for file system events")
		}
	}
}

// Run runs the monitor loop.
func (monitor *Monitor) Run() {
	done := make(chan bool, 1)
	signals := make(chan os.Signal, 1)
	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)

	go func() {
		latestSignal := <-signals
		monitor.Logger.Info("Received system signal", "signal", latestSignal)
		for processNumber, processID := range monitor.ProcessIDs {
			if processID > 0 {
				subprocessLogger := monitor.Logger.WithValues("processNumber", processNumber, "PID", processID)
				process, err := os.FindProcess(processID)
				if err != nil {
					subprocessLogger.Error(err, "Error finding subprocess")
					continue
				}
				subprocessLogger.Info("Sending signal to subprocess", "signal", latestSignal)
				err = process.Signal(latestSignal)
				if err != nil {
					subprocessLogger.Error(err, "Error signaling subprocess")
					continue
				}
			}
		}
		done <- true
	}()

	monitor.LoadConfiguration()
	watcher, err := fsnotify.NewWatcher()
	if err != nil {
		panic(err)
	}
	err = watcher.Add(monitor.ConfigFile)
	if err != nil {
		panic(err)
	}

	defer func(watcher *fsnotify.Watcher) {
		err := watcher.Close()
		if err != nil {
			monitor.Logger.Error(err, "could not close watcher")
		}
	}(watcher)
	go func() { monitor.WatchConfiguration(watcher) }()

	<-done
}

// WatchPodTimestamps watches the timestamp feed to reload the configuration.
func (monitor *Monitor) WatchPodTimestamps() {
	for timestamp := range monitor.PodClient.TimestampFeed {
		if timestamp > monitor.LastConfigurationTime.Unix() {
			monitor.LoadConfiguration()
		}
	}
}