rancher · weyfonk · Oct 3, 2024 · Oct 4, 2024 · Oct 7, 2024 · Oct 7, 2024
@@ -12,6 +12,8 @@ data:
       "apiServerCA": "{{b64enc .Values.apiServerCA}}",
       "agentCheckinInterval": "{{.Values.agentCheckinInterval}}",
       "agentTLSMode": "{{.Values.agentTLSMode}}",
+      "clusterMonitorInterval": "{{.Values.clusterMonitorInterval}}",
+      "clusterMonitorThreshold": "{{.Values.clusterMonitorThreshold}}",
       {{ if .Values.garbageCollectionInterval }}
       "garbageCollectionInterval": "{{.Values.garbageCollectionInterval}}",
       {{ end }}

@@ -23,6 +23,15 @@ agentTLSMode: "system-store"
 # A duration string for how often agents should report a heartbeat
 agentCheckinInterval: "15m"
 
+# Determines how long must have elapsed since a downstream cluster's Fleet agent last reported its status to the
+# management cluster, before that downstream cluster is considered offline.
+# If this configured value is shorter than three times the agent check-in interval, then that check-in
+# interval-based value will be used instead to prevent false positives.
+clusterMonitorThreshold: "45m"
+
+# Determines how often the cluster monitor will check for offline downstream clusters.
+clusterMonitorInterval: "10m"
+
 # The amount of time that agents will wait before they clean up old Helm releases.
 # A non-existent value or 0 will result in an interval of 15 minutes.
 garbageCollectionInterval: "15m"

@@ -44,6 +44,14 @@ func (c Cond) IsFalse(obj interface{}) bool {
 	return getStatus(obj, string(c)) == "False"
 }
 
+func (c Cond) Unknown(obj interface{}) {
+	setStatus(obj, string(c), "Unknown")
+}
+
+func (c Cond) IsUnknown(obj interface{}) bool {
+	return getStatus(obj, string(c)) == "Unknown"
+}
+
 func (c Cond) Reason(obj interface{}, reason string) {
 	cond := findOrCreateCond(obj, string(c))
 	getFieldValue(cond, "Reason").SetString(reason)

@@ -236,6 +236,10 @@ func (i *importHandler) importCluster(cluster *fleet.Cluster, status fleet.Clust
 		apiServerCA  = secret.Data[config.APIServerCAKey]
 	)
 
+	if cfg.AgentCheckinInterval.Seconds() == 0 {
+		return status, fmt.Errorf("agent check-in interval cannot be 0")
+	}
+
 	if apiServerURL == "" {
 		if len(cfg.APIServerURL) == 0 {
 			return status, fmt.Errorf("missing apiServerURL in fleet config for cluster auto registration")

@@ -256,6 +256,10 @@ func (h *handler) newAgentBundle(ns string, cluster *fleet.Cluster) (runtime.Obj
 		agentNamespace = cluster.Spec.AgentNamespace
 	}
 
+	if cfg.AgentCheckinInterval.Seconds() == 0 {
+		return nil, fmt.Errorf("agent check-in interval cannot be 0")
+	}
+
 	// Notice we only set the agentScope when it's a non-default agentNamespace. This is for backwards compatibility
 	// for when we didn't have agent scope before
 	objs := agent.Manifest(

@@ -1,17 +1,37 @@
 package manageagent
 
 import (
+	"strings"
 	"testing"
+	"time"
 
 	"github.com/golang/mock/gomock"
 	"github.com/rancher/wrangler/v3/pkg/generic/fake"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/utils/ptr"
 
+	"github.com/rancher/fleet/internal/config"
 	fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1"
 )
 
+func TestNewAgentBundle(t *testing.T) {
+	config.Set(&config.Config{AgentCheckinInterval: metav1.Duration{Duration: 0 * time.Second}})
+
+	h := handler{systemNamespace: "blah"}
+	obj, err := h.newAgentBundle("foo", &fleet.Cluster{Spec: fleet.ClusterSpec{AgentNamespace: "bar"}})
+
+	if obj != nil {
+		t.Fatalf("expected obj returned by newAgentBundle to be nil")
+	}
+
+	expectedStr := "interval cannot be 0"
+	if !strings.Contains(err.Error(), expectedStr) {
+		t.Fatalf("expected error returned by newAgentBundle to contain %q", expectedStr)
+	}
+}
+
 func TestOnClusterChangeAffinity(t *testing.T) {
 	ctrl := gomock.NewController(t)
 	namespaces := fake.NewMockNonNamespacedControllerInterface[*corev1.Namespace, *corev1.NamespaceList](ctrl)

@@ -0,0 +1,145 @@
+package clustermonitor
+
+import (
+	"context"
+	"errors"
+	"math"
+	"time"
+
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/util/retry"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	"github.com/rancher/fleet/internal/cmd/agent/deployer/monitor"
+	"github.com/rancher/fleet/internal/config"
+	"github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1"
+)
+
+const offlineMsg = "cluster is offline"
+
+// Run monitors Fleet cluster resources' agent last seen dates. If a cluster's agent was last seen longer ago than
+// a certain threshold, then Run updates statuses of all bundle deployments targeting that cluster, to reflect the fact
+// that the cluster is offline. This prevents those bundle deployments from displaying outdated status information.
+//
+// A cluster will be considered offline if its Fleet agent has not reported its status for more than:
+// - three times the agent check-in interval
+// - or any larger configured interval.
+// Therefore, this function requires configuration to have been loaded into the config package using `Load` before
+// running.
+//
+// Bundle deployment status updates done here are unlikely to conflict with those done by the bundle deployment
+// reconciler, which are either run from an online target cluster (from its Fleet agent) or triggered by other status
+// updates such as this one (eg. bundle deployment reconciler living in the Fleet controller).
+func Run(ctx context.Context, c client.Client, interval, threshold time.Duration) {
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-time.After(interval):
+		}
+
+		cfg := config.Get() // This enables config changes to take effect
+
+		thresholdSecs := math.Max(cfg.AgentCheckinInterval.Seconds()*3, threshold.Seconds())
+
+		UpdateOfflineBundleDeployments(ctx, c, time.Second*time.Duration(thresholdSecs))
+	}
+}
+
+// UpdateOfflineBundleDeployments looks for offline clusters based on the provided threshold duration. For each cluster
+// considered offline, this updates its bundle deployments' statuses accordingly.
+// If a cluster's bundle deployments have already been marked as offline, they will be skipped.
+func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, threshold time.Duration) {
+	logger := ctrl.Log.WithName("cluster status monitor")
+
+	clusters := &v1alpha1.ClusterList{}
+	if err := c.List(ctx, clusters); err != nil {
+		logger.Error(err, "Failed to get list of clusters")
+		return
+	}
+
+	for _, cluster := range clusters.Items {
+		lastSeen := cluster.Status.Agent.LastSeen
+
+		logger.Info("Checking cluster status", "cluster", cluster.Name, "last seen", lastSeen.UTC().String())
+
+		// lastSeen being 0 would typically mean that the cluster is not registered yet, in which case bundle
+		// deployments should not be deployed there.
+		if lastSeen.IsZero() || time.Now().UTC().Sub(lastSeen.UTC()) < threshold {
+			continue
+		}
+
+		logger.Info("Detected offline cluster", "cluster", cluster.Name)
+
+		// Cluster is offline
+		bundleDeployments := &v1alpha1.BundleDeploymentList{}
+		if err := c.List(ctx, bundleDeployments, client.InNamespace(cluster.Status.Namespace)); err != nil {
+			logger.Error(
+				err,
+				"Failed to get list of bundle deployments for offline cluster",
+				"cluster",
+				cluster.Name,
+				"namespace",
+				cluster.Status.Namespace,
+			)
+			continue
+		}
+
+	bd_update:
+		for _, bd := range bundleDeployments.Items {
+			for _, cond := range bd.Status.Conditions {
+				switch cond.Type {
+				case "Ready":
+					fallthrough
+				case "Monitored":
+					if cond.Message == offlineMsg {
+						break bd_update
+					}
+				}
+			}
+
+			logger.Info("Updating bundle deployment in offline cluster", "cluster", cluster.Name, "bundledeployment", bd.Name)
+			err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
+				t := &v1alpha1.BundleDeployment{}
+				nsn := types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace}
+				if err := c.Get(ctx, nsn, t); err != nil {
+					return err
+				}
+				t.Status = bd.Status
+				// Any information about resources living in an offline cluster is likely to be
+				// outdated.
+				t.Status.ModifiedStatus = nil
+				t.Status.NonReadyStatus = nil
+
+				for _, cond := range bd.Status.Conditions {
+					switch cond.Type {
+					case "Ready":
+						mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady)
+						mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg))
+						mc.Unknown(&t.Status)
+						// XXX: do we want to set Deployed and Installed conditions as well?
+					case "Monitored":
+						mc := monitor.Cond(v1alpha1.BundleDeploymentConditionMonitored)
+						mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg))
+
+					}
+				}
+
+				return c.Status().Update(ctx, t)
+			})
+			if err != nil {
+				logger.Error(
+					err,
+					"Failed to update bundle deployment status for offline cluster",
+					"bundledeployment",
+					bd.Name,
+					"cluster",
+					cluster.Name,
+					"namespace",
+					cluster.Status.Namespace,
+				)
+			}
+		}
+	}
+}