Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect offline clusters #2933

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions charts/fleet/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ data:
"apiServerCA": "{{b64enc .Values.apiServerCA}}",
"agentCheckinInterval": "{{.Values.agentCheckinInterval}}",
"agentTLSMode": "{{.Values.agentTLSMode}}",
"clusterMonitorInterval": "{{.Values.clusterMonitorInterval}}",
"clusterMonitorThreshold": "{{.Values.clusterMonitorThreshold}}",
{{ if .Values.garbageCollectionInterval }}
"garbageCollectionInterval": "{{.Values.garbageCollectionInterval}}",
{{ end }}
Expand Down
9 changes: 9 additions & 0 deletions charts/fleet/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ agentTLSMode: "system-store"
# A duration string for how often agents should report a heartbeat
agentCheckinInterval: "15m"

# Determines how long must have elapsed since a downstream cluster's Fleet agent last reported its status to the
# management cluster, before that downstream cluster is considered offline.
# If this configured value is shorter than three times the agent check-in interval, then that check-in
# interval-based value will be used instead to prevent false positives.
clusterMonitorThreshold: "45m"

# Determines how often the cluster monitor will check for offline downstream clusters.
clusterMonitorInterval: "10m"

# The amount of time that agents will wait before they clean up old Helm releases.
# A non-existent value or 0 will result in an interval of 15 minutes.
garbageCollectionInterval: "15m"
Expand Down
8 changes: 8 additions & 0 deletions internal/cmd/agent/deployer/monitor/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ func (c Cond) IsFalse(obj interface{}) bool {
return getStatus(obj, string(c)) == "False"
}

func (c Cond) Unknown(obj interface{}) {
setStatus(obj, string(c), "Unknown")
}

func (c Cond) IsUnknown(obj interface{}) bool {
return getStatus(obj, string(c)) == "Unknown"
}

func (c Cond) Reason(obj interface{}, reason string) {
cond := findOrCreateCond(obj, string(c))
getFieldValue(cond, "Reason").SetString(reason)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ func (i *importHandler) importCluster(cluster *fleet.Cluster, status fleet.Clust
apiServerCA = secret.Data[config.APIServerCAKey]
)

if cfg.AgentCheckinInterval.Seconds() == 0 {
return status, fmt.Errorf("agent check-in interval cannot be 0")
}

if apiServerURL == "" {
if len(cfg.APIServerURL) == 0 {
return status, fmt.Errorf("missing apiServerURL in fleet config for cluster auto registration")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@ func (h *handler) newAgentBundle(ns string, cluster *fleet.Cluster) (runtime.Obj
agentNamespace = cluster.Spec.AgentNamespace
}

if cfg.AgentCheckinInterval.Seconds() == 0 {
return nil, fmt.Errorf("agent check-in interval cannot be 0")
}

// Notice we only set the agentScope when it's a non-default agentNamespace. This is for backwards compatibility
// for when we didn't have agent scope before
objs := agent.Manifest(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,37 @@
package manageagent

import (
"strings"
"testing"
"time"

"github.com/golang/mock/gomock"
"github.com/rancher/wrangler/v3/pkg/generic/fake"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"

"github.com/rancher/fleet/internal/config"
fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1"
)

func TestNewAgentBundle(t *testing.T) {
config.Set(&config.Config{AgentCheckinInterval: metav1.Duration{Duration: 0 * time.Second}})

h := handler{systemNamespace: "blah"}
obj, err := h.newAgentBundle("foo", &fleet.Cluster{Spec: fleet.ClusterSpec{AgentNamespace: "bar"}})

if obj != nil {
t.Fatalf("expected obj returned by newAgentBundle to be nil")
}

expectedStr := "interval cannot be 0"
if !strings.Contains(err.Error(), expectedStr) {
t.Fatalf("expected error returned by newAgentBundle to contain %q", expectedStr)
}
}

func TestOnClusterChangeAffinity(t *testing.T) {
ctrl := gomock.NewController(t)
namespaces := fake.NewMockNonNamespacedControllerInterface[*corev1.Namespace, *corev1.NamespaceList](ctrl)
Expand Down
145 changes: 145 additions & 0 deletions internal/cmd/controller/clustermonitor/monitor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
package clustermonitor

import (
"context"
"errors"
"math"
"time"

"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/util/retry"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/rancher/fleet/internal/cmd/agent/deployer/monitor"
"github.com/rancher/fleet/internal/config"
"github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1"
)

const offlineMsg = "cluster is offline"

// Run monitors Fleet cluster resources' agent last seen dates. If a cluster's agent was last seen longer ago than
// a certain threshold, then Run updates statuses of all bundle deployments targeting that cluster, to reflect the fact
// that the cluster is offline. This prevents those bundle deployments from displaying outdated status information.
//
// A cluster will be considered offline if its Fleet agent has not reported its status for more than:
// - three times the agent check-in interval
// - or any larger configured interval.
// Therefore, this function requires configuration to have been loaded into the config package using `Load` before
// running.
//
// Bundle deployment status updates done here are unlikely to conflict with those done by the bundle deployment
// reconciler, which are either run from an online target cluster (from its Fleet agent) or triggered by other status
// updates such as this one (eg. bundle deployment reconciler living in the Fleet controller).
func Run(ctx context.Context, c client.Client, interval, threshold time.Duration) {
for {
select {
case <-ctx.Done():
return
case <-time.After(interval):
}

cfg := config.Get() // This enables config changes to take effect

thresholdSecs := math.Max(cfg.AgentCheckinInterval.Seconds()*3, threshold.Seconds())

UpdateOfflineBundleDeployments(ctx, c, time.Second*time.Duration(thresholdSecs))
}
}

// UpdateOfflineBundleDeployments looks for offline clusters based on the provided threshold duration. For each cluster
// considered offline, this updates its bundle deployments' statuses accordingly.
// If a cluster's bundle deployments have already been marked as offline, they will be skipped.
func UpdateOfflineBundleDeployments(ctx context.Context, c client.Client, threshold time.Duration) {
logger := ctrl.Log.WithName("cluster status monitor")

clusters := &v1alpha1.ClusterList{}
if err := c.List(ctx, clusters); err != nil {
logger.Error(err, "Failed to get list of clusters")
return
}

for _, cluster := range clusters.Items {
lastSeen := cluster.Status.Agent.LastSeen

logger.Info("Checking cluster status", "cluster", cluster.Name, "last seen", lastSeen.UTC().String())

// lastSeen being 0 would typically mean that the cluster is not registered yet, in which case bundle
// deployments should not be deployed there.
if lastSeen.IsZero() || time.Now().UTC().Sub(lastSeen.UTC()) < threshold {
continue
}

logger.Info("Detected offline cluster", "cluster", cluster.Name)

// Cluster is offline
bundleDeployments := &v1alpha1.BundleDeploymentList{}
if err := c.List(ctx, bundleDeployments, client.InNamespace(cluster.Status.Namespace)); err != nil {
logger.Error(
err,
"Failed to get list of bundle deployments for offline cluster",
"cluster",
cluster.Name,
"namespace",
cluster.Status.Namespace,
)
continue
}

bd_update:
for _, bd := range bundleDeployments.Items {
for _, cond := range bd.Status.Conditions {
switch cond.Type {
case "Ready":
fallthrough
case "Monitored":
if cond.Message == offlineMsg {
break bd_update
}
}
}

logger.Info("Updating bundle deployment in offline cluster", "cluster", cluster.Name, "bundledeployment", bd.Name)
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
t := &v1alpha1.BundleDeployment{}
nsn := types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace}
if err := c.Get(ctx, nsn, t); err != nil {
return err
}
t.Status = bd.Status
// Any information about resources living in an offline cluster is likely to be
// outdated.
t.Status.ModifiedStatus = nil
t.Status.NonReadyStatus = nil

for _, cond := range bd.Status.Conditions {
switch cond.Type {
case "Ready":
mc := monitor.Cond(v1alpha1.BundleDeploymentConditionReady)
mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg))
mc.Unknown(&t.Status)
// XXX: do we want to set Deployed and Installed conditions as well?
case "Monitored":
mc := monitor.Cond(v1alpha1.BundleDeploymentConditionMonitored)
mc.SetError(&t.Status, "Cluster offline", errors.New(offlineMsg))

}
}

return c.Status().Update(ctx, t)
})
if err != nil {
logger.Error(
err,
"Failed to update bundle deployment status for offline cluster",
"bundledeployment",
bd.Name,
"cluster",
cluster.Name,
"namespace",
cluster.Status.Namespace,
)
}
}
}
}
Loading
Loading