Skip to content

Commit

Permalink
feat: auto approve CSR on GKE (#139)
Browse files Browse the repository at this point in the history
* try to approve csr

* tests

* testing

* feat: auto approve CAST AI nodes csr using watch

* lint

* lint

* lint

* lint

* lint

* add detection if running on GKE

* lint

* add log

* add log

* change how svc is run

* handle not found error

* handle already approved

* lint
  • Loading branch information
aldor007 authored Oct 1, 2024
1 parent 8fa47ed commit 5b9dfcc
Show file tree
Hide file tree
Showing 8 changed files with 339 additions and 378 deletions.
78 changes: 0 additions & 78 deletions actions/approve_csr_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"errors"
"fmt"
"reflect"
"sync"
"time"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -35,8 +34,6 @@ type approveCSRHandler struct {
clientset kubernetes.Interface
initialCSRFetchTimeout time.Duration
csrFetchInterval time.Duration
cancelAutoApprove context.CancelFunc
m sync.Mutex // Used to make sure there is just one watcher running as it may be triggered from multiple CSR actions.
}

func (h *approveCSRHandler) Handle(ctx context.Context, action *castai.ClusterAction) error {
Expand All @@ -51,15 +48,7 @@ func (h *approveCSRHandler) Handle(ctx context.Context, action *castai.ClusterAc
actionIDLogField: action.ID,
})

// If AllowAutoApprove is enabled, the CSR watcher will be triggered to handle Certificate Signing Requests (CSRs)
// for nodes that are older than 24 hours and managed by CastAI
if req.AllowAutoApprove != nil {
if *req.AllowAutoApprove {
go h.RunAutoApproveForCastAINodes(ctx)
} else {
h.StopAutoApproveForCastAINodes()
}

// CSR action may be used only to instruct whether to start / stop watcher responsible for auto-approving; in
// this case, there is nothing more to do.
if req.NodeName == "" {
Expand Down Expand Up @@ -171,73 +160,6 @@ func (h *approveCSRHandler) getInitialNodeCSR(ctx context.Context, log logrus.Fi
return cert, err
}

func (h *approveCSRHandler) RunAutoApproveForCastAINodes(ctx context.Context) {
ctx, cancel := context.WithCancel(ctx)
defer cancel()

if !h.startAutoApprove(cancel) {
return // already running
}
defer h.StopAutoApproveForCastAINodes()

log := h.log.WithField("RunAutoApprove", "auto-approve-csr")
c := make(chan *csr.Certificate, 1)
go csr.WatchCastAINodeCSRs(ctx, log, h.clientset, c)

for {
select {
case <-ctx.Done():
log.WithError(ctx.Err()).Errorf("auto approve csr finished")
return
case cert := <-c:
if cert == nil {
continue
}
go func(cert *csr.Certificate) {
log := log.WithField("node_name", cert.Name)
log.Info("auto approving csr")
err := h.handleWithRetry(ctx, log, cert)
if err != nil {
log.WithError(err).Errorf("failed to approve csr: %+v", cert)
}
}(cert)
}
}
}

func (h *approveCSRHandler) startAutoApprove(cancelFunc context.CancelFunc) bool {
h.m.Lock()
defer h.m.Unlock()
if h.cancelAutoApprove != nil {
return false
}

h.log.Info("starting auto approve CSRs for managed by Cast AI nodes")
h.cancelAutoApprove = cancelFunc

return true
}

func (h *approveCSRHandler) StopAutoApproveForCastAINodes() {
h.m.Lock()
defer h.m.Unlock()

if h.cancelAutoApprove == nil {
return
}

h.log.Info("stopping auto approve CSRs for managed by Cast AI nodes")
h.cancelAutoApprove()
h.cancelAutoApprove = nil
}

func (h *approveCSRHandler) getCancelAutoApprove() context.CancelFunc {
h.m.Lock()
defer h.m.Unlock()

return h.cancelAutoApprove
}

func newApproveCSRExponentialBackoff() wait.Backoff {
b := waitext.DefaultExponentialBackoff()
b.Factor = 2
Expand Down
171 changes: 0 additions & 171 deletions actions/approve_csr_handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,177 +224,6 @@ AiAHVYZXHxxspoV0hcfn2Pdsl89fIPCOFy/K1PqSUR6QNAIgYdt51ZbQt9rgM2BD
err := h.Handle(ctx, actionApproveCSR)
r.EqualError(err, "getting initial csr: context deadline exceeded")
})

t.Run("enable-->disable auto-approve", func(t *testing.T) {
r := require.New(t)

client := fake.NewSimpleClientset()

boolTrue := true
boolFalse := false
actionRunAutoApprove := &castai.ClusterAction{
ActionApproveCSR: &castai.ActionApproveCSR{AllowAutoApprove: &boolTrue},
CreatedAt: time.Time{},
}
actionStopAutoApprove := &castai.ClusterAction{
ActionApproveCSR: &castai.ActionApproveCSR{AllowAutoApprove: &boolFalse},
CreatedAt: time.Time{},
}
h := &approveCSRHandler{
log: log,
clientset: client,
csrFetchInterval: 100 * time.Millisecond,
initialCSRFetchTimeout: 1000 * time.Millisecond,
}

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
err := h.Handle(ctx, actionRunAutoApprove)
time.Sleep(time.Second)
r.NoError(err)
r.NotNil(h.getCancelAutoApprove())
err = h.Handle(ctx, actionStopAutoApprove)
time.Sleep(time.Second)
r.NoError(err)
r.Nil(h.getCancelAutoApprove())
})

t.Run("enable auto-approve + approve", func(t *testing.T) {
r := require.New(t)

csrRes := getCSR()
csr2 := csrRes.DeepCopy()
csr2.Name = "node-csr-456"
ch := make(chan struct{})
defer close(ch)

client := fake.NewSimpleClientset(csrRes, csr2)
client.PrependReactor("update", "certificatesigningrequests", func(action ktest.Action) (handled bool, ret runtime.Object, err error) {
approved := csrRes.DeepCopy()
approved.Status.Conditions = []certv1.CertificateSigningRequestCondition{
{
Type: certv1.CertificateApproved,
Reason: csr.ReasonApproved,
Message: "approved",
LastUpdateTime: metav1.Now(),
Status: v1.ConditionTrue,
},
}
ch <- struct{}{}
return true, approved, nil
})
client.PrependReactor("get", "nodes", func(action ktest.Action) (handled bool, ret runtime.Object, err error) {
return true, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
CreationTimestamp: metav1.NewTime(time.Now().Add(-time.Hour * 25)),
Labels: map[string]string{
castai.LabelManagedBy: castai.LabelValueManagedByCASTAI,
},
},
}, nil
})

watcher := watch.NewFake()
defer watcher.Stop()
var count int // check retry on error
client.PrependWatchReactor("certificatesigningrequests",
func(action ktest.Action) (handled bool, ret watch.Interface, err error) {
if count == 5 {
return true, watcher, nil
} else {
count++
return true, nil, fmt.Errorf("ups error")
}
})

boolTrue := true
actionRunAutoApprove := &castai.ClusterAction{
ActionApproveCSR: &castai.ActionApproveCSR{AllowAutoApprove: &boolTrue},
CreatedAt: time.Time{},
}

h := &approveCSRHandler{
log: log,
clientset: client,
csrFetchInterval: 100 * time.Millisecond,
initialCSRFetchTimeout: 1000 * time.Millisecond,
}

ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()

for i := 0; i < 5; i++ {
go func() {
err := h.Handle(ctx, actionRunAutoApprove)
time.Sleep(time.Millisecond)
r.NoError(err)
}()
}

time.Sleep(time.Second)
r.NotNil(h.getCancelAutoApprove())
go watcher.Add(csrRes)
go watcher.Add(csr2)
for i := 0; i < 2; i++ {
select {
case <-ch:
case <-ctx.Done():
r.Fail("timeout waiting for auto-approve")
}
}
})
t.Run("enable auto-approve + skip approve", func(t *testing.T) {
r := require.New(t)

csrRes := getCSR()
client := fake.NewSimpleClientset(csrRes)
ch := make(chan struct{})

client.PrependReactor("get", "nodes", func(action ktest.Action) (handled bool, ret runtime.Object, err error) {
close(ch)
return true, &v1.Node{
ObjectMeta: metav1.ObjectMeta{
CreationTimestamp: metav1.NewTime(time.Now().Add(-time.Hour * 25)),
Labels: map[string]string{
castai.LabelManagedBy: castai.LabelValueManagedByCASTAI,
},
},
}, nil
})

watcher := watch.NewFake()
defer watcher.Stop()

client.PrependWatchReactor("certificatesigningrequests", ktest.DefaultWatchReactor(watcher, nil))

boolTrue := true
actionRunAutoApprove := &castai.ClusterAction{
ActionApproveCSR: &castai.ActionApproveCSR{AllowAutoApprove: &boolTrue},
CreatedAt: time.Time{},
}

h := &approveCSRHandler{
log: log,
clientset: client,
csrFetchInterval: 100 * time.Millisecond,
initialCSRFetchTimeout: 1000 * time.Millisecond,
}

ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
defer cancel()

err := h.Handle(ctx, actionRunAutoApprove)
time.Sleep(time.Millisecond)
r.NoError(err)
r.NotNil(h.getCancelAutoApprove())
watcher.Add(csrRes)

select {
case <-ch:
case <-ctx.Done():
r.Fail("timeout waiting for auto-approve")
}
})
}

func TestApproveCSRExponentialBackoff(t *testing.T) {
Expand Down
3 changes: 2 additions & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ type Config struct {
LeaderElection LeaderElection
PodName string
NodeName string
AutoApproveCSR bool
}

type Log struct {
Expand Down Expand Up @@ -73,9 +74,9 @@ func Get() Config {
_ = viper.BindEnv("leaderelection.lockname", "LEADER_ELECTION_LOCK_NAME")
_ = viper.BindEnv("leaderelection.leaseduration", "LEADER_ELECTION_LEASE_DURATION")
_ = viper.BindEnv("leaderelection.leaserenewdeadline", "LEADER_ELECTION_LEASE_RENEW_DEADLINE")
_ = viper.BindEnv("aksinitdata", "AKS_INIT_DATA")
_ = viper.BindEnv("nodename", "KUBERNETES_NODE_NAME")
_ = viper.BindEnv("podname", "KUBERNETES_POD")
_ = viper.BindEnv("autoapprovecsr", "AUTO_APPROVE_CSR")

cfg = &Config{}
if err := viper.Unmarshal(&cfg); err != nil {
Expand Down
Loading

0 comments on commit 5b9dfcc

Please sign in to comment.