Skip to content

Commit 9463a86

Browse files
committed
daemon: Avoid blocking during hive start
Execute startDaemon() in a goroutine, which resolves daemon promise when done. Change dependent start hooks to await for daemon promise in goroutines so that the execution of start hooks is not stalled. This way the hive may run concurrently with daemon start. Move some early validation from startDaemon() to the start hook so that we can fail out while we can simply return the error from the start hook. Move saving of daemon config to file also away from startDaemon, so that we can resolve the DaemonConfig promise from the start hook itself. The daemon promise will be resolved from the goroutine that runs startDaemon(), or rejected if there was any error. Signed-off-by: Jarno Rajahalme <jarno@isovalent.com>
1 parent d56b5d1 commit 9463a86

File tree

3 files changed

+95
-47
lines changed

3 files changed

+95
-47
lines changed

daemon/cmd/daemon_main.go

Lines changed: 63 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,7 +1706,15 @@ func newDaemonPromise(params daemonParams) (promise.Promise[*Daemon], promise.Pr
17061706
var wg sync.WaitGroup
17071707

17081708
params.Lifecycle.Append(cell.Hook{
1709-
OnStart: func(cell.HookContext) error {
1709+
OnStart: func(cell.HookContext) (err error) {
1710+
defer func() {
1711+
// Reject promises on error
1712+
if err != nil {
1713+
cfgResolver.Reject(err)
1714+
daemonResolver.Reject(err)
1715+
}
1716+
}()
1717+
17101718
d, restoredEndpoints, err := newDaemon(daemonCtx, cleaner, &params)
17111719
if err != nil {
17121720
cancelDaemonCtx()
@@ -1716,16 +1724,45 @@ func newDaemonPromise(params daemonParams) (promise.Promise[*Daemon], promise.Pr
17161724
daemon = d
17171725

17181726
if !option.Config.DryMode {
1719-
if err := startDaemon(daemon, restoredEndpoints, cleaner, params); err != nil {
1720-
daemonResolver.Reject(err)
1721-
cancelDaemonCtx()
1722-
cleaner.Clean()
1723-
cfgResolver.Reject(err)
1724-
return err
1727+
log.Info("Initializing daemon")
1728+
1729+
// This validation needs to be done outside of the agent until
1730+
// datapath.NodeAddressing is used consistently across the code base.
1731+
log.Info("Validating configured node address ranges")
1732+
if err := node.ValidatePostInit(); err != nil {
1733+
return fmt.Errorf("postinit failed: %w", err)
1734+
}
1735+
1736+
// Store config in file before resolving the DaemonConfig promise.
1737+
err = option.Config.StoreInFile(option.Config.StateDir)
1738+
if err != nil {
1739+
log.WithError(err).Error("Unable to store Cilium's configuration")
1740+
}
1741+
1742+
err = option.StoreViperInFile(option.Config.StateDir)
1743+
if err != nil {
1744+
log.WithError(err).Error("Unable to store Viper's configuration")
17251745
}
17261746
}
1727-
daemonResolver.Resolve(daemon)
1747+
1748+
// 'option.Config' is assumed to be stable at this point, execpt for
1749+
// 'option.Config.Opts' that are explicitly deemed to be runtime-changeable
17281750
cfgResolver.Resolve(option.Config)
1751+
1752+
if option.Config.DryMode {
1753+
daemonResolver.Resolve(daemon)
1754+
} else {
1755+
wg.Add(1)
1756+
go func() {
1757+
defer wg.Done()
1758+
if err := startDaemon(daemon, restoredEndpoints, cleaner, params); err != nil {
1759+
log.WithError(err).Error("Daemon start failed")
1760+
daemonResolver.Reject(err)
1761+
} else {
1762+
daemonResolver.Resolve(daemon)
1763+
}
1764+
}()
1765+
}
17291766
return nil
17301767
},
17311768
OnStop: func(cell.HookContext) error {
@@ -1739,16 +1776,9 @@ func newDaemonPromise(params daemonParams) (promise.Promise[*Daemon], promise.Pr
17391776
}
17401777

17411778
// startDaemon starts the old unmodular part of the cilium-agent.
1779+
// option.Config has already been exposed via *option.DaemonConfig promise,
1780+
// so it may not be modified here
17421781
func startDaemon(d *Daemon, restoredEndpoints *endpointRestoreState, cleaner *daemonCleanup, params daemonParams) error {
1743-
log.Info("Initializing daemon")
1744-
1745-
// This validation needs to be done outside of the agent until
1746-
// datapath.NodeAddressing is used consistently across the code base.
1747-
log.Info("Validating configured node address ranges")
1748-
if err := node.ValidatePostInit(); err != nil {
1749-
return fmt.Errorf("postinit failed: %w", err)
1750-
}
1751-
17521782
bootstrapStats.k8sInit.Start()
17531783
if params.Clientset.IsEnabled() {
17541784
// Wait only for certain caches, but not all!
@@ -1911,28 +1941,28 @@ func startDaemon(d *Daemon, restoredEndpoints *endpointRestoreState, cleaner *da
19111941
bootstrapStats.updateMetrics()
19121942
go d.launchHubble()
19131943

1914-
err = option.Config.StoreInFile(option.Config.StateDir)
1915-
if err != nil {
1916-
log.WithError(err).Error("Unable to store Cilium's configuration")
1917-
}
1918-
1919-
err = option.StoreViperInFile(option.Config.StateDir)
1920-
if err != nil {
1921-
log.WithError(err).Error("Unable to store Viper's configuration")
1922-
}
1923-
19241944
return nil
19251945
}
19261946

19271947
func registerEndpointStateResolver(lc cell.Lifecycle, daemonPromise promise.Promise[*Daemon], resolver promise.Resolver[endpointstate.Restorer]) {
1948+
var wg sync.WaitGroup
1949+
19281950
lc.Append(cell.Hook{
19291951
OnStart: func(ctx cell.HookContext) error {
1930-
daemon, err := daemonPromise.Await(context.Background())
1931-
if err != nil {
1932-
resolver.Reject(err)
1933-
return err
1934-
}
1935-
resolver.Resolve(daemon)
1952+
wg.Add(1)
1953+
go func() {
1954+
defer wg.Done()
1955+
daemon, err := daemonPromise.Await(context.Background())
1956+
if err != nil {
1957+
resolver.Reject(err)
1958+
} else {
1959+
resolver.Resolve(daemon)
1960+
}
1961+
}()
1962+
return nil
1963+
},
1964+
OnStop: func(ctx cell.HookContext) error {
1965+
wg.Wait()
19361966
return nil
19371967
},
19381968
})

daemon/cmd/deletion_queue.go

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"context"
88
"os"
99
"path/filepath"
10+
"sync"
1011

1112
"github.com/cilium/hive/cell"
1213

@@ -26,26 +27,39 @@ var deletionQueueCell = cell.Group(
2627
type deletionQueue struct {
2728
lf *lockfile.Lockfile
2829
daemonPromise promise.Promise[*Daemon]
30+
wg sync.WaitGroup
2931
}
3032

31-
func (dq *deletionQueue) Start(ctx cell.HookContext) error {
32-
d, err := dq.daemonPromise.Await(ctx)
33-
if err != nil {
34-
return err
35-
}
33+
func (dq *deletionQueue) Start(cell.HookContext) error {
34+
dq.wg.Add(1)
35+
go func() {
36+
defer dq.wg.Done()
3637

37-
if err := dq.lock(ctx); err != nil {
38-
return err
39-
}
38+
// hook context cancels when the start hooks have run, use context.Background()
39+
// as we may be running after that.
40+
d, err := dq.daemonPromise.Await(context.Background())
41+
if err != nil {
42+
log.WithError(err).Error("deletionQueue: Daemon promise failed")
43+
return
44+
}
4045

41-
bootstrapStats.deleteQueue.Start()
42-
err = dq.processQueuedDeletes(d, ctx)
43-
bootstrapStats.deleteQueue.EndError(err)
44-
return err
46+
if err := dq.lock(d.ctx); err != nil {
47+
log.WithError(err).Error("deletionQueue: lock failed")
48+
return
49+
}
4550

51+
bootstrapStats.deleteQueue.Start()
52+
err = dq.processQueuedDeletes(d, d.ctx)
53+
bootstrapStats.deleteQueue.EndError(err)
54+
if err != nil {
55+
log.WithError(err).Error("deletionQueue: processQueuedDeletes failed")
56+
}
57+
}()
58+
return nil
4659
}
4760

48-
func (dq *deletionQueue) Stop(ctx cell.HookContext) error {
61+
func (dq *deletionQueue) Stop(cell.HookContext) error {
62+
dq.wg.Wait()
4963
return nil
5064
}
5165

pkg/maps/nat/cell.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,15 @@ var Cell = cell.Module(
4949
ipv4Nat, ipv6Nat = GlobalMaps(cfg.EnableIPv4,
5050
cfg.EnableIPv6, true)
5151

52-
// Maps are still created in daemon.initMaps(...) under the same circumstances
52+
// Maps are still created before DaemonConfig promise is resolved in
53+
// daemon.initMaps(...) under the same circumstances
5354
// so we just open them here so they can be provided to hive.
5455
//
5556
// TODO: Refactor ctmap gc Enable() such that it can use the map descriptors from
5657
// here so we can move all nat map creation logic into here.
58+
// NOTE: This code runs concurrently with startDaemon(), so if any dependency to
59+
// daemon having finished endpoint restore, for example, is added, we should
60+
// await for an appropriate promise.
5761
if cfg.EnableIPv4 {
5862
if err := ipv4Nat.Open(); err != nil {
5963
return fmt.Errorf("open IPv4 nat map: %w", err)

0 commit comments

Comments
 (0)