From 380f5313421a68f34c1dc90a4c4ff7b801952cd3 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Wed, 8 Apr 2026 10:31:54 +0000 Subject: [PATCH] state: trigger PolicyChange on every Connect and Disconnect Connect and Disconnect previously only appended a PolicyChange when the affected node was a subnet router (routeChange) or the database persist returned a full change. For every other node the peers just received a small PeerChangedPatch{Online: ...} and no filter rules were recomputed. That was too narrow: a node going offline or coming online can affect policy compilation in ways beyond subnet routes. TestGrantCapRelay Phase 4 exposed this. When the cap/relay target node went down with `tailscale down`, headscale only sent an Online=false patch, peers never got a recomputed netmap, and their cached PeerRelay allocation stayed populated until the 120s assertion timeout. With a PolicyChange queued on Disconnect, peers immediately receive a full netmap on relay loss and clear PeerRelay as expected; the symmetric change on Connect lets Phase 5 re-publish the policy when the relay comes back. Drop the now-unused routeChange return from the Disconnect gate. Updates #2180 --- hscontrol/state/state.go | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/hscontrol/state/state.go b/hscontrol/state/state.go index 2bc828ad..723a35d3 100644 --- a/hscontrol/state/state.go +++ b/hscontrol/state/state.go @@ -546,6 +546,13 @@ func (s *State) Connect(id types.NodeID) ([]change.Change, uint64) { c = append(c, change.NodeAdded(id)) } + // Mirror Disconnect: a node coming online may (re)enable cap/relay + // grants targeting it, reintroduce identity-based aliases that + // resolve to its tags/IPs, and so on. Always trigger a PolicyChange + // so peers can recompute their netmap and pick up any policy + // elements that depend on this node being present. + c = append(c, change.PolicyChange()) + return c, gen } @@ -619,15 +626,18 @@ func (s *State) Disconnect(id types.NodeID, gen uint64) ([]change.Change, error) // The node is disconnecting so make sure that none of the routes it // announced are served to any nodes. - routeChange := s.primaryRoutes.SetRoutes(id) + s.primaryRoutes.SetRoutes(id) - cs := []change.Change{change.NodeOfflineFor(node), c} - - // If we have a policy change or route change, return that as it's more comprehensive - // Otherwise, return the NodeOffline change to ensure nodes are notified - if c.IsFull() || routeChange { - cs = append(cs, change.PolicyChange()) - } + // A node going offline can affect policy compilation in ways beyond + // subnet routes: cap/relay grants targeting this node, identity-based + // aliases (tags, groups, users) that reference its tags/IPs, via + // routes steered through it, and so on. Always trigger a PolicyChange + // so peers receive a recomputed netmap and drop any cached state + // derived from this node (including peer relay allocations). + // + // TODO(kradalby): fires one full netmap recompute per peer on + // every connect/disconnect. Coalesce in mapper/batcher.go:addToBatch. + cs := []change.Change{change.NodeOfflineFor(node), c, change.PolicyChange()} return cs, nil }