From 380f5313421a68f34c1dc90a4c4ff7b801952cd3 Mon Sep 17 00:00:00 2001
From: Kristoffer Dalby <kristoffer@dalby.cc>
Date: Wed, 8 Apr 2026 10:31:54 +0000
Subject: [PATCH] state: trigger PolicyChange on every Connect and Disconnect

Connect and Disconnect previously only appended a PolicyChange when
the affected node was a subnet router (routeChange) or the database
persist returned a full change. For every other node the peers just
received a small PeerChangedPatch{Online: ...} and no filter rules
were recomputed. That was too narrow: a node going offline or coming
online can affect policy compilation in ways beyond subnet routes.

TestGrantCapRelay Phase 4 exposed this. When the cap/relay target node
went down with `tailscale down`, headscale only sent an Online=false
patch, peers never got a recomputed netmap, and their cached
PeerRelay allocation stayed populated until the 120s assertion
timeout. With a PolicyChange queued on Disconnect, peers immediately
receive a full netmap on relay loss and clear PeerRelay as expected;
the symmetric change on Connect lets Phase 5 re-publish the policy
when the relay comes back.

Drop the now-unused routeChange return from the Disconnect gate.

Updates #2180
---
 hscontrol/state/state.go | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/hscontrol/state/state.go b/hscontrol/state/state.go
index 2bc828ad..723a35d3 100644
--- a/hscontrol/state/state.go
+++ b/hscontrol/state/state.go
@@ -546,6 +546,13 @@ func (s *State) Connect(id types.NodeID) ([]change.Change, uint64) {
 		c = append(c, change.NodeAdded(id))
 	}
 
+	// Mirror Disconnect: a node coming online may (re)enable cap/relay
+	// grants targeting it, reintroduce identity-based aliases that
+	// resolve to its tags/IPs, and so on. Always trigger a PolicyChange
+	// so peers can recompute their netmap and pick up any policy
+	// elements that depend on this node being present.
+	c = append(c, change.PolicyChange())
+
 	return c, gen
 }
 
@@ -619,15 +626,18 @@ func (s *State) Disconnect(id types.NodeID, gen uint64) ([]change.Change, error)
 
 	// The node is disconnecting so make sure that none of the routes it
 	// announced are served to any nodes.
-	routeChange := s.primaryRoutes.SetRoutes(id)
+	s.primaryRoutes.SetRoutes(id)
 
-	cs := []change.Change{change.NodeOfflineFor(node), c}
-
-	// If we have a policy change or route change, return that as it's more comprehensive
-	// Otherwise, return the NodeOffline change to ensure nodes are notified
-	if c.IsFull() || routeChange {
-		cs = append(cs, change.PolicyChange())
-	}
+	// A node going offline can affect policy compilation in ways beyond
+	// subnet routes: cap/relay grants targeting this node, identity-based
+	// aliases (tags, groups, users) that reference its tags/IPs, via
+	// routes steered through it, and so on. Always trigger a PolicyChange
+	// so peers receive a recomputed netmap and drop any cached state
+	// derived from this node (including peer relay allocations).
+	//
+	// TODO(kradalby): fires one full netmap recompute per peer on
+	// every connect/disconnect. Coalesce in mapper/batcher.go:addToBatch.
+	cs := []change.Change{change.NodeOfflineFor(node), c, change.PolicyChange()}
 
 	return cs, nil
 }