From c7630b505b2b757dd14284273ee5e84bb580c987 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Sun, 17 May 2026 17:08:40 +0000 Subject: [PATCH] state: leave prefix unmapped when all primary candidates unhealthy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit electPrimaryRoutes' all-unhealthy fallback picked candidates[0] when the previous primary was no longer a candidate. The Phase-5 simultaneous dual-disconnect path in TestHASubnetRouterFailoverDocker Disconnect hits this asymmetrically: a batched probe cycle marks both routers unhealthy with prev=r2 preserved, then the grace-period Disconnect for r2 drops it from candidates. With prev gone and the remaining r1 still carrying its Unhealthy bit, the fallback pointed peers at the cable-pulled r1 — flapping primary to an unreachable node and tripping requirePrimaryStable. Leave the prefix unmapped when prev is gone and every candidate is unhealthy. Peers see no advertiser instead of an unreachable one, which is honest: the next probe cycle re-evaluates and picks whichever node responds. The property-test model that mirrored the old behaviour is updated to match. --- hscontrol/state/node_store.go | 10 ++++++---- hscontrol/state/primaries_property_test.go | 13 +++++++++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/hscontrol/state/node_store.go b/hscontrol/state/node_store.go index ec3a808e..18dd78d8 100644 --- a/hscontrol/state/node_store.go +++ b/hscontrol/state/node_store.go @@ -692,14 +692,16 @@ func electPrimaryRoutes( } } + // All-unhealthy fallback: preserve the previous primary only + // when it is still a candidate. Falling back to any candidate + // would point peers at a node the prober has already declared + // unreachable; leaving the prefix unmapped is honest until a + // probe cycle picks one that responds. if !found && len(candidates) >= 1 { if cur, ok := prev[prefix]; ok && slices.Contains(candidates, cur) { selected = cur - } else { - selected = candidates[0] + found = true } - - found = true } if found { diff --git a/hscontrol/state/primaries_property_test.go b/hscontrol/state/primaries_property_test.go index 2c583f76..a64664d1 100644 --- a/hscontrol/state/primaries_property_test.go +++ b/hscontrol/state/primaries_property_test.go @@ -97,18 +97,23 @@ func (m *primariesModel) updatePrimaries() { } } + // All-unhealthy fallback: preserve the previous primary if it + // is still a candidate, otherwise leave the prefix unmapped. + // electPrimaryRoutes was changed to drop the candidates[0] + // fallback so the Phase-5 (simultaneous dual-disconnect) + // regression cannot pick an already-unhealthy node as + // primary; the model has to track the same behaviour. if !found && len(nodes) >= 1 { if cur, ok := m.primary[p]; ok && slices.Contains(nodes, cur) { selected = cur - } else { - selected = nodes[0] + found = true } - - found = true } if found { m.primary[p] = selected + } else { + delete(m.primary, p) } } }