Files
headscale/hscontrol/state/primaries_test.go
Kristoffer Dalby e2f2f9211f state, servertest: property-test HA election + invariant catalogue
Expand TestPrimaryRoutesProperty (5 -> 9 ops). New ops mirror the
production shapes the failure cases hit: BatchProbeResults via
UpdateNodes, SimultaneousDisconnect via UpdateNodes, SetApprovedRoutes
that leaves announced RoutableIPs intact, OfflineExpiry that keeps
Unhealthy set. The model now tracks announced and approved separately
and recomputes the intersection.

Strengthen the per-op assertions to cover invariants the model alone
cannot prove: every primary must be online, every primary must
currently advertise its prefix, no flap onto an unhealthy candidate
when a healthy one was available, no flap off a previous primary that
remains a healthy candidate. The check now takes a pre-op snapshot so
the anti-flap rule has a stable reference.

Add TestHAProberProperty in servertest. It drives a real TestServer
with three HA-route-advertising clients through rapid-drawn sequences
of ClientDisconnect / ClientReconnect / ProberTick / WaitForSnapshot
ops and re-checks the same shape invariants after every step.

Document the system in hscontrol/state/HA_INVARIANTS.md: a state
machine over (Healthy+Online, Unhealthy+Online, Offline,
OfflineExpired), fifteen numbered invariants with predicates and
violation paths, and a coverage matrix mapping each invariant to its
unit, servertest, and integration tests. Three rows pin the recent
fixes to the invariants they enforce.
2026-05-18 17:18:08 +02:00

342 lines
9.5 KiB
Go

package state
import (
"net/netip"
"slices"
"testing"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/tailcfg"
)
// mp wraps netip.MustParsePrefix.
func mp(prefix string) netip.Prefix {
return netip.MustParsePrefix(prefix)
}
// primariesFixture builds a NodeStore with the requested node IDs
// pre-registered (offline, no routes) and provides terse helpers for
// driving the kinds of state transitions the algorithm cares about.
type primariesFixture struct {
t *testing.T
ns *NodeStore
}
func newPrimariesFixture(t *testing.T, ids ...types.NodeID) *primariesFixture {
t.Helper()
ns := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
ns.Start()
t.Cleanup(ns.Stop)
for _, id := range ids {
ns.PutNode(nodeForRapid(id))
}
return &primariesFixture{t: t, ns: ns}
}
// advertise mirrors State.Connect: marks the node online, clears
// Unhealthy, and sets approved + announced routes to prefs. An empty
// prefs argument leaves the node online but advertising nothing.
func (f *primariesFixture) advertise(id types.NodeID, prefs ...netip.Prefix) {
f.t.Helper()
f.ns.UpdateNode(id, func(n *types.Node) {
n.IsOnline = new(true)
n.Unhealthy = false
n.Hostinfo = &tailcfg.Hostinfo{RoutableIPs: prefs}
n.ApprovedRoutes = prefs
})
}
// approveRoutes mirrors State.SetApprovedRoutes / Hostinfo updates:
// it changes the node's announced + approved set without touching
// Unhealthy.
func (f *primariesFixture) approveRoutes(id types.NodeID, prefs ...netip.Prefix) {
f.t.Helper()
f.ns.UpdateNode(id, func(n *types.Node) {
n.IsOnline = new(true)
n.Hostinfo = &tailcfg.Hostinfo{RoutableIPs: prefs}
n.ApprovedRoutes = prefs
})
}
// disconnect mirrors State.Disconnect: marks the node offline. The
// snapshot rebuild treats an offline node as a non-advertiser.
func (f *primariesFixture) disconnect(id types.NodeID) {
f.t.Helper()
f.ns.UpdateNode(id, func(n *types.Node) {
n.IsOnline = new(false)
})
}
// unhealthy mirrors State.SetNodeHealth(id, false).
func (f *primariesFixture) unhealthy(id types.NodeID) {
f.t.Helper()
f.ns.UpdateNode(id, func(n *types.Node) {
n.Unhealthy = true
})
}
// healthy mirrors State.SetNodeHealth(id, true).
func (f *primariesFixture) healthy(id types.NodeID) {
f.t.Helper()
f.ns.UpdateNode(id, func(n *types.Node) {
n.Unhealthy = false
})
}
// requirePrimary asserts that prefix has node id as its primary.
func (f *primariesFixture) requirePrimary(prefix netip.Prefix, id types.NodeID) {
f.t.Helper()
got, ok := f.ns.PrimaryRouteFor(prefix)
require.True(f.t, ok, "expected a primary for %s, got none", prefix)
require.Equal(f.t, id, got, "primary for %s", prefix)
}
// requireNoPrimary asserts that prefix has no primary at all.
func (f *primariesFixture) requireNoPrimary(prefix netip.Prefix) {
f.t.Helper()
_, ok := f.ns.PrimaryRouteFor(prefix)
require.False(f.t, ok, "expected no primary for %s", prefix)
}
// requireNodeRoutes asserts the set of prefixes for which id is the
// primary, regardless of order.
func (f *primariesFixture) requireNodeRoutes(id types.NodeID, want ...netip.Prefix) {
f.t.Helper()
got := f.ns.PrimaryRoutesForNode(id)
gotSorted := slices.Clone(got)
wantSorted := slices.Clone(want)
slices.SortFunc(gotSorted, netip.Prefix.Compare)
slices.SortFunc(wantSorted, netip.Prefix.Compare)
require.Equal(f.t, wantSorted, gotSorted, "primary routes for node %d", id)
}
func TestPrimaries_SingleNodeSingleRoute(t *testing.T) {
f := newPrimariesFixture(t, 1)
f.advertise(1, mp("192.168.1.0/24"))
f.requirePrimary(mp("192.168.1.0/24"), 1)
f.requireNodeRoutes(1, mp("192.168.1.0/24"))
}
func TestPrimaries_TwoNodesDifferentRoutes(t *testing.T) {
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.2.0/24"))
f.requirePrimary(mp("192.168.1.0/24"), 1)
f.requirePrimary(mp("192.168.2.0/24"), 2)
}
func TestPrimaries_OverlappingRoutesLowerIDWins(t *testing.T) {
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.1.0/24"))
f.requirePrimary(mp("192.168.1.0/24"), 1)
f.requireNodeRoutes(1, mp("192.168.1.0/24"))
f.requireNodeRoutes(2)
}
func TestPrimaries_AntiFlapPreservesCurrentPrimary(t *testing.T) {
// A primary that disappears (advertiser leaves the set) should
// trigger failover. When the original primary returns, the new
// primary keeps the assignment — anti-flap.
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.1.0/24"))
f.requirePrimary(mp("192.168.1.0/24"), 1)
f.disconnect(1)
f.requirePrimary(mp("192.168.1.0/24"), 2)
f.advertise(1, mp("192.168.1.0/24"))
f.requirePrimary(mp("192.168.1.0/24"), 2)
}
func TestPrimaries_ClearRoutesDropsPrimary(t *testing.T) {
f := newPrimariesFixture(t, 1)
f.advertise(1, mp("192.168.1.0/24"))
f.requirePrimary(mp("192.168.1.0/24"), 1)
f.approveRoutes(1)
f.requireNoPrimary(mp("192.168.1.0/24"))
}
func TestPrimaries_DisconnectDropsLastAdvertiserPrimary(t *testing.T) {
f := newPrimariesFixture(t, 1)
f.advertise(1, mp("192.168.1.0/24"))
f.requirePrimary(mp("192.168.1.0/24"), 1)
f.disconnect(1)
f.requireNoPrimary(mp("192.168.1.0/24"))
}
func TestPrimaries_UnhealthyTriggersFailover(t *testing.T) {
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.1.0/24"))
f.requirePrimary(mp("192.168.1.0/24"), 1)
f.unhealthy(1)
f.requirePrimary(mp("192.168.1.0/24"), 2)
}
func TestPrimaries_RecoveryFromUnhealthyNoFlap(t *testing.T) {
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.1.0/24"))
f.unhealthy(1)
f.requirePrimary(mp("192.168.1.0/24"), 2)
f.healthy(1)
f.requirePrimary(mp("192.168.1.0/24"), 2)
}
func TestPrimaries_AllUnhealthyKeepsAPrimary(t *testing.T) {
// Anti-blackhole: when every advertiser is unhealthy the
// algorithm keeps *some* primary so peers can recover once one
// flips healthy. The specific node is the prev primary when
// reachable (see PreservesPrevious); this test only pins the
// existence rule.
prefix := mp("192.168.1.0/24")
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, prefix)
f.advertise(2, prefix)
f.unhealthy(1)
f.unhealthy(2)
_, ok := f.ns.PrimaryRouteFor(prefix)
require.True(t, ok, "all-unhealthy must still produce some primary")
}
func TestPrimaries_AllUnhealthyPreservesPrevious(t *testing.T) {
// Once a failover has moved primary to a higher-ID node, a
// subsequent all-unhealthy state must NOT churn primary back to
// the lowest-ID candidate. Under cable-pull semantics both nodes
// can linger as IsOnline=true (half-open TCP) and both go
// Unhealthy — naive `candidates[0]` would flap the primary to a
// node that is itself unreachable.
prefix := mp("10.0.0.0/24")
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, prefix)
f.advertise(2, prefix)
f.requirePrimary(prefix, 1)
f.unhealthy(1)
f.requirePrimary(prefix, 2)
f.unhealthy(2)
f.requirePrimary(prefix, 2)
}
func TestPrimaries_ExitRouteNotElected(t *testing.T) {
// Exit routes (0.0.0.0/0, ::/0) are not subject to HA primary
// election — every approved exit-route advertiser keeps it.
f := newPrimariesFixture(t, 1)
exitV4 := mp("0.0.0.0/0")
f.advertise(1, exitV4)
f.requireNoPrimary(exitV4)
}
func TestPrimaries_BothOfflineThenOneReturns(t *testing.T) {
// With two HA advertisers, dropping both then bringing one back
// used to leave the prefix without any primary. The snapshot
// recomputes primaries on every NodeStore write, so the
// returning advertiser must be elected.
prefix := mp("10.0.0.0/24")
f := newPrimariesFixture(t, 1, 2)
f.advertise(1, prefix)
f.advertise(2, prefix)
f.requirePrimary(prefix, 1)
f.disconnect(1)
f.requirePrimary(prefix, 2)
f.disconnect(2)
f.requireNoPrimary(prefix)
f.advertise(2, prefix)
f.requirePrimary(prefix, 2)
}
func TestPrimaries_HANodes(t *testing.T) {
tests := []struct {
name string
setup func(*primariesFixture)
want map[netip.Prefix][]types.NodeID
}{
{
name: "single-node-not-ha",
setup: func(f *primariesFixture) {
f.advertise(1, mp("192.168.1.0/24"))
},
want: map[netip.Prefix][]types.NodeID{},
},
{
name: "two-nodes-same-prefix-is-ha",
setup: func(f *primariesFixture) {
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.1.0/24"))
},
want: map[netip.Prefix][]types.NodeID{
mp("192.168.1.0/24"): {1, 2},
},
},
{
name: "two-nodes-different-prefixes-not-ha",
setup: func(f *primariesFixture) {
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.2.0/24"))
},
want: map[netip.Prefix][]types.NodeID{},
},
{
name: "three-nodes-two-share-prefix",
setup: func(f *primariesFixture) {
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.1.0/24"))
f.advertise(3, mp("10.0.0.0/8"))
},
want: map[netip.Prefix][]types.NodeID{
mp("192.168.1.0/24"): {1, 2},
},
},
{
name: "three-nodes-all-share",
setup: func(f *primariesFixture) {
f.advertise(1, mp("192.168.1.0/24"))
f.advertise(2, mp("192.168.1.0/24"))
f.advertise(3, mp("192.168.1.0/24"))
},
want: map[netip.Prefix][]types.NodeID{
mp("192.168.1.0/24"): {1, 2, 3},
},
},
{
name: "empty",
setup: func(*primariesFixture) {
},
want: map[netip.Prefix][]types.NodeID{},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
f := newPrimariesFixture(t, 1, 2, 3)
tt.setup(f)
got := f.ns.HANodes()
assert.Equal(t, tt.want, got)
})
}
}