mirror of
https://github.com/juanfont/headscale.git
synced 2026-05-24 02:58:42 +09:00
Expand TestPrimaryRoutesProperty (5 -> 9 ops). New ops mirror the production shapes the failure cases hit: BatchProbeResults via UpdateNodes, SimultaneousDisconnect via UpdateNodes, SetApprovedRoutes that leaves announced RoutableIPs intact, OfflineExpiry that keeps Unhealthy set. The model now tracks announced and approved separately and recomputes the intersection. Strengthen the per-op assertions to cover invariants the model alone cannot prove: every primary must be online, every primary must currently advertise its prefix, no flap onto an unhealthy candidate when a healthy one was available, no flap off a previous primary that remains a healthy candidate. The check now takes a pre-op snapshot so the anti-flap rule has a stable reference. Add TestHAProberProperty in servertest. It drives a real TestServer with three HA-route-advertising clients through rapid-drawn sequences of ClientDisconnect / ClientReconnect / ProberTick / WaitForSnapshot ops and re-checks the same shape invariants after every step. Document the system in hscontrol/state/HA_INVARIANTS.md: a state machine over (Healthy+Online, Unhealthy+Online, Offline, OfflineExpired), fifteen numbered invariants with predicates and violation paths, and a coverage matrix mapping each invariant to its unit, servertest, and integration tests. Three rows pin the recent fixes to the invariants they enforce.
4519 lines
164 KiB
Go
4519 lines
164 KiB
Go
package integration
|
|
|
|
import (
|
|
"cmp"
|
|
"encoding/json"
|
|
"fmt"
|
|
"maps"
|
|
"net/netip"
|
|
"slices"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
cmpdiff "github.com/google/go-cmp/cmp"
|
|
"github.com/google/go-cmp/cmp/cmpopts"
|
|
v1 "github.com/juanfont/headscale/gen/go/headscale/v1"
|
|
policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2"
|
|
"github.com/juanfont/headscale/hscontrol/types"
|
|
"github.com/juanfont/headscale/hscontrol/util"
|
|
"github.com/juanfont/headscale/integration/hsic"
|
|
"github.com/juanfont/headscale/integration/integrationutil"
|
|
"github.com/juanfont/headscale/integration/tsic"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
xmaps "golang.org/x/exp/maps"
|
|
"tailscale.com/ipn/ipnstate"
|
|
"tailscale.com/net/tsaddr"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/types/ipproto"
|
|
"tailscale.com/types/views"
|
|
"tailscale.com/util/must"
|
|
"tailscale.com/util/slicesx"
|
|
"tailscale.com/wgengine/filter"
|
|
)
|
|
|
|
var allPorts = filter.PortRange{First: 0, Last: 0xffff}
|
|
|
|
// This test is both testing the routes command and the propagation of
|
|
// routes.
|
|
func TestEnablingRoutes(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 3,
|
|
Users: []string{"user1"},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
err = scenario.CreateHeadscaleEnv(
|
|
[]tsic.Option{tsic.WithAcceptRoutes()},
|
|
hsic.WithTestName("rt-enable"))
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
expectedRoutes := map[string]string{
|
|
"1": "10.0.0.0/24",
|
|
"2": "10.0.1.0/24",
|
|
"3": "10.0.2.0/24",
|
|
}
|
|
|
|
// advertise routes using the up command
|
|
for _, client := range allClients {
|
|
status := client.MustStatus()
|
|
command := []string{
|
|
"tailscale",
|
|
"set",
|
|
"--advertise-routes=" + expectedRoutes[string(status.Self.ID)],
|
|
}
|
|
_, _, err = client.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise route: %s", err)
|
|
}
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
var nodes []*v1.Node
|
|
// Wait for route advertisements to propagate to NodeStore
|
|
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
|
|
var err error
|
|
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(ct, err)
|
|
|
|
for _, node := range nodes {
|
|
assert.Len(ct, node.GetAvailableRoutes(), 1)
|
|
assert.Empty(ct, node.GetApprovedRoutes())
|
|
assert.Empty(ct, node.GetSubnetRoutes())
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), 100*time.Millisecond, "route advertisements should propagate to all nodes")
|
|
|
|
// Verify that no routes has been sent to the client,
|
|
// they are not yet enabled.
|
|
for _, client := range allClients {
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
assert.Nil(c, peerStatus.PrimaryRoutes)
|
|
}
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes are active before approval")
|
|
}
|
|
|
|
for _, node := range nodes {
|
|
_, err := headscale.ApproveRoutes(
|
|
node.GetId(),
|
|
util.MustStringsToPrefixes(node.GetAvailableRoutes()),
|
|
)
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
// Wait for route approvals to propagate to NodeStore
|
|
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
|
|
var err error
|
|
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(ct, err)
|
|
|
|
for _, node := range nodes {
|
|
assert.Len(ct, node.GetAvailableRoutes(), 1)
|
|
assert.Len(ct, node.GetApprovedRoutes(), 1)
|
|
assert.Len(ct, node.GetSubnetRoutes(), 1)
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), 100*time.Millisecond, "route approvals should propagate to all nodes")
|
|
|
|
// Wait for route state changes to propagate to clients
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
// Verify that the clients can see the new routes
|
|
for _, client := range allClients {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
assert.NotNil(c, peerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, peerStatus.AllowedIPs)
|
|
|
|
if peerStatus.AllowedIPs != nil {
|
|
assert.Len(c, peerStatus.AllowedIPs.AsSlice(), 3)
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{netip.MustParsePrefix(expectedRoutes[string(peerStatus.ID)])})
|
|
}
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "clients should see new routes")
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
1,
|
|
[]netip.Prefix{netip.MustParsePrefix("10.0.1.0/24")},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
2,
|
|
[]netip.Prefix{},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route state changes to propagate to nodes
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
var err error
|
|
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
|
|
for _, node := range nodes {
|
|
if node.GetId() == 1 {
|
|
assert.Len(c, node.GetAvailableRoutes(), 1) // 10.0.0.0/24
|
|
assert.Len(c, node.GetApprovedRoutes(), 1) // 10.0.1.0/24
|
|
assert.Empty(c, node.GetSubnetRoutes())
|
|
} else if node.GetId() == 2 {
|
|
assert.Len(c, node.GetAvailableRoutes(), 1) // 10.0.1.0/24
|
|
assert.Empty(c, node.GetApprovedRoutes())
|
|
assert.Empty(c, node.GetSubnetRoutes())
|
|
} else {
|
|
assert.Len(c, node.GetAvailableRoutes(), 1) // 10.0.2.0/24
|
|
assert.Len(c, node.GetApprovedRoutes(), 1) // 10.0.2.0/24
|
|
assert.Len(c, node.GetSubnetRoutes(), 1) // 10.0.2.0/24
|
|
}
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes")
|
|
|
|
// Verify that the clients can see the new routes
|
|
for _, client := range allClients {
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
switch peerStatus.ID {
|
|
case "1":
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
case "2":
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
default:
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{netip.MustParsePrefix("10.0.2.0/24")})
|
|
}
|
|
}
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying final route state visible to clients")
|
|
}
|
|
}
|
|
|
|
//nolint:gocyclo // complex HA failover test scenario
|
|
func TestHASubnetRouterFailover(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
// HASlowConverge (not HAConverge): tailscale's wgengine sometimes
|
|
// keeps routing through the previous primary for ~2 min after
|
|
// the netmap update flips PrimaryRoutes server-side.
|
|
propagationTime := integrationutil.HASlowConvergeTimeout
|
|
|
|
// Helper function to validate primary routes table state
|
|
validatePrimaryRoutes := func(t *testing.T, headscale ControlServer, expectedRoutes *types.DebugRoutes, message string) {
|
|
t.Helper()
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
primaryRoutesState, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
|
|
if diff := cmpdiff.Diff(expectedRoutes, primaryRoutesState, util.PrefixComparer); diff != "" {
|
|
t.Log(message)
|
|
t.Errorf("validatePrimaryRoutes mismatch (-want +got):\n%s", diff)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Validating primary routes table")
|
|
}
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 3,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
// We build the head image with curl and traceroute, so only use
|
|
// that for this test.
|
|
Versions: []string{"head"},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
// defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
err = scenario.CreateHeadscaleEnv(
|
|
[]tsic.Option{tsic.WithAcceptRoutes()},
|
|
hsic.WithTestName("rt-hafailover"),
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
prefp, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
pref := *prefp
|
|
t.Logf("usernet1 prefix: %s", pref.String())
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
t.Logf("webservice: %s, %s", webip.String(), weburl)
|
|
|
|
// Sort nodes by ID
|
|
sort.SliceStable(allClients, func(i, j int) bool {
|
|
statusI := allClients[i].MustStatus()
|
|
statusJ := allClients[j].MustStatus()
|
|
|
|
return statusI.Self.ID < statusJ.Self.ID
|
|
})
|
|
|
|
// This is ok because the scenario makes users in order, so the three first
|
|
// nodes, which are subnet routes, will be created first, and the last user
|
|
// will be created with the second.
|
|
subRouter1 := allClients[0]
|
|
subRouter2 := allClients[1]
|
|
subRouter3 := allClients[2]
|
|
|
|
client := allClients[3]
|
|
|
|
t.Logf("%s (%s) picked as client", client.Hostname(), client.MustID())
|
|
t.Logf("=== Initial Route Advertisement - Setting up HA configuration with 3 routers ===")
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" - Router 1 (%s): Advertising route %s - will become PRIMARY when approved", subRouter1.Hostname(), pref.String())
|
|
t.Logf(" - Router 2 (%s): Advertising route %s - will be STANDBY when approved", subRouter2.Hostname(), pref.String())
|
|
t.Logf(" - Router 3 (%s): Advertising route %s - will be STANDBY when approved", subRouter3.Hostname(), pref.String())
|
|
t.Logf(" Expected: All 3 routers advertise the same route for redundancy, but only one will be primary at a time")
|
|
|
|
for _, client := range allClients[:3] {
|
|
command := []string{
|
|
"tailscale",
|
|
"set",
|
|
"--advertise-routes=" + pref.String(),
|
|
}
|
|
_, _, err = client.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise route: %s", err)
|
|
}
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
// Wait for route configuration changes after advertising routes
|
|
var nodes []*v1.Node
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 6)
|
|
require.GreaterOrEqual(t, len(nodes), 3, "need at least 3 nodes to avoid panic")
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 1, 0, 0)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 1, 0, 0)
|
|
requireNodeRouteCountWithCollect(c, nodes[2], 1, 0, 0)
|
|
}, propagationTime, 200*time.Millisecond, "Waiting for route advertisements: All 3 routers should have advertised routes (available=1) but none approved yet (approved=0, subnet=0)")
|
|
|
|
// Verify that no routes has been sent to the client,
|
|
// they are not yet enabled.
|
|
for _, client := range allClients {
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
assert.Nil(c, peerStatus.PrimaryRoutes)
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Verifying no routes are active before approval")
|
|
}
|
|
|
|
// Declare variables that will be used across multiple EventuallyWithT blocks
|
|
var (
|
|
srs1, srs2, srs3 *ipnstate.Status
|
|
clientStatus *ipnstate.Status
|
|
srs1PeerStatus *ipnstate.PeerStatus
|
|
srs2PeerStatus *ipnstate.PeerStatus
|
|
srs3PeerStatus *ipnstate.PeerStatus
|
|
)
|
|
|
|
// Helper function to check test failure and print route map if needed
|
|
checkFailureAndPrintRoutes := func(t *testing.T, client TailscaleClient) { //nolint:thelper
|
|
if t.Failed() {
|
|
t.Logf("[%s] Test failed at this checkpoint", time.Now().Format(TimestampFormat))
|
|
|
|
status, err := client.Status()
|
|
if err == nil {
|
|
printCurrentRouteMap(t, xmaps.Values(status.Peer)...)
|
|
}
|
|
|
|
t.FailNow()
|
|
}
|
|
}
|
|
|
|
// Validate primary routes table state - no routes approved yet
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{},
|
|
PrimaryRoutes: map[string]types.NodeID{}, // No primary routes yet
|
|
}, "Primary routes table should be empty (no approved routes yet)")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// Enable route on node 1
|
|
t.Logf("=== Approving route on router 1 (%s) - Single router mode (no HA yet) ===", subRouter1.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Expected: Router 1 becomes PRIMARY with route %s active", pref.String())
|
|
t.Logf(" Expected: Routers 2 & 3 remain with advertised but unapproved routes")
|
|
t.Logf(" Expected: Client can access webservice through router 1 only")
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route approval on first subnet router
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 6)
|
|
require.GreaterOrEqual(t, len(nodes), 3, "need at least 3 nodes to avoid panic")
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 1, 0, 0)
|
|
requireNodeRouteCountWithCollect(c, nodes[2], 1, 0, 0)
|
|
}, propagationTime, 200*time.Millisecond, "Router 1 approval verification: Should be PRIMARY (available=1, approved=1, subnet=1), others still unapproved (available=1, approved=0, subnet=0)")
|
|
|
|
// Verify that the client has routes from the primary machine and can access
|
|
// the webservice.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
srs1 = subRouter1.MustStatus()
|
|
srs2 = subRouter2.MustStatus()
|
|
srs3 = subRouter3.MustStatus()
|
|
clientStatus = client.MustStatus()
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.True(c, srs1PeerStatus.Online, "Router 1 should be online and serving as PRIMARY")
|
|
assert.True(c, srs2PeerStatus.Online, "Router 2 should be online but NOT serving routes (unapproved)")
|
|
assert.True(c, srs3PeerStatus.Online, "Router 3 should be online but NOT serving routes (unapproved)")
|
|
|
|
assert.Nil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs3PeerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, srs1PeerStatus.PrimaryRoutes)
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{pref})
|
|
requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil)
|
|
|
|
if srs1PeerStatus.PrimaryRoutes != nil {
|
|
t.Logf("got list: %v, want in: %v", srs1PeerStatus.PrimaryRoutes.AsSlice(), pref)
|
|
assert.Contains(c,
|
|
srs1PeerStatus.PrimaryRoutes.AsSlice(),
|
|
pref,
|
|
)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Verifying Router 1 is PRIMARY with routes after approval")
|
|
|
|
t.Logf("=== Validating connectivity through PRIMARY router 1 (%s) to webservice at %s ===", must.Get(subRouter1.IPv4()).String(), webip.String())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Expected: Traffic flows through router 1 as it's the only approved route")
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter1.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for subRouter1") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 1")
|
|
|
|
// Validate primary routes table state - router 1 is primary
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref},
|
|
// Note: Router 2 and 3 are available but not approved
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 1 should be primary for route "+pref.String())
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// Enable route on node 2, now we will have a HA subnet router
|
|
t.Logf("=== Enabling High Availability by approving route on router 2 (%s) ===", subRouter2.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 is PRIMARY and actively serving traffic")
|
|
t.Logf(" Expected: Router 2 becomes STANDBY (approved but not primary)")
|
|
t.Logf(" Expected: Router 1 remains PRIMARY (no flapping - stability preferred)")
|
|
t.Logf(" Expected: HA is now active - if router 1 fails, router 2 can take over")
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route approval on second subnet router
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 6)
|
|
|
|
if len(nodes) >= 3 {
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0)
|
|
requireNodeRouteCountWithCollect(c, nodes[2], 1, 0, 0)
|
|
}
|
|
}, integrationutil.ScaledTimeout(3*time.Second), integrationutil.FastPoll, "HA setup verification: Router 2 approved as STANDBY (available=1, approved=1, subnet=0), Router 1 stays PRIMARY (subnet=1)")
|
|
|
|
// Verify that the client has routes from the primary machine
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
srs1 = subRouter1.MustStatus()
|
|
srs2 = subRouter2.MustStatus()
|
|
srs3 = subRouter3.MustStatus()
|
|
clientStatus = client.MustStatus()
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.True(c, srs1PeerStatus.Online, "Router 1 should be online and remain PRIMARY")
|
|
assert.True(c, srs2PeerStatus.Online, "Router 2 should be online and now approved as STANDBY")
|
|
assert.True(c, srs3PeerStatus.Online, "Router 3 should be online but still unapproved")
|
|
|
|
assert.Nil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs3PeerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, srs1PeerStatus.PrimaryRoutes)
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{pref})
|
|
requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil)
|
|
|
|
if srs1PeerStatus.PrimaryRoutes != nil {
|
|
t.Logf("got list: %v, want in: %v", srs1PeerStatus.PrimaryRoutes.AsSlice(), pref)
|
|
assert.Contains(c,
|
|
srs1PeerStatus.PrimaryRoutes.AsSlice(),
|
|
pref,
|
|
)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Verifying Router 1 remains PRIMARY after Router 2 approval")
|
|
|
|
// Validate primary routes table state - router 1 still primary, router 2 approved but standby
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref},
|
|
// Note: Router 3 is available but not approved
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 1 should remain primary after router 2 approval")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
t.Logf("=== Validating HA configuration - Router 1 PRIMARY, Router 2 STANDBY ===")
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current routing: Traffic through router 1 (%s) to %s", must.Get(subRouter1.IPv4()), webip.String())
|
|
t.Logf(" Expected: Router 1 continues to handle all traffic (no change from before)")
|
|
t.Logf(" Expected: Router 2 is ready to take over if router 1 fails")
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1 in HA mode")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 in HA mode")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter1.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for subRouter1") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traceroute still goes through router 1 in HA mode")
|
|
|
|
// Validate primary routes table state - router 1 primary, router 2 approved (standby)
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref},
|
|
// Note: Router 3 is available but not approved
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 1 primary with router 2 as standby")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// Enable route on node 3, now we will have a second standby and all will
|
|
// be enabled.
|
|
t.Logf("=== Adding second STANDBY router by approving route on router 3 (%s) ===", subRouter3.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 PRIMARY, Router 2 STANDBY")
|
|
t.Logf(" Expected: Router 3 becomes second STANDBY (approved but not primary)")
|
|
t.Logf(" Expected: Router 1 remains PRIMARY, Router 2 remains first STANDBY")
|
|
t.Logf(" Expected: Full HA configuration with 1 PRIMARY + 2 STANDBY routers")
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter3.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route approval on third subnet router
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 6)
|
|
require.GreaterOrEqual(t, len(nodes), 3, "need at least 3 nodes to avoid panic")
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0)
|
|
requireNodeRouteCountWithCollect(c, nodes[2], 1, 1, 0)
|
|
}, integrationutil.ScaledTimeout(3*time.Second), integrationutil.FastPoll, "Full HA verification: Router 3 approved as second STANDBY (available=1, approved=1, subnet=0), Router 1 PRIMARY, Router 2 first STANDBY")
|
|
|
|
// Verify that the client has routes from the primary machine
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
srs1 = subRouter1.MustStatus()
|
|
srs2 = subRouter2.MustStatus()
|
|
srs3 = subRouter3.MustStatus()
|
|
clientStatus = client.MustStatus()
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.True(c, srs1PeerStatus.Online, "Router 1 should be online and remain PRIMARY")
|
|
assert.True(c, srs2PeerStatus.Online, "Router 2 should be online as first STANDBY")
|
|
assert.True(c, srs3PeerStatus.Online, "Router 3 should be online as second STANDBY")
|
|
|
|
assert.Nil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs3PeerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, srs1PeerStatus.PrimaryRoutes)
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{pref})
|
|
requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil)
|
|
|
|
if srs1PeerStatus.PrimaryRoutes != nil {
|
|
t.Logf("got list: %v, want in: %v", srs1PeerStatus.PrimaryRoutes.AsSlice(), pref)
|
|
assert.Contains(c,
|
|
srs1PeerStatus.PrimaryRoutes.AsSlice(),
|
|
pref,
|
|
)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Verifying full HA with 3 routers: Router 1 PRIMARY, Routers 2 & 3 STANDBY")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1 with full HA")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 with full HA")
|
|
|
|
// Wait for traceroute to work correctly through the expected router
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
// Get the expected router IP - use a more robust approach to handle temporary disconnections
|
|
ips, err := subRouter1.IPs()
|
|
assert.NoError(c, err)
|
|
assert.NotEmpty(c, ips, "subRouter1 should have IP addresses")
|
|
|
|
var expectedIP netip.Addr
|
|
|
|
for _, ip := range ips {
|
|
if ip.Is4() {
|
|
expectedIP = ip
|
|
break
|
|
}
|
|
}
|
|
|
|
assert.True(c, expectedIP.IsValid(), "subRouter1 should have a valid IPv4 address")
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, expectedIP)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traffic still flows through PRIMARY router 1 with full HA setup active")
|
|
|
|
// Validate primary routes table state - all 3 routers approved, router 1 still primary
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref},
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 1 primary with all 3 routers approved")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// Take down the current primary
|
|
t.Logf("=== FAILOVER TEST: Taking down PRIMARY router 1 (%s) ===", subRouter1.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 PRIMARY (serving traffic), Router 2 & 3 STANDBY")
|
|
t.Logf(" Action: Shutting down router 1 to simulate failure")
|
|
t.Logf(" Expected: Router 2 (%s) should automatically become new PRIMARY", subRouter2.Hostname())
|
|
t.Logf(" Expected: Router 3 remains STANDBY")
|
|
t.Logf(" Expected: Traffic seamlessly fails over to router 2")
|
|
|
|
err = subRouter1.Down()
|
|
require.NoError(t, err)
|
|
|
|
// Wait for router status changes after r1 goes down
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
srs2 = subRouter2.MustStatus()
|
|
clientStatus = client.MustStatus()
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.False(c, srs1PeerStatus.Online, "r1 should be offline")
|
|
assert.True(c, srs2PeerStatus.Online, "r2 should be online")
|
|
assert.True(c, srs3PeerStatus.Online, "r3 should be online")
|
|
|
|
assert.Nil(c, srs1PeerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs3PeerStatus.PrimaryRoutes)
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, []netip.Prefix{pref})
|
|
requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil)
|
|
|
|
if srs2PeerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c,
|
|
srs2PeerStatus.PrimaryRoutes.AsSlice(),
|
|
pref,
|
|
)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Failover verification: Router 1 offline, Router 2 should be new PRIMARY with routes, Router 3 still STANDBY")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 2 after failover")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after failover")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter2.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for subRouter2") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 2 after failover")
|
|
|
|
// Validate primary routes table state - router 2 is now primary after router 1 failure
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
// Router 1 is disconnected, so not in AvailableRoutes
|
|
types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref},
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 2 should be primary after router 1 failure")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// Take down subnet router 2, leaving none available
|
|
t.Logf("=== FAILOVER TEST: Taking down NEW PRIMARY router 2 (%s) ===", subRouter2.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 OFFLINE, Router 2 PRIMARY (serving traffic), Router 3 STANDBY")
|
|
t.Logf(" Action: Shutting down router 2 to simulate cascading failure")
|
|
t.Logf(" Expected: Router 3 (%s) should become new PRIMARY (last remaining router)", subRouter3.Hostname())
|
|
t.Logf(" Expected: With only 1 router left, HA is effectively disabled")
|
|
t.Logf(" Expected: Traffic continues through router 3")
|
|
|
|
err = subRouter2.Down()
|
|
require.NoError(t, err)
|
|
|
|
// Wait for router status changes after r2 goes down
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
clientStatus, err = client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.False(c, srs1PeerStatus.Online, "Router 1 should still be offline")
|
|
assert.False(c, srs2PeerStatus.Online, "Router 2 should now be offline after failure")
|
|
assert.True(c, srs3PeerStatus.Online, "Router 3 should be online and taking over as PRIMARY")
|
|
|
|
assert.Nil(c, srs1PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, srs3PeerStatus.PrimaryRoutes)
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, []netip.Prefix{pref})
|
|
}, propagationTime, 200*time.Millisecond, "Second failover verification: Router 1 & 2 offline, Router 3 should be new PRIMARY (last router standing) with routes")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 3 after second failover")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 3 after second failover")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter3.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for subRouter3") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 3 after second failover")
|
|
|
|
// Validate primary routes table state - router 3 is now primary after router 2 failure
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
// Routers 1 and 2 are disconnected, so not in AvailableRoutes
|
|
types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref},
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 3 should be primary after router 2 failure")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// Bring up subnet router 1, making the route available from there.
|
|
t.Logf("=== RECOVERY TEST: Bringing router 1 (%s) back online ===", subRouter1.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 OFFLINE, Router 2 OFFLINE, Router 3 PRIMARY (only router)")
|
|
t.Logf(" Action: Starting router 1 to restore HA capability")
|
|
t.Logf(" Expected: Router 3 remains PRIMARY (stability - no unnecessary failover)")
|
|
t.Logf(" Expected: Router 1 becomes STANDBY (ready for HA)")
|
|
t.Logf(" Expected: HA is restored with 2 routers available")
|
|
|
|
err = subRouter1.Up()
|
|
require.NoError(t, err)
|
|
|
|
// Wait for router status changes after r1 comes back up
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
clientStatus, err = client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.True(c, srs1PeerStatus.Online, "Router 1 should be back online as STANDBY")
|
|
assert.False(c, srs2PeerStatus.Online, "Router 2 should still be offline")
|
|
assert.True(c, srs3PeerStatus.Online, "Router 3 should remain online as PRIMARY")
|
|
|
|
assert.Nil(c, srs1PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, srs3PeerStatus.PrimaryRoutes)
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, []netip.Prefix{pref})
|
|
|
|
if srs3PeerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c,
|
|
srs3PeerStatus.PrimaryRoutes.AsSlice(),
|
|
pref,
|
|
)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Recovery verification: Router 1 back online as STANDBY, Router 3 remains PRIMARY (no flapping) with routes")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can still reach webservice through router 3 after router 1 recovery")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can still reach webservice through router 3 after router 1 recovery")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter3.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for subRouter3") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traceroute still goes through router 3 after router 1 recovery")
|
|
|
|
// Validate primary routes table state - router 3 remains primary after router 1 comes back
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref},
|
|
// Router 2 is still disconnected
|
|
types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref},
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 3 should remain primary after router 1 recovery")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// Bring up subnet router 2, should result in no change.
|
|
t.Logf("=== FULL RECOVERY TEST: Bringing router 2 (%s) back online ===", subRouter2.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 STANDBY, Router 2 OFFLINE, Router 3 PRIMARY")
|
|
t.Logf(" Action: Starting router 2 to restore full HA (3 routers)")
|
|
t.Logf(" Expected: Router 3 (%s) remains PRIMARY (stability - avoid unnecessary failovers)", subRouter3.Hostname())
|
|
t.Logf(" Expected: Router 1 (%s) remains first STANDBY", subRouter1.Hostname())
|
|
t.Logf(" Expected: Router 2 (%s) becomes second STANDBY", subRouter2.Hostname())
|
|
t.Logf(" Expected: Full HA restored with all 3 routers online")
|
|
|
|
err = subRouter2.Up()
|
|
require.NoError(t, err)
|
|
|
|
// Wait for nodestore batch processing to complete and online status to be updated
|
|
// NodeStore batching timeout is 500ms, so we wait up to 10 seconds for all routers to be online
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
clientStatus, err = client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.True(c, srs1PeerStatus.Online, "Router 1 should be online as STANDBY")
|
|
assert.True(c, srs2PeerStatus.Online, "Router 2 should be back online as STANDBY")
|
|
assert.True(c, srs3PeerStatus.Online, "Router 3 should remain online as PRIMARY")
|
|
|
|
assert.Nil(c, srs1PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, srs3PeerStatus.PrimaryRoutes)
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, []netip.Prefix{pref})
|
|
|
|
if srs3PeerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c,
|
|
srs3PeerStatus.PrimaryRoutes.AsSlice(),
|
|
pref,
|
|
)
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "Full recovery verification: All 3 routers online, Router 3 remains PRIMARY (no flapping) with routes")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 3 after full recovery")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 3 after full recovery")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter3.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for subRouter3") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 3 after full recovery")
|
|
|
|
// Validate primary routes table state - router 3 remains primary after all routers back online
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref},
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 3 should remain primary after full recovery")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
t.Logf("=== ROUTE DISABLE TEST: Removing approved route from PRIMARY router 3 (%s) ===", subRouter3.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 STANDBY, Router 2 STANDBY, Router 3 PRIMARY")
|
|
t.Logf(" Action: Disabling route approval on router 3 (route still advertised but not approved)")
|
|
t.Logf(" Expected: Router 1 (%s) should become new PRIMARY (lowest ID with approved route)", subRouter1.Hostname())
|
|
t.Logf(" Expected: Router 2 (%s) remains STANDBY", subRouter2.Hostname())
|
|
t.Logf(" Expected: Router 3 (%s) goes to advertised-only state (no longer serving)", subRouter3.Hostname())
|
|
_, err = headscale.ApproveRoutes(MustFindNode(subRouter3.Hostname(), nodes).GetId(), []netip.Prefix{})
|
|
|
|
// Wait for nodestore batch processing and route state changes to complete
|
|
// NodeStore batching timeout is 500ms, so we wait up to 10 seconds for route failover
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 6)
|
|
|
|
// After disabling route on r3, r1 should become primary with 1 subnet route
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 0)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "Route disable verification: Router 3 route disabled, Router 1 should be new PRIMARY, Router 2 STANDBY")
|
|
|
|
// Verify that the route is announced from subnet router 1
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
clientStatus, err = client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.NotNil(c, srs1PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs3PeerStatus.PrimaryRoutes)
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{pref})
|
|
requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil)
|
|
|
|
if srs1PeerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c,
|
|
srs1PeerStatus.PrimaryRoutes.AsSlice(),
|
|
pref,
|
|
)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Verifying Router 1 becomes PRIMARY after Router 3 route disabled")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1 after route disable")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 after route disable")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter1.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for subRouter1") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 1 after route disable")
|
|
|
|
// Validate primary routes table state - router 1 is primary after router 3 route disabled
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref},
|
|
// Router 3's route is no longer approved, so not in AvailableRoutes
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 1 should be primary after router 3 route disabled")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// Disable the route of subnet router 1, making it failover to 2
|
|
t.Logf("=== ROUTE DISABLE TEST: Removing approved route from NEW PRIMARY router 1 (%s) ===", subRouter1.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 PRIMARY, Router 2 STANDBY, Router 3 advertised-only")
|
|
t.Logf(" Action: Disabling route approval on router 1")
|
|
t.Logf(" Expected: Router 2 (%s) should become new PRIMARY (only remaining approved route)", subRouter2.Hostname())
|
|
t.Logf(" Expected: Router 1 (%s) goes to advertised-only state", subRouter1.Hostname())
|
|
t.Logf(" Expected: Router 3 (%s) remains advertised-only", subRouter3.Hostname())
|
|
_, err = headscale.ApproveRoutes(MustFindNode(subRouter1.Hostname(), nodes).GetId(), []netip.Prefix{})
|
|
|
|
// Wait for nodestore batch processing and route state changes to complete
|
|
// NodeStore batching timeout is 500ms, so we wait up to 10 seconds for route failover
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 6)
|
|
|
|
// After disabling route on r1, r2 should become primary with 1 subnet route
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 0, 0)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "Second route disable verification: Router 1 route disabled, Router 2 should be new PRIMARY")
|
|
|
|
// Verify that the route is announced from subnet router 1
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
clientStatus, err = client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.Nil(c, srs1PeerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs3PeerStatus.PrimaryRoutes)
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, nil)
|
|
requirePeerSubnetRoutesWithCollect(c, srs2PeerStatus, []netip.Prefix{pref})
|
|
requirePeerSubnetRoutesWithCollect(c, srs3PeerStatus, nil)
|
|
|
|
if srs2PeerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c,
|
|
srs2PeerStatus.PrimaryRoutes.AsSlice(),
|
|
pref,
|
|
)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Verifying Router 2 becomes PRIMARY after Router 1 route disabled")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 2 after second route disable")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after second route disable")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter2.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for subRouter2") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traceroute goes through router 2 after second route disable")
|
|
|
|
// Validate primary routes table state - router 2 is primary after router 1 route disabled
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
// Router 1's route is no longer approved, so not in AvailableRoutes
|
|
types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref},
|
|
// Router 3's route is still not approved
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 2 should be primary after router 1 route disabled")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// enable the route of subnet router 1, no change expected
|
|
t.Logf("=== ROUTE RE-ENABLE TEST: Re-approving route on router 1 (%s) ===", subRouter1.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 advertised-only, Router 2 PRIMARY, Router 3 advertised-only")
|
|
t.Logf(" Action: Re-enabling route approval on router 1")
|
|
t.Logf(" Expected: Router 2 (%s) remains PRIMARY (stability - no unnecessary flapping)", subRouter2.Hostname())
|
|
t.Logf(" Expected: Router 1 (%s) becomes STANDBY (approved but not primary)", subRouter1.Hostname())
|
|
t.Logf(" Expected: HA fully restored with Router 2 PRIMARY and Router 1 STANDBY")
|
|
|
|
r1Node := MustFindNode(subRouter1.Hostname(), nodes)
|
|
_, err = headscale.ApproveRoutes(
|
|
r1Node.GetId(),
|
|
util.MustStringsToPrefixes(r1Node.GetAvailableRoutes()),
|
|
)
|
|
|
|
// Wait for route state changes after re-enabling r1
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 6)
|
|
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 1, 0)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0)
|
|
}, propagationTime, 200*time.Millisecond, "Re-enable verification: Router 1 approved as STANDBY, Router 2 remains PRIMARY (no flapping), full HA restored")
|
|
|
|
// Verify that the route is announced from subnet router 1
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
clientStatus, err = client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey]
|
|
srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey]
|
|
srs3PeerStatus = clientStatus.Peer[srs3.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
assert.NotNil(c, srs2PeerStatus, "Router 2 peer should exist")
|
|
assert.NotNil(c, srs3PeerStatus, "Router 3 peer should exist")
|
|
|
|
if srs1PeerStatus == nil || srs2PeerStatus == nil || srs3PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
assert.Nil(c, srs1PeerStatus.PrimaryRoutes)
|
|
assert.NotNil(c, srs2PeerStatus.PrimaryRoutes)
|
|
assert.Nil(c, srs3PeerStatus.PrimaryRoutes)
|
|
|
|
if srs2PeerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c,
|
|
srs2PeerStatus.PrimaryRoutes.AsSlice(),
|
|
pref,
|
|
)
|
|
}
|
|
}, propagationTime, 200*time.Millisecond, "Verifying Router 2 remains PRIMARY after Router 1 route re-enabled")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 2 after route re-enable")
|
|
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after route re-enable")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter2.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for subRouter2") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "Verifying traceroute still goes through router 2 after route re-enable")
|
|
|
|
// Validate primary routes table state after router 1 re-approval
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref},
|
|
// Router 3 route is still not approved
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 2 should remain primary after router 1 re-approval")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
|
|
// Enable route on node 3, we now have all routes re-enabled
|
|
t.Logf("=== ROUTE RE-ENABLE TEST: Re-approving route on router 3 (%s) - Full HA Restoration ===", subRouter3.Hostname())
|
|
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
|
|
t.Logf(" Current state: Router 1 STANDBY, Router 2 PRIMARY, Router 3 advertised-only")
|
|
t.Logf(" Action: Re-enabling route approval on router 3")
|
|
t.Logf(" Expected: Router 2 (%s) remains PRIMARY (stability preferred)", subRouter2.Hostname())
|
|
t.Logf(" Expected: Routers 1 & 3 are both STANDBY")
|
|
t.Logf(" Expected: Full HA restored with all 3 routers available")
|
|
|
|
r3Node := MustFindNode(subRouter3.Hostname(), nodes)
|
|
_, err = headscale.ApproveRoutes(
|
|
r3Node.GetId(),
|
|
util.MustStringsToPrefixes(r3Node.GetAvailableRoutes()),
|
|
)
|
|
|
|
// Wait for route state changes after re-enabling r3
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 6)
|
|
require.GreaterOrEqual(t, len(nodes), 3, "need at least 3 nodes to avoid panic")
|
|
// After router 3 re-approval: Router 2 remains PRIMARY, Routers 1&3 are STANDBY
|
|
// SubnetRoutes should only show routes for PRIMARY node (actively serving)
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 0) // Router 1: STANDBY (available, approved, but not serving)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 1) // Router 2: PRIMARY (available, approved, and serving)
|
|
requireNodeRouteCountWithCollect(c, nodes[2], 1, 1, 0) // Router 3: STANDBY (available, approved, but not serving)
|
|
}, propagationTime, 200*time.Millisecond, "Waiting for route state after router 3 re-approval")
|
|
|
|
// Validate primary routes table state after router 3 re-approval
|
|
validatePrimaryRoutes(t, headscale, &types.DebugRoutes{
|
|
AvailableRoutes: map[types.NodeID][]netip.Prefix{
|
|
types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()): {pref},
|
|
types.NodeID(MustFindNode(subRouter3.Hostname(), nodes).GetId()): {pref},
|
|
},
|
|
PrimaryRoutes: map[string]types.NodeID{
|
|
pref.String(): types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId()),
|
|
},
|
|
}, "Router 2 should remain primary after router 3 re-approval")
|
|
|
|
checkFailureAndPrintRoutes(t, client)
|
|
}
|
|
|
|
// TestSubnetRouteACL verifies that Subnet routes are distributed
|
|
// as expected when ACLs are activated.
|
|
// It implements the issue from
|
|
// https://github.com/juanfont/headscale/issues/1604
|
|
func TestSubnetRouteACL(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
user := "user4"
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 2,
|
|
Users: []string{user},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
err = scenario.CreateHeadscaleEnv([]tsic.Option{
|
|
tsic.WithAcceptRoutes(),
|
|
}, hsic.WithTestName("rt-subnetacl"), hsic.WithACLPolicy(
|
|
&policyv2.Policy{
|
|
Groups: policyv2.Groups{
|
|
policyv2.Group("group:admins"): []policyv2.Username{policyv2.Username(user + "@")},
|
|
},
|
|
ACLs: []policyv2.ACL{
|
|
{
|
|
Action: "accept",
|
|
Sources: []policyv2.Alias{groupp("group:admins")},
|
|
Destinations: []policyv2.AliasWithPorts{
|
|
aliasWithPorts(groupp("group:admins"), tailcfg.PortRangeAny),
|
|
},
|
|
},
|
|
{
|
|
Action: "accept",
|
|
Sources: []policyv2.Alias{groupp("group:admins")},
|
|
Destinations: []policyv2.AliasWithPorts{
|
|
aliasWithPorts(prefixp("10.33.0.0/16"), tailcfg.PortRangeAny),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
))
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
expectedRoutes := map[string]string{
|
|
"1": "10.33.0.0/16",
|
|
}
|
|
|
|
// Sort nodes by ID
|
|
sort.SliceStable(allClients, func(i, j int) bool {
|
|
statusI := allClients[i].MustStatus()
|
|
statusJ := allClients[j].MustStatus()
|
|
|
|
return statusI.Self.ID < statusJ.Self.ID
|
|
})
|
|
|
|
subRouter1 := allClients[0]
|
|
|
|
client := allClients[1]
|
|
|
|
// Read Self.ID inside a retry (it lags initial connection); apply
|
|
// the route mutation outside, since retrying a mutation hides
|
|
// real failures.
|
|
for _, client := range allClients {
|
|
var status *ipnstate.Status
|
|
|
|
require.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
s, err := client.Status()
|
|
assert.NoError(c, err)
|
|
assert.NotNil(c, s)
|
|
|
|
if s != nil {
|
|
status = s
|
|
}
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Reading client status before route advertisement")
|
|
|
|
route, ok := expectedRoutes[string(status.Self.ID)]
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
_, _, err = client.Execute([]string{
|
|
"tailscale",
|
|
"set",
|
|
"--advertise-routes=" + route,
|
|
})
|
|
require.NoErrorf(t, err, "failed to advertise route on %s", client.Hostname())
|
|
}
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
// Wait for route advertisements to propagate to the server
|
|
var nodes []*v1.Node
|
|
|
|
require.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
var err error
|
|
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 2)
|
|
|
|
// Find the node that should have the route by checking node IDs
|
|
var (
|
|
routeNode *v1.Node
|
|
otherNode *v1.Node
|
|
)
|
|
|
|
for _, node := range nodes {
|
|
nodeIDStr := strconv.FormatUint(node.GetId(), 10)
|
|
if _, shouldHaveRoute := expectedRoutes[nodeIDStr]; shouldHaveRoute {
|
|
routeNode = node
|
|
} else {
|
|
otherNode = node
|
|
}
|
|
}
|
|
|
|
assert.NotNil(c, routeNode, "could not find node that should have route")
|
|
assert.NotNil(c, otherNode, "could not find node that should not have route")
|
|
|
|
// After NodeStore fix: routes are properly tracked in route manager
|
|
// This test uses a policy with NO auto-approvers, so routes should be:
|
|
// announced=1, approved=0, subnet=0 (routes announced but not approved)
|
|
requireNodeRouteCountWithCollect(c, routeNode, 1, 0, 0)
|
|
requireNodeRouteCountWithCollect(c, otherNode, 0, 0, 0)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), 100*time.Millisecond, "route advertisements should propagate to server")
|
|
|
|
// Verify that no routes has been sent to the client,
|
|
// they are not yet enabled.
|
|
for _, client := range allClients {
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
assert.Nil(c, peerStatus.PrimaryRoutes)
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes are active before approval")
|
|
}
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
1,
|
|
[]netip.Prefix{netip.MustParsePrefix(expectedRoutes["1"])},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route state changes to propagate to nodes
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 2)
|
|
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 0, 0, 0)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes")
|
|
|
|
// Verify that the client has routes from the primary machine
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
srs1, err := subRouter1.Status()
|
|
assert.NoError(c, err)
|
|
|
|
clientStatus, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
srs1PeerStatus := clientStatus.Peer[srs1.Self.PublicKey]
|
|
|
|
assert.NotNil(c, srs1PeerStatus, "Router 1 peer should exist")
|
|
|
|
if srs1PeerStatus == nil {
|
|
return
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{netip.MustParsePrefix(expectedRoutes["1"])})
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying client can see subnet routes from router")
|
|
|
|
// Wait for packet filter updates to propagate to client netmap
|
|
wantClientFilter := []filter.Match{
|
|
{
|
|
IPProto: views.SliceOf([]ipproto.Proto{
|
|
ipproto.TCP, ipproto.UDP, ipproto.ICMPv4, ipproto.ICMPv6,
|
|
}),
|
|
Srcs: []netip.Prefix{
|
|
netip.MustParsePrefix("100.64.0.1/32"),
|
|
netip.MustParsePrefix("100.64.0.2/32"),
|
|
netip.MustParsePrefix("fd7a:115c:a1e0::1/128"),
|
|
netip.MustParsePrefix("fd7a:115c:a1e0::2/128"),
|
|
},
|
|
Dsts: []filter.NetPortRange{
|
|
{
|
|
Net: netip.MustParsePrefix("100.64.0.2/32"),
|
|
Ports: allPorts,
|
|
},
|
|
{
|
|
Net: netip.MustParsePrefix("fd7a:115c:a1e0::2/128"),
|
|
Ports: allPorts,
|
|
},
|
|
},
|
|
Caps: []filter.CapMatch{},
|
|
},
|
|
}
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
clientNm, err := client.Netmap()
|
|
assert.NoError(c, err)
|
|
|
|
if diff := cmpdiff.Diff(wantClientFilter, clientNm.PacketFilter, util.ViewSliceIPProtoComparer, util.PrefixComparer); diff != "" {
|
|
assert.Fail(c, fmt.Sprintf("Client (%s) filter, unexpected result (-want +got):\n%s", client.Hostname(), diff))
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for client packet filter to update")
|
|
|
|
// Wait for packet filter updates to propagate to subnet router netmap
|
|
// The two ACL rules (group:admins -> group:admins:* and group:admins -> 10.33.0.0/16:*)
|
|
// are merged into one filter rule since they share the same SrcIPs and IPProto.
|
|
wantSubnetFilter := []filter.Match{
|
|
{
|
|
IPProto: views.SliceOf([]ipproto.Proto{
|
|
ipproto.TCP, ipproto.UDP, ipproto.ICMPv4, ipproto.ICMPv6,
|
|
}),
|
|
Srcs: []netip.Prefix{
|
|
netip.MustParsePrefix("100.64.0.1/32"),
|
|
netip.MustParsePrefix("100.64.0.2/32"),
|
|
netip.MustParsePrefix("fd7a:115c:a1e0::1/128"),
|
|
netip.MustParsePrefix("fd7a:115c:a1e0::2/128"),
|
|
},
|
|
Dsts: []filter.NetPortRange{
|
|
{
|
|
Net: netip.MustParsePrefix("100.64.0.1/32"),
|
|
Ports: allPorts,
|
|
},
|
|
{
|
|
Net: netip.MustParsePrefix("fd7a:115c:a1e0::1/128"),
|
|
Ports: allPorts,
|
|
},
|
|
{
|
|
Net: netip.MustParsePrefix("10.33.0.0/16"),
|
|
Ports: allPorts,
|
|
},
|
|
},
|
|
Caps: []filter.CapMatch{},
|
|
},
|
|
}
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
subnetNm, err := subRouter1.Netmap()
|
|
assert.NoError(c, err)
|
|
|
|
if diff := cmpdiff.Diff(wantSubnetFilter, subnetNm.PacketFilter, util.ViewSliceIPProtoComparer, util.PrefixComparer); diff != "" {
|
|
assert.Fail(c, fmt.Sprintf("Subnet (%s) filter, unexpected result (-want +got):\n%s", subRouter1.Hostname(), diff))
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for subnet router packet filter to update")
|
|
}
|
|
|
|
// TestEnablingExitRoutes tests enabling exit routes for clients.
|
|
// Its more or less the same as TestEnablingRoutes, but with the --advertise-exit-node flag
|
|
// set during login instead of set.
|
|
func TestEnablingExitRoutes(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
user := "user2" //nolint:goconst // test-specific value, not related to userToDelete constant
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 2,
|
|
Users: []string{user},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
|
|
require.NoErrorf(t, err, "failed to create scenario")
|
|
defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
err = scenario.CreateHeadscaleEnv([]tsic.Option{
|
|
tsic.WithExtraLoginArgs([]string{"--advertise-exit-node"}),
|
|
}, hsic.WithTestName("rt-exitroute"))
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
var nodes []*v1.Node
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
var err error
|
|
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 2)
|
|
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 2, 0, 0)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 2, 0, 0)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for route advertisements to propagate")
|
|
|
|
// Verify that no routes has been sent to the client,
|
|
// they are not yet enabled.
|
|
for _, client := range allClients {
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
assert.Nil(c, peerStatus.PrimaryRoutes)
|
|
}
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no exit routes are active before approval")
|
|
}
|
|
|
|
// Enable all routes, but do v4 on one and v6 on other to ensure they
|
|
// are both added since they are exit routes.
|
|
_, err = headscale.ApproveRoutes(
|
|
nodes[0].GetId(),
|
|
[]netip.Prefix{tsaddr.AllIPv4()},
|
|
)
|
|
require.NoError(t, err)
|
|
_, err = headscale.ApproveRoutes(
|
|
nodes[1].GetId(),
|
|
[]netip.Prefix{tsaddr.AllIPv6()},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route state changes to propagate
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 2)
|
|
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 2, 2, 2)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 2, 2, 2)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to both nodes")
|
|
|
|
// Wait for route state changes to propagate to clients
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
// Verify that the clients can see the new routes
|
|
for _, client := range allClients {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
assert.NotNil(c, peerStatus.AllowedIPs)
|
|
|
|
if peerStatus.AllowedIPs != nil {
|
|
assert.Len(c, peerStatus.AllowedIPs.AsSlice(), 4)
|
|
assert.Contains(c, peerStatus.AllowedIPs.AsSlice(), tsaddr.AllIPv4())
|
|
assert.Contains(c, peerStatus.AllowedIPs.AsSlice(), tsaddr.AllIPv6())
|
|
}
|
|
}
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "clients should see new routes")
|
|
}
|
|
|
|
// TestExitRoutesWithAutogroupInternetACL reproduces juanfont/headscale#3212.
|
|
// When an ACL grants access via autogroup:internet, the source nodes must
|
|
// still see approved exit nodes as peers with 0.0.0.0/0 and ::/0 in their
|
|
// AllowedIPs — that visibility is what drives `tailscale exit-node list`.
|
|
//
|
|
// Tailscale SaaS surfaces exit nodes on the autogroup:internet path
|
|
// (verified against a live tailnet on 2026-04-28; see captures
|
|
// routes-b17/b18 in tscap). The bug was that headscale stripped
|
|
// autogroup:internet rules from both the client packet filter AND the
|
|
// matcher source used by Node.CanAccess, breaking exit-node visibility.
|
|
func TestExitRoutesWithAutogroupInternetACL(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
user := "user2"
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 2,
|
|
Users: []string{user},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
require.NoErrorf(t, err, "failed to create scenario")
|
|
|
|
defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
err = scenario.CreateHeadscaleEnv(
|
|
[]tsic.Option{
|
|
tsic.WithExtraLoginArgs([]string{"--advertise-exit-node"}),
|
|
},
|
|
hsic.WithTestName("rt-exit-aginternet"),
|
|
hsic.WithACLPolicy(&policyv2.Policy{
|
|
ACLs: []policyv2.ACL{
|
|
{
|
|
Action: "accept",
|
|
Sources: []policyv2.Alias{usernamep(user + "@")},
|
|
Destinations: []policyv2.AliasWithPorts{
|
|
aliasWithPorts(
|
|
new(policyv2.AutoGroupInternet),
|
|
tailcfg.PortRangeAny,
|
|
),
|
|
},
|
|
},
|
|
},
|
|
}),
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
// The autogroup:internet ACL grants no peer visibility until the
|
|
// exit routes are approved (Node.IsExitNode() flips on approval),
|
|
// so the standard WaitForTailscaleSync wait would deadlock here —
|
|
// the post-approval EventuallyWithT block below covers the peer
|
|
// state we actually care about.
|
|
var nodes []*v1.Node
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 2)
|
|
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 2, 0, 0)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 2, 0, 0)
|
|
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.FastPoll,
|
|
"Waiting for exit-route advertisements to propagate")
|
|
|
|
// Approve exit routes on both nodes so either could serve as
|
|
// alice's exit. The bug fix is about visibility, not which node
|
|
// is chosen.
|
|
_, err = headscale.ApproveRoutes(
|
|
nodes[0].GetId(),
|
|
[]netip.Prefix{tsaddr.AllIPv4(), tsaddr.AllIPv6()},
|
|
)
|
|
require.NoError(t, err)
|
|
_, err = headscale.ApproveRoutes(
|
|
nodes[1].GetId(),
|
|
[]netip.Prefix{tsaddr.AllIPv4(), tsaddr.AllIPv6()},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 2)
|
|
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 2, 2, 2)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 2, 2, 2)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll,
|
|
"approved exit routes should propagate to nodes")
|
|
|
|
// The end-to-end UX assertion: every client must see the OTHER
|
|
// node as a peer carrying both default-route prefixes in
|
|
// AllowedIPs. Tailscale derives PeerStatus.ExitNodeOption from
|
|
// those AllowedIPs, which is what `tailscale exit-node list`
|
|
// reads (see tailscale.com/ipn/ipnlocal/local.go).
|
|
for _, client := range allClients {
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
peerCount := 0
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
peerCount++
|
|
|
|
assert.NotNilf(c, peerStatus.AllowedIPs,
|
|
"peer %s has nil AllowedIPs", peerStatus.HostName)
|
|
|
|
if peerStatus.AllowedIPs != nil {
|
|
ips := peerStatus.AllowedIPs.AsSlice()
|
|
assert.Containsf(c, ips, tsaddr.AllIPv4(),
|
|
"peer %s lacks 0.0.0.0/0 in AllowedIPs",
|
|
peerStatus.HostName)
|
|
assert.Containsf(c, ips, tsaddr.AllIPv6(),
|
|
"peer %s lacks ::/0 in AllowedIPs",
|
|
peerStatus.HostName)
|
|
}
|
|
|
|
assert.Truef(c, peerStatus.ExitNodeOption,
|
|
"peer %s should be exposed as an exit-node "+
|
|
"option (autogroup:internet ACL must "+
|
|
"keep exit-node visibility — #3212)",
|
|
peerStatus.HostName)
|
|
}
|
|
|
|
assert.Equalf(c, 1, peerCount,
|
|
"client %s should see the other node as a peer "+
|
|
"via the autogroup:internet ACL",
|
|
status.Self.HostName)
|
|
}, integrationutil.ScaledTimeout(15*time.Second), integrationutil.SlowPoll,
|
|
"client should see exit nodes as peers via autogroup:internet ACL")
|
|
}
|
|
}
|
|
|
|
// TestSubnetRouterMultiNetwork is an evolution of the subnet router test.
|
|
// This test will set up multiple docker networks and use two isolated tailscale
|
|
// clients and a service available in one of the networks to validate that a
|
|
// subnet router is working as expected.
|
|
func TestSubnetRouterMultiNetwork(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 1,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
err = scenario.CreateHeadscaleEnv([]tsic.Option{tsic.WithAcceptRoutes()},
|
|
hsic.WithTestName("rt-multinet"),
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
assert.NotNil(t, headscale)
|
|
|
|
pref, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
var user1c, user2c TailscaleClient
|
|
|
|
for _, c := range allClients {
|
|
s := c.MustStatus()
|
|
if s.User[s.Self.UserID].LoginName == "user1@test.no" {
|
|
user1c = c
|
|
}
|
|
|
|
if s.User[s.Self.UserID].LoginName == "user2@test.no" {
|
|
user2c = c
|
|
}
|
|
}
|
|
|
|
require.NotNil(t, user1c)
|
|
require.NotNil(t, user2c)
|
|
|
|
// Advertise the route for the dockersubnet of user1
|
|
command := []string{
|
|
"tailscale",
|
|
"set",
|
|
"--advertise-routes=" + pref.String(),
|
|
}
|
|
_, _, err = user1c.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise route: %s", err)
|
|
|
|
var nodes []*v1.Node
|
|
// Wait for route advertisements to propagate to NodeStore
|
|
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
|
|
var err error
|
|
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(ct, err)
|
|
assert.Len(ct, nodes, 2)
|
|
requireNodeRouteCountWithCollect(ct, nodes[0], 1, 0, 0)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), 100*time.Millisecond, "route advertisements should propagate")
|
|
|
|
// Verify that no routes has been sent to the client,
|
|
// they are not yet enabled.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := user1c.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
assert.Nil(c, peerStatus.PrimaryRoutes)
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes are active before approval")
|
|
|
|
// Enable route
|
|
_, err = headscale.ApproveRoutes(
|
|
nodes[0].GetId(),
|
|
[]netip.Prefix{*pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route state changes to propagate to nodes
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
var err error
|
|
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 2)
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes")
|
|
|
|
// Verify that the routes have been sent to the client
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := user2c.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
if peerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *pref)
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*pref})
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "routes should be visible to client")
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
|
|
url := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
t.Logf("url from %s to %s", user2c.Hostname(), url)
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, user2c, url, "Verifying client can reach webservice through subnet route")
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying client can reach webservice through subnet route")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := user2c.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := user1c.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for user1c") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying traceroute goes through subnet router")
|
|
}
|
|
|
|
func TestSubnetRouterMultiNetworkExitNode(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 1,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
err = scenario.CreateHeadscaleEnv([]tsic.Option{},
|
|
hsic.WithTestName("rt-multinetexit"),
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
assert.NotNil(t, headscale)
|
|
|
|
var user1c, user2c TailscaleClient
|
|
|
|
for _, c := range allClients {
|
|
s := c.MustStatus()
|
|
if s.User[s.Self.UserID].LoginName == "user1@test.no" {
|
|
user1c = c
|
|
}
|
|
|
|
if s.User[s.Self.UserID].LoginName == "user2@test.no" {
|
|
user2c = c
|
|
}
|
|
}
|
|
|
|
require.NotNil(t, user1c)
|
|
require.NotNil(t, user2c)
|
|
|
|
route, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
// Advertise exit route AND the usernet1 subnet. The subnet route is
|
|
// required because Tailscale exit nodes strip locally-connected subnets
|
|
// from their forwarding filter (shrinkDefaultRoute + localInterfaceRoutes).
|
|
// Explicitly advertising the subnet adds it to localNets via the
|
|
// non-default-route path, allowing curl/traceroute to Docker IPs.
|
|
command := []string{
|
|
"tailscale",
|
|
"set",
|
|
"--advertise-exit-node",
|
|
"--advertise-routes=" + route.String(),
|
|
}
|
|
_, _, err = user1c.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise routes: %s", err)
|
|
|
|
var nodes []*v1.Node
|
|
// Wait for route advertisements to propagate (3 routes: v4 exit + v6 exit + subnet).
|
|
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
|
|
var err error
|
|
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(ct, err)
|
|
assert.Len(ct, nodes, 2)
|
|
requireNodeRouteCountWithCollect(ct, nodes[0], 3, 0, 0)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), 100*time.Millisecond, "route advertisements should propagate")
|
|
|
|
// Verify that no routes has been sent to the client,
|
|
// they are not yet enabled.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := user1c.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
assert.Nil(c, peerStatus.PrimaryRoutes)
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes sent to client before approval")
|
|
|
|
// Approve exit routes and subnet route.
|
|
_, err = headscale.ApproveRoutes(nodes[0].GetId(), []netip.Prefix{tsaddr.AllIPv4(), tsaddr.AllIPv6(), *route})
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route state changes to propagate.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 2)
|
|
requireNodeRouteCountWithCollect(c, nodes[0], 3, 3, 3)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes")
|
|
|
|
// Wait for exit routes to be visible to the client.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := user2c.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
assert.True(c, peerStatus.ExitNodeOption, "peer should be an exit node option")
|
|
}
|
|
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "exit routes should be visible to client")
|
|
|
|
// Tell user2c to use user1c as an exit node.
|
|
command = []string{
|
|
"tailscale",
|
|
"set",
|
|
"--exit-node",
|
|
user1c.Hostname(),
|
|
}
|
|
_, _, err = user2c.Execute(command)
|
|
require.NoErrorf(t, err, "failed to set exit node: %s", err)
|
|
|
|
// Wait for exit node to become active.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := user2c.Status()
|
|
assert.NoError(c, err)
|
|
assert.NotNil(c, status.ExitNodeStatus, "exit node should be active")
|
|
}, 30*time.Second, 500*time.Millisecond, "exit node activation")
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, user2c, weburl, "user2 should reach webservice via exit node")
|
|
}, 10*time.Second, 200*time.Millisecond, "user2 should reach webservice via exit node")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := user2c.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := user1c.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for user1c") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, 10*time.Second, 200*time.Millisecond, "user2 traceroute should go through user1 exit node")
|
|
}
|
|
|
|
func MustFindNode(hostname string, nodes []*v1.Node) *v1.Node {
|
|
for _, node := range nodes {
|
|
if node.GetName() == hostname {
|
|
return node
|
|
}
|
|
}
|
|
|
|
panic("node not found")
|
|
}
|
|
|
|
// TestAutoApproveMultiNetwork tests auto approving of routes
|
|
// by setting up two networks where network1 has three subnet
|
|
// routers:
|
|
// - routerUsernet1: advertising the docker network
|
|
// - routerSubRoute: advertising a subroute, a /24 inside a auto approved /16
|
|
// - routeExitNode: advertising an exit node
|
|
//
|
|
// Each router is tested step by step through the following scenarios
|
|
// - Policy is set to auto approve the nodes route
|
|
// - Node advertises route and it is verified that it is auto approved and sent to nodes
|
|
// - Policy is changed to _not_ auto approve the route
|
|
// - Verify that peers can still see the node
|
|
// - Disable route, making it unavailable
|
|
// - Verify that peers can no longer use node
|
|
// - Policy is changed back to auto approve route, check that routes already existing is approved.
|
|
// - Verify that routes can now be seen by peers.
|
|
//
|
|
//nolint:gocyclo // complex multi-network auto-approve test scenario
|
|
func TestAutoApproveMultiNetwork(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
// Timeout for EventuallyWithT assertions.
|
|
// Set generously to account for CI infrastructure variability.
|
|
assertTimeout := integrationutil.ScaledTimeout(60 * time.Second)
|
|
|
|
bigRoute := netip.MustParsePrefix("10.42.0.0/16")
|
|
subRoute := netip.MustParsePrefix("10.42.7.0/24")
|
|
notApprovedRoute := netip.MustParsePrefix("192.168.0.0/24")
|
|
|
|
tests := []struct {
|
|
name string
|
|
pol *policyv2.Policy
|
|
approver string
|
|
spec ScenarioSpec
|
|
withURL bool
|
|
}{
|
|
{
|
|
name: "authkey-tag",
|
|
pol: &policyv2.Policy{
|
|
ACLs: []policyv2.ACL{
|
|
{
|
|
Action: "accept",
|
|
Sources: []policyv2.Alias{wildcard()},
|
|
Destinations: []policyv2.AliasWithPorts{
|
|
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
|
|
},
|
|
},
|
|
},
|
|
TagOwners: policyv2.TagOwners{
|
|
policyv2.Tag("tag:approve"): policyv2.Owners{usernameOwner("user1@")},
|
|
},
|
|
AutoApprovers: policyv2.AutoApproverPolicy{
|
|
Routes: map[netip.Prefix]policyv2.AutoApprovers{
|
|
bigRoute: {tagApprover("tag:approve")},
|
|
},
|
|
ExitNode: policyv2.AutoApprovers{tagApprover("tag:approve")},
|
|
},
|
|
},
|
|
approver: "tag:approve",
|
|
spec: ScenarioSpec{
|
|
NodesPerUser: 3,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
// We build the head image with curl and traceroute, so only use
|
|
// that for this test.
|
|
Versions: []string{"head"},
|
|
},
|
|
},
|
|
{
|
|
name: "authkey-user",
|
|
pol: &policyv2.Policy{
|
|
ACLs: []policyv2.ACL{
|
|
{
|
|
Action: "accept",
|
|
Sources: []policyv2.Alias{wildcard()},
|
|
Destinations: []policyv2.AliasWithPorts{
|
|
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
|
|
},
|
|
},
|
|
},
|
|
AutoApprovers: policyv2.AutoApproverPolicy{
|
|
Routes: map[netip.Prefix]policyv2.AutoApprovers{
|
|
bigRoute: {usernameApprover("user1@")},
|
|
},
|
|
ExitNode: policyv2.AutoApprovers{usernameApprover("user1@")},
|
|
},
|
|
},
|
|
approver: "user1@",
|
|
spec: ScenarioSpec{
|
|
NodesPerUser: 3,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
// We build the head image with curl and traceroute, so only use
|
|
// that for this test.
|
|
Versions: []string{"head"},
|
|
},
|
|
},
|
|
{
|
|
name: "authkey-group",
|
|
pol: &policyv2.Policy{
|
|
ACLs: []policyv2.ACL{
|
|
{
|
|
Action: "accept",
|
|
Sources: []policyv2.Alias{wildcard()},
|
|
Destinations: []policyv2.AliasWithPorts{
|
|
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
|
|
},
|
|
},
|
|
},
|
|
Groups: policyv2.Groups{
|
|
policyv2.Group("group:approve"): []policyv2.Username{policyv2.Username("user1@")},
|
|
},
|
|
AutoApprovers: policyv2.AutoApproverPolicy{
|
|
Routes: map[netip.Prefix]policyv2.AutoApprovers{
|
|
bigRoute: {groupApprover("group:approve")},
|
|
},
|
|
ExitNode: policyv2.AutoApprovers{groupApprover("group:approve")},
|
|
},
|
|
},
|
|
approver: "group:approve",
|
|
spec: ScenarioSpec{
|
|
NodesPerUser: 3,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
// We build the head image with curl and traceroute, so only use
|
|
// that for this test.
|
|
Versions: []string{"head"},
|
|
},
|
|
},
|
|
{
|
|
name: "webauth-user",
|
|
pol: &policyv2.Policy{
|
|
ACLs: []policyv2.ACL{
|
|
{
|
|
Action: "accept",
|
|
Sources: []policyv2.Alias{wildcard()},
|
|
Destinations: []policyv2.AliasWithPorts{
|
|
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
|
|
},
|
|
},
|
|
},
|
|
AutoApprovers: policyv2.AutoApproverPolicy{
|
|
Routes: map[netip.Prefix]policyv2.AutoApprovers{
|
|
bigRoute: {usernameApprover("user1@")},
|
|
},
|
|
ExitNode: policyv2.AutoApprovers{usernameApprover("user1@")},
|
|
},
|
|
},
|
|
approver: "user1@",
|
|
spec: ScenarioSpec{
|
|
NodesPerUser: 3,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
// We build the head image with curl and traceroute, so only use
|
|
// that for this test.
|
|
Versions: []string{"head"},
|
|
},
|
|
withURL: true,
|
|
},
|
|
{
|
|
name: "webauth-tag",
|
|
pol: &policyv2.Policy{
|
|
ACLs: []policyv2.ACL{
|
|
{
|
|
Action: "accept",
|
|
Sources: []policyv2.Alias{wildcard()},
|
|
Destinations: []policyv2.AliasWithPorts{
|
|
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
|
|
},
|
|
},
|
|
},
|
|
TagOwners: policyv2.TagOwners{
|
|
policyv2.Tag("tag:approve"): policyv2.Owners{usernameOwner("user1@")},
|
|
},
|
|
AutoApprovers: policyv2.AutoApproverPolicy{
|
|
Routes: map[netip.Prefix]policyv2.AutoApprovers{
|
|
bigRoute: {tagApprover("tag:approve")},
|
|
},
|
|
ExitNode: policyv2.AutoApprovers{tagApprover("tag:approve")},
|
|
},
|
|
},
|
|
approver: "tag:approve",
|
|
spec: ScenarioSpec{
|
|
NodesPerUser: 3,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
// We build the head image with curl and traceroute, so only use
|
|
// that for this test.
|
|
Versions: []string{"head"},
|
|
},
|
|
withURL: true,
|
|
},
|
|
{
|
|
name: "webauth-group",
|
|
pol: &policyv2.Policy{
|
|
ACLs: []policyv2.ACL{
|
|
{
|
|
Action: "accept",
|
|
Sources: []policyv2.Alias{wildcard()},
|
|
Destinations: []policyv2.AliasWithPorts{
|
|
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
|
|
},
|
|
},
|
|
},
|
|
Groups: policyv2.Groups{
|
|
policyv2.Group("group:approve"): []policyv2.Username{policyv2.Username("user1@")},
|
|
},
|
|
AutoApprovers: policyv2.AutoApproverPolicy{
|
|
Routes: map[netip.Prefix]policyv2.AutoApprovers{
|
|
bigRoute: {groupApprover("group:approve")},
|
|
},
|
|
ExitNode: policyv2.AutoApprovers{groupApprover("group:approve")},
|
|
},
|
|
},
|
|
approver: "group:approve",
|
|
spec: ScenarioSpec{
|
|
NodesPerUser: 3,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
// We build the head image with curl and traceroute, so only use
|
|
// that for this test.
|
|
Versions: []string{"head"},
|
|
},
|
|
withURL: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
for _, polMode := range []types.PolicyMode{types.PolicyModeDB, types.PolicyModeFile} {
|
|
for _, advertiseDuringUp := range []bool{false, true} {
|
|
name := fmt.Sprintf("%s-advertiseduringup-%t-pol-%s", tt.name, advertiseDuringUp, polMode)
|
|
t.Run(name, func(t *testing.T) {
|
|
// Create a deep copy of the policy to avoid mutating the shared test case.
|
|
// Each subtest modifies AutoApprovers.Routes (add then delete), so we need
|
|
// an isolated copy to prevent state leakage between sequential test runs.
|
|
pol := &policyv2.Policy{
|
|
ACLs: slices.Clone(tt.pol.ACLs),
|
|
Groups: maps.Clone(tt.pol.Groups),
|
|
TagOwners: maps.Clone(tt.pol.TagOwners),
|
|
AutoApprovers: policyv2.AutoApproverPolicy{
|
|
ExitNode: slices.Clone(tt.pol.AutoApprovers.ExitNode),
|
|
Routes: maps.Clone(tt.pol.AutoApprovers.Routes),
|
|
},
|
|
}
|
|
|
|
scenario, err := NewScenario(tt.spec)
|
|
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
var nodes []*v1.Node
|
|
|
|
opts := []hsic.Option{
|
|
hsic.WithTestName("autoapprovemulti"),
|
|
hsic.WithACLPolicy(pol),
|
|
hsic.WithPolicyMode(polMode), // test iterates over file and DB policy modes
|
|
}
|
|
|
|
tsOpts := []tsic.Option{
|
|
tsic.WithAcceptRoutes(),
|
|
}
|
|
|
|
route, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
// For tag-based approvers, nodes must be tagged with that tag
|
|
// (tags-as-identity model: tagged nodes are identified by their tags)
|
|
var (
|
|
preAuthKeyTags []string
|
|
webauthTagUser string
|
|
)
|
|
|
|
if strings.HasPrefix(tt.approver, "tag:") {
|
|
preAuthKeyTags = []string{tt.approver}
|
|
if tt.withURL {
|
|
// For webauth, only user1 can request tags (per tagOwners policy)
|
|
webauthTagUser = "user1" //nolint:goconst // test value, not a constant
|
|
}
|
|
}
|
|
|
|
err = scenario.createHeadscaleEnvWithTags(tt.withURL, tsOpts, preAuthKeyTags, webauthTagUser,
|
|
opts...,
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
assert.NotNil(t, headscale)
|
|
|
|
// Add the Docker network route to the auto-approvers
|
|
// Keep existing auto-approvers (like bigRoute) in place
|
|
var approvers policyv2.AutoApprovers
|
|
|
|
switch {
|
|
case strings.HasPrefix(tt.approver, "tag:"):
|
|
approvers = append(approvers, tagApprover(tt.approver))
|
|
case strings.HasPrefix(tt.approver, "group:"):
|
|
approvers = append(approvers, groupApprover(tt.approver))
|
|
default:
|
|
approvers = append(approvers, usernameApprover(tt.approver))
|
|
}
|
|
// pol.AutoApprovers.Routes is already initialized in the deep copy above
|
|
prefix := *route
|
|
pol.AutoApprovers.Routes[prefix] = approvers
|
|
err = headscale.SetPolicy(pol)
|
|
require.NoError(t, err)
|
|
|
|
if advertiseDuringUp {
|
|
tsOpts = append(tsOpts,
|
|
tsic.WithExtraLoginArgs([]string{"--advertise-routes=" + route.String()}),
|
|
)
|
|
}
|
|
|
|
// For webauth with tag approver, the node needs to advertise the tag during registration
|
|
// (tags-as-identity model: webauth nodes can use --advertise-tags if authorized by tagOwners)
|
|
if tt.withURL && strings.HasPrefix(tt.approver, "tag:") {
|
|
tsOpts = append(tsOpts, tsic.WithTags([]string{tt.approver}))
|
|
}
|
|
|
|
tsOpts = append(tsOpts, tsic.WithNetwork(usernet1))
|
|
|
|
// This whole dance is to add a node _after_ all the other nodes
|
|
// with an additional tsOpt which advertises the route as part
|
|
// of the `tailscale up` command. If we do this as part of the
|
|
// scenario creation, it will be added to all nodes and turn
|
|
// into a HA node, which isn't something we are testing here.
|
|
routerUsernet1, err := scenario.CreateTailscaleNode("head", tsOpts...)
|
|
require.NoError(t, err)
|
|
|
|
defer func() {
|
|
_, _, err := routerUsernet1.Shutdown()
|
|
require.NoError(t, err)
|
|
}()
|
|
|
|
if tt.withURL {
|
|
u, err := routerUsernet1.LoginWithURL(headscale.GetEndpoint())
|
|
require.NoError(t, err)
|
|
|
|
body, err := doLoginURL(routerUsernet1.Hostname(), u)
|
|
require.NoError(t, err)
|
|
|
|
err = scenario.runHeadscaleRegister("user1", body)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for the client to sync with the server after webauth registration.
|
|
// Unlike authkey login which blocks until complete, webauth registration
|
|
// happens on the server side and the client needs time to receive the network map.
|
|
err = routerUsernet1.WaitForRunning(integrationutil.PeerSyncTimeout())
|
|
require.NoError(t, err, "webauth client failed to reach Running state")
|
|
} else {
|
|
userMap, err := headscale.MapUsers()
|
|
require.NoError(t, err)
|
|
|
|
// If the approver is a tag, create a tagged PreAuthKey
|
|
// (tags-as-identity model: tags come from PreAuthKey, not --advertise-tags)
|
|
var pak *v1.PreAuthKey
|
|
if strings.HasPrefix(tt.approver, "tag:") {
|
|
pak, err = scenario.CreatePreAuthKeyWithTags(userMap["user1"].GetId(), false, false, []string{tt.approver})
|
|
} else {
|
|
pak, err = scenario.CreatePreAuthKey(userMap["user1"].GetId(), false, false)
|
|
}
|
|
|
|
require.NoError(t, err)
|
|
|
|
err = routerUsernet1.Login(headscale.GetEndpoint(), pak.GetKey())
|
|
require.NoError(t, err)
|
|
}
|
|
// extra creation end.
|
|
|
|
// Wait for the node to be fully running before getting its ID
|
|
// This is especially important for webauth flow where login is asynchronous
|
|
err = routerUsernet1.WaitForRunning(integrationutil.ScaledTimeout(30 * time.Second))
|
|
require.NoError(t, err)
|
|
|
|
// Wait for bidirectional peer synchronization.
|
|
// Both the router and all existing clients must see each other.
|
|
// This is critical for connectivity - without this, the WireGuard
|
|
// tunnels may not be established despite peers appearing in netmaps.
|
|
|
|
// Router waits for all existing clients
|
|
err = routerUsernet1.WaitForPeers(len(allClients), integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
|
|
require.NoError(t, err, "router failed to see all peers")
|
|
|
|
// All clients wait for the router (they should see 6 peers including the router)
|
|
for _, existingClient := range allClients {
|
|
err = existingClient.WaitForPeers(len(allClients), integrationutil.PeerSyncTimeout(), integrationutil.PeerSyncRetryInterval())
|
|
require.NoErrorf(t, err, "client %s failed to see all peers including router", existingClient.Hostname())
|
|
}
|
|
|
|
routerUsernet1ID := routerUsernet1.MustID()
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
t.Logf("webservice: %s, %s", webip.String(), weburl)
|
|
|
|
// Sort nodes by ID
|
|
sort.SliceStable(allClients, func(i, j int) bool {
|
|
statusI := allClients[i].MustStatus()
|
|
statusJ := allClients[j].MustStatus()
|
|
|
|
return statusI.Self.ID < statusJ.Self.ID
|
|
})
|
|
|
|
// This is ok because the scenario makes users in order, so the three first
|
|
// nodes, which are subnet routes, will be created first, and the last user
|
|
// will be created with the second.
|
|
routerSubRoute := allClients[1]
|
|
routerExitNode := allClients[2]
|
|
|
|
client := allClients[3]
|
|
|
|
if !advertiseDuringUp {
|
|
// Advertise the route for the dockersubnet of user1
|
|
command := []string{
|
|
"tailscale",
|
|
"set",
|
|
"--advertise-routes=" + route.String(),
|
|
}
|
|
_, _, err = routerUsernet1.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise route: %s", err)
|
|
}
|
|
|
|
// Wait for route state changes to propagate.
|
|
// Use a longer timeout (30s) to account for CI infrastructure variability -
|
|
// when advertiseDuringUp=true, routes are sent during registration and may
|
|
// take longer to propagate through the server's auto-approval logic in slow
|
|
// environments.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
// These route should auto approve, so the node is expected to have a route
|
|
// for all counts.
|
|
nodes, err := headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
|
|
routerNode := MustFindNode(routerUsernet1.Hostname(), nodes)
|
|
t.Logf("Initial auto-approval check - Router node %s: announced=%v, approved=%v, subnet=%v",
|
|
routerNode.GetName(),
|
|
routerNode.GetAvailableRoutes(),
|
|
routerNode.GetApprovedRoutes(),
|
|
routerNode.GetSubnetRoutes())
|
|
|
|
requireNodeRouteCountWithCollect(c, routerNode, 1, 1, 1)
|
|
}, assertTimeout, 500*time.Millisecond, "Initial route auto-approval: Route should be approved via policy")
|
|
|
|
// Verify that the routes have been sent to the client.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
// Debug output to understand peer visibility
|
|
t.Logf("Client %s sees %d peers", client.Hostname(), len(status.Peers()))
|
|
|
|
routerPeerFound := false
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
if peerStatus.ID == routerUsernet1ID.StableID() {
|
|
routerPeerFound = true
|
|
|
|
t.Logf("Client sees router peer %s (ID=%s): AllowedIPs=%v, PrimaryRoutes=%v",
|
|
peerStatus.HostName,
|
|
peerStatus.ID,
|
|
peerStatus.AllowedIPs,
|
|
peerStatus.PrimaryRoutes)
|
|
|
|
assert.NotNil(c, peerStatus.PrimaryRoutes)
|
|
|
|
if peerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route)
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route})
|
|
} else {
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}
|
|
|
|
assert.True(c, routerPeerFound, "Client should see the router peer")
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying routes sent to client after auto-approval")
|
|
|
|
// Verify WireGuard tunnel connectivity to the router before testing route.
|
|
// The client may have the route in its netmap but the actual tunnel may not
|
|
// be established yet, especially in CI environments with higher latency.
|
|
routerIPv4, err := routerUsernet1.IPv4()
|
|
require.NoError(t, err, "failed to get router IPv4")
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
err := client.Ping(
|
|
routerIPv4.String(),
|
|
tsic.WithPingUntilDirect(false), // DERP relay is fine
|
|
tsic.WithPingCount(1),
|
|
tsic.WithPingTimeout(5*time.Second),
|
|
)
|
|
assert.NoError(c, err, "ping to router should succeed")
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying WireGuard tunnel to router is established")
|
|
|
|
url := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
t.Logf("url from %s to %s", client.Hostname(), url)
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, url, "Verifying client can reach webservice through auto-approved route")
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying client can reach webservice through auto-approved route")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := routerUsernet1.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for routerUsernet1") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying traceroute goes through auto-approved router")
|
|
|
|
// Remove the auto approval from the policy, any routes already enabled should be allowed.
|
|
prefix = *route
|
|
delete(pol.AutoApprovers.Routes, prefix)
|
|
err = headscale.SetPolicy(pol)
|
|
require.NoError(t, err)
|
|
t.Logf("Policy updated: removed auto-approver for route %s", prefix)
|
|
|
|
// Wait for route state changes to propagate
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
// Routes already approved should remain approved even after policy change
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
|
|
routerNode := MustFindNode(routerUsernet1.Hostname(), nodes)
|
|
t.Logf("After policy removal - Router node %s: announced=%v, approved=%v, subnet=%v",
|
|
routerNode.GetName(),
|
|
routerNode.GetAvailableRoutes(),
|
|
routerNode.GetApprovedRoutes(),
|
|
routerNode.GetSubnetRoutes())
|
|
|
|
requireNodeRouteCountWithCollect(c, routerNode, 1, 1, 1)
|
|
}, assertTimeout, 500*time.Millisecond, "Routes should remain approved after auto-approver removal")
|
|
|
|
// Verify that the routes have been sent to the client.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
if peerStatus.ID == routerUsernet1ID.StableID() {
|
|
assert.NotNil(c, peerStatus.PrimaryRoutes)
|
|
|
|
if peerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route)
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route})
|
|
} else {
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying routes remain after policy change")
|
|
|
|
url = fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
t.Logf("url from %s to %s", client.Hostname(), url)
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, url, "Verifying client can still reach webservice after policy change")
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying client can still reach webservice after policy change")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := routerUsernet1.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for routerUsernet1") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying traceroute still goes through router after policy change")
|
|
|
|
// Disable the route, making it unavailable since it is no longer auto-approved
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(routerUsernet1.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route state changes to propagate
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
// These route should auto approve, so the node is expected to have a route
|
|
// for all counts.
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(routerUsernet1.Hostname(), nodes), 1, 0, 0)
|
|
}, assertTimeout, 500*time.Millisecond, "route state changes should propagate")
|
|
|
|
// Verify that the routes have been sent to the client.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying routes disabled after route removal")
|
|
|
|
// Add the route back to the auto approver in the policy, the route should
|
|
// now become available again.
|
|
var newApprovers policyv2.AutoApprovers
|
|
|
|
switch {
|
|
case strings.HasPrefix(tt.approver, "tag:"):
|
|
newApprovers = append(newApprovers, tagApprover(tt.approver))
|
|
case strings.HasPrefix(tt.approver, "group:"):
|
|
newApprovers = append(newApprovers, groupApprover(tt.approver))
|
|
default:
|
|
newApprovers = append(newApprovers, usernameApprover(tt.approver))
|
|
}
|
|
// pol.AutoApprovers.Routes is already initialized in the deep copy above
|
|
prefix = *route
|
|
pol.AutoApprovers.Routes[prefix] = newApprovers
|
|
err = headscale.SetPolicy(pol)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route state changes to propagate
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
// These route should auto approve, so the node is expected to have a route
|
|
// for all counts.
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(routerUsernet1.Hostname(), nodes), 1, 1, 1)
|
|
}, assertTimeout, 500*time.Millisecond, "route state changes should propagate")
|
|
|
|
// Verify that the routes have been sent to the client.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
if peerStatus.ID == routerUsernet1ID.StableID() {
|
|
assert.NotNil(c, peerStatus.PrimaryRoutes)
|
|
|
|
if peerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route)
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route})
|
|
} else {
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying routes re-enabled after policy re-approval")
|
|
|
|
url = fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
t.Logf("url from %s to %s", client.Hostname(), url)
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, url, "Verifying client can reach webservice after route re-approval")
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying client can reach webservice after route re-approval")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := routerUsernet1.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for routerUsernet1") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying traceroute goes through router after re-approval")
|
|
|
|
// Advertise and validate a subnet of an auto approved route, /24 inside the
|
|
// auto approved /16.
|
|
command := []string{
|
|
"tailscale",
|
|
"set",
|
|
"--advertise-routes=" + subRoute.String(),
|
|
}
|
|
_, _, err = routerSubRoute.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise route: %s", err)
|
|
|
|
// Wait for route state changes to propagate
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
// These route should auto approve, so the node is expected to have a route
|
|
// for all counts.
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(routerUsernet1.Hostname(), nodes), 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 1)
|
|
}, assertTimeout, 500*time.Millisecond, "route state changes should propagate")
|
|
|
|
// Verify that the routes have been sent to the client.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
if peerStatus.ID == routerUsernet1ID.StableID() {
|
|
if peerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route)
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route})
|
|
} else if peerStatus.ID == "2" {
|
|
if peerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), subRoute)
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{subRoute})
|
|
} else {
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying sub-route propagated to client")
|
|
|
|
// Advertise a not approved route will not end up anywhere
|
|
command = []string{
|
|
"tailscale",
|
|
"set",
|
|
"--advertise-routes=" + notApprovedRoute.String(),
|
|
}
|
|
_, _, err = routerSubRoute.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise route: %s", err)
|
|
|
|
// Wait for route state changes to propagate
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
// These route should auto approve, so the node is expected to have a route
|
|
// for all counts.
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(routerUsernet1.Hostname(), nodes), 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0)
|
|
requireNodeRouteCountWithCollect(c, nodes[2], 0, 0, 0)
|
|
}, assertTimeout, 500*time.Millisecond, "route state changes should propagate")
|
|
|
|
// Verify that the routes have been sent to the client.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
if peerStatus.ID == routerUsernet1ID.StableID() {
|
|
assert.NotNil(c, peerStatus.PrimaryRoutes)
|
|
|
|
if peerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route)
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route})
|
|
} else {
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying unapproved route not propagated")
|
|
|
|
// Exit routes are also automatically approved
|
|
command = []string{
|
|
"tailscale",
|
|
"set",
|
|
"--advertise-exit-node",
|
|
}
|
|
_, _, err = routerExitNode.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise route: %s", err)
|
|
|
|
// Wait for route state changes to propagate
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
requireNodeRouteCountWithCollect(c, MustFindNode(routerUsernet1.Hostname(), nodes), 1, 1, 1)
|
|
requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0)
|
|
requireNodeRouteCountWithCollect(c, nodes[2], 2, 2, 2)
|
|
}, assertTimeout, 500*time.Millisecond, "route state changes should propagate")
|
|
|
|
// Verify that the routes have been sent to the client.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
if peerStatus.ID == routerUsernet1ID.StableID() {
|
|
if peerStatus.PrimaryRoutes != nil {
|
|
assert.Contains(c, peerStatus.PrimaryRoutes.AsSlice(), *route)
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route})
|
|
} else if peerStatus.ID == "3" {
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{tsaddr.AllIPv4(), tsaddr.AllIPv6()})
|
|
} else {
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}
|
|
}, assertTimeout, 200*time.Millisecond, "Verifying exit node routes propagated to client")
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// assertTracerouteViaIPWithCollect is a version of assertTracerouteViaIP that works with assert.CollectT.
|
|
func assertTracerouteViaIPWithCollect(c *assert.CollectT, tr util.Traceroute, ip netip.Addr) {
|
|
assert.NotNil(c, tr)
|
|
assert.True(c, tr.Success)
|
|
assert.NoError(c, tr.Err) //nolint:testifylint // using assert.CollectT
|
|
assert.NotEmpty(c, tr.Route)
|
|
// Since we're inside EventuallyWithT, we can't use require.Greater with t
|
|
// but assert.NotEmpty above ensures len(tr.Route) > 0
|
|
if len(tr.Route) > 0 {
|
|
assert.Equal(c, tr.Route[0].IP.String(), ip.String())
|
|
}
|
|
}
|
|
|
|
func SortPeerStatus(a, b *ipnstate.PeerStatus) int {
|
|
return cmp.Compare(a.ID, b.ID)
|
|
}
|
|
|
|
func printCurrentRouteMap(t *testing.T, routers ...*ipnstate.PeerStatus) {
|
|
t.Helper()
|
|
t.Logf("== Current routing map ==")
|
|
slices.SortFunc(routers, SortPeerStatus)
|
|
|
|
for _, router := range routers {
|
|
got := filterNonRoutes(router)
|
|
t.Logf(" Router %s (%s) is serving:", router.HostName, router.ID)
|
|
t.Logf(" AllowedIPs: %v", got)
|
|
|
|
if router.PrimaryRoutes != nil {
|
|
t.Logf(" PrimaryRoutes: %v", router.PrimaryRoutes.AsSlice())
|
|
}
|
|
}
|
|
}
|
|
|
|
// filterNonRoutes returns the list of routes that a [ipnstate.PeerStatus] is serving.
|
|
func filterNonRoutes(status *ipnstate.PeerStatus) []netip.Prefix {
|
|
return slicesx.Filter(nil, status.AllowedIPs.AsSlice(), func(p netip.Prefix) bool {
|
|
if tsaddr.IsExitRoute(p) {
|
|
return true
|
|
}
|
|
|
|
return !slices.ContainsFunc(status.TailscaleIPs, p.Contains)
|
|
})
|
|
}
|
|
|
|
func requirePeerSubnetRoutesWithCollect(c *assert.CollectT, status *ipnstate.PeerStatus, expected []netip.Prefix) {
|
|
if status.AllowedIPs.Len() <= 2 && len(expected) != 0 {
|
|
assert.Fail(c, fmt.Sprintf("peer %s (%s) has no subnet routes, expected %v", status.HostName, status.ID, expected))
|
|
return
|
|
}
|
|
|
|
if len(expected) == 0 {
|
|
expected = []netip.Prefix{}
|
|
}
|
|
|
|
got := filterNonRoutes(status)
|
|
|
|
if diff := cmpdiff.Diff(expected, got, util.PrefixComparer, cmpopts.EquateEmpty()); diff != "" {
|
|
assert.Fail(c, fmt.Sprintf("peer %s (%s) subnet routes, unexpected result (-want +got):\n%s", status.HostName, status.ID, diff))
|
|
}
|
|
}
|
|
|
|
func requireNodeRouteCountWithCollect(c *assert.CollectT, node *v1.Node, announced, approved, subnet int) {
|
|
assert.Lenf(c, node.GetAvailableRoutes(), announced, "expected %q announced routes(%v) to have %d route, had %d", node.GetName(), node.GetAvailableRoutes(), announced, len(node.GetAvailableRoutes()))
|
|
assert.Lenf(c, node.GetApprovedRoutes(), approved, "expected %q approved routes(%v) to have %d route, had %d", node.GetName(), node.GetApprovedRoutes(), approved, len(node.GetApprovedRoutes()))
|
|
assert.Lenf(c, node.GetSubnetRoutes(), subnet, "expected %q subnet routes(%v) to have %d route, had %d", node.GetName(), node.GetSubnetRoutes(), subnet, len(node.GetSubnetRoutes()))
|
|
}
|
|
|
|
// TestSubnetRouteACLFiltering tests that a node can only access subnet routes
|
|
// that are explicitly allowed in the ACL.
|
|
func TestSubnetRouteACLFiltering(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
// Use router and node users for better clarity
|
|
routerUser := "router"
|
|
nodeUser := "node"
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 1,
|
|
Users: []string{routerUser, nodeUser},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{routerUser, nodeUser}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
// We build the head image with curl and traceroute, so only use
|
|
// that for this test.
|
|
Versions: []string{"head"},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
// Set up the ACL policy that allows the node to access only one of the subnet routes (10.10.10.0/24)
|
|
aclPolicyStr := `{
|
|
"hosts": {
|
|
"router": "100.64.0.1/32",
|
|
"node": "100.64.0.2/32"
|
|
},
|
|
"acls": [
|
|
{
|
|
"action": "accept",
|
|
"src": [
|
|
"*"
|
|
],
|
|
"dst": [
|
|
"router:8000"
|
|
]
|
|
},
|
|
{
|
|
"action": "accept",
|
|
"src": [
|
|
"node"
|
|
],
|
|
"dst": [
|
|
"*:*"
|
|
]
|
|
}
|
|
]
|
|
}`
|
|
|
|
route, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
t.Logf("webservice: %s, %s", webip.String(), weburl)
|
|
|
|
aclPolicy := &policyv2.Policy{}
|
|
err = json.Unmarshal([]byte(aclPolicyStr), aclPolicy)
|
|
require.NoError(t, err)
|
|
|
|
err = scenario.CreateHeadscaleEnv([]tsic.Option{
|
|
tsic.WithAcceptRoutes(),
|
|
}, hsic.WithTestName("routeaclfilter"),
|
|
hsic.WithACLPolicy(aclPolicy),
|
|
hsic.WithPolicyMode(types.PolicyModeDB), // test updates policy at runtime via CLI
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
// Get the router and node clients by user
|
|
routerClients, err := scenario.ListTailscaleClients(routerUser)
|
|
require.NoError(t, err)
|
|
require.Len(t, routerClients, 1)
|
|
routerClient := routerClients[0]
|
|
|
|
nodeClients, err := scenario.ListTailscaleClients(nodeUser)
|
|
require.NoError(t, err)
|
|
require.Len(t, nodeClients, 1)
|
|
nodeClient := nodeClients[0]
|
|
|
|
routerIP, err := routerClient.IPv4()
|
|
require.NoError(t, err, "failed to get router IPv4")
|
|
nodeIP, err := nodeClient.IPv4()
|
|
require.NoError(t, err, "failed to get node IPv4")
|
|
|
|
aclPolicy.Hosts = policyv2.Hosts{
|
|
policyv2.Host(routerUser): policyv2.Prefix(must.Get(routerIP.Prefix(32))),
|
|
policyv2.Host(nodeUser): policyv2.Prefix(must.Get(nodeIP.Prefix(32))),
|
|
}
|
|
aclPolicy.ACLs[1].Destinations = []policyv2.AliasWithPorts{
|
|
aliasWithPorts(prefixp(route.String()), tailcfg.PortRangeAny),
|
|
}
|
|
require.NoError(t, headscale.SetPolicy(aclPolicy))
|
|
|
|
// Set up the subnet routes for the router
|
|
routes := []netip.Prefix{
|
|
*route, // This should be accessible by the client
|
|
netip.MustParsePrefix("10.10.11.0/24"), // These should NOT be accessible
|
|
netip.MustParsePrefix("10.10.12.0/24"),
|
|
}
|
|
|
|
routeArg := "--advertise-routes=" + routes[0].String() + "," + routes[1].String() + "," + routes[2].String()
|
|
command := []string{
|
|
"tailscale",
|
|
"set",
|
|
routeArg,
|
|
}
|
|
|
|
_, _, err = routerClient.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise routes: %s", err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
var routerNode, nodeNode *v1.Node
|
|
// Wait for route advertisements to propagate to NodeStore
|
|
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
|
|
// List nodes and verify the router has 3 available routes
|
|
nodes, err := headscale.NodesByUser()
|
|
assert.NoError(ct, err)
|
|
assert.Len(ct, nodes, 2)
|
|
|
|
// Find the router node
|
|
routerNode = nodes[routerUser][0]
|
|
nodeNode = nodes[nodeUser][0]
|
|
|
|
assert.NotNil(ct, routerNode, "Router node not found")
|
|
assert.NotNil(ct, nodeNode, "Client node not found")
|
|
|
|
// Check that the router has 3 routes available but not approved yet
|
|
requireNodeRouteCountWithCollect(ct, routerNode, 3, 0, 0)
|
|
requireNodeRouteCountWithCollect(ct, nodeNode, 0, 0, 0)
|
|
}, integrationutil.ScaledTimeout(10*time.Second), 100*time.Millisecond, "route advertisements should propagate to router node")
|
|
|
|
// Approve all routes for the router
|
|
_, err = headscale.ApproveRoutes(
|
|
routerNode.GetId(),
|
|
util.MustStringsToPrefixes(routerNode.GetAvailableRoutes()),
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for route state changes to propagate
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
// List nodes and verify the router has 3 available routes
|
|
var err error
|
|
|
|
nodes, err := headscale.NodesByUser()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 2)
|
|
|
|
// Find the router node
|
|
routerNode = nodes[routerUser][0]
|
|
|
|
// Check that the router has 3 routes now approved and available
|
|
requireNodeRouteCountWithCollect(c, routerNode, 3, 3, 3)
|
|
}, integrationutil.ScaledTimeout(15*time.Second), integrationutil.SlowPoll, "route state changes should propagate")
|
|
|
|
// Now check the client node status
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodeStatus, err := nodeClient.Status()
|
|
assert.NoError(c, err)
|
|
|
|
routerStatus, err := routerClient.Status()
|
|
assert.NoError(c, err)
|
|
|
|
// Check that the node can see the subnet routes from the router
|
|
routerPeerStatus := nodeStatus.Peer[routerStatus.Self.PublicKey]
|
|
|
|
// The node should only have 1 subnet route
|
|
requirePeerSubnetRoutesWithCollect(c, routerPeerStatus, []netip.Prefix{*route})
|
|
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying node sees filtered subnet routes")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, nodeClient, weburl, "Verifying node can reach webservice through allowed route")
|
|
}, integrationutil.HAConvergeTimeout, integrationutil.FastPoll, "Verifying node can reach webservice through allowed route")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := nodeClient.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := routerClient.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for routerClient") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, integrationutil.HAConvergeTimeout, integrationutil.FastPoll, "Verifying traceroute goes through router")
|
|
}
|
|
|
|
// TestGrantViaSubnetSteering validates that via grants steer different source
|
|
// groups through different tagged subnet routers to the same destination.
|
|
// Per Tailscale docs, via enables traffic steering: routing specific source
|
|
// groups through specific tagged intermediate nodes (subnet routers).
|
|
func TestGrantViaSubnetSteering(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
assertTimeout := 60 * time.Second
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 0,
|
|
Users: []string{"router", "client"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"router"}},
|
|
"usernet2": {Users: []string{"client"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
Versions: []string{"head"},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
defer scenario.ShutdownAssertNoPanics(t)
|
|
|
|
// Get the subnet for usernet1 before creating headscale
|
|
// (needed for policy construction).
|
|
route, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
pol := &policyv2.Policy{
|
|
TagOwners: policyv2.TagOwners{
|
|
policyv2.Tag("tag:router-a"): policyv2.Owners{usernameOwner("router@")},
|
|
policyv2.Tag("tag:router-b"): policyv2.Owners{usernameOwner("router@")},
|
|
policyv2.Tag("tag:group-a"): policyv2.Owners{usernameOwner("client@")},
|
|
policyv2.Tag("tag:group-b"): policyv2.Owners{usernameOwner("client@")},
|
|
},
|
|
Grants: []policyv2.Grant{
|
|
// Allow all tagged nodes to communicate with each other (peer connectivity).
|
|
// Uses tag-based src/dst to avoid creating rules for the subnet prefix,
|
|
// so only via grants control subnet route visibility.
|
|
{
|
|
Sources: policyv2.Aliases{
|
|
tagp("tag:router-a"), tagp("tag:router-b"),
|
|
tagp("tag:group-a"), tagp("tag:group-b"),
|
|
},
|
|
Destinations: policyv2.Aliases{
|
|
tagp("tag:router-a"), tagp("tag:router-b"),
|
|
tagp("tag:group-a"), tagp("tag:group-b"),
|
|
},
|
|
InternetProtocols: []policyv2.ProtocolPort{
|
|
{Protocol: "*", Ports: []tailcfg.PortRange{tailcfg.PortRangeAny}},
|
|
},
|
|
},
|
|
// Via grant: steer tag:group-a traffic to usernet1 subnet through tag:router-a.
|
|
{
|
|
Sources: policyv2.Aliases{tagp("tag:group-a")},
|
|
Destinations: policyv2.Aliases{prefixp(route.String())},
|
|
InternetProtocols: []policyv2.ProtocolPort{
|
|
{Protocol: "*", Ports: []tailcfg.PortRange{tailcfg.PortRangeAny}},
|
|
},
|
|
Via: []policyv2.Tag{policyv2.Tag("tag:router-a")},
|
|
},
|
|
// Via grant: steer tag:group-b traffic to usernet1 subnet through tag:router-b.
|
|
{
|
|
Sources: policyv2.Aliases{tagp("tag:group-b")},
|
|
Destinations: policyv2.Aliases{prefixp(route.String())},
|
|
InternetProtocols: []policyv2.ProtocolPort{
|
|
{Protocol: "*", Ports: []tailcfg.PortRange{tailcfg.PortRangeAny}},
|
|
},
|
|
Via: []policyv2.Tag{policyv2.Tag("tag:router-b")},
|
|
},
|
|
},
|
|
AutoApprovers: policyv2.AutoApproverPolicy{
|
|
Routes: map[netip.Prefix]policyv2.AutoApprovers{
|
|
*route: {tagApprover("tag:router-a"), tagApprover("tag:router-b")},
|
|
},
|
|
},
|
|
}
|
|
|
|
headscale, err := scenario.Headscale(
|
|
hsic.WithTestName("grantvia-subnet"),
|
|
hsic.WithACLPolicy(pol),
|
|
hsic.WithPolicyMode(types.PolicyModeDB),
|
|
)
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
usernet2, err := scenario.Network("usernet2")
|
|
require.NoError(t, err)
|
|
|
|
// Create users on headscale server.
|
|
_, err = scenario.CreateUser("router")
|
|
require.NoError(t, err)
|
|
_, err = scenario.CreateUser("client")
|
|
require.NoError(t, err)
|
|
|
|
userMap, err := headscale.MapUsers()
|
|
require.NoError(t, err)
|
|
|
|
// Create Router A (tag:router-a) on usernet1.
|
|
// Routers advertise routes but must NOT accept peer routes — with
|
|
// co-router visibility the HA primary's subnet appears in the
|
|
// co-router's AllowedIPs, and --accept-routes would install a
|
|
// system route that conflicts with local subnet forwarding.
|
|
routerA, err := scenario.CreateTailscaleNode("head",
|
|
tsic.WithNetwork(usernet1),
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
defer func() { _, _, _ = routerA.Shutdown() }()
|
|
|
|
pakRouterA, err := scenario.CreatePreAuthKeyWithTags(
|
|
userMap["router"].GetId(), false, false, []string{"tag:router-a"},
|
|
)
|
|
require.NoError(t, err)
|
|
err = routerA.Login(headscale.GetEndpoint(), pakRouterA.GetKey())
|
|
require.NoError(t, err)
|
|
err = routerA.WaitForRunning(30 * time.Second)
|
|
require.NoError(t, err)
|
|
|
|
// Create Router B (tag:router-b) on usernet1.
|
|
routerB, err := scenario.CreateTailscaleNode("head",
|
|
tsic.WithNetwork(usernet1),
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
defer func() { _, _, _ = routerB.Shutdown() }()
|
|
|
|
pakRouterB, err := scenario.CreatePreAuthKeyWithTags(
|
|
userMap["router"].GetId(), false, false, []string{"tag:router-b"},
|
|
)
|
|
require.NoError(t, err)
|
|
err = routerB.Login(headscale.GetEndpoint(), pakRouterB.GetKey())
|
|
require.NoError(t, err)
|
|
err = routerB.WaitForRunning(30 * time.Second)
|
|
require.NoError(t, err)
|
|
|
|
// Create Client A (tag:group-a) on usernet2.
|
|
clientA, err := scenario.CreateTailscaleNode("head",
|
|
tsic.WithNetwork(usernet2),
|
|
tsic.WithAcceptRoutes(),
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
defer func() { _, _, _ = clientA.Shutdown() }()
|
|
|
|
pakClientA, err := scenario.CreatePreAuthKeyWithTags(
|
|
userMap["client"].GetId(), false, false, []string{"tag:group-a"},
|
|
)
|
|
require.NoError(t, err)
|
|
err = clientA.Login(headscale.GetEndpoint(), pakClientA.GetKey())
|
|
require.NoError(t, err)
|
|
err = clientA.WaitForRunning(30 * time.Second)
|
|
require.NoError(t, err)
|
|
|
|
// Create Client B (tag:group-b) on usernet2.
|
|
clientB, err := scenario.CreateTailscaleNode("head",
|
|
tsic.WithNetwork(usernet2),
|
|
tsic.WithAcceptRoutes(),
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
defer func() { _, _, _ = clientB.Shutdown() }()
|
|
|
|
pakClientB, err := scenario.CreatePreAuthKeyWithTags(
|
|
userMap["client"].GetId(), false, false, []string{"tag:group-b"},
|
|
)
|
|
require.NoError(t, err)
|
|
err = clientB.Login(headscale.GetEndpoint(), pakClientB.GetKey())
|
|
require.NoError(t, err)
|
|
err = clientB.WaitForRunning(30 * time.Second)
|
|
require.NoError(t, err)
|
|
|
|
// Wait for all peers to see each other (4 nodes, each sees 3 peers).
|
|
allNodes := []TailscaleClient{routerA, routerB, clientA, clientB}
|
|
for _, node := range allNodes {
|
|
err = node.WaitForPeers(len(allNodes)-1, 60*time.Second, 1*time.Second)
|
|
require.NoErrorf(t, err, "node %s failed to see all peers", node.Hostname())
|
|
}
|
|
|
|
// Both routers advertise usernet1 subnet.
|
|
for _, router := range []TailscaleClient{routerA, routerB} {
|
|
command := []string{
|
|
"tailscale", "set",
|
|
"--advertise-routes=" + route.String(),
|
|
}
|
|
_, _, err = router.Execute(command)
|
|
require.NoErrorf(t, err, "failed to advertise route on %s", router.Hostname())
|
|
}
|
|
|
|
// Wait for auto-approval on both routers.
|
|
// Only check announced and approved counts. SubnetRoutes (primary election)
|
|
// is a global single-primary-per-prefix model, so only one router wins.
|
|
// Via steering should override this per-client, which is what we test below.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err := headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
|
|
routerANode := MustFindNode(routerA.Hostname(), nodes)
|
|
t.Logf("Router A %s: announced=%v, approved=%v, subnet=%v",
|
|
routerANode.GetName(),
|
|
routerANode.GetAvailableRoutes(),
|
|
routerANode.GetApprovedRoutes(),
|
|
routerANode.GetSubnetRoutes())
|
|
assert.Len(c, routerANode.GetAvailableRoutes(), 1, "Router A should have 1 announced route")
|
|
assert.Len(c, routerANode.GetApprovedRoutes(), 1, "Router A should have 1 approved route")
|
|
|
|
routerBNode := MustFindNode(routerB.Hostname(), nodes)
|
|
t.Logf("Router B %s: announced=%v, approved=%v, subnet=%v",
|
|
routerBNode.GetName(),
|
|
routerBNode.GetAvailableRoutes(),
|
|
routerBNode.GetApprovedRoutes(),
|
|
routerBNode.GetSubnetRoutes())
|
|
assert.Len(c, routerBNode.GetAvailableRoutes(), 1, "Router B should have 1 announced route")
|
|
assert.Len(c, routerBNode.GetApprovedRoutes(), 1, "Router B should have 1 approved route")
|
|
}, assertTimeout, 500*time.Millisecond, "Both routers should have auto-approved routes")
|
|
|
|
// Get webservice info.
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
t.Logf("webservice: %s, %s", webip.String(), weburl)
|
|
|
|
// Verify Client A sees only Router A's subnet route (via steering).
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := clientA.Status()
|
|
assert.NoError(c, err)
|
|
|
|
routerAID := routerA.MustID()
|
|
routerBID := routerB.MustID()
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
switch peerStatus.ID {
|
|
case routerAID.StableID():
|
|
// Client A should see Router A's subnet route.
|
|
t.Logf("Client A sees Router A: AllowedIPs=%v, PrimaryRoutes=%v",
|
|
peerStatus.AllowedIPs, peerStatus.PrimaryRoutes)
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route})
|
|
case routerBID.StableID():
|
|
// Client A should NOT see Router B's subnet route.
|
|
t.Logf("Client A sees Router B: AllowedIPs=%v, PrimaryRoutes=%v",
|
|
peerStatus.AllowedIPs, peerStatus.PrimaryRoutes)
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
}
|
|
}
|
|
}, assertTimeout, 500*time.Millisecond, "Client A should see only Router A's subnet route")
|
|
|
|
// Verify Client B sees only Router B's subnet route (via steering).
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
status, err := clientB.Status()
|
|
assert.NoError(c, err)
|
|
|
|
routerAID := routerA.MustID()
|
|
routerBID := routerB.MustID()
|
|
|
|
for _, peerKey := range status.Peers() {
|
|
peerStatus := status.Peer[peerKey]
|
|
|
|
switch peerStatus.ID {
|
|
case routerAID.StableID():
|
|
// Client B should NOT see Router A's subnet route.
|
|
t.Logf("Client B sees Router A: AllowedIPs=%v, PrimaryRoutes=%v",
|
|
peerStatus.AllowedIPs, peerStatus.PrimaryRoutes)
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
|
|
case routerBID.StableID():
|
|
// Client B should see Router B's subnet route.
|
|
t.Logf("Client B sees Router B: AllowedIPs=%v, PrimaryRoutes=%v",
|
|
peerStatus.AllowedIPs, peerStatus.PrimaryRoutes)
|
|
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*route})
|
|
}
|
|
}
|
|
}, assertTimeout, 500*time.Millisecond, "Client B should see only Router B's subnet route")
|
|
|
|
// Verify Client A can reach the webservice.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, clientA, weburl, "Client A should reach webservice")
|
|
}, assertTimeout, 200*time.Millisecond, "Client A should reach webservice")
|
|
|
|
// Verify Client B can reach the webservice.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, clientB, weburl, "Client B should reach webservice")
|
|
}, assertTimeout, 200*time.Millisecond, "Client B should reach webservice")
|
|
|
|
// Verify Client A's traffic goes through Router A.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := clientA.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := routerA.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for routerA") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, assertTimeout, 200*time.Millisecond, "Client A traceroute should go through Router A")
|
|
|
|
// Verify Client B's traffic goes through Router B.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := clientB.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := routerB.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for routerB") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, assertTimeout, 200*time.Millisecond, "Client B traceroute should go through Router B")
|
|
}
|
|
|
|
// TestHASubnetRouterPingFailover tests HA failover triggered by the
|
|
// health prober rather than by a full disconnect. The primary router
|
|
// stays connected (Noise session alive) but its ping callback is
|
|
// blocked via iptables, so the prober marks it unhealthy and fails
|
|
// over to the next available router.
|
|
func TestHASubnetRouterPingFailover(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
propagationTime := integrationutil.HAConvergeTimeout
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 2,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
Versions: []string{"head"},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
|
|
err = scenario.CreateHeadscaleEnv(
|
|
[]tsic.Option{
|
|
tsic.WithAcceptRoutes(),
|
|
tsic.WithPackages("iptables"),
|
|
},
|
|
hsic.WithTestName("rt-hapingfail"),
|
|
hsic.WithHAProbing(10*time.Second, 5*time.Second),
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
prefp, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
pref := *prefp
|
|
t.Logf("usernet1 prefix: %s", pref.String())
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
|
|
sort.SliceStable(allClients, func(i, j int) bool {
|
|
return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID
|
|
})
|
|
|
|
subRouter1 := allClients[0]
|
|
subRouter2 := allClients[1]
|
|
client := allClients[2]
|
|
|
|
t.Logf("Router 1: %s, Router 2: %s, Client: %s",
|
|
subRouter1.Hostname(), subRouter2.Hostname(), client.Hostname())
|
|
|
|
// Advertise same route on both routers.
|
|
for _, r := range []TailscaleClient{subRouter1, subRouter2} {
|
|
_, _, err = r.Execute([]string{
|
|
"tailscale", "set", "--advertise-routes=" + pref.String(),
|
|
})
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
var nodes []*v1.Node
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 4)
|
|
}, propagationTime, 200*time.Millisecond)
|
|
|
|
// Approve routes on both routers.
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
nodeID1 := types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId())
|
|
nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId())
|
|
|
|
// Wait for HA to be set up: router 1 primary, router 2 standby.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
|
|
assert.Equal(c, map[string]types.NodeID{
|
|
pref.String(): nodeID1,
|
|
}, pr.PrimaryRoutes, "router 1 should be primary")
|
|
|
|
assert.Contains(c, pr.AvailableRoutes, nodeID1)
|
|
assert.Contains(c, pr.AvailableRoutes, nodeID2)
|
|
}, propagationTime, 200*time.Millisecond, "waiting for HA setup")
|
|
|
|
// Verify connectivity through router 1.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "client should reach webservice through router 1")
|
|
}, propagationTime, 200*time.Millisecond, "client should reach webservice through router 1")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter1.IPv4()
|
|
if !assert.NoError(c, err) {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "traceroute should go through router 1")
|
|
|
|
t.Log("=== HA setup verified. Blocking ping callbacks on router 1 via iptables ===")
|
|
|
|
// Block NEW outbound TCP from router 1 to headscale.
|
|
// Preserves the existing Noise HTTP/2 long-poll (ESTABLISHED).
|
|
hsIP := headscale.GetIPInNetwork(usernet1)
|
|
iptablesAdd := []string{
|
|
"iptables", "-A", "OUTPUT",
|
|
"-d", hsIP,
|
|
"-p", "tcp", "--dport", "8080",
|
|
"-m", "state", "--state", "NEW",
|
|
"-j", "DROP",
|
|
}
|
|
|
|
_, _, err = subRouter1.Execute(iptablesAdd)
|
|
require.NoError(t, err, "failed to add iptables rule")
|
|
|
|
t.Logf("Blocked new TCP connections from %s to headscale at %s:8080",
|
|
subRouter1.Hostname(), hsIP)
|
|
|
|
// Wait for the prober to detect the failure and trigger failover.
|
|
// Probe interval=10s, timeout=5s → failover within ~15s.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
|
|
assert.Equal(c, map[string]types.NodeID{
|
|
pref.String(): nodeID2,
|
|
}, pr.PrimaryRoutes, "router 2 should be primary after ping failover")
|
|
|
|
assert.Contains(c, pr.UnhealthyNodes, nodeID1,
|
|
"router 1 should be marked unhealthy")
|
|
|
|
// Router 1 still in available routes (still connected, just unhealthy).
|
|
assert.Contains(c, pr.AvailableRoutes, nodeID1)
|
|
assert.Contains(c, pr.AvailableRoutes, nodeID2)
|
|
}, propagationTime, 1*time.Second, "waiting for ping-based failover")
|
|
|
|
t.Log("Failover detected. Verifying connectivity through router 2.")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "client should reach webservice through router 2")
|
|
}, propagationTime, 200*time.Millisecond, "client should reach webservice through router 2")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter2.IPv4()
|
|
if !assert.NoError(c, err) {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "traceroute should go through router 2")
|
|
|
|
t.Log("=== Recovery: removing iptables block on router 1 ===")
|
|
|
|
iptablesDel := []string{
|
|
"iptables", "-D", "OUTPUT",
|
|
"-d", hsIP,
|
|
"-p", "tcp", "--dport", "8080",
|
|
"-m", "state", "--state", "NEW",
|
|
"-j", "DROP",
|
|
}
|
|
|
|
_, _, err = subRouter1.Execute(iptablesDel)
|
|
require.NoError(t, err, "failed to remove iptables rule")
|
|
|
|
// Wait for the prober to detect recovery.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
|
|
// Router 1 should be healthy again but NOT primary (no flapping).
|
|
assert.Equal(c, map[string]types.NodeID{
|
|
pref.String(): nodeID2,
|
|
}, pr.PrimaryRoutes, "router 2 should remain primary (no flapping)")
|
|
|
|
assert.Empty(c, pr.UnhealthyNodes,
|
|
"no nodes should be unhealthy after recovery")
|
|
}, propagationTime, 1*time.Second, "waiting for recovery without flapping")
|
|
|
|
// Traffic should still go through router 2 (stability).
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter2.IPv4()
|
|
if !assert.NoError(c, err) {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "traceroute should still go through router 2 after recovery")
|
|
}
|
|
|
|
// TestHASubnetRouterFailoverBothOffline reproduces issue #3203:
|
|
// HA tracking loses the secondary subnet router after all routers serving
|
|
// the route have been offline simultaneously and one of them returns.
|
|
// See https://github.com/juanfont/headscale/issues/3203.
|
|
//
|
|
// Existing TestHASubnetRouterFailover keeps subRouter3 online across both
|
|
// failover steps, so the all-offline transition is uncovered. This test
|
|
// uses two routers and walks them both offline before bringing r2 back.
|
|
//
|
|
// Two assertion sets split the failure surface:
|
|
// - R1: server-side primary route table restores after reconnect.
|
|
// If R1 fails, the bug is in state.Connect / primaryRoutes.
|
|
// - R2: client's view shows r2 online with the route in PrimaryRoutes.
|
|
// If R1 passes and R2 fails, the bug is in change broadcast /
|
|
// mapBatcher / multiChannelNodeConn.
|
|
func TestHASubnetRouterFailoverBothOffline(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
propagationTime := integrationutil.HAConvergeTimeout
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 2,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
Versions: []string{"head"},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
|
|
err = scenario.CreateHeadscaleEnv(
|
|
[]tsic.Option{tsic.WithAcceptRoutes()},
|
|
hsic.WithTestName("rt-haboth"),
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
prefp, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
pref := *prefp
|
|
t.Logf("usernet1 prefix: %s", pref.String())
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
|
|
sort.SliceStable(allClients, func(i, j int) bool {
|
|
return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID
|
|
})
|
|
|
|
subRouter1 := allClients[0]
|
|
subRouter2 := allClients[1]
|
|
client := allClients[2]
|
|
|
|
t.Logf("Router 1: %s, Router 2: %s, Client: %s",
|
|
subRouter1.Hostname(), subRouter2.Hostname(), client.Hostname())
|
|
|
|
// Advertise the same route on both routers.
|
|
for _, r := range []TailscaleClient{subRouter1, subRouter2} {
|
|
_, _, err = r.Execute([]string{
|
|
"tailscale", "set", "--advertise-routes=" + pref.String(),
|
|
})
|
|
require.NoErrorf(t, err, "failed to advertise route on %s", r.Hostname())
|
|
}
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
var nodes []*v1.Node
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 4)
|
|
}, propagationTime, 200*time.Millisecond, "nodes should be registered")
|
|
|
|
// Approve the route on both routers explicitly.
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
nodeID1 := types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId())
|
|
nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId())
|
|
|
|
// Sanity: r1 starts as primary (lower NodeID).
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
assert.Equal(c, map[string]types.NodeID{
|
|
pref.String(): nodeID1,
|
|
}, pr.PrimaryRoutes, "router 1 should be primary initially")
|
|
}, propagationTime, 200*time.Millisecond, "waiting for HA setup")
|
|
|
|
// Confirm initial connectivity through r1.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "client reaches webservice via r1")
|
|
}, propagationTime, 200*time.Millisecond, "client reaches webservice via r1")
|
|
|
|
t.Log("=== Step 1: r1 goes offline. r2 should take over. ===")
|
|
|
|
require.NoError(t, subRouter1.Down())
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
assert.Equal(c, map[string]types.NodeID{
|
|
pref.String(): nodeID2,
|
|
}, pr.PrimaryRoutes, "r2 should be primary after r1 offline")
|
|
}, propagationTime, 500*time.Millisecond, "waiting for failover to r2")
|
|
|
|
t.Log("=== Step 2: r2 also goes offline. No primary should remain. ===")
|
|
|
|
require.NoError(t, subRouter2.Down())
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
assert.Empty(c, pr.PrimaryRoutes,
|
|
"no primary should be assigned while both routers are offline")
|
|
}, propagationTime, 500*time.Millisecond, "waiting for both routers to be offline")
|
|
|
|
t.Log("=== Step 3: r2 returns. ===")
|
|
t.Log(" R1: server-side primary route state must restore r2 as primary.")
|
|
t.Log(" R2: client must observe r2 online with the route in PrimaryRoutes.")
|
|
|
|
require.NoError(t, subRouter2.Up())
|
|
|
|
// R1 — server side.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
assert.Equal(c, map[string]types.NodeID{
|
|
pref.String(): nodeID2,
|
|
}, pr.PrimaryRoutes,
|
|
"R1: r2 should be re-registered as primary after reconnect — issue #3203")
|
|
}, propagationTime, 500*time.Millisecond, "R1: waiting for server-side primary restore")
|
|
|
|
// R2 — client view.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
clientStatus, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
srs2 := subRouter2.MustStatus()
|
|
|
|
peer := clientStatus.Peer[srs2.Self.PublicKey]
|
|
if !assert.NotNil(c, peer, "r2 peer should be in client status") {
|
|
return
|
|
}
|
|
|
|
assert.True(c, peer.Online,
|
|
"R2: client should see r2 online after reconnect — issue #3203")
|
|
|
|
if assert.NotNil(c, peer.PrimaryRoutes,
|
|
"R2: r2 should have PrimaryRoutes set in client status") {
|
|
assert.Contains(c, peer.PrimaryRoutes.AsSlice(), pref,
|
|
"R2: client's view of r2 should include the route as primary")
|
|
}
|
|
|
|
requirePeerSubnetRoutesWithCollect(c, peer, []netip.Prefix{pref})
|
|
}, propagationTime, 500*time.Millisecond, "R2: waiting for client to see r2 with primary route")
|
|
|
|
// End-to-end traffic should reach the webservice via r2.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "client reaches webservice via r2 after recovery")
|
|
}, propagationTime, 200*time.Millisecond, "client reaches webservice via r2 after recovery")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
tr, err := client.Traceroute(webip)
|
|
assert.NoError(c, err)
|
|
|
|
ip, err := subRouter2.IPv4()
|
|
if !assert.NoError(c, err, "failed to get IPv4 for r2") {
|
|
return
|
|
}
|
|
|
|
assertTracerouteViaIPWithCollect(c, tr, ip)
|
|
}, propagationTime, 200*time.Millisecond, "traceroute should go through r2 after recovery")
|
|
}
|
|
|
|
// TestHASubnetRouterFailoverBothOfflineCablePull is a stricter variant of
|
|
// TestHASubnetRouterFailoverBothOffline that simulates a cable pull rather
|
|
// than a graceful tailscale down. The two differ in what the server sees:
|
|
//
|
|
// - tailscale down: poll connection closes cleanly; defer fires
|
|
// immediately; grace period starts and ends predictably.
|
|
// - cable pull: server's noise long-poll is wedged in a half-open TCP
|
|
// connection until kernel keepalives time out (often >60 s). When
|
|
// the cable returns, two server-side longpoll sessions can overlap.
|
|
//
|
|
// This variant blocks all traffic between the router container and
|
|
// headscale via iptables and then removes the block to mimic the
|
|
// cable-pull behaviour.
|
|
func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
propagationTime := integrationutil.HASlowConvergeTimeout
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 2,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
Versions: []string{"head"},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
|
|
err = scenario.CreateHeadscaleEnv(
|
|
[]tsic.Option{
|
|
tsic.WithAcceptRoutes(),
|
|
tsic.WithPackages("iptables"),
|
|
},
|
|
hsic.WithTestName("rt-hacable"),
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
prefp, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
pref := *prefp
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
|
|
sort.SliceStable(allClients, func(i, j int) bool {
|
|
return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID
|
|
})
|
|
|
|
subRouter1 := allClients[0]
|
|
subRouter2 := allClients[1]
|
|
client := allClients[2]
|
|
|
|
for _, r := range []TailscaleClient{subRouter1, subRouter2} {
|
|
_, _, err = r.Execute([]string{
|
|
"tailscale", "set", "--advertise-routes=" + pref.String(),
|
|
})
|
|
require.NoErrorf(t, err, "advertise route on %s", r.Hostname())
|
|
}
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
var nodes []*v1.Node
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 4)
|
|
}, propagationTime, 200*time.Millisecond, "nodes registered")
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId())
|
|
|
|
// Sanity: r1 starts as primary.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
assert.NotEmpty(c, pr.PrimaryRoutes, "a primary should exist")
|
|
}, propagationTime, 200*time.Millisecond, "HA setup")
|
|
|
|
hsIP := headscale.GetIPInNetwork(usernet1)
|
|
|
|
// "Cable pull" — drop all traffic in BOTH directions to/from headscale.
|
|
// Unlike the NEW-state-only filter used by TestHASubnetRouterPingFailover,
|
|
// this also breaks the existing ESTABLISHED long-poll, mimicking a
|
|
// physically severed link.
|
|
cablePull := func(r TailscaleClient) {
|
|
t.Helper()
|
|
|
|
for _, chain := range []string{"OUTPUT", "INPUT"} {
|
|
_, _, err := r.Execute([]string{
|
|
"iptables", "-A", chain,
|
|
"-d", hsIP, "-j", "DROP",
|
|
})
|
|
require.NoErrorf(t, err, "iptables -A %s on %s", chain, r.Hostname())
|
|
_, _, err = r.Execute([]string{
|
|
"iptables", "-A", chain,
|
|
"-s", hsIP, "-j", "DROP",
|
|
})
|
|
require.NoErrorf(t, err, "iptables -A %s -s on %s", chain, r.Hostname())
|
|
}
|
|
}
|
|
|
|
cableReplug := func(r TailscaleClient) {
|
|
t.Helper()
|
|
|
|
for _, chain := range []string{"OUTPUT", "INPUT"} {
|
|
_, _, err := r.Execute([]string{
|
|
"iptables", "-D", chain,
|
|
"-d", hsIP, "-j", "DROP",
|
|
})
|
|
require.NoErrorf(t, err, "iptables -D %s on %s", chain, r.Hostname())
|
|
_, _, err = r.Execute([]string{
|
|
"iptables", "-D", chain,
|
|
"-s", hsIP, "-j", "DROP",
|
|
})
|
|
require.NoErrorf(t, err, "iptables -D %s -s on %s", chain, r.Hostname())
|
|
}
|
|
}
|
|
|
|
t.Log("=== Cable-pull r1. Server should eventually fail r1 over to r2. ===")
|
|
cablePull(subRouter1)
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
assert.Equal(c, map[string]types.NodeID{
|
|
pref.String(): nodeID2,
|
|
}, pr.PrimaryRoutes, "r2 should become primary after r1 cable pull")
|
|
}, propagationTime, 1*time.Second, "waiting for r2 promotion")
|
|
|
|
t.Log("=== Cable-pull r2 while r1 is still cable-pulled. ===")
|
|
cablePull(subRouter2)
|
|
|
|
// Some primary may transiently flip back to r1 (offline) here — see the
|
|
// user's "failover to n1 (offline)" observation in the issue. We do not
|
|
// assert on that intermediate state; we just assert recovery below.
|
|
|
|
t.Log("=== Reconnect r2 (cable plugged back in). ===")
|
|
cableReplug(subRouter2)
|
|
|
|
// R1 — server side primary table should restore r2 as primary.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
assert.Equal(c, map[string]types.NodeID{
|
|
pref.String(): nodeID2,
|
|
}, pr.PrimaryRoutes,
|
|
"R1: r2 should be re-registered as primary after cable replug — issue #3203")
|
|
}, propagationTime, 1*time.Second, "R1: waiting for r2 to be primary again")
|
|
|
|
// R2 — client should observe r2 online with the route.
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
clientStatus, err := client.Status()
|
|
assert.NoError(c, err)
|
|
|
|
srs2 := subRouter2.MustStatus()
|
|
|
|
peer := clientStatus.Peer[srs2.Self.PublicKey]
|
|
if !assert.NotNil(c, peer) {
|
|
return
|
|
}
|
|
|
|
assert.True(c, peer.Online,
|
|
"R2: client should see r2 online — issue #3203")
|
|
|
|
if assert.NotNil(c, peer.PrimaryRoutes) {
|
|
assert.Contains(c, peer.PrimaryRoutes.AsSlice(), pref)
|
|
}
|
|
}, propagationTime, 1*time.Second, "R2: waiting for client to see r2")
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, "client reaches webservice via r2 after recovery")
|
|
}, propagationTime, 1*time.Second, "client reaches webservice via r2 after recovery")
|
|
}
|
|
|
|
// TestHASubnetRouterFailoverDockerDisconnect drives a multi-phase
|
|
// up/down/up/down lifecycle of two HA subnet routers using real
|
|
// docker network disconnects — the same failure primitive nblock
|
|
// observed when pulling a Proxmox interface in issue #3203.
|
|
// iptables-based simulations cannot reproduce this because the
|
|
// container's kernel still owns the socket; only daemon-level
|
|
// disconnect leaves the long-poll TCP half-open at the peer.
|
|
//
|
|
// Phases:
|
|
// 1. r1 starts as primary (lowest NodeID).
|
|
// 2. r1 alone fails and recovers — failover to r2, then traffic
|
|
// resumes when r1 returns.
|
|
// 3. r2 alone fails and recovers — failover, then traffic resumes.
|
|
// 4. Sequential dual failure — the issue #3203 bug.
|
|
// 4a. r1 down → r2 promoted.
|
|
// 4b. r2 down → primary must NOT flap to offline r1.
|
|
// 4c. r2 up → r2 primary again, traffic resumes.
|
|
// 5. Simultaneous dual failure.
|
|
// 5a. r1 + r2 down → primary must NOT flap to offline r1.
|
|
// 5b. both up → primary stays r2, traffic resumes.
|
|
//
|
|
// The no-flap assertions in 4b and 5a are the regression barriers
|
|
// for #3203. Phases 2/3 are functional checks (failover works,
|
|
// traffic recovers) without strict identity assertions on the
|
|
// "return" leg, since `docker network disconnect` triggers bridge
|
|
// reconfiguration that can transiently affect probing of OTHER
|
|
// containers on the same network — a test-infrastructure quirk
|
|
// that does not occur with a real cable pull.
|
|
func TestHASubnetRouterFailoverDockerDisconnect(t *testing.T) {
|
|
IntegrationSkip(t)
|
|
|
|
propagationTime := integrationutil.HASlowConvergeTimeout
|
|
flapWindow := integrationutil.ScaledTimeout(40 * time.Second)
|
|
|
|
spec := ScenarioSpec{
|
|
NodesPerUser: 2,
|
|
Users: []string{"user1", "user2"},
|
|
Networks: map[string]NetworkSpec{
|
|
"usernet1": {Users: []string{"user1"}},
|
|
"usernet2": {Users: []string{"user2"}},
|
|
},
|
|
ExtraService: map[string][]extraServiceFunc{
|
|
"usernet1": {Webservice},
|
|
},
|
|
Versions: []string{"head"},
|
|
}
|
|
|
|
scenario, err := NewScenario(spec)
|
|
require.NoErrorf(t, err, "failed to create scenario: %s", err)
|
|
|
|
err = scenario.CreateHeadscaleEnv(
|
|
[]tsic.Option{tsic.WithAcceptRoutes()},
|
|
hsic.WithTestName("rt-hadocker"),
|
|
)
|
|
requireNoErrHeadscaleEnv(t, err)
|
|
|
|
allClients, err := scenario.ListTailscaleClients()
|
|
requireNoErrListClients(t, err)
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
headscale, err := scenario.Headscale()
|
|
requireNoErrGetHeadscale(t, err)
|
|
|
|
prefp, err := scenario.SubnetOfNetwork("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
pref := *prefp
|
|
|
|
usernet1, err := scenario.Network("usernet1")
|
|
require.NoError(t, err)
|
|
|
|
services, err := scenario.Services("usernet1")
|
|
require.NoError(t, err)
|
|
require.Len(t, services, 1)
|
|
|
|
web := services[0]
|
|
webip := netip.MustParseAddr(web.GetIPInNetwork(usernet1))
|
|
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
|
|
|
|
sort.SliceStable(allClients, func(i, j int) bool {
|
|
return allClients[i].MustStatus().Self.ID < allClients[j].MustStatus().Self.ID
|
|
})
|
|
|
|
subRouter1 := allClients[0]
|
|
subRouter2 := allClients[1]
|
|
client := allClients[2]
|
|
|
|
for _, r := range []TailscaleClient{subRouter1, subRouter2} {
|
|
_, _, err = r.Execute([]string{
|
|
"tailscale", "set", "--advertise-routes=" + pref.String(),
|
|
})
|
|
require.NoErrorf(t, err, "advertise route on %s", r.Hostname())
|
|
}
|
|
|
|
err = scenario.WaitForTailscaleSync()
|
|
requireNoErrSync(t, err)
|
|
|
|
var nodes []*v1.Node
|
|
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
nodes, err = headscale.ListNodes()
|
|
assert.NoError(c, err)
|
|
assert.Len(c, nodes, 4)
|
|
}, propagationTime, 200*time.Millisecond, "nodes registered")
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter1.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
_, err = headscale.ApproveRoutes(
|
|
MustFindNode(subRouter2.Hostname(), nodes).GetId(),
|
|
[]netip.Prefix{pref},
|
|
)
|
|
require.NoError(t, err)
|
|
|
|
nodeID1 := types.NodeID(MustFindNode(subRouter1.Hostname(), nodes).GetId())
|
|
nodeID2 := types.NodeID(MustFindNode(subRouter2.Hostname(), nodes).GetId())
|
|
|
|
// requirePrimary blocks until headscale reports want as the
|
|
// primary advertiser for pref.
|
|
requirePrimary := func(want types.NodeID, msg string) {
|
|
t.Helper()
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
assert.NoError(c, err)
|
|
assert.Equal(c, map[string]types.NodeID{
|
|
pref.String(): want,
|
|
}, pr.PrimaryRoutes, msg)
|
|
}, propagationTime, 1*time.Second, msg)
|
|
}
|
|
|
|
// requireTrafficWorks asserts the client can reach the webservice
|
|
// across the tailnet (i.e. via whichever router is primary).
|
|
requireTrafficWorks := func(msg string) {
|
|
t.Helper()
|
|
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
|
assertCurlDockerHostname(c, client, weburl, msg)
|
|
}, propagationTime, 1*time.Second, msg)
|
|
}
|
|
|
|
// requirePrimaryStable asserts primary == want for the entire
|
|
// window. Catches transient flaps and verifies anti-flap on
|
|
// prev-primary return.
|
|
requirePrimaryStable := func(want types.NodeID, window time.Duration, msg string) {
|
|
t.Helper()
|
|
require.Never(t, func() bool {
|
|
pr, err := headscale.PrimaryRoutes()
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
owner, ok := pr.PrimaryRoutes[pref.String()]
|
|
|
|
return !ok || owner != want
|
|
}, window, 1*time.Second, msg)
|
|
}
|
|
|
|
// ============================================================
|
|
// Phase 1: initial state — r1 (lowest NodeID) is primary.
|
|
// ============================================================
|
|
t.Log("=== Phase 1: initial state — r1 should be primary. ===")
|
|
requirePrimary(nodeID1, "phase 1: r1 primary at start")
|
|
requireTrafficWorks("phase 1: client reaches webservice via r1")
|
|
|
|
// ============================================================
|
|
// Phase 2: r1 alone fails and returns. Failover to r2, traffic
|
|
// resumes; reconnect r1 and verify traffic still flows. We do
|
|
// not assert primary identity across the r1-return leg because
|
|
// docker bridge reconfiguration can transiently fail probes on
|
|
// r2 (real cable pulls do not have this side effect).
|
|
// ============================================================
|
|
t.Log("=== Phase 2a: cable-pull r1, expect failover to r2. ===")
|
|
require.NoError(t, subRouter1.DisconnectFromNetwork(usernet1),
|
|
"phase 2a: docker disconnect r1")
|
|
requirePrimary(nodeID2, "phase 2a: r2 promoted after r1 down")
|
|
requireTrafficWorks("phase 2a: client reaches webservice via r2")
|
|
|
|
t.Log("=== Phase 2b: reconnect r1, traffic should still flow. ===")
|
|
require.NoError(t, subRouter1.ReconnectToNetwork(usernet1),
|
|
"phase 2b: docker reconnect r1")
|
|
requireTrafficWorks("phase 2b: client still reaches webservice")
|
|
|
|
// ============================================================
|
|
// Phase 3: r2 alone fails and returns. Same caveats as phase 2
|
|
// on identity assertions during the return leg.
|
|
// ============================================================
|
|
t.Log("=== Phase 3a: cable-pull r2, traffic should fail over. ===")
|
|
require.NoError(t, subRouter2.DisconnectFromNetwork(usernet1),
|
|
"phase 3a: docker disconnect r2")
|
|
requireTrafficWorks("phase 3a: client reaches webservice via remaining router")
|
|
|
|
t.Log("=== Phase 3b: reconnect r2, traffic should still flow. ===")
|
|
require.NoError(t, subRouter2.ReconnectToNetwork(usernet1),
|
|
"phase 3b: docker reconnect r2")
|
|
requireTrafficWorks("phase 3b: client still reaches webservice")
|
|
|
|
// ============================================================
|
|
// Phase 4: sequential dual failure — the issue #3203 bug. The
|
|
// flap target is r1 because under cable-pull both routers
|
|
// linger as IsOnline=true (half-open TCP), both go Unhealthy,
|
|
// and electPrimaryRoutes' all-unhealthy fallback selects the
|
|
// lowest NodeID regardless of who was prev primary.
|
|
// ============================================================
|
|
t.Log("=== Phase 4a: cable-pull r1, expect failover to r2. ===")
|
|
require.NoError(t, subRouter1.DisconnectFromNetwork(usernet1),
|
|
"phase 4a: docker disconnect r1")
|
|
requirePrimary(nodeID2, "phase 4a: r2 promoted after r1 down")
|
|
|
|
t.Log("=== Phase 4b: cable-pull r2, primary must NOT flap to offline r1. ===")
|
|
require.NoError(t, subRouter2.DisconnectFromNetwork(usernet1),
|
|
"phase 4b: docker disconnect r2")
|
|
requirePrimaryStable(nodeID2, flapWindow,
|
|
"phase 4b: primary must not flap to offline r1 (issue #3203)")
|
|
|
|
t.Log("=== Phase 4c: reconnect r2, r2 should resume as primary. ===")
|
|
require.NoError(t, subRouter2.ReconnectToNetwork(usernet1),
|
|
"phase 4c: docker reconnect r2")
|
|
requirePrimary(nodeID2, "phase 4c: r2 primary after reconnect")
|
|
requireTrafficWorks("phase 4c: client reaches webservice via r2 after recovery")
|
|
|
|
t.Log("=== Phase 4d: reconnect r1, traffic should still flow. ===")
|
|
require.NoError(t, subRouter1.ReconnectToNetwork(usernet1),
|
|
"phase 4d: docker reconnect r1")
|
|
requireTrafficWorks("phase 4d: client still reaches webservice")
|
|
|
|
// ============================================================
|
|
// Phase 5: simultaneous dual failure (whole-segment outage).
|
|
// prev going in is r2 — primary must not flap to offline r1.
|
|
// ============================================================
|
|
t.Log("=== Phase 5a: cable-pull r1 and r2 simultaneously. ===")
|
|
require.NoError(t, subRouter1.DisconnectFromNetwork(usernet1),
|
|
"phase 5a: docker disconnect r1")
|
|
require.NoError(t, subRouter2.DisconnectFromNetwork(usernet1),
|
|
"phase 5a: docker disconnect r2")
|
|
requirePrimaryStable(nodeID2, flapWindow,
|
|
"phase 5a: primary must not flap to offline r1 (issue #3203)")
|
|
|
|
t.Log("=== Phase 5b: reconnect both, r2 should remain primary. ===")
|
|
require.NoError(t, subRouter1.ReconnectToNetwork(usernet1),
|
|
"phase 5b: docker reconnect r1")
|
|
require.NoError(t, subRouter2.ReconnectToNetwork(usernet1),
|
|
"phase 5b: docker reconnect r2")
|
|
requirePrimary(nodeID2, "phase 5b: r2 primary after both reconnect")
|
|
requireTrafficWorks("phase 5b: client reaches webservice via r2")
|
|
}
|