integration: harden ACL test ergonomics

tsic.Curl returned ("", nil) when curl exited 0 with a zero-byte body —
the usual signature of a mid-stream reset — so EventuallyWithT could
not retry. Return an error on empty body instead.

Replace the 56 inline curl + assert.Len(13) blocks with
assertCurlDockerHostname so the empty-body fix benefits every callsite
without further touch-ups.

Gate ACL waits on actual filter visibility: snapshotClientFilters +
waitForClientFilterChange ensure the new PacketFilter has reached the
client before assertions fire; SyncOption + WithPreBarrier feeds a
server-side policy-loaded check into WaitForTailscaleSyncPerUser.

Move advertise-routes mutation out of EventuallyWithT in route_test
(cmd/hi/README forbids retrying state-mutating calls). Pace the
TestNodeOnlineStatus outer loop with a Ticker, not a Sleep.
This commit is contained in:
Kristoffer Dalby
2026-05-13 13:19:49 +00:00
parent 78fd6efb38
commit dfcc96d808
6 changed files with 321 additions and 300 deletions

View File

@@ -325,7 +325,7 @@ func TestACLHostsInNetMapTable(t *testing.T) {
user := status.User[status.Self.UserID].LoginName
assert.Len(c, status.Peer, (testCase.want[user]))
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for expected peer visibility")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for expected peer visibility")
}
})
}
@@ -372,10 +372,8 @@ func TestACLAllowUser80Dst(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2")
assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2")
}
}
@@ -390,7 +388,7 @@ func TestACLAllowUser80Dst(t *testing.T) {
assert.EventuallyWithT(t, func(c *assert.CollectT) {
assertCurlFailWithCollect(c, client, url, "user2 should not reach user1")
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user2 cannot reach user1")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user2 cannot reach user1")
}
}
}
@@ -437,7 +435,7 @@ func TestACLDenyAllPort80(t *testing.T) {
assert.EventuallyWithT(t, func(c *assert.CollectT) {
assertCurlFailWithCollect(c, client, url, "all traffic should be denied")
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying all traffic is denied")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying all traffic is denied")
}
}
}
@@ -481,10 +479,8 @@ func TestACLAllowUserDst(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2")
assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2")
}
}
@@ -499,7 +495,7 @@ func TestACLAllowUserDst(t *testing.T) {
assert.EventuallyWithT(t, func(c *assert.CollectT) {
assertCurlFailWithCollect(c, client, url, "user2 should not reach user1")
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user2 cannot reach user1")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user2 cannot reach user1")
}
}
}
@@ -542,10 +538,8 @@ func TestACLAllowStarDst(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2")
assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2")
}
}
@@ -560,7 +554,7 @@ func TestACLAllowStarDst(t *testing.T) {
assert.EventuallyWithT(t, func(c *assert.CollectT) {
assertCurlFailWithCollect(c, client, url, "user2 should not reach user1")
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user2 cannot reach user1")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user2 cannot reach user1")
}
}
}
@@ -608,10 +602,8 @@ func TestACLNamedHostsCanReachBySubnet(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2")
assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2")
}
}
@@ -627,10 +619,8 @@ func TestACLNamedHostsCanReachBySubnet(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user2 can reach user1")
assertCurlDockerHostname(c, client, url, "Verifying user2 can reach user1")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user2 can reach user1")
}
}
}
@@ -790,7 +780,7 @@ func TestACLNamedHostsCanReach(t *testing.T) {
test3URL,
result,
)
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test1 should reach test3")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test1 should reach test3")
// test2 can query test3 (everyone -> test3)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -804,7 +794,7 @@ func TestACLNamedHostsCanReach(t *testing.T) {
test3URL,
result,
)
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test2 should reach test3")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test2 should reach test3")
// test3 cannot query test1
_, err = test3.CurlFailFast(test1URL)
@@ -826,7 +816,7 @@ func TestACLNamedHostsCanReach(t *testing.T) {
test2URL,
result,
)
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test1 should reach test2")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test1 should reach test2")
// test2 cannot query test1
_, err = test2.CurlFailFast(test1URL)
@@ -981,12 +971,12 @@ func TestACLDevice1CanAccessDevice2(t *testing.T) {
test2URL,
result,
)
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test1 should reach test2")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test1 should reach test2")
// test2 cannot query test1 (asymmetric policy)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
assertCurlFailWithCollect(c, test2, test1URL, "test2 should not reach test1")
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test2 should NOT reach test1")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test2 should NOT reach test1")
})
}
}
@@ -1047,10 +1037,8 @@ func TestPolicyUpdateWhileRunningWithCLIInDatabase(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2")
assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2")
}
}
@@ -1096,7 +1084,7 @@ func TestPolicyUpdateWhileRunningWithCLIInDatabase(t *testing.T) {
if diff := cmp.Diff(p, *output, cmpopts.IgnoreUnexported(policyv2.Policy{}), cmpopts.EquateEmpty()); diff != "" {
ct.Errorf("unexpected policy(-want +got):\n%s", diff)
}
}, integrationutil.ScaledTimeout(30*time.Second), 1*time.Second, "verifying that the new policy took place")
}, integrationutil.StatusReadyTimeout, 1*time.Second, "verifying that the new policy took place")
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
// Test that user1 can visit all user2
@@ -1108,9 +1096,7 @@ func TestPolicyUpdateWhileRunningWithCLIInDatabase(t *testing.T) {
url := fmt.Sprintf("http://%s/etc/hostname", fqdn)
t.Logf("url from %s to %s", client.Hostname(), url)
result, err := client.Curl(url)
assert.Len(ct, result, 13)
assert.NoError(ct, err)
assertCurlDockerHostname(ct, client, url, "new policy did not get propagated to nodes")
}
}
@@ -1126,7 +1112,7 @@ func TestPolicyUpdateWhileRunningWithCLIInDatabase(t *testing.T) {
assertCurlFailWithCollect(ct, client, url, "user2 should not reach user1")
}
}
}, integrationutil.ScaledTimeout(30*time.Second), 1*time.Second, "new policy did not get propagated to nodes")
}, integrationutil.StatusReadyTimeout, 1*time.Second, "new policy did not get propagated to nodes")
}
func TestACLAutogroupMember(t *testing.T) {
@@ -1165,7 +1151,7 @@ func TestACLAutogroupMember(t *testing.T) {
clientIsUntagged = status.Self.Tags == nil || status.Self.Tags.Len() == 0
assert.True(c, clientIsUntagged, "Expected client %s to be untagged for autogroup:member test", client.Hostname())
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for client %s to be untagged", client.Hostname())
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for client %s to be untagged", client.Hostname())
if !clientIsUntagged {
continue
@@ -1184,7 +1170,7 @@ func TestACLAutogroupMember(t *testing.T) {
peerIsUntagged = status.Self.Tags == nil || status.Self.Tags.Len() == 0
assert.True(c, peerIsUntagged, "Expected peer %s to be untagged for autogroup:member test", peer.Hostname())
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for peer %s to be untagged", peer.Hostname())
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for peer %s to be untagged", peer.Hostname())
if !peerIsUntagged {
continue
@@ -1197,10 +1183,8 @@ func TestACLAutogroupMember(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying autogroup:member connectivity")
assertCurlDockerHostname(c, client, url, "Verifying autogroup:member connectivity")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying autogroup:member connectivity")
}
}
}
@@ -1376,7 +1360,7 @@ func TestACLAutogroupTagged(t *testing.T) {
untaggedClients = append(untaggedClients, client)
}
}
}, integrationutil.ScaledTimeout(30*time.Second), 1*time.Second, "verifying peer visibility for node %s", hostname)
}, integrationutil.StatusReadyTimeout, 1*time.Second, "verifying peer visibility for node %s", hostname)
}
// Verify we have the expected number of tagged and untagged nodes
@@ -1390,7 +1374,7 @@ func TestACLAutogroupTagged(t *testing.T) {
assert.NoError(c, err)
assert.NotNil(c, status.Self.Tags, "tagged node %s should have tags", client.Hostname())
assert.Positive(c, status.Self.Tags.Len(), "tagged node %s should have at least one tag", client.Hostname())
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for tags to be applied to tagged nodes")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for tags to be applied to tagged nodes")
}
// Verify untagged nodes have no tags
@@ -1402,7 +1386,7 @@ func TestACLAutogroupTagged(t *testing.T) {
if status.Self.Tags != nil {
assert.Equal(c, 0, status.Self.Tags.Len(), "untagged node %s should have no tags", client.Hostname())
}
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting to verify untagged nodes have no tags")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting to verify untagged nodes have no tags")
}
// Test that tagged nodes can communicate with each other
@@ -1420,10 +1404,8 @@ func TestACLAutogroupTagged(t *testing.T) {
t.Logf("Testing connection from tagged node %s to tagged node %s", client.Hostname(), peer.Hostname())
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(ct, err)
assert.Len(ct, result, 13)
}, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "tagged nodes should be able to communicate")
assertCurlDockerHostname(ct, client, url, "tagged nodes should be able to communicate")
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "tagged nodes should be able to communicate")
}
}
@@ -1442,7 +1424,7 @@ func TestACLAutogroupTagged(t *testing.T) {
result, err := client.CurlFailFast(url)
assert.Empty(ct, result)
assert.Error(ct, err)
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "untagged nodes should not be able to reach tagged nodes")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "untagged nodes should not be able to reach tagged nodes")
}
// Try to reach other untagged nodes (should also fail)
@@ -1462,7 +1444,7 @@ func TestACLAutogroupTagged(t *testing.T) {
result, err := client.CurlFailFast(url)
assert.Empty(ct, result)
assert.Error(ct, err)
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "untagged nodes should not be able to reach other untagged nodes")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "untagged nodes should not be able to reach other untagged nodes")
}
}
@@ -1480,7 +1462,7 @@ func TestACLAutogroupTagged(t *testing.T) {
result, err := client.CurlFailFast(url)
assert.Empty(ct, result)
assert.Error(ct, err)
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "tagged nodes should not be able to reach untagged nodes")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "tagged nodes should not be able to reach untagged nodes")
}
}
}
@@ -1664,10 +1646,8 @@ func TestACLAutogroupSelf(t *testing.T) {
t.Logf("url from %s (user1) to %s (user1)", client.Hostname(), fqdn)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "user1 device should reach other user1 device via autogroup:self")
assertCurlDockerHostname(c, client, url, "user1 device should reach other user1 device via autogroup:self")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "user1 device should reach other user1 device via autogroup:self")
}
}
@@ -1685,10 +1665,8 @@ func TestACLAutogroupSelf(t *testing.T) {
t.Logf("url from %s (user2) to %s (user2)", client.Hostname(), fqdn)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "user2 device should reach other user2 device via autogroup:self")
assertCurlDockerHostname(c, client, url, "user2 device should reach other user2 device via autogroup:self")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "user2 device should reach other user2 device via autogroup:self")
}
}
@@ -1704,7 +1682,7 @@ func TestACLAutogroupSelf(t *testing.T) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.NotEmpty(c, result, "user1 should be able to access router-node via group:home -> tag:router-node rule")
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "user1 device should reach router-node (proves autogroup:self doesn't interfere)")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "user1 device should reach router-node (proves autogroup:self doesn't interfere)")
}
// Test that user2's regular devices can access router-node
@@ -1719,7 +1697,7 @@ func TestACLAutogroupSelf(t *testing.T) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.NotEmpty(c, result, "user2 should be able to access router-node via group:home -> tag:router-node rule")
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "user2 device should reach router-node (proves autogroup:self doesn't interfere)")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "user2 device should reach router-node (proves autogroup:self doesn't interfere)")
}
// Test that devices from different users cannot access each other's regular devices
@@ -1867,20 +1845,32 @@ func TestACLPolicyPropagationOverTime(t *testing.T) {
assert.Len(ct, result, 13, "iteration %d: response from %s to %s should be valid", iteration, client.Hostname(), fqdn)
}
}
}, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "iteration %d: Phase 1 - all connectivity tests with allow-all policy", iteration)
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "iteration %d: Phase 1 - all connectivity tests with allow-all policy", iteration)
// Phase 2: Autogroup:self policy (only same user can access)
t.Logf("Iteration %d: Phase 2 - Setting autogroup:self policy", iteration)
preFilters := snapshotClientFilters(t, allClients)
err = headscale.SetPolicy(autogroupSelfPolicy)
require.NoError(t, err)
// Wait for peer lists to sync with autogroup:self - ensures cross-user peers are removed
t.Logf("Iteration %d: Phase 2 - Waiting for peer lists to sync with autogroup:self", iteration)
err = scenario.WaitForTailscaleSyncPerUser(integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond)
err = scenario.WaitForTailscaleSyncPerUser(
integrationutil.PolicyPropagationTimeout,
integrationutil.SlowPoll,
)
require.NoError(t, err, "iteration %d: Phase 2 - failed to sync after autogroup:self policy", iteration)
// Gate on each client's wgengine having applied the new
// PacketFilter before asserting reachability. WaitForTailscale
// SyncPerUser only checks peer-count parity, which trips
// before the filter rules have actually replaced the
// allow-all rules.
waitForClientFilterChange(t, allClients, preFilters, integrationutil.PolicyPropagationTimeout)
// Test ALL connectivity (positive and negative) in one block after state is settled
t.Logf("Iteration %d: Phase 2 - Testing all connectivity with autogroup:self", iteration)
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
@@ -1947,7 +1937,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) {
assertCurlFailWithCollect(ct, client, url, fmt.Sprintf("iteration %d: user2 %s should NOT reach user1 %s", iteration, client.Hostname(), peer.Hostname()))
}
}
}, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "iteration %d: Phase 2 - all connectivity tests with autogroup:self", iteration)
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "iteration %d: Phase 2 - all connectivity tests with autogroup:self", iteration)
// Phase 2b: Add a new node to user1 and validate policy propagation
t.Logf("Iteration %d: Phase 2b - Adding new node to user1 during autogroup:self policy", iteration)
@@ -2011,7 +2001,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) {
assertCurlFailWithCollect(ct, client, url, fmt.Sprintf("iteration %d: user1 %s should NOT reach user2 %s", iteration, client.Hostname(), peer.Hostname()))
}
}
}, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "iteration %d: Phase 2b - all connectivity tests after new node addition", iteration)
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "iteration %d: Phase 2b - all connectivity tests after new node addition", iteration)
// Delete the newly added node before Phase 3
t.Logf("Iteration %d: Phase 2b - Deleting the newly added node from user1", iteration)
@@ -2033,7 +2023,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) {
nodeToDeleteID = node.GetId()
}
}
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "iteration %d: Phase 2b - listing nodes before deletion", iteration)
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "iteration %d: Phase 2b - listing nodes before deletion", iteration)
// Delete the node via headscale helper
t.Logf("Iteration %d: Phase 2b - Deleting node ID %d from headscale", iteration, nodeToDeleteID)
@@ -2066,7 +2056,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) {
nodeListAfter, err := headscale.ListNodes("user1")
assert.NoError(ct, err, "failed to list nodes after deletion")
assert.Len(ct, nodeListAfter, 2, "iteration %d: should have 2 user1 nodes after deletion, got %d", iteration, len(nodeListAfter))
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "iteration %d: Phase 2b - node should be deleted", iteration)
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "iteration %d: Phase 2b - node should be deleted", iteration)
// Wait for sync after deletion to ensure peer counts are correct
// Use WaitForTailscaleSyncPerUser because autogroup:self is still active,
@@ -2128,7 +2118,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) {
assertCurlFailWithCollect(ct, client, url, fmt.Sprintf("iteration %d: user2 %s should NOT reach user1 %s", iteration, client.Hostname(), peer.Hostname()))
}
}
}, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "iteration %d: Phase 3 - all connectivity tests with directional policy", iteration)
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "iteration %d: Phase 3 - all connectivity tests with directional policy", iteration)
t.Logf("=== Iteration %d/5 completed successfully - All 3 phases passed ===", iteration)
}
@@ -2591,7 +2581,7 @@ func TestACLTagPropagation(t *testing.T) {
} else {
assertCurlFailWithCollect(c, sourceClient, targetURL, "initial access should fail")
}
}, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "verifying initial access state")
}, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "verifying initial access state")
// Step 1b: Verify initial NetMap visibility
t.Logf("Step 1b: Verifying initial NetMap visibility (expect visible=%v)", tt.initialAccess)
@@ -2614,7 +2604,7 @@ func TestACLTagPropagation(t *testing.T) {
} else {
assert.False(c, found, "Target should NOT be visible in NetMap initially")
}
}, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "verifying initial NetMap visibility")
}, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "verifying initial NetMap visibility")
// Step 2: Apply tag change
t.Logf("Step 2: Setting tags on node %d to %v", targetNodeID, tt.tagChange)
@@ -2632,7 +2622,7 @@ func TestACLTagPropagation(t *testing.T) {
if node != nil {
assert.ElementsMatch(c, tt.tagChange, node.GetTags(), "Tags should be updated")
}
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "verifying tag change applied")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "verifying tag change applied")
// Step 3: Verify final NetMap visibility first (fast signal that
// the MapResponse propagated to the client).
@@ -2660,7 +2650,7 @@ func TestACLTagPropagation(t *testing.T) {
} else {
assert.False(c, found, "Target should NOT be visible in NetMap after tag change")
}
}, integrationutil.ScaledTimeout(120*time.Second), 500*time.Millisecond, "verifying NetMap visibility propagated after tag change")
}, integrationutil.HASlowConvergeTimeout, integrationutil.SlowPoll, "verifying NetMap visibility propagated after tag change")
// Step 4: Verify final access state (this is the key test for #2389).
// Even though Step 3 confirmed the MapResponse arrived, the full
@@ -2674,7 +2664,7 @@ func TestACLTagPropagation(t *testing.T) {
} else {
assertCurlFailWithCollect(c, sourceClient, targetURL, "final access should fail after tag change")
}
}, integrationutil.ScaledTimeout(120*time.Second), 500*time.Millisecond, "verifying access propagated after tag change")
}, integrationutil.HASlowConvergeTimeout, integrationutil.SlowPoll, "verifying access propagated after tag change")
t.Logf("Test %s PASSED: Tag change propagated correctly", tt.name)
})
@@ -2823,7 +2813,7 @@ func TestACLTagPropagationPortSpecific(t *testing.T) {
t.Log("Step 1: Verifying HTTP access with tag:webserver (should succeed)")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
assertCurlSuccessWithCollect(c, user2Node, targetURL, "HTTP should work with tag:webserver")
}, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "initial HTTP access with tag:webserver")
}, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "initial HTTP access with tag:webserver")
// Step 2: Change tag from webserver to sshonly
t.Logf("Step 2: Changing tag from webserver to sshonly on node %d", targetNodeID)
@@ -2844,7 +2834,7 @@ func TestACLTagPropagationPortSpecific(t *testing.T) {
if node != nil {
assert.ElementsMatch(c, []string{"tag:sshonly"}, node.GetTags(), "Tags should be updated to sshonly")
}
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "verifying tag change applied")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "verifying tag change applied")
// Step 3: Verify peer is still visible in NetMap (partial access, not full removal)
t.Log("Step 3: Verifying peer remains visible in NetMap after tag change")
@@ -2863,7 +2853,7 @@ func TestACLTagPropagationPortSpecific(t *testing.T) {
}
assert.True(c, found, "Peer should still be visible with tag:sshonly (port 22 access)")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "peer visibility after tag change")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "peer visibility after tag change")
// Step 4: Verify HTTP on port 80 now fails (tag:sshonly only allows port 22).
// Port-specific filter changes are harder than peer removal because
@@ -2872,7 +2862,7 @@ func TestACLTagPropagationPortSpecific(t *testing.T) {
t.Log("Step 4: Verifying HTTP access is now blocked (tag:sshonly only allows port 22)")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
assertCurlFailWithCollect(c, user2Node, targetURL, "HTTP should fail with tag:sshonly (only port 22 allowed)")
}, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "HTTP blocked after tag change to sshonly")
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "HTTP blocked after tag change to sshonly")
t.Log("Test PASSED: Port-specific ACL changes propagated correctly")
}
@@ -2963,19 +2953,15 @@ func TestACLGroupWithUnknownUser(t *testing.T) {
t.Log("Testing connectivity: user1 -> user2 (should succeed despite unknown user in group)")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should be able to reach user2")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "user1 should reach user2")
assertCurlDockerHostname(c, user1, url, "user1 should be able to reach user2")
}, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "user1 should reach user2")
// Test that user2 can reach user1 (bidirectional)
t.Log("Testing connectivity: user2 -> user1 (should succeed despite unknown user in group)")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user2.Curl(url)
assert.NoError(c, err, "user2 should be able to reach user1")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "user2 should reach user1")
assertCurlDockerHostname(c, user2, url, "user2 should be able to reach user1")
}, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "user2 should reach user1")
t.Log("Test PASSED: Valid users can communicate despite unknown user reference in group")
}
@@ -3069,17 +3055,13 @@ func TestACLGroupAfterUserDeletion(t *testing.T) {
t.Log("Step 1: Verifying initial connectivity between all users")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should be able to reach user2 initially")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "initial user1 -> user2 connectivity")
assertCurlDockerHostname(c, user1, url, "user1 should be able to reach user2 initially")
}, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "initial user1 -> user2 connectivity")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user2.Curl(url)
assert.NoError(c, err, "user2 should be able to reach user1 initially")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "initial user2 -> user1 connectivity")
assertCurlDockerHostname(c, user2, url, "user2 should be able to reach user1 initially")
}, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "initial user2 -> user1 connectivity")
// Step 2: Get user3's node and user, then delete them
t.Log("Step 2: Deleting user3's node and user from headscale")
@@ -3114,10 +3096,8 @@ func TestACLGroupAfterUserDeletion(t *testing.T) {
// Test that user1 can still reach user2
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should still be able to reach user2 after user3 deletion (stale cache)")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user2 after user3 deletion")
assertCurlDockerHostname(c, user1, url, "user1 should still be able to reach user2 after user3 deletion (stale cache)")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user1 -> user2 after user3 deletion")
// Step 4: Create a NEW user - this triggers updatePolicyManagerUsers() which
// re-evaluates the policy. According to issue #2967, this is when the bug manifests:
@@ -3139,18 +3119,14 @@ func TestACLGroupAfterUserDeletion(t *testing.T) {
// Test that user1 can still reach user2 AFTER the policy refresh triggered by user creation
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should still reach user2 after policy refresh (BUG if this fails)")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user2 after policy refresh (issue #2967)")
assertCurlDockerHostname(c, user1, url, "user1 should still reach user2 after policy refresh (BUG if this fails)")
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user1 -> user2 after policy refresh (issue #2967)")
// Test that user2 can still reach user1
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user2.Curl(url)
assert.NoError(c, err, "user2 should still reach user1 after policy refresh (BUG if this fails)")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user2 -> user1 after policy refresh (issue #2967)")
assertCurlDockerHostname(c, user2, url, "user2 should still reach user1 after policy refresh (BUG if this fails)")
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user2 -> user1 after policy refresh (issue #2967)")
t.Log("Test PASSED: Remaining users can communicate after deleted user and policy refresh")
}
@@ -3256,17 +3232,13 @@ func TestACLGroupDeletionExactReproduction(t *testing.T) {
t.Log("Step 1: Verifying initial connectivity (user1 <-> user3)")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user3FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should reach user3")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user3")
assertCurlDockerHostname(c, user1, url, "user1 should reach user3")
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user1 -> user3")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user3.Curl(url)
assert.NoError(c, err, "user3 should reach user1")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user3 -> user1")
assertCurlDockerHostname(c, user3, url, "user3 should reach user1")
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user3 -> user1")
t.Log("Step 1: PASSED - initial connectivity works")
@@ -3293,17 +3265,13 @@ func TestACLGroupDeletionExactReproduction(t *testing.T) {
t.Log("Step 3: Verifying connectivity STILL works after user2 deletion")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user3FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should still reach user3 after user2 deletion")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user3 after user2 deletion")
assertCurlDockerHostname(c, user1, url, "user1 should still reach user3 after user2 deletion")
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user1 -> user3 after user2 deletion")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user3.Curl(url)
assert.NoError(c, err, "user3 should still reach user1 after user2 deletion")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user3 -> user1 after user2 deletion")
assertCurlDockerHostname(c, user3, url, "user3 should still reach user1 after user2 deletion")
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user3 -> user1 after user2 deletion")
t.Log("Step 3: PASSED - connectivity works after user2 deletion")
@@ -3322,17 +3290,13 @@ func TestACLGroupDeletionExactReproduction(t *testing.T) {
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user3FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "BUG #2967: user1 should still reach user3 after user4 creation")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user3 after user4 creation (issue #2967)")
assertCurlDockerHostname(c, user1, url, "BUG #2967: user1 should still reach user3 after user4 creation")
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user1 -> user3 after user4 creation (issue #2967)")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user3.Curl(url)
assert.NoError(c, err, "BUG #2967: user3 should still reach user1 after user4 creation")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user3 -> user1 after user4 creation (issue #2967)")
assertCurlDockerHostname(c, user3, url, "BUG #2967: user3 should still reach user1 after user4 creation")
}, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user3 -> user1 after user4 creation (issue #2967)")
// Additional verification: check filter rules are not empty
filter, err := headscale.DebugFilter()
@@ -3432,17 +3396,13 @@ func TestACLDynamicUnknownUserAddition(t *testing.T) {
t.Log("Step 1: Verifying initial connectivity with valid policy (no unknown users)")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should reach user2")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "initial user1 -> user2")
assertCurlDockerHostname(c, user1, url, "user1 should reach user2")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "initial user1 -> user2")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user2.Curl(url)
assert.NoError(c, err, "user2 should reach user1")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "initial user2 -> user1")
assertCurlDockerHostname(c, user2, url, "user2 should reach user1")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "initial user2 -> user1")
t.Log("Step 1: PASSED - connectivity works with valid policy")
@@ -3483,17 +3443,13 @@ func TestACLDynamicUnknownUserAddition(t *testing.T) {
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should STILL reach user2 after adding unknown user")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user2 after unknown user added")
assertCurlDockerHostname(c, user1, url, "user1 should STILL reach user2 after adding unknown user")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user1 -> user2 after unknown user added")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user2.Curl(url)
assert.NoError(c, err, "user2 should STILL reach user1 after adding unknown user")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user2 -> user1 after unknown user added")
assertCurlDockerHostname(c, user2, url, "user2 should STILL reach user1 after adding unknown user")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user2 -> user1 after unknown user added")
t.Log("Step 3: PASSED - connectivity maintained after adding unknown user")
t.Log("Test PASSED: v0.28.0-beta.1 scenario - unknown user added dynamically, valid users still work")
@@ -3589,17 +3545,13 @@ func TestACLDynamicUnknownUserRemoval(t *testing.T) {
t.Log("Step 1: Verifying connectivity with unknown user in policy (v2 graceful handling)")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should reach user2 even with unknown user in policy")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "initial user1 -> user2 with unknown")
assertCurlDockerHostname(c, user1, url, "user1 should reach user2 even with unknown user in policy")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "initial user1 -> user2 with unknown")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user2.Curl(url)
assert.NoError(c, err, "user2 should reach user1 even with unknown user in policy")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "initial user2 -> user1 with unknown")
assertCurlDockerHostname(c, user2, url, "user2 should reach user1 even with unknown user in policy")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "initial user2 -> user1 with unknown")
t.Log("Step 1: PASSED - connectivity works even with unknown user (v2 graceful handling)")
@@ -3638,17 +3590,13 @@ func TestACLDynamicUnknownUserRemoval(t *testing.T) {
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN)
result, err := user1.Curl(url)
assert.NoError(c, err, "user1 should reach user2 after removing unknown user")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user2 after unknown removed")
assertCurlDockerHostname(c, user1, url, "user1 should reach user2 after removing unknown user")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user1 -> user2 after unknown removed")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN)
result, err := user2.Curl(url)
assert.NoError(c, err, "user2 should reach user1 after removing unknown user")
assert.Len(c, result, 13, "expected hostname response")
}, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user2 -> user1 after unknown removed")
assertCurlDockerHostname(c, user2, url, "user2 should reach user1 after removing unknown user")
}, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user2 -> user1 after unknown removed")
t.Log("Step 3: PASSED - connectivity maintained after removing unknown user")
t.Log("Test PASSED: Removing unknown users from policy works correctly")

View File

@@ -13,7 +13,6 @@ import (
v1 "github.com/juanfont/headscale/gen/go/headscale/v1"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/juanfont/headscale/integration/hsic"
"tailscale.com/util/dnsname"
"github.com/juanfont/headscale/integration/integrationutil"
"github.com/juanfont/headscale/integration/tsic"
"github.com/rs/zerolog/log"
@@ -23,6 +22,7 @@ import (
"golang.org/x/sync/errgroup"
"tailscale.com/client/tailscale/apitype"
"tailscale.com/types/key"
"tailscale.com/util/dnsname"
)
func TestPingAllByIP(t *testing.T) {
@@ -211,7 +211,7 @@ func testEphemeralWithOptions(t *testing.T, opts ...hsic.Option) {
nodes, err := headscale.ListNodes()
assert.NoError(ct, err)
assert.Len(ct, nodes, 0, "All ephemeral nodes should be cleaned up after logout")
}, integrationutil.ScaledTimeout(30*time.Second), 2*time.Second)
}, integrationutil.StatusReadyTimeout, 2*time.Second)
}
// TestEphemeral2006DeletedTooQuickly verifies that ephemeral nodes are not
@@ -298,7 +298,7 @@ func TestEphemeral2006DeletedTooQuickly(t *testing.T) {
assert.NoError(ct, err)
assertPingAllWithCollect(ct, allClients, allAddrs)
}, integrationutil.ScaledTimeout(60*time.Second), 2*time.Second)
}, integrationutil.HAConvergeTimeout, 2*time.Second)
// Take down all clients, this should start an expiry timer for each.
for _, client := range allClients {
@@ -894,7 +894,7 @@ func TestUpdateHostnameFromClient(t *testing.T) {
}
}
}
}, integrationutil.ScaledTimeout(60*time.Second), 2*time.Second)
}, integrationutil.HAConvergeTimeout, 2*time.Second)
for _, client := range allClients {
status := client.MustStatus()
@@ -980,7 +980,7 @@ func TestExpireNode(t *testing.T) {
// Assert that we have the original count - self
assert.Len(ct, status.Peers(), spec.NodesPerUser-1, "Client %s should see correct number of peers", client.Hostname())
}, integrationutil.ScaledTimeout(30*time.Second), 1*time.Second)
}, integrationutil.StatusReadyTimeout, 1*time.Second)
}
headscale, err := scenario.Headscale()
@@ -1069,7 +1069,7 @@ func TestExpireNode(t *testing.T) {
)
}
}
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for expired node status to propagate")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for expired node status to propagate")
}
}
@@ -1307,7 +1307,7 @@ func TestNodeOnlineStatus(t *testing.T) {
// Assert that we have the original count - self
assert.Len(c, status.Peers(), len(MustTestVersions)-1)
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for expected peer count")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for expected peer count")
}
headscale, err := scenario.Headscale()
@@ -1320,6 +1320,11 @@ func TestNodeOnlineStatus(t *testing.T) {
log.Printf("Starting online test from %v to %v", start, end)
// Pace the outer loop at one iteration per second so the
// continuous online-check does not hammer the docker daemon.
tick := time.NewTicker(time.Second)
defer tick.Stop()
for {
// Let the test run continuously for X minutes to verify
// all nodes stay connected and has the expected status over time.
@@ -1384,8 +1389,7 @@ func TestNodeOnlineStatus(t *testing.T) {
}, integrationutil.ScaledTimeout(15*time.Second), 1*time.Second)
}
// Check maximum once per second
time.Sleep(time.Second)
<-tick.C
}
}

View File

@@ -8,6 +8,7 @@ import (
"io"
"maps"
"net/netip"
"reflect"
"slices"
"strconv"
"strings"
@@ -27,6 +28,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/tailcfg"
"tailscale.com/wgengine/filter"
)
const (
@@ -558,6 +560,56 @@ func assertCurlSuccessWithCollect(c *assert.CollectT, client TailscaleClient, ur
assert.NotEmpty(c, result, msg)
}
// assertCurlDockerHostname curls url and asserts the body is the
// 13-byte Docker auto-generated container hostname (12 hex chars +
// trailing newline from /etc/hostname). For use inside EventuallyWithT.
func assertCurlDockerHostname(c *assert.CollectT, client TailscaleClient, url, msg string) {
const dockerHostnameLen = 13
result, err := client.Curl(url)
assert.NoError(c, err, msg) //nolint:testifylint // CollectT requires assert, not require
assert.Len(c, result, dockerHostnameLen, msg)
}
// snapshotClientFilters snapshots each client's current netmap
// PacketFilter keyed by hostname. Pair with waitForClientFilterChange.
func snapshotClientFilters(t *testing.T, clients []TailscaleClient) map[string][]filter.Match {
t.Helper()
out := make(map[string][]filter.Match, len(clients))
for _, c := range clients {
nm, err := c.Netmap()
require.NoError(t, err, "snapshot netmap for %s", c.Hostname())
out[c.Hostname()] = nm.PacketFilter
}
return out
}
// waitForClientFilterChange polls each client until its netmap
// PacketFilter differs from baselines[Hostname]. Use after SetPolicy
// to gate on client-side filter application before asserting
// reachability.
func waitForClientFilterChange(t *testing.T, clients []TailscaleClient, baselines map[string][]filter.Match, timeout time.Duration) {
t.Helper()
for _, client := range clients {
c := client
baseline := baselines[c.Hostname()]
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
nm, err := c.Netmap()
if !assert.NoError(ct, err, "fetch netmap for %s", c.Hostname()) {
return
}
assert.False(ct, reflect.DeepEqual(baseline, nm.PacketFilter), "client %s PacketFilter unchanged since baseline", c.Hostname())
}, timeout, integrationutil.SlowPoll, "client %s PacketFilter should change after SetPolicy", c.Hostname())
}
}
// assertCurlFailWithCollect asserts that a curl request fails. Uses
// CurlFailFast internally for aggressive timeouts, avoiding wasted
// time on retries when we expect the connection to be blocked.

View File

@@ -114,7 +114,7 @@ func TestEnablingRoutes(t *testing.T) {
assert.Nil(c, peerStatus.PrimaryRoutes)
}
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no routes are active before approval")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes are active before approval")
}
for _, node := range nodes {
@@ -159,7 +159,7 @@ func TestEnablingRoutes(t *testing.T) {
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{netip.MustParsePrefix(expectedRoutes[string(peerStatus.ID)])})
}
}
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "clients should see new routes")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "clients should see new routes")
_, err = headscale.ApproveRoutes(
1,
@@ -195,7 +195,7 @@ func TestEnablingRoutes(t *testing.T) {
assert.Len(c, node.GetSubnetRoutes(), 1) // 10.0.2.0/24
}
}
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to nodes")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes")
// Verify that the clients can see the new routes
for _, client := range allClients {
@@ -215,7 +215,7 @@ func TestEnablingRoutes(t *testing.T) {
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{netip.MustParsePrefix("10.0.2.0/24")})
}
}
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying final route state visible to clients")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying final route state visible to clients")
}
}
@@ -223,7 +223,10 @@ func TestEnablingRoutes(t *testing.T) {
func TestHASubnetRouterFailover(t *testing.T) {
IntegrationSkip(t)
propagationTime := integrationutil.ScaledTimeout(60 * time.Second)
// HASlowConverge (not HAConverge): tailscale's wgengine sometimes
// keeps routing through the previous primary for ~2 min after
// the netmap update flips PrimaryRoutes server-side.
propagationTime := integrationutil.HASlowConvergeTimeout
// Helper function to validate primary routes table state
validatePrimaryRoutes := func(t *testing.T, headscale ControlServer, expectedRoutes *types.DebugRoutes, message string) {
@@ -458,9 +461,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat))
t.Logf(" Expected: Traffic flows through router 1 as it's the only approved route")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1")
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -513,7 +514,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0)
requireNodeRouteCountWithCollect(c, nodes[2], 1, 0, 0)
}
}, integrationutil.ScaledTimeout(3*time.Second), 200*time.Millisecond, "HA setup verification: Router 2 approved as STANDBY (available=1, approved=1, subnet=0), Router 1 stays PRIMARY (subnet=1)")
}, integrationutil.ScaledTimeout(3*time.Second), integrationutil.FastPoll, "HA setup verification: Router 2 approved as STANDBY (available=1, approved=1, subnet=0), Router 1 stays PRIMARY (subnet=1)")
// Verify that the client has routes from the primary machine
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -575,9 +576,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
t.Logf(" Expected: Router 1 continues to handle all traffic (no change from before)")
t.Logf(" Expected: Router 2 is ready to take over if router 1 fails")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1 in HA mode")
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 in HA mode")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -630,7 +629,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1)
requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0)
requireNodeRouteCountWithCollect(c, nodes[2], 1, 1, 0)
}, integrationutil.ScaledTimeout(3*time.Second), 200*time.Millisecond, "Full HA verification: Router 3 approved as second STANDBY (available=1, approved=1, subnet=0), Router 1 PRIMARY, Router 2 first STANDBY")
}, integrationutil.ScaledTimeout(3*time.Second), integrationutil.FastPoll, "Full HA verification: Router 3 approved as second STANDBY (available=1, approved=1, subnet=0), Router 1 PRIMARY, Router 2 first STANDBY")
// Verify that the client has routes from the primary machine
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -673,9 +672,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
}, propagationTime, 200*time.Millisecond, "Verifying full HA with 3 routers: Router 1 PRIMARY, Routers 2 & 3 STANDBY")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1 with full HA")
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 with full HA")
// Wait for traceroute to work correctly through the expected router
@@ -766,9 +763,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
}, propagationTime, 200*time.Millisecond, "Failover verification: Router 1 offline, Router 2 should be new PRIMARY with routes, Router 3 still STANDBY")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 2 after failover")
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after failover")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -840,9 +835,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
}, propagationTime, 200*time.Millisecond, "Second failover verification: Router 1 & 2 offline, Router 3 should be new PRIMARY (last router standing) with routes")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 3 after second failover")
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 3 after second failover")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -920,9 +913,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
}, propagationTime, 200*time.Millisecond, "Recovery verification: Router 1 back online as STANDBY, Router 3 remains PRIMARY (no flapping) with routes")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can still reach webservice through router 3 after router 1 recovery")
}, propagationTime, 200*time.Millisecond, "Verifying client can still reach webservice through router 3 after router 1 recovery")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1000,12 +991,10 @@ func TestHASubnetRouterFailover(t *testing.T) {
pref,
)
}
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "Full recovery verification: All 3 routers online, Router 3 remains PRIMARY (no flapping) with routes")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "Full recovery verification: All 3 routers online, Router 3 remains PRIMARY (no flapping) with routes")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 3 after full recovery")
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 3 after full recovery")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1054,7 +1043,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 1, 1)
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 0)
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0)
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "Route disable verification: Router 3 route disabled, Router 1 should be new PRIMARY, Router 2 STANDBY")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "Route disable verification: Router 3 route disabled, Router 1 should be new PRIMARY, Router 2 STANDBY")
// Verify that the route is announced from subnet router 1
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1090,9 +1079,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
}, propagationTime, 200*time.Millisecond, "Verifying Router 1 becomes PRIMARY after Router 3 route disabled")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1 after route disable")
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 after route disable")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1142,7 +1129,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 0, 0)
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 1)
requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0)
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "Second route disable verification: Router 1 route disabled, Router 2 should be new PRIMARY")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "Second route disable verification: Router 1 route disabled, Router 2 should be new PRIMARY")
// Verify that the route is announced from subnet router 1
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1178,9 +1165,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
}, propagationTime, 200*time.Millisecond, "Verifying Router 2 becomes PRIMARY after Router 1 route disabled")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 2 after second route disable")
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after second route disable")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1265,9 +1250,7 @@ func TestHASubnetRouterFailover(t *testing.T) {
}, propagationTime, 200*time.Millisecond, "Verifying Router 2 remains PRIMARY after Router 1 route re-enabled")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 2 after route re-enable")
}, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after route re-enable")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1410,21 +1393,33 @@ func TestSubnetRouteACL(t *testing.T) {
client := allClients[1]
// Read Self.ID inside a retry (it lags initial connection); apply
// the route mutation outside, since retrying a mutation hides
// real failures.
for _, client := range allClients {
assert.EventuallyWithT(t, func(c *assert.CollectT) {
status, err := client.Status()
assert.NoError(c, err)
var status *ipnstate.Status
if route, ok := expectedRoutes[string(status.Self.ID)]; ok {
command := []string{
"tailscale",
"set",
"--advertise-routes=" + route,
}
_, _, err = client.Execute(command)
assert.NoErrorf(c, err, "failed to advertise route: %s", err)
require.EventuallyWithT(t, func(c *assert.CollectT) {
s, err := client.Status()
assert.NoError(c, err)
assert.NotNil(c, s)
if s != nil {
status = s
}
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Configuring route advertisements")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Reading client status before route advertisement")
route, ok := expectedRoutes[string(status.Self.ID)]
if !ok {
continue
}
_, _, err = client.Execute([]string{
"tailscale",
"set",
"--advertise-routes=" + route,
})
require.NoErrorf(t, err, "failed to advertise route on %s", client.Hostname())
}
err = scenario.WaitForTailscaleSync()
@@ -1478,7 +1473,7 @@ func TestSubnetRouteACL(t *testing.T) {
assert.Nil(c, peerStatus.PrimaryRoutes)
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
}
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no routes are active before approval")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes are active before approval")
}
_, err = headscale.ApproveRoutes(
@@ -1495,7 +1490,7 @@ func TestSubnetRouteACL(t *testing.T) {
requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1)
requireNodeRouteCountWithCollect(c, nodes[1], 0, 0, 0)
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to nodes")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes")
// Verify that the client has routes from the primary machine
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1514,7 +1509,7 @@ func TestSubnetRouteACL(t *testing.T) {
}
requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{netip.MustParsePrefix(expectedRoutes["1"])})
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying client can see subnet routes from router")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying client can see subnet routes from router")
// Wait for packet filter updates to propagate to client netmap
wantClientFilter := []filter.Match{
@@ -1549,7 +1544,7 @@ func TestSubnetRouteACL(t *testing.T) {
if diff := cmpdiff.Diff(wantClientFilter, clientNm.PacketFilter, util.ViewSliceIPProtoComparer, util.PrefixComparer); diff != "" {
assert.Fail(c, fmt.Sprintf("Client (%s) filter, unexpected result (-want +got):\n%s", client.Hostname(), diff))
}
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for client packet filter to update")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for client packet filter to update")
// Wait for packet filter updates to propagate to subnet router netmap
// The two ACL rules (group:admins -> group:admins:* and group:admins -> 10.33.0.0/16:*)
@@ -1590,7 +1585,7 @@ func TestSubnetRouteACL(t *testing.T) {
if diff := cmpdiff.Diff(wantSubnetFilter, subnetNm.PacketFilter, util.ViewSliceIPProtoComparer, util.PrefixComparer); diff != "" {
assert.Fail(c, fmt.Sprintf("Subnet (%s) filter, unexpected result (-want +got):\n%s", subRouter1.Hostname(), diff))
}
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for subnet router packet filter to update")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for subnet router packet filter to update")
}
// TestEnablingExitRoutes tests enabling exit routes for clients.
@@ -1639,7 +1634,7 @@ func TestEnablingExitRoutes(t *testing.T) {
requireNodeRouteCountWithCollect(c, nodes[0], 2, 0, 0)
requireNodeRouteCountWithCollect(c, nodes[1], 2, 0, 0)
}, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for route advertisements to propagate")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for route advertisements to propagate")
// Verify that no routes has been sent to the client,
// they are not yet enabled.
@@ -1653,7 +1648,7 @@ func TestEnablingExitRoutes(t *testing.T) {
assert.Nil(c, peerStatus.PrimaryRoutes)
}
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no exit routes are active before approval")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no exit routes are active before approval")
}
// Enable all routes, but do v4 on one and v6 on other to ensure they
@@ -1677,7 +1672,7 @@ func TestEnablingExitRoutes(t *testing.T) {
requireNodeRouteCountWithCollect(c, nodes[0], 2, 2, 2)
requireNodeRouteCountWithCollect(c, nodes[1], 2, 2, 2)
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to both nodes")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to both nodes")
// Wait for route state changes to propagate to clients
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1698,7 +1693,7 @@ func TestEnablingExitRoutes(t *testing.T) {
}
}
}
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "clients should see new routes")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "clients should see new routes")
}
// TestExitRoutesWithAutogroupInternetACL reproduces juanfont/headscale#3212.
@@ -1768,7 +1763,7 @@ func TestExitRoutesWithAutogroupInternetACL(t *testing.T) {
requireNodeRouteCountWithCollect(c, nodes[0], 2, 0, 0)
requireNodeRouteCountWithCollect(c, nodes[1], 2, 0, 0)
}, integrationutil.ScaledTimeout(20*time.Second), 200*time.Millisecond,
}, integrationutil.ScaledTimeout(20*time.Second), integrationutil.FastPoll,
"Waiting for exit-route advertisements to propagate")
// Approve exit routes on both nodes so either could serve as
@@ -1792,7 +1787,7 @@ func TestExitRoutesWithAutogroupInternetACL(t *testing.T) {
requireNodeRouteCountWithCollect(c, nodes[0], 2, 2, 2)
requireNodeRouteCountWithCollect(c, nodes[1], 2, 2, 2)
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond,
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll,
"approved exit routes should propagate to nodes")
// The end-to-end UX assertion: every client must see the OTHER
@@ -1835,7 +1830,7 @@ func TestExitRoutesWithAutogroupInternetACL(t *testing.T) {
"client %s should see the other node as a peer "+
"via the autogroup:internet ACL",
status.Self.HostName)
}, integrationutil.ScaledTimeout(15*time.Second), 500*time.Millisecond,
}, integrationutil.ScaledTimeout(15*time.Second), integrationutil.SlowPoll,
"client should see exit nodes as peers via autogroup:internet ACL")
}
}
@@ -1930,7 +1925,7 @@ func TestSubnetRouterMultiNetwork(t *testing.T) {
assert.Nil(c, peerStatus.PrimaryRoutes)
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
}
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no routes are active before approval")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes are active before approval")
// Enable route
_, err = headscale.ApproveRoutes(
@@ -1947,7 +1942,7 @@ func TestSubnetRouterMultiNetwork(t *testing.T) {
assert.NoError(c, err)
assert.Len(c, nodes, 2)
requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1)
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to nodes")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes")
// Verify that the routes have been sent to the client
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -1963,7 +1958,7 @@ func TestSubnetRouterMultiNetwork(t *testing.T) {
requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*pref})
}
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "routes should be visible to client")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "routes should be visible to client")
usernet1, err := scenario.Network("usernet1")
require.NoError(t, err)
@@ -1979,10 +1974,8 @@ func TestSubnetRouterMultiNetwork(t *testing.T) {
t.Logf("url from %s to %s", user2c.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := user2c.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying client can reach webservice through subnet route")
assertCurlDockerHostname(c, user2c, url, "Verifying client can reach webservice through subnet route")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying client can reach webservice through subnet route")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
tr, err := user2c.Traceroute(webip)
@@ -1994,7 +1987,7 @@ func TestSubnetRouterMultiNetwork(t *testing.T) {
}
assertTracerouteViaIPWithCollect(c, tr, ip)
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying traceroute goes through subnet router")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying traceroute goes through subnet router")
}
func TestSubnetRouterMultiNetworkExitNode(t *testing.T) {
@@ -2088,7 +2081,7 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) {
assert.Nil(c, peerStatus.PrimaryRoutes)
requirePeerSubnetRoutesWithCollect(c, peerStatus, nil)
}
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no routes sent to client before approval")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes sent to client before approval")
// Approve exit routes and subnet route.
_, err = headscale.ApproveRoutes(nodes[0].GetId(), []netip.Prefix{tsaddr.AllIPv4(), tsaddr.AllIPv6(), *route})
@@ -2100,7 +2093,7 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) {
assert.NoError(c, err)
assert.Len(c, nodes, 2)
requireNodeRouteCountWithCollect(c, nodes[0], 3, 3, 3)
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to nodes")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes")
// Wait for exit routes to be visible to the client.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -2111,7 +2104,7 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) {
peerStatus := status.Peer[peerKey]
assert.True(c, peerStatus.ExitNodeOption, "peer should be an exit node option")
}
}, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "exit routes should be visible to client")
}, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "exit routes should be visible to client")
// Tell user2c to use user1c as an exit node.
command = []string{
@@ -2142,9 +2135,7 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) {
weburl := fmt.Sprintf("http://%s/etc/hostname", webip)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := user2c.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, user2c, weburl, "user2 should reach webservice via exit node")
}, 10*time.Second, 200*time.Millisecond, "user2 should reach webservice via exit node")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -2719,9 +2710,7 @@ func TestAutoApproveMultiNetwork(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, url, "Verifying client can reach webservice through auto-approved route")
}, assertTimeout, 200*time.Millisecond, "Verifying client can reach webservice through auto-approved route")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -2785,9 +2774,7 @@ func TestAutoApproveMultiNetwork(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, url, "Verifying client can still reach webservice after policy change")
}, assertTimeout, 200*time.Millisecond, "Verifying client can still reach webservice after policy change")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -2882,9 +2869,7 @@ func TestAutoApproveMultiNetwork(t *testing.T) {
t.Logf("url from %s to %s", client.Hostname(), url)
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(url)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, url, "Verifying client can reach webservice after route re-approval")
}, assertTimeout, 200*time.Millisecond, "Verifying client can reach webservice after route re-approval")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -3274,7 +3259,7 @@ func TestSubnetRouteACLFiltering(t *testing.T) {
// Check that the router has 3 routes now approved and available
requireNodeRouteCountWithCollect(c, routerNode, 3, 3, 3)
}, integrationutil.ScaledTimeout(15*time.Second), 500*time.Millisecond, "route state changes should propagate")
}, integrationutil.ScaledTimeout(15*time.Second), integrationutil.SlowPoll, "route state changes should propagate")
// Now check the client node status
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -3289,13 +3274,11 @@ func TestSubnetRouteACLFiltering(t *testing.T) {
// The node should only have 1 subnet route
requirePeerSubnetRoutesWithCollect(c, routerPeerStatus, []netip.Prefix{*route})
}, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying node sees filtered subnet routes")
}, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying node sees filtered subnet routes")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := nodeClient.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
}, integrationutil.ScaledTimeout(60*time.Second), 200*time.Millisecond, "Verifying node can reach webservice through allowed route")
assertCurlDockerHostname(c, nodeClient, weburl, "Verifying node can reach webservice through allowed route")
}, integrationutil.HAConvergeTimeout, integrationutil.FastPoll, "Verifying node can reach webservice through allowed route")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
tr, err := nodeClient.Traceroute(webip)
@@ -3307,7 +3290,7 @@ func TestSubnetRouteACLFiltering(t *testing.T) {
}
assertTracerouteViaIPWithCollect(c, tr, ip)
}, integrationutil.ScaledTimeout(60*time.Second), 200*time.Millisecond, "Verifying traceroute goes through router")
}, integrationutil.HAConvergeTimeout, integrationutil.FastPoll, "Verifying traceroute goes through router")
}
// TestGrantViaSubnetSteering validates that via grants steer different source
@@ -3595,16 +3578,12 @@ func TestGrantViaSubnetSteering(t *testing.T) {
// Verify Client A can reach the webservice.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := clientA.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, clientA, weburl, "Client A should reach webservice")
}, assertTimeout, 200*time.Millisecond, "Client A should reach webservice")
// Verify Client B can reach the webservice.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := clientB.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, clientB, weburl, "Client B should reach webservice")
}, assertTimeout, 200*time.Millisecond, "Client B should reach webservice")
// Verify Client A's traffic goes through Router A.
@@ -3642,7 +3621,7 @@ func TestGrantViaSubnetSteering(t *testing.T) {
func TestHASubnetRouterPingFailover(t *testing.T) {
IntegrationSkip(t)
propagationTime := integrationutil.ScaledTimeout(60 * time.Second)
propagationTime := integrationutil.HAConvergeTimeout
spec := ScenarioSpec{
NodesPerUser: 2,
@@ -3757,9 +3736,7 @@ func TestHASubnetRouterPingFailover(t *testing.T) {
// Verify connectivity through router 1.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "client should reach webservice through router 1")
}, propagationTime, 200*time.Millisecond, "client should reach webservice through router 1")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -3814,9 +3791,7 @@ func TestHASubnetRouterPingFailover(t *testing.T) {
t.Log("Failover detected. Verifying connectivity through router 2.")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "client should reach webservice through router 2")
}, propagationTime, 200*time.Millisecond, "client should reach webservice through router 2")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -3890,7 +3865,7 @@ func TestHASubnetRouterPingFailover(t *testing.T) {
func TestHASubnetRouterFailoverBothOffline(t *testing.T) {
IntegrationSkip(t)
propagationTime := integrationutil.ScaledTimeout(60 * time.Second)
propagationTime := integrationutil.HAConvergeTimeout
spec := ScenarioSpec{
NodesPerUser: 2,
@@ -3997,9 +3972,7 @@ func TestHASubnetRouterFailoverBothOffline(t *testing.T) {
// Confirm initial connectivity through r1.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "client reaches webservice via r1")
}, propagationTime, 200*time.Millisecond, "client reaches webservice via r1")
t.Log("=== Step 1: r1 goes offline. r2 should take over. ===")
@@ -4067,9 +4040,7 @@ func TestHASubnetRouterFailoverBothOffline(t *testing.T) {
// End-to-end traffic should reach the webservice via r2.
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "client reaches webservice via r2 after recovery")
}, propagationTime, 200*time.Millisecond, "client reaches webservice via r2 after recovery")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
@@ -4101,7 +4072,7 @@ func TestHASubnetRouterFailoverBothOffline(t *testing.T) {
func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) {
IntegrationSkip(t)
propagationTime := integrationutil.ScaledTimeout(120 * time.Second)
propagationTime := integrationutil.HASlowConvergeTimeout
spec := ScenarioSpec{
NodesPerUser: 2,
@@ -4292,9 +4263,7 @@ func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) {
}, propagationTime, 1*time.Second, "R2: waiting for client to see r2")
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, "client reaches webservice via r2 after recovery")
}, propagationTime, 1*time.Second, "client reaches webservice via r2 after recovery")
}
@@ -4329,7 +4298,7 @@ func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) {
func TestHASubnetRouterFailoverDockerDisconnect(t *testing.T) {
IntegrationSkip(t)
propagationTime := integrationutil.ScaledTimeout(120 * time.Second)
propagationTime := integrationutil.HASlowConvergeTimeout
flapWindow := integrationutil.ScaledTimeout(40 * time.Second)
spec := ScenarioSpec{
@@ -4438,9 +4407,7 @@ func TestHASubnetRouterFailoverDockerDisconnect(t *testing.T) {
requireTrafficWorks := func(msg string) {
t.Helper()
assert.EventuallyWithT(t, func(c *assert.CollectT) {
result, err := client.Curl(weburl)
assert.NoError(c, err)
assert.Len(c, result, 13)
assertCurlDockerHostname(c, client, weburl, msg)
}, propagationTime, 1*time.Second, msg)
}

View File

@@ -784,10 +784,42 @@ func (s *Scenario) WaitForTailscaleSync() error {
return err
}
// WaitForTailscaleSyncPerUser blocks execution until each TailscaleClient has the expected
// number of peers for its user. This is useful for policies like autogroup:self where nodes
// only see same-user peers, not all nodes in the network.
func (s *Scenario) WaitForTailscaleSyncPerUser(timeout, retryInterval time.Duration) error {
// SyncOption configures WaitForTailscaleSyncPerUser.
type SyncOption func(*syncOptions)
type syncOptions struct {
preBarrier func(context.Context) error
}
// WithPreBarrier runs a precondition check before per-user peer-count
// waits begin, sharing the outer timeout via context. Use to gate on
// a server-side signal (e.g. policy compile) that the peer-count
// alone cannot observe.
func WithPreBarrier(barrier func(context.Context) error) SyncOption {
return func(o *syncOptions) { o.preBarrier = barrier }
}
// WaitForTailscaleSyncPerUser blocks until each TailscaleClient has
// the expected per-user peer count (necessary for policies like
// autogroup:self where cross-user peers are invisible).
func (s *Scenario) WaitForTailscaleSyncPerUser(timeout, retryInterval time.Duration, opts ...SyncOption) error {
options := syncOptions{}
for _, opt := range opts {
opt(&options)
}
if options.preBarrier != nil {
barrierCtx, cancel := context.WithTimeout(context.Background(), timeout)
err := options.preBarrier(barrierCtx)
cancel()
if err != nil {
return fmt.Errorf("pre-barrier: %w", err)
}
}
var allErrors []error
for _, user := range s.users {

View File

@@ -67,6 +67,7 @@ var (
errTailscaleImageRequiredInCI = errors.New("HEADSCALE_INTEGRATION_TAILSCALE_IMAGE must be set in CI for HEAD version")
errContainerNotInitialized = errors.New("container not initialized")
errFQDNNotYetAvailable = errors.New("FQDN not yet available")
errCurlEmptyResponseBody = errors.New("curl returned empty response body")
)
const (
@@ -1532,6 +1533,15 @@ func (t *TailscaleInContainer) Curl(url string, opts ...CurlOption) (string, err
return result, err
}
// curl exit 0 with an empty body usually means a mid-stream reset
// after headers (HTTP 200 with the connection torn down before the
// body arrived). Without this signal, callers wrapping the call in
// EventuallyWithT see assert.NoError pass and assert.Len fail with
// no error to drive a retry.
if result == "" {
return result, fmt.Errorf("%w: %s from %s", errCurlEmptyResponseBody, url, t.Hostname())
}
return result, nil
}
@@ -1547,8 +1557,16 @@ func (t *TailscaleInContainer) CurlFailFast(url string) (string, error) {
}
func (t *TailscaleInContainer) Traceroute(ip netip.Addr) (util.Traceroute, error) {
// -w 1: wait at most 1s for each probe response. busybox's default
// is 5s, which means an EventuallyWithT loop at 200ms ticks
// can spend 25 ticks worth of budget on a single Traceroute.
// -q 1: send 1 probe per hop instead of 3. The HA tests only care
// about the first hop's identity; the other probes are dead
// weight for the same retry budget.
command := []string{
"traceroute",
"-w", "1",
"-q", "1",
ip.String(),
}