From dfcc96d808058a9b0819b626aa018b5531c7264f Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Wed, 13 May 2026 13:19:49 +0000 Subject: [PATCH] integration: harden ACL test ergonomics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tsic.Curl returned ("", nil) when curl exited 0 with a zero-byte body — the usual signature of a mid-stream reset — so EventuallyWithT could not retry. Return an error on empty body instead. Replace the 56 inline curl + assert.Len(13) blocks with assertCurlDockerHostname so the empty-body fix benefits every callsite without further touch-ups. Gate ACL waits on actual filter visibility: snapshotClientFilters + waitForClientFilterChange ensure the new PacketFilter has reached the client before assertions fire; SyncOption + WithPreBarrier feeds a server-side policy-loaded check into WaitForTailscaleSyncPerUser. Move advertise-routes mutation out of EventuallyWithT in route_test (cmd/hi/README forbids retrying state-mutating calls). Pace the TestNodeOnlineStatus outer loop with a Ticker, not a Sleep. --- integration/acl_test.go | 278 +++++++++++++++--------------------- integration/general_test.go | 22 +-- integration/helpers.go | 52 +++++++ integration/route_test.go | 211 ++++++++++++--------------- integration/scenario.go | 40 +++++- integration/tsic/tsic.go | 18 +++ 6 files changed, 321 insertions(+), 300 deletions(-) diff --git a/integration/acl_test.go b/integration/acl_test.go index de339fb5..d090390d 100644 --- a/integration/acl_test.go +++ b/integration/acl_test.go @@ -325,7 +325,7 @@ func TestACLHostsInNetMapTable(t *testing.T) { user := status.User[status.Self.UserID].LoginName assert.Len(c, status.Peer, (testCase.want[user])) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for expected peer visibility") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for expected peer visibility") } }) } @@ -372,10 +372,8 @@ func TestACLAllowUser80Dst(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2") + assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2") } } @@ -390,7 +388,7 @@ func TestACLAllowUser80Dst(t *testing.T) { assert.EventuallyWithT(t, func(c *assert.CollectT) { assertCurlFailWithCollect(c, client, url, "user2 should not reach user1") - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user2 cannot reach user1") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user2 cannot reach user1") } } } @@ -437,7 +435,7 @@ func TestACLDenyAllPort80(t *testing.T) { assert.EventuallyWithT(t, func(c *assert.CollectT) { assertCurlFailWithCollect(c, client, url, "all traffic should be denied") - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying all traffic is denied") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying all traffic is denied") } } } @@ -481,10 +479,8 @@ func TestACLAllowUserDst(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2") + assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2") } } @@ -499,7 +495,7 @@ func TestACLAllowUserDst(t *testing.T) { assert.EventuallyWithT(t, func(c *assert.CollectT) { assertCurlFailWithCollect(c, client, url, "user2 should not reach user1") - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user2 cannot reach user1") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user2 cannot reach user1") } } } @@ -542,10 +538,8 @@ func TestACLAllowStarDst(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2") + assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2") } } @@ -560,7 +554,7 @@ func TestACLAllowStarDst(t *testing.T) { assert.EventuallyWithT(t, func(c *assert.CollectT) { assertCurlFailWithCollect(c, client, url, "user2 should not reach user1") - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user2 cannot reach user1") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user2 cannot reach user1") } } } @@ -608,10 +602,8 @@ func TestACLNamedHostsCanReachBySubnet(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2") + assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2") } } @@ -627,10 +619,8 @@ func TestACLNamedHostsCanReachBySubnet(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user2 can reach user1") + assertCurlDockerHostname(c, client, url, "Verifying user2 can reach user1") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user2 can reach user1") } } } @@ -790,7 +780,7 @@ func TestACLNamedHostsCanReach(t *testing.T) { test3URL, result, ) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test1 should reach test3") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test1 should reach test3") // test2 can query test3 (everyone -> test3) assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -804,7 +794,7 @@ func TestACLNamedHostsCanReach(t *testing.T) { test3URL, result, ) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test2 should reach test3") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test2 should reach test3") // test3 cannot query test1 _, err = test3.CurlFailFast(test1URL) @@ -826,7 +816,7 @@ func TestACLNamedHostsCanReach(t *testing.T) { test2URL, result, ) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test1 should reach test2") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test1 should reach test2") // test2 cannot query test1 _, err = test2.CurlFailFast(test1URL) @@ -981,12 +971,12 @@ func TestACLDevice1CanAccessDevice2(t *testing.T) { test2URL, result, ) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test1 should reach test2") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test1 should reach test2") // test2 cannot query test1 (asymmetric policy) assert.EventuallyWithT(t, func(c *assert.CollectT) { assertCurlFailWithCollect(c, test2, test1URL, "test2 should not reach test1") - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "test2 should NOT reach test1") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "test2 should NOT reach test1") }) } } @@ -1047,10 +1037,8 @@ func TestPolicyUpdateWhileRunningWithCLIInDatabase(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying user1 can reach user2") + assertCurlDockerHostname(c, client, url, "Verifying user1 can reach user2") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying user1 can reach user2") } } @@ -1096,7 +1084,7 @@ func TestPolicyUpdateWhileRunningWithCLIInDatabase(t *testing.T) { if diff := cmp.Diff(p, *output, cmpopts.IgnoreUnexported(policyv2.Policy{}), cmpopts.EquateEmpty()); diff != "" { ct.Errorf("unexpected policy(-want +got):\n%s", diff) } - }, integrationutil.ScaledTimeout(30*time.Second), 1*time.Second, "verifying that the new policy took place") + }, integrationutil.StatusReadyTimeout, 1*time.Second, "verifying that the new policy took place") assert.EventuallyWithT(t, func(ct *assert.CollectT) { // Test that user1 can visit all user2 @@ -1108,9 +1096,7 @@ func TestPolicyUpdateWhileRunningWithCLIInDatabase(t *testing.T) { url := fmt.Sprintf("http://%s/etc/hostname", fqdn) t.Logf("url from %s to %s", client.Hostname(), url) - result, err := client.Curl(url) - assert.Len(ct, result, 13) - assert.NoError(ct, err) + assertCurlDockerHostname(ct, client, url, "new policy did not get propagated to nodes") } } @@ -1126,7 +1112,7 @@ func TestPolicyUpdateWhileRunningWithCLIInDatabase(t *testing.T) { assertCurlFailWithCollect(ct, client, url, "user2 should not reach user1") } } - }, integrationutil.ScaledTimeout(30*time.Second), 1*time.Second, "new policy did not get propagated to nodes") + }, integrationutil.StatusReadyTimeout, 1*time.Second, "new policy did not get propagated to nodes") } func TestACLAutogroupMember(t *testing.T) { @@ -1165,7 +1151,7 @@ func TestACLAutogroupMember(t *testing.T) { clientIsUntagged = status.Self.Tags == nil || status.Self.Tags.Len() == 0 assert.True(c, clientIsUntagged, "Expected client %s to be untagged for autogroup:member test", client.Hostname()) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for client %s to be untagged", client.Hostname()) + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for client %s to be untagged", client.Hostname()) if !clientIsUntagged { continue @@ -1184,7 +1170,7 @@ func TestACLAutogroupMember(t *testing.T) { peerIsUntagged = status.Self.Tags == nil || status.Self.Tags.Len() == 0 assert.True(c, peerIsUntagged, "Expected peer %s to be untagged for autogroup:member test", peer.Hostname()) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for peer %s to be untagged", peer.Hostname()) + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for peer %s to be untagged", peer.Hostname()) if !peerIsUntagged { continue @@ -1197,10 +1183,8 @@ func TestACLAutogroupMember(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "Verifying autogroup:member connectivity") + assertCurlDockerHostname(c, client, url, "Verifying autogroup:member connectivity") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "Verifying autogroup:member connectivity") } } } @@ -1376,7 +1360,7 @@ func TestACLAutogroupTagged(t *testing.T) { untaggedClients = append(untaggedClients, client) } } - }, integrationutil.ScaledTimeout(30*time.Second), 1*time.Second, "verifying peer visibility for node %s", hostname) + }, integrationutil.StatusReadyTimeout, 1*time.Second, "verifying peer visibility for node %s", hostname) } // Verify we have the expected number of tagged and untagged nodes @@ -1390,7 +1374,7 @@ func TestACLAutogroupTagged(t *testing.T) { assert.NoError(c, err) assert.NotNil(c, status.Self.Tags, "tagged node %s should have tags", client.Hostname()) assert.Positive(c, status.Self.Tags.Len(), "tagged node %s should have at least one tag", client.Hostname()) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for tags to be applied to tagged nodes") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for tags to be applied to tagged nodes") } // Verify untagged nodes have no tags @@ -1402,7 +1386,7 @@ func TestACLAutogroupTagged(t *testing.T) { if status.Self.Tags != nil { assert.Equal(c, 0, status.Self.Tags.Len(), "untagged node %s should have no tags", client.Hostname()) } - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting to verify untagged nodes have no tags") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting to verify untagged nodes have no tags") } // Test that tagged nodes can communicate with each other @@ -1420,10 +1404,8 @@ func TestACLAutogroupTagged(t *testing.T) { t.Logf("Testing connection from tagged node %s to tagged node %s", client.Hostname(), peer.Hostname()) assert.EventuallyWithT(t, func(ct *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(ct, err) - assert.Len(ct, result, 13) - }, integrationutil.ScaledTimeout(20*time.Second), 500*time.Millisecond, "tagged nodes should be able to communicate") + assertCurlDockerHostname(ct, client, url, "tagged nodes should be able to communicate") + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.SlowPoll, "tagged nodes should be able to communicate") } } @@ -1442,7 +1424,7 @@ func TestACLAutogroupTagged(t *testing.T) { result, err := client.CurlFailFast(url) assert.Empty(ct, result) assert.Error(ct, err) - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "untagged nodes should not be able to reach tagged nodes") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "untagged nodes should not be able to reach tagged nodes") } // Try to reach other untagged nodes (should also fail) @@ -1462,7 +1444,7 @@ func TestACLAutogroupTagged(t *testing.T) { result, err := client.CurlFailFast(url) assert.Empty(ct, result) assert.Error(ct, err) - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "untagged nodes should not be able to reach other untagged nodes") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "untagged nodes should not be able to reach other untagged nodes") } } @@ -1480,7 +1462,7 @@ func TestACLAutogroupTagged(t *testing.T) { result, err := client.CurlFailFast(url) assert.Empty(ct, result) assert.Error(ct, err) - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "tagged nodes should not be able to reach untagged nodes") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "tagged nodes should not be able to reach untagged nodes") } } } @@ -1664,10 +1646,8 @@ func TestACLAutogroupSelf(t *testing.T) { t.Logf("url from %s (user1) to %s (user1)", client.Hostname(), fqdn) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "user1 device should reach other user1 device via autogroup:self") + assertCurlDockerHostname(c, client, url, "user1 device should reach other user1 device via autogroup:self") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "user1 device should reach other user1 device via autogroup:self") } } @@ -1685,10 +1665,8 @@ func TestACLAutogroupSelf(t *testing.T) { t.Logf("url from %s (user2) to %s (user2)", client.Hostname(), fqdn) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "user2 device should reach other user2 device via autogroup:self") + assertCurlDockerHostname(c, client, url, "user2 device should reach other user2 device via autogroup:self") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "user2 device should reach other user2 device via autogroup:self") } } @@ -1704,7 +1682,7 @@ func TestACLAutogroupSelf(t *testing.T) { result, err := client.Curl(url) assert.NoError(c, err) assert.NotEmpty(c, result, "user1 should be able to access router-node via group:home -> tag:router-node rule") - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "user1 device should reach router-node (proves autogroup:self doesn't interfere)") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "user1 device should reach router-node (proves autogroup:self doesn't interfere)") } // Test that user2's regular devices can access router-node @@ -1719,7 +1697,7 @@ func TestACLAutogroupSelf(t *testing.T) { result, err := client.Curl(url) assert.NoError(c, err) assert.NotEmpty(c, result, "user2 should be able to access router-node via group:home -> tag:router-node rule") - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "user2 device should reach router-node (proves autogroup:self doesn't interfere)") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "user2 device should reach router-node (proves autogroup:self doesn't interfere)") } // Test that devices from different users cannot access each other's regular devices @@ -1867,20 +1845,32 @@ func TestACLPolicyPropagationOverTime(t *testing.T) { assert.Len(ct, result, 13, "iteration %d: response from %s to %s should be valid", iteration, client.Hostname(), fqdn) } } - }, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "iteration %d: Phase 1 - all connectivity tests with allow-all policy", iteration) + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "iteration %d: Phase 1 - all connectivity tests with allow-all policy", iteration) // Phase 2: Autogroup:self policy (only same user can access) t.Logf("Iteration %d: Phase 2 - Setting autogroup:self policy", iteration) + preFilters := snapshotClientFilters(t, allClients) + err = headscale.SetPolicy(autogroupSelfPolicy) require.NoError(t, err) // Wait for peer lists to sync with autogroup:self - ensures cross-user peers are removed t.Logf("Iteration %d: Phase 2 - Waiting for peer lists to sync with autogroup:self", iteration) - err = scenario.WaitForTailscaleSyncPerUser(integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond) + err = scenario.WaitForTailscaleSyncPerUser( + integrationutil.PolicyPropagationTimeout, + integrationutil.SlowPoll, + ) require.NoError(t, err, "iteration %d: Phase 2 - failed to sync after autogroup:self policy", iteration) + // Gate on each client's wgengine having applied the new + // PacketFilter before asserting reachability. WaitForTailscale + // SyncPerUser only checks peer-count parity, which trips + // before the filter rules have actually replaced the + // allow-all rules. + waitForClientFilterChange(t, allClients, preFilters, integrationutil.PolicyPropagationTimeout) + // Test ALL connectivity (positive and negative) in one block after state is settled t.Logf("Iteration %d: Phase 2 - Testing all connectivity with autogroup:self", iteration) assert.EventuallyWithT(t, func(ct *assert.CollectT) { @@ -1947,7 +1937,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) { assertCurlFailWithCollect(ct, client, url, fmt.Sprintf("iteration %d: user2 %s should NOT reach user1 %s", iteration, client.Hostname(), peer.Hostname())) } } - }, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "iteration %d: Phase 2 - all connectivity tests with autogroup:self", iteration) + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "iteration %d: Phase 2 - all connectivity tests with autogroup:self", iteration) // Phase 2b: Add a new node to user1 and validate policy propagation t.Logf("Iteration %d: Phase 2b - Adding new node to user1 during autogroup:self policy", iteration) @@ -2011,7 +2001,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) { assertCurlFailWithCollect(ct, client, url, fmt.Sprintf("iteration %d: user1 %s should NOT reach user2 %s", iteration, client.Hostname(), peer.Hostname())) } } - }, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "iteration %d: Phase 2b - all connectivity tests after new node addition", iteration) + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "iteration %d: Phase 2b - all connectivity tests after new node addition", iteration) // Delete the newly added node before Phase 3 t.Logf("Iteration %d: Phase 2b - Deleting the newly added node from user1", iteration) @@ -2033,7 +2023,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) { nodeToDeleteID = node.GetId() } } - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "iteration %d: Phase 2b - listing nodes before deletion", iteration) + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "iteration %d: Phase 2b - listing nodes before deletion", iteration) // Delete the node via headscale helper t.Logf("Iteration %d: Phase 2b - Deleting node ID %d from headscale", iteration, nodeToDeleteID) @@ -2066,7 +2056,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) { nodeListAfter, err := headscale.ListNodes("user1") assert.NoError(ct, err, "failed to list nodes after deletion") assert.Len(ct, nodeListAfter, 2, "iteration %d: should have 2 user1 nodes after deletion, got %d", iteration, len(nodeListAfter)) - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "iteration %d: Phase 2b - node should be deleted", iteration) + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "iteration %d: Phase 2b - node should be deleted", iteration) // Wait for sync after deletion to ensure peer counts are correct // Use WaitForTailscaleSyncPerUser because autogroup:self is still active, @@ -2128,7 +2118,7 @@ func TestACLPolicyPropagationOverTime(t *testing.T) { assertCurlFailWithCollect(ct, client, url, fmt.Sprintf("iteration %d: user2 %s should NOT reach user1 %s", iteration, client.Hostname(), peer.Hostname())) } } - }, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "iteration %d: Phase 3 - all connectivity tests with directional policy", iteration) + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "iteration %d: Phase 3 - all connectivity tests with directional policy", iteration) t.Logf("=== Iteration %d/5 completed successfully - All 3 phases passed ===", iteration) } @@ -2591,7 +2581,7 @@ func TestACLTagPropagation(t *testing.T) { } else { assertCurlFailWithCollect(c, sourceClient, targetURL, "initial access should fail") } - }, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "verifying initial access state") + }, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "verifying initial access state") // Step 1b: Verify initial NetMap visibility t.Logf("Step 1b: Verifying initial NetMap visibility (expect visible=%v)", tt.initialAccess) @@ -2614,7 +2604,7 @@ func TestACLTagPropagation(t *testing.T) { } else { assert.False(c, found, "Target should NOT be visible in NetMap initially") } - }, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "verifying initial NetMap visibility") + }, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "verifying initial NetMap visibility") // Step 2: Apply tag change t.Logf("Step 2: Setting tags on node %d to %v", targetNodeID, tt.tagChange) @@ -2632,7 +2622,7 @@ func TestACLTagPropagation(t *testing.T) { if node != nil { assert.ElementsMatch(c, tt.tagChange, node.GetTags(), "Tags should be updated") } - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "verifying tag change applied") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "verifying tag change applied") // Step 3: Verify final NetMap visibility first (fast signal that // the MapResponse propagated to the client). @@ -2660,7 +2650,7 @@ func TestACLTagPropagation(t *testing.T) { } else { assert.False(c, found, "Target should NOT be visible in NetMap after tag change") } - }, integrationutil.ScaledTimeout(120*time.Second), 500*time.Millisecond, "verifying NetMap visibility propagated after tag change") + }, integrationutil.HASlowConvergeTimeout, integrationutil.SlowPoll, "verifying NetMap visibility propagated after tag change") // Step 4: Verify final access state (this is the key test for #2389). // Even though Step 3 confirmed the MapResponse arrived, the full @@ -2674,7 +2664,7 @@ func TestACLTagPropagation(t *testing.T) { } else { assertCurlFailWithCollect(c, sourceClient, targetURL, "final access should fail after tag change") } - }, integrationutil.ScaledTimeout(120*time.Second), 500*time.Millisecond, "verifying access propagated after tag change") + }, integrationutil.HASlowConvergeTimeout, integrationutil.SlowPoll, "verifying access propagated after tag change") t.Logf("Test %s PASSED: Tag change propagated correctly", tt.name) }) @@ -2823,7 +2813,7 @@ func TestACLTagPropagationPortSpecific(t *testing.T) { t.Log("Step 1: Verifying HTTP access with tag:webserver (should succeed)") assert.EventuallyWithT(t, func(c *assert.CollectT) { assertCurlSuccessWithCollect(c, user2Node, targetURL, "HTTP should work with tag:webserver") - }, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "initial HTTP access with tag:webserver") + }, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "initial HTTP access with tag:webserver") // Step 2: Change tag from webserver to sshonly t.Logf("Step 2: Changing tag from webserver to sshonly on node %d", targetNodeID) @@ -2844,7 +2834,7 @@ func TestACLTagPropagationPortSpecific(t *testing.T) { if node != nil { assert.ElementsMatch(c, []string{"tag:sshonly"}, node.GetTags(), "Tags should be updated to sshonly") } - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "verifying tag change applied") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "verifying tag change applied") // Step 3: Verify peer is still visible in NetMap (partial access, not full removal) t.Log("Step 3: Verifying peer remains visible in NetMap after tag change") @@ -2863,7 +2853,7 @@ func TestACLTagPropagationPortSpecific(t *testing.T) { } assert.True(c, found, "Peer should still be visible with tag:sshonly (port 22 access)") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "peer visibility after tag change") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "peer visibility after tag change") // Step 4: Verify HTTP on port 80 now fails (tag:sshonly only allows port 22). // Port-specific filter changes are harder than peer removal because @@ -2872,7 +2862,7 @@ func TestACLTagPropagationPortSpecific(t *testing.T) { t.Log("Step 4: Verifying HTTP access is now blocked (tag:sshonly only allows port 22)") assert.EventuallyWithT(t, func(c *assert.CollectT) { assertCurlFailWithCollect(c, user2Node, targetURL, "HTTP should fail with tag:sshonly (only port 22 allowed)") - }, integrationutil.ScaledTimeout(90*time.Second), 500*time.Millisecond, "HTTP blocked after tag change to sshonly") + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "HTTP blocked after tag change to sshonly") t.Log("Test PASSED: Port-specific ACL changes propagated correctly") } @@ -2963,19 +2953,15 @@ func TestACLGroupWithUnknownUser(t *testing.T) { t.Log("Testing connectivity: user1 -> user2 (should succeed despite unknown user in group)") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should be able to reach user2") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "user1 should reach user2") + assertCurlDockerHostname(c, user1, url, "user1 should be able to reach user2") + }, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "user1 should reach user2") // Test that user2 can reach user1 (bidirectional) t.Log("Testing connectivity: user2 -> user1 (should succeed despite unknown user in group)") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user2.Curl(url) - assert.NoError(c, err, "user2 should be able to reach user1") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "user2 should reach user1") + assertCurlDockerHostname(c, user2, url, "user2 should be able to reach user1") + }, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "user2 should reach user1") t.Log("Test PASSED: Valid users can communicate despite unknown user reference in group") } @@ -3069,17 +3055,13 @@ func TestACLGroupAfterUserDeletion(t *testing.T) { t.Log("Step 1: Verifying initial connectivity between all users") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should be able to reach user2 initially") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "initial user1 -> user2 connectivity") + assertCurlDockerHostname(c, user1, url, "user1 should be able to reach user2 initially") + }, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "initial user1 -> user2 connectivity") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user2.Curl(url) - assert.NoError(c, err, "user2 should be able to reach user1 initially") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(30*time.Second), 500*time.Millisecond, "initial user2 -> user1 connectivity") + assertCurlDockerHostname(c, user2, url, "user2 should be able to reach user1 initially") + }, integrationutil.StatusReadyTimeout, integrationutil.SlowPoll, "initial user2 -> user1 connectivity") // Step 2: Get user3's node and user, then delete them t.Log("Step 2: Deleting user3's node and user from headscale") @@ -3114,10 +3096,8 @@ func TestACLGroupAfterUserDeletion(t *testing.T) { // Test that user1 can still reach user2 assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should still be able to reach user2 after user3 deletion (stale cache)") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user2 after user3 deletion") + assertCurlDockerHostname(c, user1, url, "user1 should still be able to reach user2 after user3 deletion (stale cache)") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user1 -> user2 after user3 deletion") // Step 4: Create a NEW user - this triggers updatePolicyManagerUsers() which // re-evaluates the policy. According to issue #2967, this is when the bug manifests: @@ -3139,18 +3119,14 @@ func TestACLGroupAfterUserDeletion(t *testing.T) { // Test that user1 can still reach user2 AFTER the policy refresh triggered by user creation assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should still reach user2 after policy refresh (BUG if this fails)") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user2 after policy refresh (issue #2967)") + assertCurlDockerHostname(c, user1, url, "user1 should still reach user2 after policy refresh (BUG if this fails)") + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user1 -> user2 after policy refresh (issue #2967)") // Test that user2 can still reach user1 assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user2.Curl(url) - assert.NoError(c, err, "user2 should still reach user1 after policy refresh (BUG if this fails)") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user2 -> user1 after policy refresh (issue #2967)") + assertCurlDockerHostname(c, user2, url, "user2 should still reach user1 after policy refresh (BUG if this fails)") + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user2 -> user1 after policy refresh (issue #2967)") t.Log("Test PASSED: Remaining users can communicate after deleted user and policy refresh") } @@ -3256,17 +3232,13 @@ func TestACLGroupDeletionExactReproduction(t *testing.T) { t.Log("Step 1: Verifying initial connectivity (user1 <-> user3)") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user3FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should reach user3") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user3") + assertCurlDockerHostname(c, user1, url, "user1 should reach user3") + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user1 -> user3") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user3.Curl(url) - assert.NoError(c, err, "user3 should reach user1") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user3 -> user1") + assertCurlDockerHostname(c, user3, url, "user3 should reach user1") + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user3 -> user1") t.Log("Step 1: PASSED - initial connectivity works") @@ -3293,17 +3265,13 @@ func TestACLGroupDeletionExactReproduction(t *testing.T) { t.Log("Step 3: Verifying connectivity STILL works after user2 deletion") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user3FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should still reach user3 after user2 deletion") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user3 after user2 deletion") + assertCurlDockerHostname(c, user1, url, "user1 should still reach user3 after user2 deletion") + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user1 -> user3 after user2 deletion") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user3.Curl(url) - assert.NoError(c, err, "user3 should still reach user1 after user2 deletion") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user3 -> user1 after user2 deletion") + assertCurlDockerHostname(c, user3, url, "user3 should still reach user1 after user2 deletion") + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user3 -> user1 after user2 deletion") t.Log("Step 3: PASSED - connectivity works after user2 deletion") @@ -3322,17 +3290,13 @@ func TestACLGroupDeletionExactReproduction(t *testing.T) { assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user3FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "BUG #2967: user1 should still reach user3 after user4 creation") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user3 after user4 creation (issue #2967)") + assertCurlDockerHostname(c, user1, url, "BUG #2967: user1 should still reach user3 after user4 creation") + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user1 -> user3 after user4 creation (issue #2967)") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user3.Curl(url) - assert.NoError(c, err, "BUG #2967: user3 should still reach user1 after user4 creation") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user3 -> user1 after user4 creation (issue #2967)") + assertCurlDockerHostname(c, user3, url, "BUG #2967: user3 should still reach user1 after user4 creation") + }, integrationutil.PolicyPropagationTimeout, integrationutil.SlowPoll, "user3 -> user1 after user4 creation (issue #2967)") // Additional verification: check filter rules are not empty filter, err := headscale.DebugFilter() @@ -3432,17 +3396,13 @@ func TestACLDynamicUnknownUserAddition(t *testing.T) { t.Log("Step 1: Verifying initial connectivity with valid policy (no unknown users)") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should reach user2") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "initial user1 -> user2") + assertCurlDockerHostname(c, user1, url, "user1 should reach user2") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "initial user1 -> user2") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user2.Curl(url) - assert.NoError(c, err, "user2 should reach user1") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "initial user2 -> user1") + assertCurlDockerHostname(c, user2, url, "user2 should reach user1") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "initial user2 -> user1") t.Log("Step 1: PASSED - connectivity works with valid policy") @@ -3483,17 +3443,13 @@ func TestACLDynamicUnknownUserAddition(t *testing.T) { assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should STILL reach user2 after adding unknown user") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user2 after unknown user added") + assertCurlDockerHostname(c, user1, url, "user1 should STILL reach user2 after adding unknown user") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user1 -> user2 after unknown user added") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user2.Curl(url) - assert.NoError(c, err, "user2 should STILL reach user1 after adding unknown user") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user2 -> user1 after unknown user added") + assertCurlDockerHostname(c, user2, url, "user2 should STILL reach user1 after adding unknown user") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user2 -> user1 after unknown user added") t.Log("Step 3: PASSED - connectivity maintained after adding unknown user") t.Log("Test PASSED: v0.28.0-beta.1 scenario - unknown user added dynamically, valid users still work") @@ -3589,17 +3545,13 @@ func TestACLDynamicUnknownUserRemoval(t *testing.T) { t.Log("Step 1: Verifying connectivity with unknown user in policy (v2 graceful handling)") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should reach user2 even with unknown user in policy") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "initial user1 -> user2 with unknown") + assertCurlDockerHostname(c, user1, url, "user1 should reach user2 even with unknown user in policy") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "initial user1 -> user2 with unknown") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user2.Curl(url) - assert.NoError(c, err, "user2 should reach user1 even with unknown user in policy") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "initial user2 -> user1 with unknown") + assertCurlDockerHostname(c, user2, url, "user2 should reach user1 even with unknown user in policy") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "initial user2 -> user1 with unknown") t.Log("Step 1: PASSED - connectivity works even with unknown user (v2 graceful handling)") @@ -3638,17 +3590,13 @@ func TestACLDynamicUnknownUserRemoval(t *testing.T) { assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user2FQDN) - result, err := user1.Curl(url) - assert.NoError(c, err, "user1 should reach user2 after removing unknown user") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user1 -> user2 after unknown removed") + assertCurlDockerHostname(c, user1, url, "user1 should reach user2 after removing unknown user") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user1 -> user2 after unknown removed") assert.EventuallyWithT(t, func(c *assert.CollectT) { url := fmt.Sprintf("http://%s/etc/hostname", user1FQDN) - result, err := user2.Curl(url) - assert.NoError(c, err, "user2 should reach user1 after removing unknown user") - assert.Len(c, result, 13, "expected hostname response") - }, integrationutil.ScaledTimeout(60*time.Second), 500*time.Millisecond, "user2 -> user1 after unknown removed") + assertCurlDockerHostname(c, user2, url, "user2 should reach user1 after removing unknown user") + }, integrationutil.HAConvergeTimeout, integrationutil.SlowPoll, "user2 -> user1 after unknown removed") t.Log("Step 3: PASSED - connectivity maintained after removing unknown user") t.Log("Test PASSED: Removing unknown users from policy works correctly") diff --git a/integration/general_test.go b/integration/general_test.go index 71d4371a..02a2b53b 100644 --- a/integration/general_test.go +++ b/integration/general_test.go @@ -13,7 +13,6 @@ import ( v1 "github.com/juanfont/headscale/gen/go/headscale/v1" "github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/integration/hsic" - "tailscale.com/util/dnsname" "github.com/juanfont/headscale/integration/integrationutil" "github.com/juanfont/headscale/integration/tsic" "github.com/rs/zerolog/log" @@ -23,6 +22,7 @@ import ( "golang.org/x/sync/errgroup" "tailscale.com/client/tailscale/apitype" "tailscale.com/types/key" + "tailscale.com/util/dnsname" ) func TestPingAllByIP(t *testing.T) { @@ -211,7 +211,7 @@ func testEphemeralWithOptions(t *testing.T, opts ...hsic.Option) { nodes, err := headscale.ListNodes() assert.NoError(ct, err) assert.Len(ct, nodes, 0, "All ephemeral nodes should be cleaned up after logout") - }, integrationutil.ScaledTimeout(30*time.Second), 2*time.Second) + }, integrationutil.StatusReadyTimeout, 2*time.Second) } // TestEphemeral2006DeletedTooQuickly verifies that ephemeral nodes are not @@ -298,7 +298,7 @@ func TestEphemeral2006DeletedTooQuickly(t *testing.T) { assert.NoError(ct, err) assertPingAllWithCollect(ct, allClients, allAddrs) - }, integrationutil.ScaledTimeout(60*time.Second), 2*time.Second) + }, integrationutil.HAConvergeTimeout, 2*time.Second) // Take down all clients, this should start an expiry timer for each. for _, client := range allClients { @@ -894,7 +894,7 @@ func TestUpdateHostnameFromClient(t *testing.T) { } } } - }, integrationutil.ScaledTimeout(60*time.Second), 2*time.Second) + }, integrationutil.HAConvergeTimeout, 2*time.Second) for _, client := range allClients { status := client.MustStatus() @@ -980,7 +980,7 @@ func TestExpireNode(t *testing.T) { // Assert that we have the original count - self assert.Len(ct, status.Peers(), spec.NodesPerUser-1, "Client %s should see correct number of peers", client.Hostname()) - }, integrationutil.ScaledTimeout(30*time.Second), 1*time.Second) + }, integrationutil.StatusReadyTimeout, 1*time.Second) } headscale, err := scenario.Headscale() @@ -1069,7 +1069,7 @@ func TestExpireNode(t *testing.T) { ) } } - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for expired node status to propagate") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for expired node status to propagate") } } @@ -1307,7 +1307,7 @@ func TestNodeOnlineStatus(t *testing.T) { // Assert that we have the original count - self assert.Len(c, status.Peers(), len(MustTestVersions)-1) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for expected peer count") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for expected peer count") } headscale, err := scenario.Headscale() @@ -1320,6 +1320,11 @@ func TestNodeOnlineStatus(t *testing.T) { log.Printf("Starting online test from %v to %v", start, end) + // Pace the outer loop at one iteration per second so the + // continuous online-check does not hammer the docker daemon. + tick := time.NewTicker(time.Second) + defer tick.Stop() + for { // Let the test run continuously for X minutes to verify // all nodes stay connected and has the expected status over time. @@ -1384,8 +1389,7 @@ func TestNodeOnlineStatus(t *testing.T) { }, integrationutil.ScaledTimeout(15*time.Second), 1*time.Second) } - // Check maximum once per second - time.Sleep(time.Second) + <-tick.C } } diff --git a/integration/helpers.go b/integration/helpers.go index f51f4703..aadb6309 100644 --- a/integration/helpers.go +++ b/integration/helpers.go @@ -8,6 +8,7 @@ import ( "io" "maps" "net/netip" + "reflect" "slices" "strconv" "strings" @@ -27,6 +28,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "tailscale.com/tailcfg" + "tailscale.com/wgengine/filter" ) const ( @@ -558,6 +560,56 @@ func assertCurlSuccessWithCollect(c *assert.CollectT, client TailscaleClient, ur assert.NotEmpty(c, result, msg) } +// assertCurlDockerHostname curls url and asserts the body is the +// 13-byte Docker auto-generated container hostname (12 hex chars + +// trailing newline from /etc/hostname). For use inside EventuallyWithT. +func assertCurlDockerHostname(c *assert.CollectT, client TailscaleClient, url, msg string) { + const dockerHostnameLen = 13 + + result, err := client.Curl(url) + assert.NoError(c, err, msg) //nolint:testifylint // CollectT requires assert, not require + assert.Len(c, result, dockerHostnameLen, msg) +} + +// snapshotClientFilters snapshots each client's current netmap +// PacketFilter keyed by hostname. Pair with waitForClientFilterChange. +func snapshotClientFilters(t *testing.T, clients []TailscaleClient) map[string][]filter.Match { + t.Helper() + + out := make(map[string][]filter.Match, len(clients)) + + for _, c := range clients { + nm, err := c.Netmap() + require.NoError(t, err, "snapshot netmap for %s", c.Hostname()) + + out[c.Hostname()] = nm.PacketFilter + } + + return out +} + +// waitForClientFilterChange polls each client until its netmap +// PacketFilter differs from baselines[Hostname]. Use after SetPolicy +// to gate on client-side filter application before asserting +// reachability. +func waitForClientFilterChange(t *testing.T, clients []TailscaleClient, baselines map[string][]filter.Match, timeout time.Duration) { + t.Helper() + + for _, client := range clients { + c := client + baseline := baselines[c.Hostname()] + + assert.EventuallyWithT(t, func(ct *assert.CollectT) { + nm, err := c.Netmap() + if !assert.NoError(ct, err, "fetch netmap for %s", c.Hostname()) { + return + } + + assert.False(ct, reflect.DeepEqual(baseline, nm.PacketFilter), "client %s PacketFilter unchanged since baseline", c.Hostname()) + }, timeout, integrationutil.SlowPoll, "client %s PacketFilter should change after SetPolicy", c.Hostname()) + } +} + // assertCurlFailWithCollect asserts that a curl request fails. Uses // CurlFailFast internally for aggressive timeouts, avoiding wasted // time on retries when we expect the connection to be blocked. diff --git a/integration/route_test.go b/integration/route_test.go index a9b1f7f5..911e5c84 100644 --- a/integration/route_test.go +++ b/integration/route_test.go @@ -114,7 +114,7 @@ func TestEnablingRoutes(t *testing.T) { assert.Nil(c, peerStatus.PrimaryRoutes) } - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no routes are active before approval") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes are active before approval") } for _, node := range nodes { @@ -159,7 +159,7 @@ func TestEnablingRoutes(t *testing.T) { requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{netip.MustParsePrefix(expectedRoutes[string(peerStatus.ID)])}) } } - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "clients should see new routes") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "clients should see new routes") _, err = headscale.ApproveRoutes( 1, @@ -195,7 +195,7 @@ func TestEnablingRoutes(t *testing.T) { assert.Len(c, node.GetSubnetRoutes(), 1) // 10.0.2.0/24 } } - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to nodes") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes") // Verify that the clients can see the new routes for _, client := range allClients { @@ -215,7 +215,7 @@ func TestEnablingRoutes(t *testing.T) { requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{netip.MustParsePrefix("10.0.2.0/24")}) } } - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying final route state visible to clients") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying final route state visible to clients") } } @@ -223,7 +223,10 @@ func TestEnablingRoutes(t *testing.T) { func TestHASubnetRouterFailover(t *testing.T) { IntegrationSkip(t) - propagationTime := integrationutil.ScaledTimeout(60 * time.Second) + // HASlowConverge (not HAConverge): tailscale's wgengine sometimes + // keeps routing through the previous primary for ~2 min after + // the netmap update flips PrimaryRoutes server-side. + propagationTime := integrationutil.HASlowConvergeTimeout // Helper function to validate primary routes table state validatePrimaryRoutes := func(t *testing.T, headscale ControlServer, expectedRoutes *types.DebugRoutes, message string) { @@ -458,9 +461,7 @@ func TestHASubnetRouterFailover(t *testing.T) { t.Logf("[%s] Starting test section", time.Now().Format(TimestampFormat)) t.Logf(" Expected: Traffic flows through router 1 as it's the only approved route") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1") }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -513,7 +514,7 @@ func TestHASubnetRouterFailover(t *testing.T) { requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0) requireNodeRouteCountWithCollect(c, nodes[2], 1, 0, 0) } - }, integrationutil.ScaledTimeout(3*time.Second), 200*time.Millisecond, "HA setup verification: Router 2 approved as STANDBY (available=1, approved=1, subnet=0), Router 1 stays PRIMARY (subnet=1)") + }, integrationutil.ScaledTimeout(3*time.Second), integrationutil.FastPoll, "HA setup verification: Router 2 approved as STANDBY (available=1, approved=1, subnet=0), Router 1 stays PRIMARY (subnet=1)") // Verify that the client has routes from the primary machine assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -575,9 +576,7 @@ func TestHASubnetRouterFailover(t *testing.T) { t.Logf(" Expected: Router 1 continues to handle all traffic (no change from before)") t.Logf(" Expected: Router 2 is ready to take over if router 1 fails") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1 in HA mode") }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 in HA mode") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -630,7 +629,7 @@ func TestHASubnetRouterFailover(t *testing.T) { requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1) requireNodeRouteCountWithCollect(c, nodes[1], 1, 1, 0) requireNodeRouteCountWithCollect(c, nodes[2], 1, 1, 0) - }, integrationutil.ScaledTimeout(3*time.Second), 200*time.Millisecond, "Full HA verification: Router 3 approved as second STANDBY (available=1, approved=1, subnet=0), Router 1 PRIMARY, Router 2 first STANDBY") + }, integrationutil.ScaledTimeout(3*time.Second), integrationutil.FastPoll, "Full HA verification: Router 3 approved as second STANDBY (available=1, approved=1, subnet=0), Router 1 PRIMARY, Router 2 first STANDBY") // Verify that the client has routes from the primary machine assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -673,9 +672,7 @@ func TestHASubnetRouterFailover(t *testing.T) { }, propagationTime, 200*time.Millisecond, "Verifying full HA with 3 routers: Router 1 PRIMARY, Routers 2 & 3 STANDBY") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1 with full HA") }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 with full HA") // Wait for traceroute to work correctly through the expected router @@ -766,9 +763,7 @@ func TestHASubnetRouterFailover(t *testing.T) { }, propagationTime, 200*time.Millisecond, "Failover verification: Router 1 offline, Router 2 should be new PRIMARY with routes, Router 3 still STANDBY") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 2 after failover") }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after failover") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -840,9 +835,7 @@ func TestHASubnetRouterFailover(t *testing.T) { }, propagationTime, 200*time.Millisecond, "Second failover verification: Router 1 & 2 offline, Router 3 should be new PRIMARY (last router standing) with routes") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 3 after second failover") }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 3 after second failover") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -920,9 +913,7 @@ func TestHASubnetRouterFailover(t *testing.T) { }, propagationTime, 200*time.Millisecond, "Recovery verification: Router 1 back online as STANDBY, Router 3 remains PRIMARY (no flapping) with routes") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can still reach webservice through router 3 after router 1 recovery") }, propagationTime, 200*time.Millisecond, "Verifying client can still reach webservice through router 3 after router 1 recovery") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1000,12 +991,10 @@ func TestHASubnetRouterFailover(t *testing.T) { pref, ) } - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "Full recovery verification: All 3 routers online, Router 3 remains PRIMARY (no flapping) with routes") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "Full recovery verification: All 3 routers online, Router 3 remains PRIMARY (no flapping) with routes") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 3 after full recovery") }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 3 after full recovery") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1054,7 +1043,7 @@ func TestHASubnetRouterFailover(t *testing.T) { requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 1, 1) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 0) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0) - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "Route disable verification: Router 3 route disabled, Router 1 should be new PRIMARY, Router 2 STANDBY") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "Route disable verification: Router 3 route disabled, Router 1 should be new PRIMARY, Router 2 STANDBY") // Verify that the route is announced from subnet router 1 assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1090,9 +1079,7 @@ func TestHASubnetRouterFailover(t *testing.T) { }, propagationTime, 200*time.Millisecond, "Verifying Router 1 becomes PRIMARY after Router 3 route disabled") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 1 after route disable") }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 1 after route disable") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1142,7 +1129,7 @@ func TestHASubnetRouterFailover(t *testing.T) { requireNodeRouteCountWithCollect(c, MustFindNode(subRouter1.Hostname(), nodes), 1, 0, 0) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter2.Hostname(), nodes), 1, 1, 1) requireNodeRouteCountWithCollect(c, MustFindNode(subRouter3.Hostname(), nodes), 1, 0, 0) - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "Second route disable verification: Router 1 route disabled, Router 2 should be new PRIMARY") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "Second route disable verification: Router 1 route disabled, Router 2 should be new PRIMARY") // Verify that the route is announced from subnet router 1 assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1178,9 +1165,7 @@ func TestHASubnetRouterFailover(t *testing.T) { }, propagationTime, 200*time.Millisecond, "Verifying Router 2 becomes PRIMARY after Router 1 route disabled") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 2 after second route disable") }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after second route disable") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1265,9 +1250,7 @@ func TestHASubnetRouterFailover(t *testing.T) { }, propagationTime, 200*time.Millisecond, "Verifying Router 2 remains PRIMARY after Router 1 route re-enabled") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "Verifying client can reach webservice through router 2 after route re-enable") }, propagationTime, 200*time.Millisecond, "Verifying client can reach webservice through router 2 after route re-enable") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1410,21 +1393,33 @@ func TestSubnetRouteACL(t *testing.T) { client := allClients[1] + // Read Self.ID inside a retry (it lags initial connection); apply + // the route mutation outside, since retrying a mutation hides + // real failures. for _, client := range allClients { - assert.EventuallyWithT(t, func(c *assert.CollectT) { - status, err := client.Status() - assert.NoError(c, err) + var status *ipnstate.Status - if route, ok := expectedRoutes[string(status.Self.ID)]; ok { - command := []string{ - "tailscale", - "set", - "--advertise-routes=" + route, - } - _, _, err = client.Execute(command) - assert.NoErrorf(c, err, "failed to advertise route: %s", err) + require.EventuallyWithT(t, func(c *assert.CollectT) { + s, err := client.Status() + assert.NoError(c, err) + assert.NotNil(c, s) + + if s != nil { + status = s } - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Configuring route advertisements") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Reading client status before route advertisement") + + route, ok := expectedRoutes[string(status.Self.ID)] + if !ok { + continue + } + + _, _, err = client.Execute([]string{ + "tailscale", + "set", + "--advertise-routes=" + route, + }) + require.NoErrorf(t, err, "failed to advertise route on %s", client.Hostname()) } err = scenario.WaitForTailscaleSync() @@ -1478,7 +1473,7 @@ func TestSubnetRouteACL(t *testing.T) { assert.Nil(c, peerStatus.PrimaryRoutes) requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) } - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no routes are active before approval") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes are active before approval") } _, err = headscale.ApproveRoutes( @@ -1495,7 +1490,7 @@ func TestSubnetRouteACL(t *testing.T) { requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1) requireNodeRouteCountWithCollect(c, nodes[1], 0, 0, 0) - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to nodes") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes") // Verify that the client has routes from the primary machine assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1514,7 +1509,7 @@ func TestSubnetRouteACL(t *testing.T) { } requirePeerSubnetRoutesWithCollect(c, srs1PeerStatus, []netip.Prefix{netip.MustParsePrefix(expectedRoutes["1"])}) - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying client can see subnet routes from router") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying client can see subnet routes from router") // Wait for packet filter updates to propagate to client netmap wantClientFilter := []filter.Match{ @@ -1549,7 +1544,7 @@ func TestSubnetRouteACL(t *testing.T) { if diff := cmpdiff.Diff(wantClientFilter, clientNm.PacketFilter, util.ViewSliceIPProtoComparer, util.PrefixComparer); diff != "" { assert.Fail(c, fmt.Sprintf("Client (%s) filter, unexpected result (-want +got):\n%s", client.Hostname(), diff)) } - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for client packet filter to update") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for client packet filter to update") // Wait for packet filter updates to propagate to subnet router netmap // The two ACL rules (group:admins -> group:admins:* and group:admins -> 10.33.0.0/16:*) @@ -1590,7 +1585,7 @@ func TestSubnetRouteACL(t *testing.T) { if diff := cmpdiff.Diff(wantSubnetFilter, subnetNm.PacketFilter, util.ViewSliceIPProtoComparer, util.PrefixComparer); diff != "" { assert.Fail(c, fmt.Sprintf("Subnet (%s) filter, unexpected result (-want +got):\n%s", subRouter1.Hostname(), diff)) } - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for subnet router packet filter to update") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for subnet router packet filter to update") } // TestEnablingExitRoutes tests enabling exit routes for clients. @@ -1639,7 +1634,7 @@ func TestEnablingExitRoutes(t *testing.T) { requireNodeRouteCountWithCollect(c, nodes[0], 2, 0, 0) requireNodeRouteCountWithCollect(c, nodes[1], 2, 0, 0) - }, integrationutil.ScaledTimeout(10*time.Second), 200*time.Millisecond, "Waiting for route advertisements to propagate") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.FastPoll, "Waiting for route advertisements to propagate") // Verify that no routes has been sent to the client, // they are not yet enabled. @@ -1653,7 +1648,7 @@ func TestEnablingExitRoutes(t *testing.T) { assert.Nil(c, peerStatus.PrimaryRoutes) } - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no exit routes are active before approval") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no exit routes are active before approval") } // Enable all routes, but do v4 on one and v6 on other to ensure they @@ -1677,7 +1672,7 @@ func TestEnablingExitRoutes(t *testing.T) { requireNodeRouteCountWithCollect(c, nodes[0], 2, 2, 2) requireNodeRouteCountWithCollect(c, nodes[1], 2, 2, 2) - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to both nodes") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to both nodes") // Wait for route state changes to propagate to clients assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1698,7 +1693,7 @@ func TestEnablingExitRoutes(t *testing.T) { } } } - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "clients should see new routes") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "clients should see new routes") } // TestExitRoutesWithAutogroupInternetACL reproduces juanfont/headscale#3212. @@ -1768,7 +1763,7 @@ func TestExitRoutesWithAutogroupInternetACL(t *testing.T) { requireNodeRouteCountWithCollect(c, nodes[0], 2, 0, 0) requireNodeRouteCountWithCollect(c, nodes[1], 2, 0, 0) - }, integrationutil.ScaledTimeout(20*time.Second), 200*time.Millisecond, + }, integrationutil.ScaledTimeout(20*time.Second), integrationutil.FastPoll, "Waiting for exit-route advertisements to propagate") // Approve exit routes on both nodes so either could serve as @@ -1792,7 +1787,7 @@ func TestExitRoutesWithAutogroupInternetACL(t *testing.T) { requireNodeRouteCountWithCollect(c, nodes[0], 2, 2, 2) requireNodeRouteCountWithCollect(c, nodes[1], 2, 2, 2) - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "approved exit routes should propagate to nodes") // The end-to-end UX assertion: every client must see the OTHER @@ -1835,7 +1830,7 @@ func TestExitRoutesWithAutogroupInternetACL(t *testing.T) { "client %s should see the other node as a peer "+ "via the autogroup:internet ACL", status.Self.HostName) - }, integrationutil.ScaledTimeout(15*time.Second), 500*time.Millisecond, + }, integrationutil.ScaledTimeout(15*time.Second), integrationutil.SlowPoll, "client should see exit nodes as peers via autogroup:internet ACL") } } @@ -1930,7 +1925,7 @@ func TestSubnetRouterMultiNetwork(t *testing.T) { assert.Nil(c, peerStatus.PrimaryRoutes) requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) } - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no routes are active before approval") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes are active before approval") // Enable route _, err = headscale.ApproveRoutes( @@ -1947,7 +1942,7 @@ func TestSubnetRouterMultiNetwork(t *testing.T) { assert.NoError(c, err) assert.Len(c, nodes, 2) requireNodeRouteCountWithCollect(c, nodes[0], 1, 1, 1) - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to nodes") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes") // Verify that the routes have been sent to the client assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -1963,7 +1958,7 @@ func TestSubnetRouterMultiNetwork(t *testing.T) { requirePeerSubnetRoutesWithCollect(c, peerStatus, []netip.Prefix{*pref}) } - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "routes should be visible to client") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "routes should be visible to client") usernet1, err := scenario.Network("usernet1") require.NoError(t, err) @@ -1979,10 +1974,8 @@ func TestSubnetRouterMultiNetwork(t *testing.T) { t.Logf("url from %s to %s", user2c.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := user2c.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying client can reach webservice through subnet route") + assertCurlDockerHostname(c, user2c, url, "Verifying client can reach webservice through subnet route") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying client can reach webservice through subnet route") assert.EventuallyWithT(t, func(c *assert.CollectT) { tr, err := user2c.Traceroute(webip) @@ -1994,7 +1987,7 @@ func TestSubnetRouterMultiNetwork(t *testing.T) { } assertTracerouteViaIPWithCollect(c, tr, ip) - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying traceroute goes through subnet router") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying traceroute goes through subnet router") } func TestSubnetRouterMultiNetworkExitNode(t *testing.T) { @@ -2088,7 +2081,7 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) { assert.Nil(c, peerStatus.PrimaryRoutes) requirePeerSubnetRoutesWithCollect(c, peerStatus, nil) } - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying no routes sent to client before approval") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying no routes sent to client before approval") // Approve exit routes and subnet route. _, err = headscale.ApproveRoutes(nodes[0].GetId(), []netip.Prefix{tsaddr.AllIPv4(), tsaddr.AllIPv6(), *route}) @@ -2100,7 +2093,7 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) { assert.NoError(c, err) assert.Len(c, nodes, 2) requireNodeRouteCountWithCollect(c, nodes[0], 3, 3, 3) - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "route state changes should propagate to nodes") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "route state changes should propagate to nodes") // Wait for exit routes to be visible to the client. assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -2111,7 +2104,7 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) { peerStatus := status.Peer[peerKey] assert.True(c, peerStatus.ExitNodeOption, "peer should be an exit node option") } - }, integrationutil.ScaledTimeout(10*time.Second), 500*time.Millisecond, "exit routes should be visible to client") + }, integrationutil.ScaledTimeout(10*time.Second), integrationutil.SlowPoll, "exit routes should be visible to client") // Tell user2c to use user1c as an exit node. command = []string{ @@ -2142,9 +2135,7 @@ func TestSubnetRouterMultiNetworkExitNode(t *testing.T) { weburl := fmt.Sprintf("http://%s/etc/hostname", webip) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := user2c.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, user2c, weburl, "user2 should reach webservice via exit node") }, 10*time.Second, 200*time.Millisecond, "user2 should reach webservice via exit node") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -2719,9 +2710,7 @@ func TestAutoApproveMultiNetwork(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, url, "Verifying client can reach webservice through auto-approved route") }, assertTimeout, 200*time.Millisecond, "Verifying client can reach webservice through auto-approved route") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -2785,9 +2774,7 @@ func TestAutoApproveMultiNetwork(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, url, "Verifying client can still reach webservice after policy change") }, assertTimeout, 200*time.Millisecond, "Verifying client can still reach webservice after policy change") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -2882,9 +2869,7 @@ func TestAutoApproveMultiNetwork(t *testing.T) { t.Logf("url from %s to %s", client.Hostname(), url) assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(url) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, url, "Verifying client can reach webservice after route re-approval") }, assertTimeout, 200*time.Millisecond, "Verifying client can reach webservice after route re-approval") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -3274,7 +3259,7 @@ func TestSubnetRouteACLFiltering(t *testing.T) { // Check that the router has 3 routes now approved and available requireNodeRouteCountWithCollect(c, routerNode, 3, 3, 3) - }, integrationutil.ScaledTimeout(15*time.Second), 500*time.Millisecond, "route state changes should propagate") + }, integrationutil.ScaledTimeout(15*time.Second), integrationutil.SlowPoll, "route state changes should propagate") // Now check the client node status assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -3289,13 +3274,11 @@ func TestSubnetRouteACLFiltering(t *testing.T) { // The node should only have 1 subnet route requirePeerSubnetRoutesWithCollect(c, routerPeerStatus, []netip.Prefix{*route}) - }, integrationutil.ScaledTimeout(5*time.Second), 200*time.Millisecond, "Verifying node sees filtered subnet routes") + }, integrationutil.ScaledTimeout(5*time.Second), integrationutil.FastPoll, "Verifying node sees filtered subnet routes") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := nodeClient.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) - }, integrationutil.ScaledTimeout(60*time.Second), 200*time.Millisecond, "Verifying node can reach webservice through allowed route") + assertCurlDockerHostname(c, nodeClient, weburl, "Verifying node can reach webservice through allowed route") + }, integrationutil.HAConvergeTimeout, integrationutil.FastPoll, "Verifying node can reach webservice through allowed route") assert.EventuallyWithT(t, func(c *assert.CollectT) { tr, err := nodeClient.Traceroute(webip) @@ -3307,7 +3290,7 @@ func TestSubnetRouteACLFiltering(t *testing.T) { } assertTracerouteViaIPWithCollect(c, tr, ip) - }, integrationutil.ScaledTimeout(60*time.Second), 200*time.Millisecond, "Verifying traceroute goes through router") + }, integrationutil.HAConvergeTimeout, integrationutil.FastPoll, "Verifying traceroute goes through router") } // TestGrantViaSubnetSteering validates that via grants steer different source @@ -3595,16 +3578,12 @@ func TestGrantViaSubnetSteering(t *testing.T) { // Verify Client A can reach the webservice. assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := clientA.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, clientA, weburl, "Client A should reach webservice") }, assertTimeout, 200*time.Millisecond, "Client A should reach webservice") // Verify Client B can reach the webservice. assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := clientB.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, clientB, weburl, "Client B should reach webservice") }, assertTimeout, 200*time.Millisecond, "Client B should reach webservice") // Verify Client A's traffic goes through Router A. @@ -3642,7 +3621,7 @@ func TestGrantViaSubnetSteering(t *testing.T) { func TestHASubnetRouterPingFailover(t *testing.T) { IntegrationSkip(t) - propagationTime := integrationutil.ScaledTimeout(60 * time.Second) + propagationTime := integrationutil.HAConvergeTimeout spec := ScenarioSpec{ NodesPerUser: 2, @@ -3757,9 +3736,7 @@ func TestHASubnetRouterPingFailover(t *testing.T) { // Verify connectivity through router 1. assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "client should reach webservice through router 1") }, propagationTime, 200*time.Millisecond, "client should reach webservice through router 1") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -3814,9 +3791,7 @@ func TestHASubnetRouterPingFailover(t *testing.T) { t.Log("Failover detected. Verifying connectivity through router 2.") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "client should reach webservice through router 2") }, propagationTime, 200*time.Millisecond, "client should reach webservice through router 2") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -3890,7 +3865,7 @@ func TestHASubnetRouterPingFailover(t *testing.T) { func TestHASubnetRouterFailoverBothOffline(t *testing.T) { IntegrationSkip(t) - propagationTime := integrationutil.ScaledTimeout(60 * time.Second) + propagationTime := integrationutil.HAConvergeTimeout spec := ScenarioSpec{ NodesPerUser: 2, @@ -3997,9 +3972,7 @@ func TestHASubnetRouterFailoverBothOffline(t *testing.T) { // Confirm initial connectivity through r1. assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "client reaches webservice via r1") }, propagationTime, 200*time.Millisecond, "client reaches webservice via r1") t.Log("=== Step 1: r1 goes offline. r2 should take over. ===") @@ -4067,9 +4040,7 @@ func TestHASubnetRouterFailoverBothOffline(t *testing.T) { // End-to-end traffic should reach the webservice via r2. assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "client reaches webservice via r2 after recovery") }, propagationTime, 200*time.Millisecond, "client reaches webservice via r2 after recovery") assert.EventuallyWithT(t, func(c *assert.CollectT) { @@ -4101,7 +4072,7 @@ func TestHASubnetRouterFailoverBothOffline(t *testing.T) { func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) { IntegrationSkip(t) - propagationTime := integrationutil.ScaledTimeout(120 * time.Second) + propagationTime := integrationutil.HASlowConvergeTimeout spec := ScenarioSpec{ NodesPerUser: 2, @@ -4292,9 +4263,7 @@ func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) { }, propagationTime, 1*time.Second, "R2: waiting for client to see r2") assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, "client reaches webservice via r2 after recovery") }, propagationTime, 1*time.Second, "client reaches webservice via r2 after recovery") } @@ -4329,7 +4298,7 @@ func TestHASubnetRouterFailoverBothOfflineCablePull(t *testing.T) { func TestHASubnetRouterFailoverDockerDisconnect(t *testing.T) { IntegrationSkip(t) - propagationTime := integrationutil.ScaledTimeout(120 * time.Second) + propagationTime := integrationutil.HASlowConvergeTimeout flapWindow := integrationutil.ScaledTimeout(40 * time.Second) spec := ScenarioSpec{ @@ -4438,9 +4407,7 @@ func TestHASubnetRouterFailoverDockerDisconnect(t *testing.T) { requireTrafficWorks := func(msg string) { t.Helper() assert.EventuallyWithT(t, func(c *assert.CollectT) { - result, err := client.Curl(weburl) - assert.NoError(c, err) - assert.Len(c, result, 13) + assertCurlDockerHostname(c, client, weburl, msg) }, propagationTime, 1*time.Second, msg) } diff --git a/integration/scenario.go b/integration/scenario.go index 779a9bff..eca239b9 100644 --- a/integration/scenario.go +++ b/integration/scenario.go @@ -784,10 +784,42 @@ func (s *Scenario) WaitForTailscaleSync() error { return err } -// WaitForTailscaleSyncPerUser blocks execution until each TailscaleClient has the expected -// number of peers for its user. This is useful for policies like autogroup:self where nodes -// only see same-user peers, not all nodes in the network. -func (s *Scenario) WaitForTailscaleSyncPerUser(timeout, retryInterval time.Duration) error { +// SyncOption configures WaitForTailscaleSyncPerUser. +type SyncOption func(*syncOptions) + +type syncOptions struct { + preBarrier func(context.Context) error +} + +// WithPreBarrier runs a precondition check before per-user peer-count +// waits begin, sharing the outer timeout via context. Use to gate on +// a server-side signal (e.g. policy compile) that the peer-count +// alone cannot observe. +func WithPreBarrier(barrier func(context.Context) error) SyncOption { + return func(o *syncOptions) { o.preBarrier = barrier } +} + +// WaitForTailscaleSyncPerUser blocks until each TailscaleClient has +// the expected per-user peer count (necessary for policies like +// autogroup:self where cross-user peers are invisible). +func (s *Scenario) WaitForTailscaleSyncPerUser(timeout, retryInterval time.Duration, opts ...SyncOption) error { + options := syncOptions{} + for _, opt := range opts { + opt(&options) + } + + if options.preBarrier != nil { + barrierCtx, cancel := context.WithTimeout(context.Background(), timeout) + + err := options.preBarrier(barrierCtx) + + cancel() + + if err != nil { + return fmt.Errorf("pre-barrier: %w", err) + } + } + var allErrors []error for _, user := range s.users { diff --git a/integration/tsic/tsic.go b/integration/tsic/tsic.go index fcabb5e9..4f0c2dd5 100644 --- a/integration/tsic/tsic.go +++ b/integration/tsic/tsic.go @@ -67,6 +67,7 @@ var ( errTailscaleImageRequiredInCI = errors.New("HEADSCALE_INTEGRATION_TAILSCALE_IMAGE must be set in CI for HEAD version") errContainerNotInitialized = errors.New("container not initialized") errFQDNNotYetAvailable = errors.New("FQDN not yet available") + errCurlEmptyResponseBody = errors.New("curl returned empty response body") ) const ( @@ -1532,6 +1533,15 @@ func (t *TailscaleInContainer) Curl(url string, opts ...CurlOption) (string, err return result, err } + // curl exit 0 with an empty body usually means a mid-stream reset + // after headers (HTTP 200 with the connection torn down before the + // body arrived). Without this signal, callers wrapping the call in + // EventuallyWithT see assert.NoError pass and assert.Len fail with + // no error to drive a retry. + if result == "" { + return result, fmt.Errorf("%w: %s from %s", errCurlEmptyResponseBody, url, t.Hostname()) + } + return result, nil } @@ -1547,8 +1557,16 @@ func (t *TailscaleInContainer) CurlFailFast(url string) (string, error) { } func (t *TailscaleInContainer) Traceroute(ip netip.Addr) (util.Traceroute, error) { + // -w 1: wait at most 1s for each probe response. busybox's default + // is 5s, which means an EventuallyWithT loop at 200ms ticks + // can spend 25 ticks worth of budget on a single Traceroute. + // -q 1: send 1 probe per hop instead of 3. The HA tests only care + // about the first hop's identity; the other probes are dead + // weight for the same retry budget. command := []string{ "traceroute", + "-w", "1", + "-q", "1", ip.String(), }