stability and race conditions in auth and node store (#2781)

This PR addresses some consistency issues that was introduced or discovered with the nodestore.

nodestore:
Now returns the node that is being put or updated when it is finished. This closes a race condition where when we read it back, we do not necessarily get the node with the given change and it ensures we get all the other updates from that batch write.

auth:
Authentication paths have been unified and simplified. It removes a lot of bad branches and ensures we only do the minimal work.
A comprehensive auth test set has been created so we do not have to run integration tests to validate auth and it has allowed us to generate test cases for all the branches we currently know of.

integration:
added a lot more tooling and checks to validate that nodes reach the expected state when they come up and down. Standardised between the different auth models. A lot of this is to support or detect issues in the changes to nodestore (races) and auth (inconsistencies after login and reaching correct state)

This PR was assisted, particularly tests, by claude code.
This commit is contained in:
Kristoffer Dalby
2025-10-16 12:17:43 +02:00
committed by GitHub
parent 881a6b9227
commit fddc7117e4
34 changed files with 7408 additions and 1876 deletions

View File

@@ -11,6 +11,7 @@ import (
"github.com/juanfont/headscale/integration/hsic"
"github.com/juanfont/headscale/integration/tsic"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/tailcfg"
)
@@ -30,7 +31,7 @@ func sshScenario(t *testing.T, policy *policyv2.Policy, clientsPerUser int) *Sce
Users: []string{"user1", "user2"},
}
scenario, err := NewScenario(spec)
assertNoErr(t, err)
require.NoError(t, err)
err = scenario.CreateHeadscaleEnv(
[]tsic.Option{
@@ -50,13 +51,13 @@ func sshScenario(t *testing.T, policy *policyv2.Policy, clientsPerUser int) *Sce
hsic.WithACLPolicy(policy),
hsic.WithTestName("ssh"),
)
assertNoErr(t, err)
require.NoError(t, err)
err = scenario.WaitForTailscaleSync()
assertNoErr(t, err)
require.NoError(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
assertNoErr(t, err)
require.NoError(t, err)
return scenario
}
@@ -93,19 +94,19 @@ func TestSSHOneUserToAll(t *testing.T) {
defer scenario.ShutdownAssertNoPanics(t)
allClients, err := scenario.ListTailscaleClients()
assertNoErrListClients(t, err)
requireNoErrListClients(t, err)
user1Clients, err := scenario.ListTailscaleClients("user1")
assertNoErrListClients(t, err)
requireNoErrListClients(t, err)
user2Clients, err := scenario.ListTailscaleClients("user2")
assertNoErrListClients(t, err)
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
assertNoErrSync(t, err)
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
assertNoErrListFQDN(t, err)
requireNoErrListFQDN(t, err)
for _, client := range user1Clients {
for _, peer := range allClients {
@@ -160,16 +161,16 @@ func TestSSHMultipleUsersAllToAll(t *testing.T) {
defer scenario.ShutdownAssertNoPanics(t)
nsOneClients, err := scenario.ListTailscaleClients("user1")
assertNoErrListClients(t, err)
requireNoErrListClients(t, err)
nsTwoClients, err := scenario.ListTailscaleClients("user2")
assertNoErrListClients(t, err)
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
assertNoErrSync(t, err)
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
assertNoErrListFQDN(t, err)
requireNoErrListFQDN(t, err)
testInterUserSSH := func(sourceClients []TailscaleClient, targetClients []TailscaleClient) {
for _, client := range sourceClients {
@@ -208,13 +209,13 @@ func TestSSHNoSSHConfigured(t *testing.T) {
defer scenario.ShutdownAssertNoPanics(t)
allClients, err := scenario.ListTailscaleClients()
assertNoErrListClients(t, err)
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
assertNoErrSync(t, err)
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
assertNoErrListFQDN(t, err)
requireNoErrListFQDN(t, err)
for _, client := range allClients {
for _, peer := range allClients {
@@ -259,13 +260,13 @@ func TestSSHIsBlockedInACL(t *testing.T) {
defer scenario.ShutdownAssertNoPanics(t)
allClients, err := scenario.ListTailscaleClients()
assertNoErrListClients(t, err)
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
assertNoErrSync(t, err)
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
assertNoErrListFQDN(t, err)
requireNoErrListFQDN(t, err)
for _, client := range allClients {
for _, peer := range allClients {
@@ -317,16 +318,16 @@ func TestSSHUserOnlyIsolation(t *testing.T) {
defer scenario.ShutdownAssertNoPanics(t)
ssh1Clients, err := scenario.ListTailscaleClients("user1")
assertNoErrListClients(t, err)
requireNoErrListClients(t, err)
ssh2Clients, err := scenario.ListTailscaleClients("user2")
assertNoErrListClients(t, err)
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
assertNoErrSync(t, err)
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
assertNoErrListFQDN(t, err)
requireNoErrListFQDN(t, err)
for _, client := range ssh1Clients {
for _, peer := range ssh2Clients {
@@ -422,9 +423,9 @@ func assertSSHHostname(t *testing.T, client TailscaleClient, peer TailscaleClien
t.Helper()
result, _, err := doSSH(t, client, peer)
assertNoErr(t, err)
require.NoError(t, err)
assertContains(t, peer.ContainerID(), strings.ReplaceAll(result, "\n", ""))
require.Contains(t, peer.ContainerID(), strings.ReplaceAll(result, "\n", ""))
}
func assertSSHPermissionDenied(t *testing.T, client TailscaleClient, peer TailscaleClient) {