Files
headscale/integration/ssh_test.go
Kristoffer Dalby fddc7117e4 stability and race conditions in auth and node store (#2781)
This PR addresses some consistency issues that was introduced or discovered with the nodestore.

nodestore:
Now returns the node that is being put or updated when it is finished. This closes a race condition where when we read it back, we do not necessarily get the node with the given change and it ensures we get all the other updates from that batch write.

auth:
Authentication paths have been unified and simplified. It removes a lot of bad branches and ensures we only do the minimal work.
A comprehensive auth test set has been created so we do not have to run integration tests to validate auth and it has allowed us to generate test cases for all the branches we currently know of.

integration:
added a lot more tooling and checks to validate that nodes reach the expected state when they come up and down. Standardised between the different auth models. A lot of this is to support or detect issues in the changes to nodestore (races) and auth (inconsistencies after login and reaching correct state)

This PR was assisted, particularly tests, by claude code.
2025-10-16 12:17:43 +02:00

461 lines
12 KiB
Go

package integration
import (
"fmt"
"log"
"strings"
"testing"
"time"
policyv2 "github.com/juanfont/headscale/hscontrol/policy/v2"
"github.com/juanfont/headscale/integration/hsic"
"github.com/juanfont/headscale/integration/tsic"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/tailcfg"
)
func isSSHNoAccessStdError(stderr string) bool {
return strings.Contains(stderr, "Permission denied (tailscale)") ||
// Since https://github.com/tailscale/tailscale/pull/14853
strings.Contains(stderr, "failed to evaluate SSH policy") ||
// Since https://github.com/tailscale/tailscale/pull/16127
strings.Contains(stderr, "tailnet policy does not permit you to SSH to this node")
}
func sshScenario(t *testing.T, policy *policyv2.Policy, clientsPerUser int) *Scenario {
t.Helper()
spec := ScenarioSpec{
NodesPerUser: clientsPerUser,
Users: []string{"user1", "user2"},
}
scenario, err := NewScenario(spec)
require.NoError(t, err)
err = scenario.CreateHeadscaleEnv(
[]tsic.Option{
tsic.WithSSH(),
// Alpine containers dont have ip6tables set up, which causes
// tailscaled to stop configuring the wgengine, causing it
// to not configure DNS.
tsic.WithNetfilter("off"),
tsic.WithDockerEntrypoint([]string{
"/bin/sh",
"-c",
"/bin/sleep 3 ; apk add openssh ; adduser ssh-it-user ; update-ca-certificates ; tailscaled --tun=tsdev",
}),
tsic.WithDockerWorkdir("/"),
},
hsic.WithACLPolicy(policy),
hsic.WithTestName("ssh"),
)
require.NoError(t, err)
err = scenario.WaitForTailscaleSync()
require.NoError(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
require.NoError(t, err)
return scenario
}
func TestSSHOneUserToAll(t *testing.T) {
IntegrationSkip(t)
scenario := sshScenario(t,
&policyv2.Policy{
Groups: policyv2.Groups{
policyv2.Group("group:integration-test"): []policyv2.Username{policyv2.Username("user1@")},
},
ACLs: []policyv2.ACL{
{
Action: "accept",
Protocol: "tcp",
Sources: []policyv2.Alias{wildcard()},
Destinations: []policyv2.AliasWithPorts{
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
},
},
},
SSHs: []policyv2.SSH{
{
Action: "accept",
Sources: policyv2.SSHSrcAliases{groupp("group:integration-test")},
Destinations: policyv2.SSHDstAliases{wildcard()},
Users: []policyv2.SSHUser{policyv2.SSHUser("ssh-it-user")},
},
},
},
len(MustTestVersions),
)
defer scenario.ShutdownAssertNoPanics(t)
allClients, err := scenario.ListTailscaleClients()
requireNoErrListClients(t, err)
user1Clients, err := scenario.ListTailscaleClients("user1")
requireNoErrListClients(t, err)
user2Clients, err := scenario.ListTailscaleClients("user2")
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
requireNoErrListFQDN(t, err)
for _, client := range user1Clients {
for _, peer := range allClients {
if client.Hostname() == peer.Hostname() {
continue
}
assertSSHHostname(t, client, peer)
}
}
for _, client := range user2Clients {
for _, peer := range allClients {
if client.Hostname() == peer.Hostname() {
continue
}
assertSSHPermissionDenied(t, client, peer)
}
}
}
func TestSSHMultipleUsersAllToAll(t *testing.T) {
IntegrationSkip(t)
scenario := sshScenario(t,
&policyv2.Policy{
Groups: policyv2.Groups{
policyv2.Group("group:integration-test"): []policyv2.Username{policyv2.Username("user1@"), policyv2.Username("user2@")},
},
ACLs: []policyv2.ACL{
{
Action: "accept",
Protocol: "tcp",
Sources: []policyv2.Alias{wildcard()},
Destinations: []policyv2.AliasWithPorts{
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
},
},
},
SSHs: []policyv2.SSH{
{
Action: "accept",
Sources: policyv2.SSHSrcAliases{groupp("group:integration-test")},
Destinations: policyv2.SSHDstAliases{usernamep("user1@"), usernamep("user2@")},
Users: []policyv2.SSHUser{policyv2.SSHUser("ssh-it-user")},
},
},
},
len(MustTestVersions),
)
defer scenario.ShutdownAssertNoPanics(t)
nsOneClients, err := scenario.ListTailscaleClients("user1")
requireNoErrListClients(t, err)
nsTwoClients, err := scenario.ListTailscaleClients("user2")
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
requireNoErrListFQDN(t, err)
testInterUserSSH := func(sourceClients []TailscaleClient, targetClients []TailscaleClient) {
for _, client := range sourceClients {
for _, peer := range targetClients {
assertSSHHostname(t, client, peer)
}
}
}
testInterUserSSH(nsOneClients, nsTwoClients)
testInterUserSSH(nsTwoClients, nsOneClients)
}
func TestSSHNoSSHConfigured(t *testing.T) {
IntegrationSkip(t)
scenario := sshScenario(t,
&policyv2.Policy{
Groups: policyv2.Groups{
policyv2.Group("group:integration-test"): []policyv2.Username{policyv2.Username("user1@")},
},
ACLs: []policyv2.ACL{
{
Action: "accept",
Protocol: "tcp",
Sources: []policyv2.Alias{wildcard()},
Destinations: []policyv2.AliasWithPorts{
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
},
},
},
SSHs: []policyv2.SSH{},
},
len(MustTestVersions),
)
defer scenario.ShutdownAssertNoPanics(t)
allClients, err := scenario.ListTailscaleClients()
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
requireNoErrListFQDN(t, err)
for _, client := range allClients {
for _, peer := range allClients {
if client.Hostname() == peer.Hostname() {
continue
}
assertSSHPermissionDenied(t, client, peer)
}
}
}
func TestSSHIsBlockedInACL(t *testing.T) {
IntegrationSkip(t)
scenario := sshScenario(t,
&policyv2.Policy{
Groups: policyv2.Groups{
policyv2.Group("group:integration-test"): []policyv2.Username{policyv2.Username("user1@")},
},
ACLs: []policyv2.ACL{
{
Action: "accept",
Protocol: "tcp",
Sources: []policyv2.Alias{wildcard()},
Destinations: []policyv2.AliasWithPorts{
aliasWithPorts(wildcard(), tailcfg.PortRange{First: 80, Last: 80}),
},
},
},
SSHs: []policyv2.SSH{
{
Action: "accept",
Sources: policyv2.SSHSrcAliases{groupp("group:integration-test")},
Destinations: policyv2.SSHDstAliases{usernamep("user1@")},
Users: []policyv2.SSHUser{policyv2.SSHUser("ssh-it-user")},
},
},
},
len(MustTestVersions),
)
defer scenario.ShutdownAssertNoPanics(t)
allClients, err := scenario.ListTailscaleClients()
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
requireNoErrListFQDN(t, err)
for _, client := range allClients {
for _, peer := range allClients {
if client.Hostname() == peer.Hostname() {
continue
}
assertSSHTimeout(t, client, peer)
}
}
}
func TestSSHUserOnlyIsolation(t *testing.T) {
IntegrationSkip(t)
scenario := sshScenario(t,
&policyv2.Policy{
Groups: policyv2.Groups{
policyv2.Group("group:ssh1"): []policyv2.Username{policyv2.Username("user1@")},
policyv2.Group("group:ssh2"): []policyv2.Username{policyv2.Username("user2@")},
},
ACLs: []policyv2.ACL{
{
Action: "accept",
Protocol: "tcp",
Sources: []policyv2.Alias{wildcard()},
Destinations: []policyv2.AliasWithPorts{
aliasWithPorts(wildcard(), tailcfg.PortRangeAny),
},
},
},
SSHs: []policyv2.SSH{
{
Action: "accept",
Sources: policyv2.SSHSrcAliases{groupp("group:ssh1")},
Destinations: policyv2.SSHDstAliases{usernamep("user1@")},
Users: []policyv2.SSHUser{policyv2.SSHUser("ssh-it-user")},
},
{
Action: "accept",
Sources: policyv2.SSHSrcAliases{groupp("group:ssh2")},
Destinations: policyv2.SSHDstAliases{usernamep("user2@")},
Users: []policyv2.SSHUser{policyv2.SSHUser("ssh-it-user")},
},
},
},
len(MustTestVersions),
)
defer scenario.ShutdownAssertNoPanics(t)
ssh1Clients, err := scenario.ListTailscaleClients("user1")
requireNoErrListClients(t, err)
ssh2Clients, err := scenario.ListTailscaleClients("user2")
requireNoErrListClients(t, err)
err = scenario.WaitForTailscaleSync()
requireNoErrSync(t, err)
_, err = scenario.ListTailscaleClientsFQDNs()
requireNoErrListFQDN(t, err)
for _, client := range ssh1Clients {
for _, peer := range ssh2Clients {
if client.Hostname() == peer.Hostname() {
continue
}
assertSSHPermissionDenied(t, client, peer)
}
}
for _, client := range ssh2Clients {
for _, peer := range ssh1Clients {
if client.Hostname() == peer.Hostname() {
continue
}
assertSSHPermissionDenied(t, client, peer)
}
}
for _, client := range ssh1Clients {
for _, peer := range ssh1Clients {
if client.Hostname() == peer.Hostname() {
continue
}
assertSSHHostname(t, client, peer)
}
}
for _, client := range ssh2Clients {
for _, peer := range ssh2Clients {
if client.Hostname() == peer.Hostname() {
continue
}
assertSSHHostname(t, client, peer)
}
}
}
func doSSH(t *testing.T, client TailscaleClient, peer TailscaleClient) (string, string, error) {
t.Helper()
return doSSHWithRetry(t, client, peer, true)
}
func doSSHWithoutRetry(t *testing.T, client TailscaleClient, peer TailscaleClient) (string, string, error) {
t.Helper()
return doSSHWithRetry(t, client, peer, false)
}
func doSSHWithRetry(t *testing.T, client TailscaleClient, peer TailscaleClient, retry bool) (string, string, error) {
t.Helper()
peerFQDN, _ := peer.FQDN()
command := []string{
"/usr/bin/ssh", "-o StrictHostKeyChecking=no", "-o ConnectTimeout=1",
fmt.Sprintf("%s@%s", "ssh-it-user", peerFQDN),
"'hostname'",
}
log.Printf("Running from %s to %s", client.Hostname(), peer.Hostname())
log.Printf("Command: %s", strings.Join(command, " "))
var result, stderr string
var err error
if retry {
// Use assert.EventuallyWithT to retry SSH connections for success cases
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
result, stderr, err = client.Execute(command)
// If we get a permission denied error, we can fail immediately
// since that is something we won't recover from by retrying.
if err != nil && isSSHNoAccessStdError(stderr) {
return // Don't retry permission denied errors
}
// For all other errors, assert no error to trigger retry
assert.NoError(ct, err)
}, 10*time.Second, 200*time.Millisecond)
} else {
// For failure cases, just execute once
result, stderr, err = client.Execute(command)
}
return result, stderr, err
}
func assertSSHHostname(t *testing.T, client TailscaleClient, peer TailscaleClient) {
t.Helper()
result, _, err := doSSH(t, client, peer)
require.NoError(t, err)
require.Contains(t, peer.ContainerID(), strings.ReplaceAll(result, "\n", ""))
}
func assertSSHPermissionDenied(t *testing.T, client TailscaleClient, peer TailscaleClient) {
t.Helper()
result, stderr, err := doSSHWithoutRetry(t, client, peer)
assert.Empty(t, result)
assertSSHNoAccessStdError(t, err, stderr)
}
func assertSSHTimeout(t *testing.T, client TailscaleClient, peer TailscaleClient) {
t.Helper()
result, stderr, _ := doSSHWithoutRetry(t, client, peer)
assert.Empty(t, result)
if !strings.Contains(stderr, "Connection timed out") &&
!strings.Contains(stderr, "Operation timed out") {
t.Fatalf("connection did not time out")
}
}
func assertSSHNoAccessStdError(t *testing.T, err error, stderr string) {
t.Helper()
assert.Error(t, err)
if !isSSHNoAccessStdError(stderr) {
t.Errorf("expected stderr output suggesting access denied, got: %s", stderr)
}
}