mapper/batcher: fix pointer retention, done-channel init, and connected-map races

M7: Nil out trailing *connectionEntry pointers in the backing array after slice removal in removeConnectionAtIndexLocked and send(). Without this, the GC cannot collect removed entries until the slice is reallocated. M1: Initialize the done channel in NewBatcher instead of Start(). Previously, calling Close() or queueWork before Start() would select on a nil channel, blocking forever. Moving the make() to the constructor ensures the channel is always usable. M2: Move b.connected.Delete and b.totalNodes decrement inside the Compute callback in cleanupOfflineNodes. Previously these ran after the Compute returned, allowing a concurrent AddNode to reconnect between the delete and the bookkeeping update, which would wipe the fresh connected state. M3: Call markDisconnectedIfNoConns on AddNode error paths. Previously, when initial map generation or send timed out, the connection was removed but b.connected retained its old nil (= connected) value, making IsConnected return true for a node with zero connections. Updates #2545
2026-07-16 04:50:45 +09:00 · 2026-03-13 15:47:58 +00:00
parent 8e26651f2c
commit 50e8b21471
1 changed files with 36 additions and 20 deletions
@@ -43,6 +43,7 @@ func NewBatcher(batchTime time.Duration, workers int, mapper *mapper) *Batcher {

 		// The size of this channel is arbitrary chosen, the sizing should be revisited.
 		workCh:    make(chan work, workers*200),
+		done:      make(chan struct{}),
 		nodes:     xsync.NewMap[types.NodeID, *multiChannelNodeConn](),
 		connected: xsync.NewMap[types.NodeID, *time.Time](),
 	}
@@ -258,6 +259,7 @@ func (b *Batcher) AddNode(
 	if err != nil {
 		nlog.Error().Err(err).Msg("initial map generation failed")
 		nodeConn.removeConnectionByChannel(c)
+		b.markDisconnectedIfNoConns(id, nodeConn)

 		return fmt.Errorf("generating initial map for node %d: %w", id, err)
 	}
@@ -272,6 +274,7 @@ func (b *Batcher) AddNode(
 		nlog.Debug().Caller().Dur("timeout.duration", 5*time.Second). //nolint:mnd
 										Msg("initial map send timed out because channel was blocked or receiver not ready")
 		nodeConn.removeConnectionByChannel(c)
+		b.markDisconnectedIfNoConns(id, nodeConn)

 		return fmt.Errorf("%w for node %d", ErrInitialMapSendTimeout, id)
 	}
@@ -335,8 +338,6 @@ func (b *Batcher) Start() {
 		return
 	}

-	b.done = make(chan struct{})
-
 	b.wg.Add(1)

 	go b.doWork()
@@ -349,9 +350,7 @@ func (b *Batcher) Close() {
 	// close workCh here because processBatchedChanges or
 	// MapResponseFromChange may still be sending on it concurrently.
 	b.doneOnce.Do(func() {
-		if b.done != nil {
-			close(b.done)
-		}
+		close(b.done)
 	})

 	// Wait for all worker goroutines (and doWork) to exit before
@@ -619,8 +618,6 @@ func (b *Batcher) cleanupOfflineNodes() {
 	cleaned := 0

 	for _, nodeID := range nodesToCleanup {
-		deleted := false
-
 		b.nodes.Compute(
 			nodeID,
 			func(conn *multiChannelNodeConn, loaded bool) (*multiChannelNodeConn, xsync.ComputeOp) {
@@ -628,22 +625,22 @@ func (b *Batcher) cleanupOfflineNodes() {
 					return conn, xsync.CancelOp
 				}

-				deleted = true
+				// Perform all bookkeeping inside the Compute callback so
+				// that a concurrent AddNode (which calls LoadOrStore on
+				// b.nodes) cannot slip in between the delete and the
+				// connected/counter updates.
+				b.connected.Delete(nodeID)
+				b.totalNodes.Add(-1)
+
+				cleaned++
+
+				log.Info().Uint64(zf.NodeID, nodeID.Uint64()).
+					Dur("offline_duration", cleanupThreshold).
+					Msg("cleaning up node that has been offline for too long")

 				return conn, xsync.DeleteOp
 			},
 		)
-
-		if deleted {
-			log.Info().Uint64(zf.NodeID, nodeID.Uint64()).
-				Dur("offline_duration", cleanupThreshold).
-				Msg("cleaning up node that has been offline for too long")
-
-			b.connected.Delete(nodeID)
-			b.totalNodes.Add(-1)
-
-			cleaned++
-		}
 	}

 	if cleaned > 0 {
@@ -711,6 +708,17 @@ func (b *Batcher) ConnectedMap() *xsync.Map[types.NodeID, bool] {
 	return ret
 }

+// markDisconnectedIfNoConns stores a disconnect timestamp in b.connected
+// when the node has no remaining active connections. This prevents
+// IsConnected from returning a stale true after all connections have been
+// removed on an error path (e.g. AddNode failure).
+func (b *Batcher) markDisconnectedIfNoConns(id types.NodeID, nc *multiChannelNodeConn) {
+	if !nc.hasActiveConnections() {
+		now := time.Now()
+		b.connected.Store(id, &now)
+	}
+}
+
 // MapResponseFromChange queues work to generate a map response and waits for the result.
 // This allows synchronous map generation using the same worker pool.
 func (b *Batcher) MapResponseFromChange(id types.NodeID, ch change.Change) (*tailcfg.MapResponse, error) {
@@ -808,7 +816,9 @@ func (mc *multiChannelNodeConn) stopConnection(conn *connectionEntry) {
 // Caller must hold mc.mutex.
 func (mc *multiChannelNodeConn) removeConnectionAtIndexLocked(i int, stopConnection bool) *connectionEntry {
 	conn := mc.connections[i]
-	mc.connections = append(mc.connections[:i], mc.connections[i+1:]...)
+	copy(mc.connections[i:], mc.connections[i+1:])
+	mc.connections[len(mc.connections)-1] = nil // release pointer for GC
+	mc.connections = mc.connections[:len(mc.connections)-1]

 	if stopConnection {
 		mc.stopConnection(conn)
@@ -964,6 +974,12 @@ func (mc *multiChannelNodeConn) send(data *tailcfg.MapResponse) error {
 			}
 		}

+		// Nil out trailing slots so removed *connectionEntry values
+		// are not retained by the backing array.
+		for i := len(clean); i < len(mc.connections); i++ {
+			mc.connections[i] = nil
+		}
+
 		mc.connections = clean
 		mc.mutex.Unlock()
 	}