Skip to content

Commit

Permalink
backport of commit a04c53e (#27377)
Browse files Browse the repository at this point in the history
Co-authored-by: Paul Banks <[email protected]>
  • Loading branch information
hc-github-team-secure-vault-core and banks committed Jun 6, 2024
1 parent d750a1f commit b0ea89b
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 11 deletions.
4 changes: 4 additions & 0 deletions changelog/27277.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
```release-note:bug
storage/raft (enterprise): Fix a regression introduced in 1.15.8 that causes
autopilot to fail to discover new server versions and so not trigger an upgrade.
```
18 changes: 12 additions & 6 deletions physical/raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/sdk/physical"
"github.com/hashicorp/vault/vault/cluster"
"github.com/hashicorp/vault/version"
etcdbolt "go.etcd.io/bbolt"
)

Expand Down Expand Up @@ -582,6 +583,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
failGetInTxn: new(uint32),
raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled,
raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval,
effectiveSDKVersion: version.GetVersion().Version,
}, nil
}

Expand Down Expand Up @@ -660,12 +662,6 @@ func (b *RaftBackend) FailGetInTxn(fail bool) {
atomic.StoreUint32(b.failGetInTxn, val)
}

func (b *RaftBackend) SetEffectiveSDKVersion(sdkVersion string) {
b.l.Lock()
b.effectiveSDKVersion = sdkVersion
b.l.Unlock()
}

func (b *RaftBackend) RedundancyZone() string {
b.l.RLock()
defer b.l.RUnlock()
Expand Down Expand Up @@ -1089,6 +1085,11 @@ type SetupOpts struct {
// RecoveryModeConfig is the configuration for the raft cluster in recovery
// mode.
RecoveryModeConfig *raft.Configuration

// EffectiveSDKVersion is typically the version string baked into the binary.
// We pass it in though because it can be overridden in tests or via ENV in
// core.
EffectiveSDKVersion string
}

func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error {
Expand Down Expand Up @@ -1132,6 +1133,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
return errors.New("no local node id configured")
}

if opts.EffectiveSDKVersion != "" {
// Override the SDK version
b.effectiveSDKVersion = opts.EffectiveSDKVersion
}

// Setup the raft config
raftConfig := raft.DefaultConfig()
if err := b.applyConfigSettings(raftConfig); err != nil {
Expand Down
5 changes: 4 additions & 1 deletion physical/raft/raft_autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,10 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
}
followerVersion = leaderVersion
} else {
delete(d.emptyVersionLogs, currentServerID)
if _, ok := d.emptyVersionLogs[currentServerID]; ok {
d.logger.Trace("received non-empty version in heartbeat state. no longer need to fake it", "id", id, "update_version", followerVersion)
delete(d.emptyVersionLogs, currentServerID)
}
}
d.dl.Unlock()

Expand Down
35 changes: 35 additions & 0 deletions vault/external_tests/raft/raft_autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,41 @@ func TestRaft_Autopilot_Disable(t *testing.T) {
require.Nil(t, nil, state)
}

// TestRaft_Autopilot_BinaryVersionPlumbing is an apparently trivial test that
// ensures that the default plumbing in Vault core to configure the binary
// version of the raft library is working. Hopefully this will trivially pass
// from now on, however it would have caught a regression in the past!
func TestRaft_Autopilot_BinaryVersionPlumbing(t *testing.T) {
t.Parallel()

coreCfg, clusterOpts := raftClusterBuilder(t, &RaftClusterOpts{
EnableAutopilot: true,
// We need 2 nodes because the code path that regressed was different on a
// standby vs active node so we'd not detect the problem if we only test on
// an active node.
NumCores: 2,
})

// Default options should not set EffectiveSDKVersion(Map) which would defeat
// the point of this test by plumbing versions via config.
require.Nil(t, clusterOpts.EffectiveSDKVersionMap)
require.Empty(t, coreCfg.EffectiveSDKVersion)

c := vault.NewTestCluster(t, coreCfg, &clusterOpts)
defer c.Cleanup()

// Wait for follower to be perf standby (in Ent, in CE it will only wait for
// active). In either Enterprise or CE case, this should pass if we've plumbed
// the version correctly so it's valuable for both.
testhelpers.WaitForActiveNodeAndStandbys(t, c)
for _, core := range c.Cores {
be := core.UnderlyingRawStorage.(*raft.RaftBackend)
require.Equal(t, version.GetVersion().Version, be.UpgradeVersion(),
"expected raft upgrade version to default to Vault version for core %q",
core.NodeID)
}
}

// TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted
// to be voters after the stabilization time has elapsed. Also checks that
// the autopilot state is Healthy once all nodes are available.
Expand Down
8 changes: 4 additions & 4 deletions vault/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,10 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) {
raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true))

if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{
TLSKeyring: raftTLS,
ClusterListener: c.getClusterListener(),
StartAsLeader: creating,
TLSKeyring: raftTLS,
ClusterListener: c.getClusterListener(),
StartAsLeader: creating,
EffectiveSDKVersion: c.effectiveSDKVersion,
}); err != nil {
return err
}
Expand Down Expand Up @@ -309,7 +310,6 @@ func (c *Core) setupRaftActiveNode(ctx context.Context) error {
}

c.logger.Info("starting raft active node")
raftBackend.SetEffectiveSDKVersion(c.effectiveSDKVersion)

c.pendingRaftPeers = &sync.Map{}

Expand Down
9 changes: 9 additions & 0 deletions vault/request_forwarding_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,15 @@ func (s *forwardedRequestRPCServer) Echo(ctx context.Context, in *EchoRequest) (
}

if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil {
s.core.logger.Trace("forwarding RPC: echo received",
"node_id", in.RaftNodeID,
"applied_index", in.RaftAppliedIndex,
"term", in.RaftTerm,
"desired_suffrage", in.RaftDesiredSuffrage,
"sdk_version", in.SdkVersion,
"upgrade_version", in.RaftUpgradeVersion,
"redundancy_zone", in.RaftRedundancyZone)

s.raftFollowerStates.Update(&raft.EchoRequestUpdate{
NodeID: in.RaftNodeID,
AppliedIndex: in.RaftAppliedIndex,
Expand Down

0 comments on commit b0ea89b

Please sign in to comment.