Skip to content

Commit

Permalink
delay health checks until host has finished rebooting
Browse files Browse the repository at this point in the history
This is implemented by getting the current boot ID, triggering reboot,
and then delaying health checks until the boot ID has changed.

Fixes #47
  • Loading branch information
adamtulinius authored and Johan Thomsen committed Feb 20, 2019
1 parent 26b2238 commit 5f0476d
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 19 deletions.
23 changes: 4 additions & 19 deletions morph.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (
"os/exec"
"path/filepath"
"strings"
"syscall"
)

// This is set at build time via -ldflags magic
Expand Down Expand Up @@ -368,25 +367,11 @@ func execDeploy(hosts []nix.Host) (string, error) {
}

if deployReboot {
if cmd, err := sshContext.Cmd(&host, "sudo", "reboot"); cmd != nil {
fmt.Fprint(os.Stderr, "Asking host to reboot ... ")
if err = cmd.Run(); err != nil {
// Here we assume that exit code 255 means: "SSH connection got disconnected",
// which is OK for a reboot - sshd may close active connections before we disconnect after all
if exitErr, ok := err.(*exec.ExitError); ok {
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok && status.ExitStatus() == 255 {
fmt.Fprintln(os.Stderr, "Remote host disconnected.")
err = nil
}
}
}
}

err = host.Reboot(sshContext)
if err != nil {
fmt.Fprintln(os.Stderr, "Reboot failed")
return "", err
}

fmt.Fprintln(os.Stderr)
}

if !skipHealthChecks {
Expand Down Expand Up @@ -578,10 +563,10 @@ func buildHosts(hosts []nix.Host) (resultPath string, err error) {
nixBuildTargets = fmt.Sprintf("{ \"out\" = %s; }", nixBuildTarget)
}

ctx := getNixContext()
ctx := getNixContext()
resultPath, err = ctx.BuildMachines(deploymentPath, hosts, nixBuildArg, nixBuildTargets)

if err != nil {
if err != nil {
return
}

Expand Down
62 changes: 62 additions & 0 deletions nix/nix.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (
"os"
"os/exec"
"path/filepath"
"syscall"
"time"
)

type Host struct {
Expand All @@ -36,6 +38,66 @@ func (host *Host) GetHealthChecks() healthchecks.HealthChecks {
return host.HealthChecks
}

func (host *Host) Reboot(sshContext *ssh.SSHContext) error {

var (
oldBootID string
newBootID string
)

oldBootID, err := sshContext.GetBootID(host)
// If the host doesn't support getting boot ID's for some reason, warn about it, and skip the comparison
skipBootIDComparison := err != nil
if skipBootIDComparison {
fmt.Fprintf(os.Stderr, "Error getting boot ID (this is used to determine when the reboot is complete): %v\n", err)
fmt.Fprintf(os.Stderr, "This makes it impossible to detect when the host has rebooted, so health checks might pass before the host has rebooted.\n")
}

if cmd, err := sshContext.Cmd(host, "sudo", "reboot"); cmd != nil {
fmt.Fprint(os.Stderr, "Asking host to reboot ... ")
if err = cmd.Run(); err != nil {
// Here we assume that exit code 255 means: "SSH connection got disconnected",
// which is OK for a reboot - sshd may close active connections before we disconnect after all
if exitErr, ok := err.(*exec.ExitError); ok {
if status, ok := exitErr.Sys().(syscall.WaitStatus); ok && status.ExitStatus() == 255 {
fmt.Fprintln(os.Stderr, "Remote host disconnected.")
err = nil
}
}
}

if err != nil {
fmt.Fprintln(os.Stderr, "Failed")
return err
}
}

fmt.Fprintln(os.Stderr, "OK")

if !skipBootIDComparison {
fmt.Fprint(os.Stderr, "Waiting for host to come online ")

// Wait for the host to get a new boot ID. These ID's should be unique for each boot,
// meaning a reboot will have been completed when the boot ID has changed.
for {
fmt.Fprint(os.Stderr, ".")

// Ignore errors; there'll be plenty of them since we'll be attempting to connect to an offline host,
// and we know from previously that the host should support boot ID's
newBootID, _ = sshContext.GetBootID(host)

if newBootID != "" && oldBootID != newBootID {
fmt.Fprintln(os.Stderr, " OK")
break
}

time.Sleep(2 * time.Second)
}
}

return nil
}

func (ctx *NixContext) GetMachines(deploymentPath string) (hosts []Host, err error) {

args := []string{"eval",
Expand Down
21 changes: 21 additions & 0 deletions ssh/ssh.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"path/filepath"
"strings"
"syscall"
"time"
)

type Context interface {
Expand Down Expand Up @@ -237,6 +238,26 @@ func (ctx *SSHContext) ActivateConfiguration(host Host, configuration string, ac
return nil
}

func (sshCtx *SSHContext) GetBootID(host Host) (string, error) {
ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Second)
defer cancel()
cmd, err := sshCtx.CmdContext(ctx, host, "cat", "/proc/sys/kernel/random/boot_id")
if err != nil {
return "", err
}

var stdout bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = os.Stderr

err = cmd.Run()
if err != nil {
return "", err
}

return strings.TrimSpace(stdout.String()), nil
}

func (ctx *SSHContext) MakeTempFile(host Host) (path string, err error) {
cmd, _ := ctx.Cmd(host, "mktemp")

Expand Down

0 comments on commit 5f0476d

Please sign in to comment.