/
status.go
280 lines (242 loc) · 8.98 KB
/
status.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
/*
Copyright 2016 The Rook Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package client
import (
"encoding/json"
"fmt"
"github.com/pkg/errors"
"github.com/rook/rook/pkg/clusterd"
)
const (
// CephHealthOK denotes the status of ceph cluster when healthy.
CephHealthOK = "HEALTH_OK"
// CephHealthWarn denotes the status of ceph cluster when unhealthy but recovering.
CephHealthWarn = "HEALTH_WARN"
// CephHealthErr denotes the status of ceph cluster when unhealthy but usually needs
// manual intervention.
CephHealthErr = "HEALTH_ERR"
)
const (
activeClean = "active+clean"
activeCleanScrubbing = "active+clean+scrubbing"
activeCleanScrubbingDeep = "active+clean+scrubbing+deep"
)
type CephStatus struct {
Health HealthStatus `json:"health"`
FSID string `json:"fsid"`
ElectionEpoch int `json:"election_epoch"`
Quorum []int `json:"quorum"`
QuorumNames []string `json:"quorum_names"`
MonMap MonMap `json:"monmap"`
OsdMap struct {
OsdMap OsdMap `json:"osdmap"`
} `json:"osdmap"`
PgMap PgMap `json:"pgmap"`
MgrMap MgrMap `json:"mgrmap"`
Fsmap Fsmap `json:"fsmap"`
}
type HealthStatus struct {
Status string `json:"status"`
Checks map[string]CheckMessage `json:"checks"`
}
type CheckMessage struct {
Severity string `json:"severity"`
Summary struct {
Message string `json:"message"`
} `json:"summary"`
}
type MonMap struct {
Epoch int `json:"epoch"`
FSID string `json:"fsid"`
CreatedTime string `json:"created"`
ModifiedTime string `json:"modified"`
Mons []MonMapEntry `json:"mons"`
}
type MgrMap struct {
Epoch int `json:"epoch"`
ActiveGID int `json:"active_gid"`
ActiveName string `json:"active_name"`
ActiveAddr string `json:"active_addr"`
Available bool `json:"available"`
Standbys []MgrStandby `json:"standbys"`
}
type MgrStandby struct {
GID int `json:"gid"`
Name string `json:"name"`
}
type OsdMap struct {
Epoch int `json:"epoch"`
NumOsd int `json:"num_osds"`
NumUpOsd int `json:"num_up_osds"`
NumInOsd int `json:"num_in_osds"`
Full bool `json:"full"`
NearFull bool `json:"nearfull"`
NumRemappedPgs int `json:"num_remapped_pgs"`
}
type PgMap struct {
PgsByState []PgStateEntry `json:"pgs_by_state"`
Version int `json:"version"`
NumPgs int `json:"num_pgs"`
DataBytes uint64 `json:"data_bytes"`
UsedBytes uint64 `json:"bytes_used"`
AvailableBytes uint64 `json:"bytes_avail"`
TotalBytes uint64 `json:"bytes_total"`
ReadBps uint64 `json:"read_bytes_sec"`
WriteBps uint64 `json:"write_bytes_sec"`
ReadOps uint64 `json:"read_op_per_sec"`
WriteOps uint64 `json:"write_op_per_sec"`
RecoveryBps uint64 `json:"recovering_bytes_per_sec"`
RecoveryObjectsPerSec uint64 `json:"recovering_objects_per_sec"`
RecoveryKeysPerSec uint64 `json:"recovering_keys_per_sec"`
CacheFlushBps uint64 `json:"flush_bytes_sec"`
CacheEvictBps uint64 `json:"evict_bytes_sec"`
CachePromoteBps uint64 `json:"promote_op_per_sec"`
}
type PgStateEntry struct {
StateName string `json:"state_name"`
Count int `json:"count"`
}
// Fsmap is a struct representing the filesystem map
type Fsmap struct {
Epoch int `json:"epoch"`
ID int `json:"id"`
Up int `json:"up"`
In int `json:"in"`
Max int `json:"max"`
ByRank []struct {
FilesystemID int `json:"filesystem_id"`
Rank int `json:"rank"`
Name string `json:"name"`
Status string `json:"status"`
Gid int `json:"gid"`
} `json:"by_rank"`
UpStandby int `json:"up:standby"`
}
func Status(context *clusterd.Context, clusterName string, debug bool) (CephStatus, error) {
args := []string{"status"}
cmd := NewCephCommand(context, clusterName, args)
cmd.Debug = debug
buf, err := cmd.Run()
if err != nil {
return CephStatus{}, errors.Wrapf(err, "failed to get status")
}
var status CephStatus
if err := json.Unmarshal(buf, &status); err != nil {
return CephStatus{}, errors.Wrapf(err, "failed to unmarshal status response")
}
return status, nil
}
// IsClusterClean returns msg (string), clean (bool), err (error)
// msg describes the state of the PGs
// clean is true if the cluster is clean
// err is not nil if getting the status failed.
func IsClusterClean(context *clusterd.Context, clusterName string) (string, bool, error) {
status, err := Status(context, clusterName, true)
if err != nil {
return "unable to get PG health", false, err
}
msg, clean := isClusterClean(status)
if !clean {
return msg, false, nil
}
return msg, true, nil
}
// IsClusterCleanError returns an error indicating if the cluster is fully clean yet (i.e., all placement
// groups are in the active+clean state). It returns nil if the cluster is clean.
// Using IsClusterClean is recommended if you want to differentiate between a failure of the status query and
// an unclean cluster.
func IsClusterCleanError(context *clusterd.Context, clusterName string) error {
msg, clean, err := IsClusterClean(context, clusterName)
if err != nil {
return err
}
if !clean {
return errors.New(msg)
}
return nil
}
func isClusterClean(status CephStatus) (string, bool) {
if status.PgMap.NumPgs == 0 {
// there are no PGs yet, that still counts as clean
return "cluster has no PGs", true
}
cleanPGs := 0
for _, pg := range status.PgMap.PgsByState {
if pg.StateName == activeClean || pg.StateName == activeCleanScrubbing || pg.StateName == activeCleanScrubbingDeep {
cleanPGs += pg.Count
}
}
if cleanPGs == status.PgMap.NumPgs {
// all PGs in the cluster are in a clean state
logger.Debugf("all placement groups have reached a clean state: %+v", status.PgMap.PgsByState)
return "all PGs in cluster are clean", true
}
return fmt.Sprintf("cluster is not fully clean. PGs: %+v", status.PgMap.PgsByState), false
}
// getMDSRank returns the rank of a given MDS
func getMDSRank(status CephStatus, clusterName, fsName string) (int, error) {
// dummy rank
mdsRank := -1000
for r := range status.Fsmap.ByRank {
if status.Fsmap.ByRank[r].Name == fsName {
mdsRank = r
}
}
// if the mds is not shown in the map one reason might be because it's in standby
// if not in standby there is something else going wrong
if mdsRank < 0 && status.Fsmap.UpStandby < 1 {
// it might seem strange to log an error since this could be a warning too
// it is a warning until we reach the timeout, this should give enough time to the mds to transition its state
// after the timeout we consider that the mds might be gone or the timeout was not long enough...
return mdsRank, errors.Errorf("mds %s not found in fsmap, this likely means mdss are transitioning between active and standby states", fsName)
}
return mdsRank, nil
}
// MdsActiveOrStandbyReplay returns wether a given MDS is active or in standby
func MdsActiveOrStandbyReplay(context *clusterd.Context, clusterName, fsName string) error {
status, err := Status(context, clusterName, false)
if err != nil {
return errors.Wrapf(err, "failed to get ceph status")
}
mdsRank, err := getMDSRank(status, clusterName, fsName)
if err != nil {
return errors.Cause(err)
}
// this MDS is in standby so let's return immediately
if mdsRank < 0 {
logger.Infof("mds %s is in standby, nothing to check", fsName)
return nil
}
if status.Fsmap.ByRank[mdsRank].Status == "up:active" || status.Fsmap.ByRank[mdsRank].Status == "up:standby-replay" || status.Fsmap.ByRank[mdsRank].Status == "up:standby" {
logger.Infof("mds %s is %s", fsName, status.Fsmap.ByRank[mdsRank].Status)
return nil
}
return errors.Errorf("mds %s is %s, bad state", fsName, status.Fsmap.ByRank[mdsRank].Status)
}
// IsCephHealthy verifies Ceph is healthy, useful when performing an upgrade
// check if it's a minor or major upgrade... too!
func IsCephHealthy(context *clusterd.Context, clusterName string) bool {
cephStatus, err := Status(context, clusterName, false)
if err != nil {
logger.Errorf("failed to detect if Ceph is healthy. failed to get ceph status. %v", err)
return false
}
return isCephHealthy(cephStatus)
}
func isCephHealthy(status CephStatus) bool {
s := status.Health.Status
if s == "HEALTH_WARN" || s == "HEALTH_OK" {
return true
}
return false
}