1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
|
package main
import (
"fmt"
"strings"
"time"
"go.wit.com/lib/gui/shell"
pb "go.wit.com/lib/protobuf/virtbuf"
"go.wit.com/log"
"google.golang.org/protobuf/types/known/timestamppb"
)
func (h *HyperT) pollHypervisor() {
url := "http://" + h.pb.Hostname + ":2520/vms"
log.Log(POLL, "wget url =", url)
s := shell.Wget(url)
if s == nil {
return
}
var bytesSplice []byte
bytesSplice = s.Bytes()
// fmt.Fprintln(w, string(bytesSplice))
for _, line := range strings.Split(string(bytesSplice), "\n") {
if line == "" {
continue
}
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
state := fields[0]
name := fields[1]
if state == "OFF" {
// skip locally defined libvirt vms
continue
}
h.lastDroplets[name] = time.Now()
// if _, ok := h.lastDroplets[name]; ok {
// h.lastDroplets[name] = time.Now()
// }
// try the protobuf
d := findDroplet(name)
if d == nil {
// not sure whawt now?
log.Log(WARN, name, "is unknown on", h.pb.Hostname, "state =", state)
log.Log(WARN, name, "this vm was probably started by hand using virtsh")
log.Log(WARN, name, "todo: import vm from libvrit")
continue
}
if state == "ON" {
log.Log(POLL, h.pb.Hostname, "STATE:", state, "HOST:", name, "rest:", fields[2:])
log.Log(INFO, "ALREADY RECORDED", d.Hostname)
// update the status to ON
d.CurrentState = pb.DropletState_ON
// set the LastPoll time to now
now := time.Now()
d.LastPoll = timestamppb.New(now)
if d.CurrentHypervisor == "" {
// this means the droplet was in the config file
// but this is the first time it's shown up as running
// this should mean a droplet is running where the config file says it probably should be running
if d.PreferredHypervisor == h.pb.Hostname {
log.Log(EVENT, "poll shows new droplet", d.Hostname, "(matches config hypervisor", h.pb.Hostname+")")
d.CurrentHypervisor = h.pb.Hostname
continue
}
log.Log(EVENT, "poll shows new droplet", d.Hostname, "on", h.pb.Hostname, "(in config file without preferred hypervisor)")
d.CurrentHypervisor = h.pb.Hostname
continue
}
// if this is blank, the droplet has probably never booted yet
if d.CurrentHypervisor == "" {
d.CurrentHypervisor = h.pb.Hostname
continue
}
// this means the droplet has moved
if d.CurrentHypervisor != h.pb.Hostname {
log.Log(EVENT, "droplet", d.Hostname, "moved to", h.pb.Hostname)
// record the droplet migrated (or booted somewhere else? recording this is a work in progress)
me.cluster.DropletMoved(d, h.pb)
continue
}
d.CurrentHypervisor = h.pb.Hostname
}
}
for name, t := range h.lastDroplets {
dur := time.Since(t)
if dur > me.hyperPollDelay {
log.Info("droplet has probably powered down", name)
d := findDroplet(name)
if d != nil {
d.CurrentState = pb.DropletState_UNKNOWN
log.Info("set state UNKNOWN here", name)
}
}
}
h.lastpoll = time.Now()
h.killcount = 0 // poll worked. reset killcount
}
// check the state of the cluster and return a string
// that is intended to be sent to an uptime monitor like Kuma
func uptimeCheck() (bool, string) {
var good bool = true
var total int
var working int
var failed int
var missing []*pb.Droplet
var unknown int
var unknownList []string
for _, d := range me.cluster.Droplets {
total += 1
if d.StartState != pb.DropletState_ON {
continue
}
dur := time.Since(d.LastPoll.AsTime()) // Calculate the elapsed time
if d.CurrentState == pb.DropletState_UNKNOWN {
// log.Info("SKIP. hostname has not been polled yet", d.Hostname, d.hname)
unknown += 1
unknownList = append(unknownList, d.Hostname)
continue
}
var hname string
if d.CurrentHypervisor != "" {
hname = d.CurrentHypervisor
}
if d.CurrentState != pb.DropletState_ON {
log.Info("BAD STATE", d.StartState, d.Hostname, hname, "CurrentState =", d.CurrentState, shell.FormatDuration(dur))
good = false
failed += 1
missing = append(missing, d)
} else {
dur := time.Since(d.LastPoll.AsTime()) // Calculate the elapsed time
if dur > me.missingDropletTimeout {
log.Info("GOOD STATE MISSING", d.Hostname, hname, shell.FormatDuration(dur))
good = false
d.CurrentState = pb.DropletState_UNKNOWN
failed += 1
continue
}
l := shell.FormatDuration(dur)
if l == "" {
log.Info("DUR IS EMPTY", dur)
missing = append(missing, d)
continue
}
working += 1
// log.Info("GOOD STATE ON", d.Hostname, d.hname, "dur =", l)
}
}
var summary string = "("
summary += fmt.Sprintf("total = %d ", total)
summary += fmt.Sprintf("working = %d ", working)
if len(missing) > 0 {
summary += fmt.Sprintf("missing = %d ", len(missing))
}
if unknown > 0 {
summary += fmt.Sprintf("unknown = %d ", unknown, unknownList)
}
if failed > 0 {
summary += fmt.Sprintf("failed = %d ", failed)
}
summary = strings.TrimSpace(summary)
summary += ")"
if me.killcount > 0 {
summary += "(killcount=" + fmt.Sprintf("%d", me.killcount) + ")"
}
last := time.Since(me.unstable)
if last > me.clusterStableDuration {
// the cluster has not been stable for 10 seconds
s := strings.TrimSpace(shell.FormatDuration(last))
summary += "(stable=" + s + ")"
}
for _, d := range missing {
summary += fmt.Sprint("\nmissing droplet: ", d.Hostname, " current state ", d.CurrentState)
}
if good {
return good, "GOOD=true " + summary
}
me.unstable = time.Now()
return good, "GOOD=false " + summary
}
|