summaryrefslogtreecommitdiff
path: root/poll.go
blob: 15f8aa46ea804140ea785f1e5a321cced69f9386 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
package main

import (
	"fmt"
	"strings"
	"time"

	"go.wit.com/lib/gui/shell"
	pb "go.wit.com/lib/protobuf/virtbuf"
	"go.wit.com/log"
	"google.golang.org/protobuf/types/known/timestamppb"
)

func findHypervisorByName(name string) *HyperT {
	for _, h := range me.hypers {
		if h.pb.Hostname == name {
			return h
		}
	}
	return nil
}

func (h *HyperT) pollHypervisor() {
	url := "http://" + h.pb.Hostname + ":2520/vms"
	log.Log(POLL, "wget url =", url)
	s := shell.Wget(url)
	if s == nil {
		return
	}

	var bytesSplice []byte
	bytesSplice = s.Bytes()
	// fmt.Fprintln(w, string(bytesSplice))
	for _, line := range strings.Split(string(bytesSplice), "\n") {
		if line == "" {
			continue
		}
		fields := strings.Fields(line)
		if len(fields) < 2 {
			log.Log(WARN, "unknown:", h.pb.Hostname, fields)
			continue
		}
		state := fields[0]
		name := fields[1]
		d := me.cluster.FindDropletByName(name)
		if d == nil {
			log.Log(WARN, name, "local defined domain")
			log.Log(WARN, name, "local Adding new entry with AddDropletLocal()")
			log.Log(WARN, name, "local Adding new entry with AddDropletLocal()")
			log.Log(WARN, name, "local Adding new entry with AddDropletLocal()")
			me.cluster.AddDropletLocal(name, h.pb.Hostname)
			continue
		}
		start := d.SprintHeader()
		h.lastDroplets[name] = time.Now()
		if state == "OFF" {
			if d.Current.Hypervisor == "" {
				d.Current.Hypervisor = h.pb.Hostname
			}
			if d.LocalOnly == "" {
				log.Log(WARN, start, "local domain is a duplicate (need to resolve this)", h.pb.Hostname)
				continue
			}
			log.Log(WARN, start, "local domain ready to import from hypervisor")
			continue
		}

		if state == "ON" {
			log.Log(POLL, start, "STATE:", state, "rest:", fields[2:])

			// update the status to ON
			d.SetState(pb.DropletState_ON)

			// set the LastPoll time to now
			now := time.Now()
			d.Current.LastPoll = timestamppb.New(now)

			if d.Current.Hypervisor == "" {
				// this means the droplet was in the config file
				// but this is the first time it's shown up as running

				// this should mean a droplet is running where the config file says it probably should be running
				if d.PreferredHypervisor == h.pb.Hostname {
					log.Log(EVENT, start, "poll shows new droplet", d.Hostname,
						"(matches config hypervisor", h.pb.Hostname+")")
					d.Current.Hypervisor = h.pb.Hostname
					continue
				}

				log.Log(EVENT, start, "poll shows new droplet (in config file without preferred hypervisor)")
				d.Current.Hypervisor = h.pb.Hostname
				continue
			}

			// if this is blank, the droplet has probably never booted yet
			if d.Current.Hypervisor == "" {
				d.Current.Hypervisor = h.pb.Hostname
				continue
			}

			// this means the droplet has moved
			if d.Current.Hypervisor != h.pb.Hostname {
				log.Log(EVENT, "droplet", d.Hostname, "moved to", h.pb.Hostname)
				// record the droplet migrated (or booted somewhere else? recording this is a work in progress)
				me.cluster.DropletMoved(d, h.pb)
				continue
			}
			d.Current.Hypervisor = h.pb.Hostname
		}
	}

	// these are the droplets that don't exist anymore on this hypervisor
	// this should mean you ran shutdown within domU
	for name, t := range h.lastDroplets {
		dur := time.Since(t)
		if dur > me.hyperPollDelay {
			d := me.cluster.FindDropletByName(name)
			header := d.SprintHeader()
			if d == nil {
				log.Info(header, "droplet has probably powered down", name, "but findDroplet returned nil")
				// should delete this from h.lastDroplets
				continue
			}
			if d.Current.State == pb.DropletState_OFF {
				log.Info(header, "droplet timed out and is off. remove from h.lastDroplets[] slice")
				delete(h.lastDroplets, name)
				continue
			}

			// everthing below here is dumb and needs to be rethought
			if d.Current.State != pb.DropletState_UNKNOWN {
				d.SetState(pb.DropletState_UNKNOWN)
				log.Info(header, "set state UNKNOWN here", name)
			}
			if d.Current.State == pb.DropletState_UNKNOWN {
				if dur > time.Minute*2 {
					// what this means is the droplet probably wasn't migrated or the migrate failed
					// where should this be checked? the status needs to be changed to OFF
					s := pb.FormatDuration(dur)
					log.Info(header, "UNKNOWN state for more than 2 minutes (clearing out ?)", name, s)

					// it might be safe to set the status to OFF here. not really. this poll needs
					// to be moved somewhere else. there needs to be a new goroutine not tied to the
					// hypervisor
					d.SetState(pb.DropletState_OFF)
				}
			}
		}
	}
	h.lastpoll = time.Now()
	h.killcount = 0 // poll worked. reset killcount
}

// check the state of the cluster and return a string
// that is intended to be sent to an uptime monitor like Kuma
func uptimeCheck() (bool, string) {
	var good bool = true
	var total int
	var working int
	var failed int
	var missing []*pb.Droplet
	var unknown int
	var unknownList []string

	loop := me.cluster.DropletsAll() // get the list of droplets
	for loop.Scan() {
		d := loop.Next()
		total += 1
		if d.StartState != pb.DropletState_ON {
			continue
		}
		dur := time.Since(d.Current.LastPoll.AsTime()) // Calculate the elapsed time
		var hname string
		if d.Current.Hypervisor != "" {
			hname = d.Current.Hypervisor
		}
		switch d.Current.State {
		case pb.DropletState_UNKNOWN:
			// log.Info("SKIP. hostname has not been polled yet", d.Hostname, d.hname)
			unknown += 1
			unknownList = append(unknownList, d.Hostname)
		case pb.DropletState_ON:
			if dur > me.missingDropletTimeout {
				log.Info("GOOD STATE MISSING", d.Hostname, hname, pb.FormatDuration(dur))
				good = false
				d.SetState(pb.DropletState_UNKNOWN)
				failed += 1
				continue
			}
			l := pb.FormatDuration(dur)
			if l == "" {
				log.Info("DUR IS EMPTY", dur)
				missing = append(missing, d)
				continue
			}
			working += 1
			// log.Info("GOOD STATE ON", d.Hostname, d.hname, "dur =", l)
		case pb.DropletState_OFF:
			log.Info("OFF  STATE", d.StartState, d.Hostname, hname, pb.FormatDuration(dur))
			good = false
			failed += 1
			// missing = append(missing, d)
		default:
			log.Info("WTF  STATE", d.StartState, d.Hostname, hname, "Current.State =", d.Current.State, pb.FormatDuration(dur))
			good = false
			failed += 1
			missing = append(missing, d)
		}
	}
	var summary string = "("
	summary += fmt.Sprintf("total = %d ", total)
	summary += fmt.Sprintf("working = %d ", working)
	if len(missing) > 0 {
		summary += fmt.Sprintf("missing = %d ", len(missing))
	}
	if unknown > 0 {
		summary += fmt.Sprintf("unknown = %d %+v", unknown, unknownList)
	}
	if failed > 0 {
		summary += fmt.Sprintf("failed = %d ", failed)
	}
	summary = strings.TrimSpace(summary)
	summary += ")"
	if me.killcount > 0 {
		summary += "(killcount=" + fmt.Sprintf("%d", me.killcount) + ")"
	}
	last := time.Since(me.unstable)
	s := strings.TrimSpace(pb.FormatDuration(last))
	if last > me.clusterStableDuration {
		// the cluster has not been stable for 10 seconds
		summary += "(stable=" + s + ")"
	} else {
		summary += "(unstable=" + s + ")"
	}
	for _, d := range missing {
		summary += fmt.Sprint("\nmissing droplet: ", d.Hostname, " current state ", d.Current.State)
	}
	if good {
		return good, "GOOD=true " + summary
	}
	// me.unstable = time.Now()
	return good, "GOOD=false " + summary
}