summaryrefslogtreecommitdiff
path: root/poll.go
blob: bc3ea49a3e2265035f8030d646eac8aaa702deac (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
package main

import (
	"fmt"
	"strings"
	"time"

	"go.wit.com/lib/gui/shell"
	"go.wit.com/log"
)

func (h *HyperT) pollHypervisor() {
	url := "http://" + h.Hostname + ":2520/vms"
	log.Log(POLL, "wget url =", url)
	s := shell.Wget(url)
	if s == nil {
		return
	}
	var bytesSplice []byte
	bytesSplice = s.Bytes()
	// fmt.Fprintln(w, string(bytesSplice))
	for _, line := range strings.Split(string(bytesSplice), "\n") {
		if line == "" {
			continue
		}
		fields := strings.Fields(line)
		if len(fields) < 2 {
			continue
		}
		state := fields[0]
		name := fields[1]
		if state == "ON" {
			log.Log(POLL, h.Hostname, "STATE:", state, "HOST:", name, "rest:", fields[2:])
			d := findDroplet(name)
			if d != nil {
				log.Log(INFO, "ALREADY RECORDED", d.Hostname)
				d.lastpoll = time.Now()
				d.CurrentState = "ON"
				// log.Info("ALREADY RECORDED", d.Hostname, d.lastpoll)
				if d.hname == "" {
					log.Log(EVENT, "DROPLET", d.Hostname, "PROBABLY WAS NEVER POLLED YET")
				}
				if d.hname != h.Hostname {
					log.Log(EVENT, "DROPLET", d.Hostname, "MOVED FROM", d.hname, "TO", h.Hostname)
					d.hname = h.Hostname
				}
				continue
			}
			// this is a new unknown droplet (not in the config file)
			d = new(DropletT)
			d.Hostname = name
			d.hname = h.Hostname
			d.lastpoll = time.Now()
			d.CurrentState = "ON"
			me.droplets = append(me.droplets, d)
			log.Log(EVENT, name, "IS NEW. ADDED ON", h.Hostname)
		}
	}
	h.lastpoll = time.Now()
	h.killcount = 0 // poll worked. reset killcount
}

func findDroplet(name string) *DropletT {
	for _, d := range me.droplets {
		if d.Hostname == name {
			return d
		}
	}
	return nil
}

func findHypervisor(name string) *HyperT {
	for _, h := range me.hypers {
		if h.Hostname == name {
			return h
		}
	}
	return nil
}

// check the state of the cluster and return a string
// that is intended to be sent to an uptime monitor like Kuma
func clusterHealthy() (bool, string) {
	var good bool = true
	var total int
	var working int
	var failed int
	var missing int
	var unknown int
	var unknownList []string

	for _, d := range me.droplets {
		total += 1
		if d.State != "ON" {
			continue
		}
		dur := time.Since(d.lastpoll) // Calculate the elapsed time
		if d.CurrentState == "" {
			// log.Info("SKIP. hostname has not been polled yet", d.Hostname, d.hname)
			unknown += 1
			unknownList = append(unknownList, d.Hostname)
			continue
		}
		if d.CurrentState != "ON" {
			log.Info("BAD  STATE", d.State, d.Hostname, d.hname, "CurrentState =", d.CurrentState, shell.FormatDuration(dur))
			good = false
			failed += 1
		} else {
			dur := time.Since(d.lastpoll) // Calculate the elapsed time
			if dur > time.Minute {
				log.Info("GOOD STATE MISSING", d.Hostname, d.hname, shell.FormatDuration(dur))
				good = false
				d.CurrentState = "MISSING"
				failed += 1
				continue
			}
			l := shell.FormatDuration(dur)
			if l == "" {
				log.Info("DUR IS EMPTY", dur)
				missing += 1
				continue
			}
			working += 1
			// log.Info("GOOD STATE ON", d.Hostname, d.hname, "dur =", l)
		}
	}
	var summary string = "("
	summary += fmt.Sprintf("total = %d ", total)
	summary += fmt.Sprintf("working = %d ", working)
	if missing > 0 {
		summary += fmt.Sprintf("missing = %d ", missing)
	}
	if unknown > 0 {
		summary += fmt.Sprintf("unknown = %d ", unknown, unknownList)
	}
	if failed > 0 {
		summary += fmt.Sprintf("failed = %d ", failed)
	}
	summary = strings.TrimSpace(summary)
	summary += ")"
	if me.killcount > 0 {
		summary += "(killcount=" + fmt.Sprintf("%d", me.killcount) + ")"
	}
	last := time.Since(me.unstable)
	if last > 133*time.Second {
		// the cluster has not been stable for 10 seconds
		s := strings.TrimSpace(shell.FormatDuration(last))
		summary += "(stable=" + s + ")"
	}
	if good {
		return good, "GOOD=true " + summary
	}
	me.unstable = time.Now()
	return good, "GOOD=false " + summary
}