ConsulRegistry: use health checks to select nodes.

Consul sees a healthcheck that is in the warning state as a "failed"
node. This means that when we ask Consul for services that are passing,
it would not return nodes that have warning healthchecks.

In the cache, we only check on critical to skip for nodes. This makes
the cache out of sync with the non-cache implementation.

This patch reworks the non-cache implementation to ask for all nodes
(even unhealthy ones) and does the same check as within the cache, skip
nodes that have critical healthchecks.

We've noticed this issue when we deployed custom healthchecks where the
cache was acting properly, but after 1 minute we saw "None Available"
errors. This is due to the TTL expiry on the cache, which is then
followed by doing a non cached request.
This commit is contained in:
Jelmer Snoeck
2016-08-26 08:36:45 +01:00
parent b8241f3026
commit e59f7a7ace
2 changed files with 198 additions and 1 deletions

View File

@@ -179,7 +179,7 @@ func (c *consulRegistry) Register(s *Service, opts ...RegisterOption) error {
}
func (c *consulRegistry) GetService(name string) ([]*Service, error) {
rsp, _, err := c.Client.Health().Service(name, "", true, nil)
rsp, _, err := c.Client.Health().Service(name, "", false, nil)
if err != nil {
return nil, err
}
@@ -216,6 +216,20 @@ func (c *consulRegistry) GetService(name string) ([]*Service, error) {
serviceMap[key] = svc
}
var del bool
for _, check := range s.Checks {
// delete the node if the status is critical
if check.Status == "critical" {
del = true
break
}
}
// if delete then skip the node
if del {
continue
}
svc.Nodes = append(svc.Nodes, &Node{
Id: id,
Address: address,