micro/registry/consul_registry.go

387 lines
8.5 KiB
Go
Raw Normal View History

2015-01-14 02:31:27 +03:00
package registry
import (
2016-01-16 23:25:18 +03:00
"crypto/tls"
2015-01-14 02:31:27 +03:00
"errors"
"fmt"
"net"
2016-01-16 23:25:18 +03:00
"net/http"
"runtime"
"sync"
2016-01-16 23:25:18 +03:00
"time"
2015-01-14 02:31:27 +03:00
consul "github.com/hashicorp/consul/api"
hash "github.com/mitchellh/hashstructure"
2015-01-14 02:31:27 +03:00
)
type consulRegistry struct {
2015-02-15 02:00:47 +03:00
Address string
Client *consul.Client
2017-09-28 13:16:56 +03:00
opts Options
2018-08-06 19:12:34 +03:00
// connect enabled
connect bool
queryOptions *consul.QueryOptions
sync.Mutex
register map[string]uint64
// lastChecked tracks when a node was last checked as existing in Consul
lastChecked map[string]time.Time
}
func getDeregisterTTL(t time.Duration) time.Duration {
// splay slightly for the watcher?
splay := time.Second * 5
deregTTL := t + splay
// consul has a minimum timeout on deregistration of 1 minute.
if t < time.Minute {
deregTTL = time.Minute + splay
}
return deregTTL
}
2016-01-17 02:39:47 +03:00
func newTransport(config *tls.Config) *http.Transport {
if config == nil {
config = &tls.Config{
InsecureSkipVerify: true,
}
}
2016-01-16 23:25:18 +03:00
t := &http.Transport{
Proxy: http.ProxyFromEnvironment,
Dial: (&net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).Dial,
TLSHandshakeTimeout: 10 * time.Second,
2016-01-17 02:39:47 +03:00
TLSClientConfig: config,
2016-01-16 23:25:18 +03:00
}
runtime.SetFinalizer(&t, func(tr **http.Transport) {
(*tr).CloseIdleConnections()
})
return t
}
func configure(c *consulRegistry, opts ...Option) {
// set opts
2015-12-19 21:28:08 +03:00
for _, o := range opts {
o(&c.opts)
2015-12-19 21:28:08 +03:00
}
// use default config
config := consul.DefaultConfig()
2018-08-06 19:12:34 +03:00
if c.opts.Context != nil {
// Use the consul config passed in the options, if available
if co, ok := c.opts.Context.Value("consul_config").(*consul.Config); ok {
config = co
}
if cn, ok := c.opts.Context.Value("consul_connect").(bool); ok {
c.connect = cn
2018-08-06 19:12:34 +03:00
}
// Use the consul query options passed in the options, if available
if qo, ok := c.opts.Context.Value("consul_query_options").(*consul.QueryOptions); ok && qo != nil {
c.queryOptions = qo
}
if as, ok := c.opts.Context.Value("consul_allow_stale").(bool); ok {
c.queryOptions.AllowStale = as
}
}
2015-12-19 21:28:08 +03:00
// check if there are any addrs
if len(c.opts.Addrs) > 0 {
addr, port, err := net.SplitHostPort(c.opts.Addrs[0])
if ae, ok := err.(*net.AddrError); ok && ae.Err == "missing port in address" {
port = "8500"
addr = c.opts.Addrs[0]
config.Address = fmt.Sprintf("%s:%s", addr, port)
} else if err == nil {
config.Address = fmt.Sprintf("%s:%s", addr, port)
}
}
2015-12-19 21:28:08 +03:00
2016-01-16 23:25:18 +03:00
// requires secure connection?
if c.opts.Secure || c.opts.TLSConfig != nil {
if config.HttpClient == nil {
config.HttpClient = new(http.Client)
}
2016-01-16 23:25:18 +03:00
config.Scheme = "https"
// We're going to support InsecureSkipVerify
config.HttpClient.Transport = newTransport(c.opts.TLSConfig)
}
// set timeout
if c.opts.Timeout > 0 {
config.HttpClient.Timeout = c.opts.Timeout
2016-01-16 23:25:18 +03:00
}
2015-12-19 21:28:08 +03:00
// create the client
2015-10-26 23:23:57 +03:00
client, _ := consul.NewClient(config)
// set address/client
c.Address = config.Address
c.Client = client
}
func newConsulRegistry(opts ...Option) Registry {
cr := &consulRegistry{
opts: Options{},
register: make(map[string]uint64),
lastChecked: make(map[string]time.Time),
queryOptions: &consul.QueryOptions{
AllowStale: true,
},
}
configure(cr, opts...)
return cr
}
func (c *consulRegistry) Init(opts ...Option) error {
configure(c, opts...)
return nil
}
func (c *consulRegistry) Deregister(s *Service) error {
if len(s.Nodes) == 0 {
2015-01-14 02:31:27 +03:00
return errors.New("Require at least one node")
}
2016-01-26 23:44:29 +03:00
// delete our hash and time check of the service
c.Lock()
delete(c.register, s.Name)
delete(c.lastChecked, s.Name)
c.Unlock()
2016-01-26 23:44:29 +03:00
node := s.Nodes[0]
2016-01-27 02:32:27 +03:00
return c.Client.Agent().ServiceDeregister(node.Id)
2015-01-14 02:31:27 +03:00
}
2016-01-27 02:32:27 +03:00
func (c *consulRegistry) Register(s *Service, opts ...RegisterOption) error {
if len(s.Nodes) == 0 {
2015-01-14 02:31:27 +03:00
return errors.New("Require at least one node")
}
var regTCPCheck bool
var regInterval time.Duration
2016-01-27 02:32:27 +03:00
var options RegisterOptions
for _, o := range opts {
o(&options)
}
if c.opts.Context != nil {
if tcpCheckInterval, ok := c.opts.Context.Value("consul_tcp_check").(time.Duration); ok {
regTCPCheck = true
regInterval = tcpCheckInterval
}
}
// create hash of service; uint64
h, err := hash.Hash(s, nil)
if err != nil {
return err
}
// use first node
node := s.Nodes[0]
// get existing hash and last checked time
c.Lock()
v, ok := c.register[s.Name]
lastChecked := c.lastChecked[s.Name]
c.Unlock()
// if it's already registered and matches then just pass the check
if ok && v == h {
2018-10-09 05:40:24 +03:00
if options.TTL == time.Duration(0) {
// ensure that our service hasn't been deregistered by Consul
if time.Since(lastChecked) <= getDeregisterTTL(regInterval) {
return nil
}
services, _, err := c.Client.Health().Checks(s.Name, c.queryOptions)
2018-10-09 05:40:24 +03:00
if err == nil {
for _, v := range services {
if v.ServiceID == node.Id {
return nil
}
}
}
2018-11-03 15:17:11 +03:00
} else {
2018-10-09 05:40:24 +03:00
// if the err is nil we're all good, bail out
// if not, we don't know what the state is, so full re-register
if err := c.Client.Agent().PassTTL("service:"+node.Id, ""); err == nil {
return nil
}
}
}
// encode the tags
2015-05-27 00:39:48 +03:00
tags := encodeMetadata(node.Metadata)
2015-10-11 14:05:20 +03:00
tags = append(tags, encodeEndpoints(s.Endpoints)...)
tags = append(tags, encodeVersion(s.Version)...)
2015-01-14 02:31:27 +03:00
2016-01-27 03:32:16 +03:00
var check *consul.AgentServiceCheck
if regTCPCheck {
deregTTL := getDeregisterTTL(regInterval)
Add option to enable TCP check with Consul registry One disadvantage of using TTL based health check is the high network traffic between Consul agent (either between servers, or between server and client). In order for the services considered alive by Consul, microservices must send an update TTL to Consul every n seconds (currently 30 seconds). Here is the explanation about TTL check from Consul documentation [1] Time to Live (TTL) - These checks retain their last known state for a given TTL. The state of the check must be updated periodically over the HTTP interface. If an external system fails to update the status within a given TTL, the check is set to the failed state. This mechanism, conceptually similar to a dead man's switch, relies on the application to directly report its health. For example, a healthy app can periodically PUT a status update to the HTTP endpoint; if the app fails, the TTL will expire and the health check enters a critical state. The endpoints used to update health information for a given check are the pass endpoint and the fail endpoint. TTL checks also persist their last known status to disk. This allows the Consul agent to restore the last known status of the check across restarts. Persisted check status is valid through the end of the TTL from the time of the last check. Hint: TTL checks also persist their last known status to disk. This allows the Consul agent to restore the last known status of the check across restarts. When microservices update the TTL, Consul will write to disk. Writing to disk means all other slaves need to replicate it, which means master need to inform other standby Consul to pull the new catalog. Hence, the increased traffic. More information about this issue can be viewed at Consul mailing list [2]. [1] https://www.consul.io/docs/agent/checks.html [2] https://groups.google.com/forum/#!topic/consul-tool/84h7qmCCpjg
2018-03-14 14:51:38 +03:00
check = &consul.AgentServiceCheck{
TCP: fmt.Sprintf("%s:%d", node.Address, node.Port),
Interval: fmt.Sprintf("%v", regInterval),
Add option to enable TCP check with Consul registry One disadvantage of using TTL based health check is the high network traffic between Consul agent (either between servers, or between server and client). In order for the services considered alive by Consul, microservices must send an update TTL to Consul every n seconds (currently 30 seconds). Here is the explanation about TTL check from Consul documentation [1] Time to Live (TTL) - These checks retain their last known state for a given TTL. The state of the check must be updated periodically over the HTTP interface. If an external system fails to update the status within a given TTL, the check is set to the failed state. This mechanism, conceptually similar to a dead man's switch, relies on the application to directly report its health. For example, a healthy app can periodically PUT a status update to the HTTP endpoint; if the app fails, the TTL will expire and the health check enters a critical state. The endpoints used to update health information for a given check are the pass endpoint and the fail endpoint. TTL checks also persist their last known status to disk. This allows the Consul agent to restore the last known status of the check across restarts. Persisted check status is valid through the end of the TTL from the time of the last check. Hint: TTL checks also persist their last known status to disk. This allows the Consul agent to restore the last known status of the check across restarts. When microservices update the TTL, Consul will write to disk. Writing to disk means all other slaves need to replicate it, which means master need to inform other standby Consul to pull the new catalog. Hence, the increased traffic. More information about this issue can be viewed at Consul mailing list [2]. [1] https://www.consul.io/docs/agent/checks.html [2] https://groups.google.com/forum/#!topic/consul-tool/84h7qmCCpjg
2018-03-14 14:51:38 +03:00
DeregisterCriticalServiceAfter: fmt.Sprintf("%v", deregTTL),
}
Add option to enable TCP check with Consul registry One disadvantage of using TTL based health check is the high network traffic between Consul agent (either between servers, or between server and client). In order for the services considered alive by Consul, microservices must send an update TTL to Consul every n seconds (currently 30 seconds). Here is the explanation about TTL check from Consul documentation [1] Time to Live (TTL) - These checks retain their last known state for a given TTL. The state of the check must be updated periodically over the HTTP interface. If an external system fails to update the status within a given TTL, the check is set to the failed state. This mechanism, conceptually similar to a dead man's switch, relies on the application to directly report its health. For example, a healthy app can periodically PUT a status update to the HTTP endpoint; if the app fails, the TTL will expire and the health check enters a critical state. The endpoints used to update health information for a given check are the pass endpoint and the fail endpoint. TTL checks also persist their last known status to disk. This allows the Consul agent to restore the last known status of the check across restarts. Persisted check status is valid through the end of the TTL from the time of the last check. Hint: TTL checks also persist their last known status to disk. This allows the Consul agent to restore the last known status of the check across restarts. When microservices update the TTL, Consul will write to disk. Writing to disk means all other slaves need to replicate it, which means master need to inform other standby Consul to pull the new catalog. Hence, the increased traffic. More information about this issue can be viewed at Consul mailing list [2]. [1] https://www.consul.io/docs/agent/checks.html [2] https://groups.google.com/forum/#!topic/consul-tool/84h7qmCCpjg
2018-03-14 14:51:38 +03:00
// if the TTL is greater than 0 create an associated check
} else if options.TTL > time.Duration(0) {
deregTTL := getDeregisterTTL(options.TTL)
2016-01-27 03:32:16 +03:00
check = &consul.AgentServiceCheck{
2018-11-03 15:17:11 +03:00
TTL: fmt.Sprintf("%v", options.TTL),
DeregisterCriticalServiceAfter: fmt.Sprintf("%v", deregTTL),
2016-01-27 03:32:16 +03:00
}
}
// register the service
2018-08-06 19:12:34 +03:00
asr := &consul.AgentServiceRegistration{
2016-01-27 03:15:46 +03:00
ID: node.Id,
Name: s.Name,
Tags: tags,
Port: node.Port,
Address: node.Address,
2016-01-27 03:32:16 +03:00
Check: check,
2018-08-06 19:12:34 +03:00
}
// Specify consul connect
if c.connect {
asr.Connect = &consul.AgentServiceConnect{
Native: true,
}
}
if err := c.Client.Agent().ServiceRegister(asr); err != nil {
return err
}
2015-01-14 02:31:27 +03:00
// save our hash and time check of the service
c.Lock()
c.register[s.Name] = h
c.lastChecked[s.Name] = time.Now()
c.Unlock()
// if the TTL is 0 we don't mess with the checks
2016-01-27 03:32:16 +03:00
if options.TTL == time.Duration(0) {
return nil
}
// pass the healthcheck
2016-01-27 03:15:46 +03:00
return c.Client.Agent().PassTTL("service:"+node.Id, "")
2015-01-14 02:31:27 +03:00
}
2015-11-08 04:48:48 +03:00
func (c *consulRegistry) GetService(name string) ([]*Service, error) {
2018-08-06 19:12:34 +03:00
var rsp []*consul.ServiceEntry
var err error
// if we're connect enabled only get connect services
if c.connect {
rsp, _, err = c.Client.Health().Connect(name, "", false, c.queryOptions)
2018-08-06 19:12:34 +03:00
} else {
rsp, _, err = c.Client.Health().Service(name, "", false, c.queryOptions)
2018-08-06 19:12:34 +03:00
}
2015-01-14 02:31:27 +03:00
if err != nil {
return nil, err
}
2015-11-08 04:48:48 +03:00
serviceMap := map[string]*Service{}
2015-01-14 02:31:27 +03:00
for _, s := range rsp {
2016-01-27 02:32:27 +03:00
if s.Service.Service != name {
2015-01-14 02:31:27 +03:00
continue
}
// version is now a tag
2018-03-01 20:35:13 +03:00
version, _ := decodeVersion(s.Service.Tags)
// service ID is now the node id
2016-01-27 02:32:27 +03:00
id := s.Service.ID
// key is always the version
key := version
2018-03-01 20:35:13 +03:00
// address is service address
2016-01-27 02:32:27 +03:00
address := s.Service.Address
2018-03-01 20:35:13 +03:00
// use node address
if len(address) == 0 {
address = s.Node.Address
2015-11-08 04:48:48 +03:00
}
svc, ok := serviceMap[key]
if !ok {
svc = &Service{
2016-01-27 02:32:27 +03:00
Endpoints: decodeEndpoints(s.Service.Tags),
Name: s.Service.Service,
2015-11-08 04:48:48 +03:00
Version: version,
}
serviceMap[key] = svc
}
var del bool
2018-03-01 20:35:13 +03:00
for _, check := range s.Checks {
// delete the node if the status is critical
if check.Status == "critical" {
del = true
break
}
}
// if delete then skip the node
if del {
continue
}
2015-11-08 04:48:48 +03:00
svc.Nodes = append(svc.Nodes, &Node{
Id: id,
Address: address,
2016-01-27 02:32:27 +03:00
Port: s.Service.Port,
Metadata: decodeMetadata(s.Service.Tags),
2015-01-14 02:31:27 +03:00
})
}
2015-11-08 04:48:48 +03:00
var services []*Service
for _, service := range serviceMap {
services = append(services, service)
}
return services, nil
2015-01-14 02:31:27 +03:00
}
func (c *consulRegistry) ListServices() ([]*Service, error) {
rsp, _, err := c.Client.Catalog().Services(c.queryOptions)
if err != nil {
return nil, err
}
2015-12-05 04:12:29 +03:00
var services []*Service
for service := range rsp {
services = append(services, &Service{Name: service})
}
return services, nil
}
2018-02-19 20:12:37 +03:00
func (c *consulRegistry) Watch(opts ...WatchOption) (Watcher, error) {
return newConsulWatcher(c, opts...)
2015-01-14 02:31:27 +03:00
}
2015-12-20 00:56:14 +03:00
func (c *consulRegistry) String() string {
return "consul"
}
2017-09-28 13:16:56 +03:00
func (c *consulRegistry) Options() Options {
return c.opts
}