rework gossip registry

This commit is contained in:
Asim Aslam 2019-02-13 14:39:20 +00:00
parent c3722877c1
commit 7cb466359f
3 changed files with 272 additions and 256 deletions

View File

@ -1,4 +1,4 @@
// Package Gossip provides a gossip registry based on hashicorp/memberlist
// Package gossip provides a gossip registry based on hashicorp/memberlist
package gossip
import (
@ -8,11 +8,9 @@ import (
"io/ioutil"
"net"
"os"
"os/signal"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/golang/protobuf/proto"
@ -110,7 +108,7 @@ type gossipRegistry struct {
mtu int
addrs []string
members map[string]int32
done chan struct{}
done chan bool
}
type update struct {
@ -119,6 +117,11 @@ type update struct {
sync chan *registry.Service
}
type updates struct {
sync.RWMutex
services map[uint64]*update
}
var (
// You should change this if using secure
DefaultSecret = []byte("micro-gossip-key") // exactly 16 bytes
@ -126,56 +129,6 @@ var (
MaxPacketSize = 512
)
func (g *gossipRegistry) connect(addrs []string) error {
var err error
if len(addrs) == 0 {
return nil
}
timeout := make(<-chan time.Time)
if g.connectTimeout > 0 {
timeout = time.After(g.connectTimeout)
}
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
fn := func() (int, error) {
return g.member.Join(addrs)
}
// don't wait for first try
if _, err = fn(); err == nil {
return nil
}
// wait loop
for {
select {
// context closed
case <-g.options.Context.Done():
return nil
// call close, don't wait anymore
case <-g.done:
return nil
// in case of timeout fail with a timeout error
case <-timeout:
return fmt.Errorf("[gossip]: timedout connect to %v", g.addrs)
// got a tick, try to connect
case <-ticker.C:
if _, err = fn(); err == nil {
log.Logf("[gossip]: success connect to %v", g.addrs)
return nil
} else {
log.Logf("[gossip]: failed connect to %v", g.addrs)
}
}
}
return err
}
func configure(g *gossipRegistry, opts ...registry.Option) error {
// loop through address list and get valid entries
addrs := func(curAddrs []string) []string {
@ -205,9 +158,9 @@ func configure(g *gossipRegistry, opts ...registry.Option) error {
}
// shutdown old member
if g.member != nil {
g.Stop()
}
g.Stop()
// new done chan
g.done = make(chan bool)
// replace addresses
curAddrs = newAddrs
@ -220,16 +173,18 @@ func configure(g *gossipRegistry, opts ...registry.Option) error {
c.PushPullInterval = 0 // disable expensive tcp push/pull
c.ProtocolVersion = 4 // suport latest stable features
if optConfig, ok := g.options.Context.Value(contextConfig{}).(*memberlist.Config); ok && optConfig != nil {
c = optConfig
// set config from options
if config, ok := g.options.Context.Value(configKey{}).(*memberlist.Config); ok && config != nil {
c = config
}
if hostport, ok := g.options.Context.Value(contextAddress{}).(string); ok {
host, port, err := net.SplitHostPort(hostport)
// set address
if address, ok := g.options.Context.Value(addressKey{}).(string); ok {
host, port, err := net.SplitHostPort(address)
if err == nil {
pn, err := strconv.Atoi(port)
p, err := strconv.Atoi(port)
if err == nil {
c.BindPort = pn
c.BindPort = p
}
c.BindAddr = host
}
@ -238,12 +193,13 @@ func configure(g *gossipRegistry, opts ...registry.Option) error {
c.BindPort = 0
}
if hostport, ok := g.options.Context.Value(contextAdvertise{}).(string); ok {
host, port, err := net.SplitHostPort(hostport)
// set the advertise address
if advertise, ok := g.options.Context.Value(advertiseKey{}).(string); ok {
host, port, err := net.SplitHostPort(advertise)
if err == nil {
pn, err := strconv.Atoi(port)
p, err := strconv.Atoi(port)
if err == nil {
c.AdvertisePort = pn
c.AdvertisePort = p
}
c.AdvertiseAddr = host
}
@ -257,7 +213,7 @@ func configure(g *gossipRegistry, opts ...registry.Option) error {
// set a secret key if secure
if g.options.Secure {
k, ok := g.options.Context.Value(contextSecretKey{}).([]byte)
k, ok := g.options.Context.Value(secretKey{}).([]byte)
if !ok {
// use the default secret
k = DefaultSecret
@ -265,10 +221,13 @@ func configure(g *gossipRegistry, opts ...registry.Option) error {
c.SecretKey = k
}
if v, ok := g.options.Context.Value(connectRetry{}).(bool); ok && v {
// set connect retry
if v, ok := g.options.Context.Value(connectRetryKey{}).(bool); ok && v {
g.connectRetry = true
}
if td, ok := g.options.Context.Value(connectTimeout{}).(time.Duration); ok {
// set connect timeout
if td, ok := g.options.Context.Value(connectTimeoutKey{}).(time.Duration); ok {
g.connectTimeout = td
}
@ -300,19 +259,24 @@ func configure(g *gossipRegistry, opts ...registry.Option) error {
// set internals
g.Lock()
if len(curAddrs) > 0 {
for _, addr := range curAddrs {
g.members[addr] = nodeActionUnknown
}
}
g.tcpInterval = c.PushPullInterval
g.addrs = curAddrs
g.queue = queue
g.member = m
g.interval = c.GossipInterval
g.Unlock()
log.Logf("[gossip]: Registry Listening on %s", m.LocalNode().Address())
log.Logf("[gossip] Registry Listening on %s", m.LocalNode().Address())
// try connect
return g.connect(curAddrs)
}
@ -328,7 +292,7 @@ func (b *broadcast) Message() []byte {
return nil
}
if l := len(up); l > MaxPacketSize {
log.Logf("[gossip]: broadcast message size %d bigger then MaxPacketSize %d", l, MaxPacketSize)
log.Logf("[gossip] broadcast message size %d bigger then MaxPacketSize %d", l, MaxPacketSize)
}
return up
}
@ -429,6 +393,55 @@ func (d *delegate) MergeRemoteState(buf []byte, join bool) {
}
}
func (g *gossipRegistry) connect(addrs []string) error {
if len(addrs) == 0 {
return nil
}
timeout := make(<-chan time.Time)
if g.connectTimeout > 0 {
timeout = time.After(g.connectTimeout)
}
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
fn := func() (int, error) {
return g.member.Join(addrs)
}
// don't wait for first try
if _, err := fn(); err == nil {
return nil
}
// wait loop
for {
select {
// context closed
case <-g.options.Context.Done():
return nil
// call close, don't wait anymore
case <-g.done:
return nil
// in case of timeout fail with a timeout error
case <-timeout:
return fmt.Errorf("[gossip] connect timeout %v", g.addrs)
// got a tick, try to connect
case <-ticker.C:
if _, err := fn(); err == nil {
log.Logf("[gossip] connect success for %v", g.addrs)
return nil
} else {
log.Logf("[gossip] connect failed for %v", g.addrs)
}
}
}
return nil
}
func (g *gossipRegistry) publish(action string, services []*registry.Service) {
g.RLock()
for _, sub := range g.watchers {
@ -462,119 +475,137 @@ func (g *gossipRegistry) subscribe() (chan *registry.Result, chan bool) {
return next, exit
}
func (g *gossipRegistry) wait() {
ctx := g.options.Context
if c, ok := ctx.Value(contextContext{}).(context.Context); ok && c != nil {
ctx = c
}
ch := make(chan os.Signal, 1)
signal.Notify(ch, syscall.SIGTERM, syscall.SIGINT, syscall.SIGQUIT)
select {
// wait on kill signal
case <-ch:
// wait on context cancel
case <-ctx.Done():
}
g.Stop()
}
func (g *gossipRegistry) Stop() error {
g.Lock()
if g.done != nil {
select {
case <-g.done:
return nil
default:
close(g.done)
g.done = nil
g.Lock()
if g.member != nil {
g.member.Leave(g.interval * 2)
g.member.Shutdown()
g.member = nil
}
g.Unlock()
}
if g.member != nil {
g.member.Leave(g.interval * 2)
g.member.Shutdown()
g.member = nil
}
g.Unlock()
return nil
}
// connectLoop attempts to reconnect to the memberlist
func (g *gossipRegistry) connectLoop() {
// try every second
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
for {
select {
case <-g.done:
return
case <-g.options.Context.Done():
g.Stop()
return
case <-ticker.C:
var addrs []string
g.RLock()
// only process if we have a memberlist
if g.member == nil {
g.RUnlock()
continue
}
// self
local := g.member.LocalNode().Address()
// operate on each member
for node, action := range g.members {
switch action {
// process leave event
case nodeActionLeave:
// don't process self
if node == local {
continue
}
addrs = append(addrs, node)
}
}
g.RUnlock()
// connect to all the members
// TODO: only connect to new members
if len(addrs) > 0 {
g.connect(addrs)
}
}
}
}
func (g *gossipRegistry) expiryLoop(updates *updates) {
ticker := time.NewTicker(ExpiryTick)
defer ticker.Stop()
for {
select {
case <-g.done:
return
case <-ticker.C:
now := uint64(time.Now().UnixNano())
updates.Lock()
// process all the updates
for k, v := range updates.services {
// check if expiry time has passed
if d := (v.Update.Expires); d < now {
// delete from records
delete(updates.services, k)
// set to delete
v.Update.Action = actionTypeDelete
// fire a new update
g.updates <- v
}
}
updates.Unlock()
}
}
}
// process member events
func (g *gossipRegistry) eventLoop() {
for {
select {
// return when done
case <-g.done:
return
case ev := <-g.events:
// TODO: nonblocking update
g.Lock()
if _, ok := g.members[ev.node]; ok {
g.members[ev.node] = ev.action
}
g.Unlock()
}
}
}
func (g *gossipRegistry) run() {
var mtx sync.Mutex
updates := map[uint64]*update{}
updates := &updates{
services: make(map[uint64]*update),
}
// expiry loop
go func() {
ticker := time.NewTicker(ExpiryTick)
defer ticker.Stop()
go g.expiryLoop(updates)
for {
select {
case <-g.done:
return
case <-ticker.C:
now := uint64(time.Now().UnixNano())
mtx.Lock()
// process all the updates
for k, v := range updates {
// check if expiry time has passed
if d := (v.Update.Expires); d < now {
// delete from records
delete(updates, k)
// set to delete
v.Update.Action = actionTypeDelete
// fire a new update
g.updates <- v
}
}
mtx.Unlock()
}
}
}()
go func() {
for {
select {
case <-g.done:
return
case ed := <-g.events:
// may be not block all registry?
g.Lock()
if _, ok := g.members[ed.node]; ok {
g.members[ed.node] = ed.action
}
g.Unlock()
}
}
}()
// event loop
go g.eventLoop()
// connect loop
if g.connectRetry {
go func() {
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
for {
select {
case <-g.done:
return
case <-ticker.C:
var addrs []string
g.RLock()
if g.member != nil {
for node, action := range g.members {
if action == nodeActionLeave && g.member.LocalNode().Address() != node {
addrs = append(addrs, node)
}
}
}
g.RUnlock()
if len(addrs) > 0 {
g.connect(addrs)
}
}
}
}()
go g.connectLoop()
}
// process the updates
@ -597,9 +628,9 @@ func (g *gossipRegistry) run() {
if u.Update.Expires > 0 {
// create a hash of this service
if hash, err := hashstructure.Hash(u.Service, nil); err == nil {
mtx.Lock()
updates[hash] = u
mtx.Unlock()
updates.Lock()
updates.services[hash] = u
updates.Unlock()
}
}
case actionTypeDelete:
@ -618,9 +649,9 @@ func (g *gossipRegistry) run() {
// delete from expiry checks
if hash, err := hashstructure.Hash(u.Service, nil); err == nil {
mtx.Lock()
delete(updates, hash)
mtx.Unlock()
updates.Lock()
delete(updates.services, hash)
updates.Unlock()
}
case actionTypeSync:
// no sync channel provided
@ -676,7 +707,7 @@ func (g *gossipRegistry) Register(s *registry.Service, opts ...registry.Register
}
if options.TTL == 0 && g.tcpInterval == 0 {
return fmt.Errorf("must provide registry.RegisterTTL option or set PushPullInterval in *memberlist.Config")
return fmt.Errorf("Require register TTL or interval for memberlist.Config")
}
up := &pb.Update{
@ -766,30 +797,28 @@ func (g *gossipRegistry) String() string {
}
func NewRegistry(opts ...registry.Option) registry.Registry {
gossip := &gossipRegistry{
g := &gossipRegistry{
options: registry.Options{
Context: context.Background(),
},
updates: make(chan *update, 100),
done: make(chan bool),
events: make(chan *event, 100),
updates: make(chan *update, 100),
services: make(map[string][]*registry.Service),
watchers: make(map[string]chan *registry.Result),
done: make(chan struct{}),
members: make(map[string]int32),
}
// run the updater
go gossip.run()
go g.run()
// configure the gossiper
if err := configure(gossip, opts...); err != nil {
log.Fatalf("Error configuring registry: %v", err)
if err := configure(g, opts...); err != nil {
log.Fatalf("[gossip] Error configuring registry: %v", err)
}
// wait for setup
<-time.After(gossip.interval * 2)
<-time.After(g.interval * 2)
go gossip.wait()
return gossip
return g
}

View File

@ -35,7 +35,7 @@ func newRegistry(opts ...registry.Option) registry.Registry {
return r
}
func TestRegistryBroadcast(t *testing.T) {
func TestGossipRegistryBroadcast(t *testing.T) {
mc1 := newMemberlistConfig()
r1 := newRegistry(Config(mc1), Address("127.0.0.1:54321"))
@ -45,65 +45,57 @@ func TestRegistryBroadcast(t *testing.T) {
defer r1.(*gossipRegistry).Stop()
defer r2.(*gossipRegistry).Stop()
svc1 := &registry.Service{Name: "r1-svc", Version: "0.0.0.1"}
svc2 := &registry.Service{Name: "r2-svc", Version: "0.0.0.2"}
svc1 := &registry.Service{Name: "service.1", Version: "0.0.0.1"}
svc2 := &registry.Service{Name: "service.2", Version: "0.0.0.2"}
t.Logf("register service svc1 on r1\n")
if err := r1.Register(svc1, registry.RegisterTTL(10*time.Second)); err != nil {
t.Fatal(err)
}
t.Logf("register service svc2 on r2\n")
if err := r2.Register(svc2, registry.RegisterTTL(10*time.Second)); err != nil {
t.Fatal(err)
}
var found bool
t.Logf("list services on r1\n")
svcs, err := r1.ListServices()
if err != nil {
t.Fatal(err)
}
for _, svc := range svcs {
if svc.Name == "r2-svc" {
if svc.Name == "service.2" {
found = true
}
}
if !found {
t.Fatalf("r2-svc not found in r1, broadcast not work")
} else {
t.Logf("r2-svc found in r1, all ok")
t.Fatalf("[gossip registry] service.2 not found in r1, broadcast not work")
}
found = false
t.Logf("list services on r2\n")
svcs, err = r2.ListServices()
if err != nil {
t.Fatal(err)
}
for _, svc := range svcs {
if svc.Name == "r1-svc" {
if svc.Name == "service.1" {
found = true
}
}
if !found {
t.Fatalf("r1-svc not found in r2, broadcast not work")
} else {
t.Logf("r1-svc found in r1, all ok")
t.Fatalf("[gossip registry] broadcast failed: service.1 not found in r2")
}
t.Logf("deregister service svc1 on r1\n")
if err := r1.Deregister(svc1); err != nil {
t.Fatal(err)
}
t.Logf("deregister service svc1 on r2\n")
if err := r2.Deregister(svc2); err != nil {
t.Fatal(err)
}
}
func TestRegistryRetry(t *testing.T) {
func TestGossipRegistryRetry(t *testing.T) {
mc1 := newMemberlistConfig()
r1 := newRegistry(Config(mc1), Address("127.0.0.1:54321"))
@ -113,8 +105,8 @@ func TestRegistryRetry(t *testing.T) {
defer r1.(*gossipRegistry).Stop()
defer r2.(*gossipRegistry).Stop()
svc1 := &registry.Service{Name: "r1-svc", Version: "0.0.0.1"}
svc2 := &registry.Service{Name: "r2-svc", Version: "0.0.0.2"}
svc1 := &registry.Service{Name: "service.1", Version: "0.0.0.1"}
svc2 := &registry.Service{Name: "service.2", Version: "0.0.0.2"}
var mu sync.Mutex
ch := make(chan struct{})
@ -150,17 +142,17 @@ func TestRegistryRetry(t *testing.T) {
}
for _, svc := range svcs {
if svc.Name == "r1-svc" {
if svc.Name == "service.1" {
found = true
}
}
if !found {
t.Fatalf("r1-svc not found in r2, broadcast not work, retry cant test")
t.Fatalf("[gossip registry] broadcast failed: service.1 not found in r2")
}
t.Logf("stop r1\n")
if err = r1.(*gossipRegistry).Stop(); err != nil {
t.Fatalf("cant stop r1 registry %v", err)
t.Fatalf("[gossip registry] failed to stop registry: %v", err)
}
mu.Lock()
@ -176,26 +168,24 @@ func TestRegistryRetry(t *testing.T) {
}
for _, svc := range svcs {
if svc.Name == "r1-svc" {
if svc.Name == "service.1" {
found = true
}
}
if found {
t.Fatalf("r1-svc found in r2, something wrong")
t.Fatalf("[gossip registry] service.1 found in r2")
}
t.Logf("start r1\n")
if tr := os.Getenv("TRAVIS"); len(tr) > 0 {
t.Logf("[gossip registry] skip test on travis")
t.Skip()
return
}
r1 = newRegistry(Config(mc1), Address("127.0.0.1:54321"))
<-time.After(2 * time.Second)
if tr := os.Getenv("TRAVIS"); len(tr) > 0 {
t.Logf("skip next test part, becasue it not works in travis")
t.Skip()
return
<-time.After(5 * time.Second)
}
found = false
svcs, err = r2.ListServices()
if err != nil {
@ -203,12 +193,13 @@ func TestRegistryRetry(t *testing.T) {
}
for _, svc := range svcs {
if svc.Name == "r1-svc" {
if svc.Name == "service.1" {
found = true
}
}
if !found {
t.Fatalf("r1-svc not found in r2, connect retry not works")
t.Fatalf("[gossip registry] connect retry failed: service.1 not found in r2")
}
if err := r1.Deregister(svc1); err != nil {

View File

@ -8,55 +8,51 @@ import (
"github.com/micro/go-micro/registry"
)
type contextSecretKey struct{}
type secretKey struct{}
type addressKey struct{}
type configKey struct{}
type advertiseKey struct{}
type connectTimeoutKey struct{}
type connectRetryKey struct{}
// helper for setting registry options
func setRegistryOption(k, v interface{}) registry.Option {
return func(o *registry.Options) {
if o.Context == nil {
o.Context = context.Background()
}
o.Context = context.WithValue(o.Context, k, v)
}
}
// Secret specifies an encryption key. The value should be either
// 16, 24, or 32 bytes to select AES-128, AES-192, or AES-256.
func Secret(k []byte) registry.Option {
return setRegistryOption(contextSecretKey{}, k)
return setRegistryOption(secretKey{}, k)
}
type contextAddress struct{}
// Address to bind to - host:port
func Address(a string) registry.Option {
return setRegistryOption(contextAddress{}, a)
return setRegistryOption(addressKey{}, a)
}
type contextConfig struct{}
// Config allow to inject a *memberlist.Config struct for configuring gossip
// Config sets *memberlist.Config for configuring gossip
func Config(c *memberlist.Config) registry.Option {
return setRegistryOption(contextConfig{}, c)
return setRegistryOption(configKey{}, c)
}
type contextAdvertise struct{}
// The address to advertise for other gossip members - host:port
// The address to advertise for other gossip members to connect to - host:port
func Advertise(a string) registry.Option {
return setRegistryOption(contextAdvertise{}, a)
return setRegistryOption(advertiseKey{}, a)
}
type contextContext struct{}
// Context specifies a context for the registry.
// Can be used to signal shutdown of the registry.
// Can be used for extra option values.
func Context(ctx context.Context) registry.Option {
return setRegistryOption(contextContext{}, ctx)
}
type connectTimeout struct{}
// ConnectTimeout specify registry connect timeout use -1 to specify infinite
// ConnectTimeout sets the registry connect timeout. Use -1 to specify infinite timeout
func ConnectTimeout(td time.Duration) registry.Option {
return setRegistryOption(connectTimeout{}, td)
return setRegistryOption(connectTimeoutKey{}, td)
}
type connectRetry struct{}
// ConnectRetry enable reconnect to registry then connection closed,
// ConnectRetry enables reconnect to registry then connection closed,
// use with ConnectTimeout to specify how long retry
func ConnectRetry(v bool) registry.Option {
return setRegistryOption(connectRetry{}, v)
return setRegistryOption(connectRetryKey{}, v)
}