Flap detection vol2 (#915)
* We now purge flapping routes before regular tick processes them * Updated comments * Record the timestamp as soon as you receive the event * Set route Address to routing table test * Fixed a bunch of deadlocks. Added basic Router tests.
This commit is contained in:
parent
f67c5e779f
commit
b84134581c
@ -13,28 +13,25 @@ import (
|
||||
"github.com/micro/go-micro/util/log"
|
||||
)
|
||||
|
||||
const (
|
||||
var (
|
||||
// AdvertiseEventsTick is time interval in which the router advertises route updates
|
||||
AdvertiseEventsTick = 5 * time.Second
|
||||
// AdvertiseTableTick is time interval in which router advertises all routes found in routing table
|
||||
AdvertiseTableTick = 1 * time.Minute
|
||||
// AdvertiseFlushTick is time the yet unconsumed advertisements are flush i.e. discarded
|
||||
AdvertiseFlushTick = 15 * time.Second
|
||||
// DefaultAdvertTTL is default advertisement TTL
|
||||
DefaultAdvertTTL = 1 * time.Minute
|
||||
// AdvertSuppress is advert suppression threshold
|
||||
AdvertSuppress = 200.0
|
||||
// AdvertRecover is advert recovery threshold
|
||||
AdvertRecover = 120.0
|
||||
// DefaultAdvertTTL is default advertisement TTL
|
||||
DefaultAdvertTTL = 1 * time.Minute
|
||||
// Penalty for routes processed multiple times
|
||||
Penalty = 100.0
|
||||
// PenaltyHalfLife is the time the advert penalty decays to half its value
|
||||
PenaltyHalfLife = 30.0
|
||||
// MaxSuppressTime defines time after which the suppressed advert is deleted
|
||||
MaxSuppressTime = 5 * time.Minute
|
||||
)
|
||||
|
||||
var (
|
||||
// PenaltyDecay is a coefficient which controls the speed the advert penalty decays
|
||||
PenaltyDecay = math.Log(2) / PenaltyHalfLife
|
||||
)
|
||||
@ -52,6 +49,7 @@ type router struct {
|
||||
wg *sync.WaitGroup
|
||||
|
||||
// advert subscribers
|
||||
sub sync.RWMutex
|
||||
subscribers map[string]chan *Advert
|
||||
}
|
||||
|
||||
@ -189,7 +187,6 @@ func (r *router) watchRegistry(w registry.Watcher) error {
|
||||
defer func() {
|
||||
// close the exit channel when the go routine finishes
|
||||
close(exit)
|
||||
r.wg.Done()
|
||||
}()
|
||||
|
||||
// wait in the background for the router to stop
|
||||
@ -197,6 +194,7 @@ func (r *router) watchRegistry(w registry.Watcher) error {
|
||||
r.wg.Add(1)
|
||||
go func() {
|
||||
defer w.Stop()
|
||||
defer r.wg.Done()
|
||||
|
||||
select {
|
||||
case <-r.exit:
|
||||
@ -233,7 +231,6 @@ func (r *router) watchTable(w Watcher) error {
|
||||
defer func() {
|
||||
// close the exit channel when the go routine finishes
|
||||
close(exit)
|
||||
r.wg.Done()
|
||||
}()
|
||||
|
||||
// wait in the background for the router to stop
|
||||
@ -241,6 +238,7 @@ func (r *router) watchTable(w Watcher) error {
|
||||
r.wg.Add(1)
|
||||
go func() {
|
||||
defer w.Stop()
|
||||
defer r.wg.Done()
|
||||
|
||||
select {
|
||||
case <-r.exit:
|
||||
@ -276,10 +274,7 @@ func (r *router) watchTable(w Watcher) error {
|
||||
}
|
||||
|
||||
// publishAdvert publishes router advert to advert channel
|
||||
// NOTE: this might cease to be a dedicated method in the future
|
||||
func (r *router) publishAdvert(advType AdvertType, events []*Event) {
|
||||
defer r.advertWg.Done()
|
||||
|
||||
a := &Advert{
|
||||
Id: r.options.Id,
|
||||
Type: advType,
|
||||
@ -288,24 +283,17 @@ func (r *router) publishAdvert(advType AdvertType, events []*Event) {
|
||||
Events: events,
|
||||
}
|
||||
|
||||
log.Debugf("Router publishing advert; %+v", a)
|
||||
r.RLock()
|
||||
r.sub.RLock()
|
||||
for _, sub := range r.subscribers {
|
||||
// check the exit chan first
|
||||
select {
|
||||
case <-r.exit:
|
||||
r.RUnlock()
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
// now send the message
|
||||
select {
|
||||
case sub <- a:
|
||||
default:
|
||||
case <-r.exit:
|
||||
r.sub.RUnlock()
|
||||
return
|
||||
}
|
||||
}
|
||||
r.RUnlock()
|
||||
r.sub.RUnlock()
|
||||
}
|
||||
|
||||
// advertiseTable advertises the whole routing table to the network
|
||||
@ -327,7 +315,10 @@ func (r *router) advertiseTable() error {
|
||||
if len(events) > 0 {
|
||||
log.Debugf("Router flushing table with %d events: %s", len(events), r.options.Id)
|
||||
r.advertWg.Add(1)
|
||||
go r.publishAdvert(RouteUpdate, events)
|
||||
go func() {
|
||||
defer r.advertWg.Done()
|
||||
r.publishAdvert(RouteUpdate, events)
|
||||
}()
|
||||
}
|
||||
case <-r.exit:
|
||||
return nil
|
||||
@ -335,12 +326,12 @@ func (r *router) advertiseTable() error {
|
||||
}
|
||||
}
|
||||
|
||||
// routeAdvert contains a route event to be advertised
|
||||
type routeAdvert struct {
|
||||
// advert contains a route event to be advertised
|
||||
type advert struct {
|
||||
// event received from routing table
|
||||
event *Event
|
||||
// lastUpdate records the time of the last advert update
|
||||
lastUpdate time.Time
|
||||
// lastSeen records the time of the last advert update
|
||||
lastSeen time.Time
|
||||
// penalty is current advert penalty
|
||||
penalty float64
|
||||
// isSuppressed flags the advert suppression
|
||||
@ -349,6 +340,51 @@ type routeAdvert struct {
|
||||
suppressTime time.Time
|
||||
}
|
||||
|
||||
// adverts maintains a map of router adverts
|
||||
type adverts map[uint64]*advert
|
||||
|
||||
// process processes advert
|
||||
// It updates advert timestamp, increments its penalty and
|
||||
// marks upresses or recovers it if it reaches configured thresholds
|
||||
func (m adverts) process(a *advert) error {
|
||||
// lookup advert in adverts
|
||||
hash := a.event.Route.Hash()
|
||||
a, ok := m[hash]
|
||||
if !ok {
|
||||
return fmt.Errorf("advert not found")
|
||||
}
|
||||
|
||||
// decay the event penalty
|
||||
delta := time.Since(a.lastSeen).Seconds()
|
||||
|
||||
// decay advert penalty
|
||||
a.penalty = a.penalty * math.Exp(-delta*PenaltyDecay)
|
||||
service := a.event.Route.Service
|
||||
address := a.event.Route.Address
|
||||
|
||||
// suppress/recover the event based on its penalty level
|
||||
switch {
|
||||
case a.penalty > AdvertSuppress && !a.isSuppressed:
|
||||
log.Debugf("Router suppressing advert %d %.2f for route %s %s", hash, a.penalty, service, address)
|
||||
a.isSuppressed = true
|
||||
a.suppressTime = time.Now()
|
||||
case a.penalty < AdvertRecover && a.isSuppressed:
|
||||
log.Debugf("Router recovering advert %d %.2f for route %s %s", hash, a.penalty, service, address)
|
||||
a.isSuppressed = false
|
||||
}
|
||||
|
||||
// if suppressed, checked how long has it been suppressed for
|
||||
if a.isSuppressed {
|
||||
// max suppression time threshold has been reached, delete the advert
|
||||
if time.Since(a.suppressTime) > MaxSuppressTime {
|
||||
delete(m, hash)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// advertiseEvents advertises routing table events
|
||||
// It suppresses unhealthy flapping events and advertises healthy events upstream.
|
||||
func (r *router) advertiseEvents() error {
|
||||
@ -356,8 +392,8 @@ func (r *router) advertiseEvents() error {
|
||||
ticker := time.NewTicker(AdvertiseEventsTick)
|
||||
defer ticker.Stop()
|
||||
|
||||
// advertMap is a map of advert events
|
||||
advertMap := make(map[uint64]*routeAdvert)
|
||||
// adverts is a map of advert events
|
||||
adverts := make(adverts)
|
||||
|
||||
// routing table watcher
|
||||
tableWatcher, err := r.Watch()
|
||||
@ -379,82 +415,74 @@ func (r *router) advertiseEvents() error {
|
||||
case <-ticker.C:
|
||||
var events []*Event
|
||||
// collect all events which are not flapping
|
||||
for key, advert := range advertMap {
|
||||
// decay the event penalty
|
||||
delta := time.Since(advert.lastUpdate).Seconds()
|
||||
advert.penalty = advert.penalty * math.Exp(-delta*PenaltyDecay)
|
||||
service := advert.event.Route.Service
|
||||
address := advert.event.Route.Address
|
||||
// suppress/recover the event based on its penalty level
|
||||
switch {
|
||||
case advert.penalty > AdvertSuppress && !advert.isSuppressed:
|
||||
log.Debugf("Router suppressing advert %d %.2f for route %s %s", key, advert.penalty, service, address)
|
||||
advert.isSuppressed = true
|
||||
advert.suppressTime = time.Now()
|
||||
case advert.penalty < AdvertRecover && advert.isSuppressed:
|
||||
log.Debugf("Router recovering advert %d %.2f for route %s %s", key, advert.penalty, service, address)
|
||||
advert.isSuppressed = false
|
||||
}
|
||||
|
||||
if advert.isSuppressed {
|
||||
// max suppression time threshold has been reached, delete the advert
|
||||
if time.Since(advert.suppressTime) > MaxSuppressTime {
|
||||
delete(advertMap, key)
|
||||
for key, advert := range adverts {
|
||||
// process the advert
|
||||
if err := adverts.process(advert); err != nil {
|
||||
log.Debugf("Router failed processing advert %d: %v", key, err)
|
||||
continue
|
||||
}
|
||||
// process next advert
|
||||
// if suppressed go to the next advert
|
||||
if advert.isSuppressed {
|
||||
continue
|
||||
}
|
||||
|
||||
// copy the event and append
|
||||
e := new(Event)
|
||||
// this is ok, because router.Event only contains builtin types
|
||||
// and no references so this creates a deep struct copy of Event
|
||||
// and no references so this creates a deep copy of struct Event
|
||||
*e = *(advert.event)
|
||||
events = append(events, e)
|
||||
// delete the advert from the advertMap
|
||||
delete(advertMap, key)
|
||||
// delete the advert from adverts
|
||||
delete(adverts, key)
|
||||
}
|
||||
|
||||
// advertise all Update events to subscribers
|
||||
// advertise events to subscribers
|
||||
if len(events) > 0 {
|
||||
r.advertWg.Add(1)
|
||||
log.Debugf("Router publishing %d events", len(events))
|
||||
go r.publishAdvert(RouteUpdate, events)
|
||||
r.advertWg.Add(1)
|
||||
go func() {
|
||||
defer r.advertWg.Done()
|
||||
r.publishAdvert(RouteUpdate, events)
|
||||
}()
|
||||
}
|
||||
case e := <-r.eventChan:
|
||||
// if event is nil, continue
|
||||
if e == nil {
|
||||
continue
|
||||
}
|
||||
log.Debugf("Router processing table event %s for service %s %s", e.Type, e.Route.Service, e.Route.Address)
|
||||
now := time.Now()
|
||||
|
||||
// determine the event penalty
|
||||
// TODO: should there be any difference in penalty for different event types
|
||||
penalty := Penalty
|
||||
log.Debugf("Router processing table event %s for service %s %s", e.Type, e.Route.Service, e.Route.Address)
|
||||
|
||||
// check if we have already registered the route
|
||||
hash := e.Route.Hash()
|
||||
advert, ok := advertMap[hash]
|
||||
a, ok := adverts[hash]
|
||||
if !ok {
|
||||
advert = &routeAdvert{
|
||||
a = &advert{
|
||||
event: e,
|
||||
penalty: penalty,
|
||||
lastUpdate: time.Now(),
|
||||
penalty: Penalty,
|
||||
lastSeen: now,
|
||||
}
|
||||
advertMap[hash] = advert
|
||||
adverts[hash] = a
|
||||
continue
|
||||
}
|
||||
|
||||
// override the route event only if the last event was different
|
||||
if advert.event.Type != e.Type {
|
||||
advert.event = e
|
||||
// override the route event only if the previous event was different
|
||||
if a.event.Type != e.Type {
|
||||
a.event = e
|
||||
}
|
||||
|
||||
// process the advert
|
||||
if err := adverts.process(a); err != nil {
|
||||
log.Debugf("Router error processing advert %d: %v", hash, err)
|
||||
continue
|
||||
}
|
||||
|
||||
// update event penalty and timestamp
|
||||
advert.lastUpdate = time.Now()
|
||||
advert.penalty += penalty
|
||||
log.Debugf("Router advert %d for route %s %s event penalty: %f", hash, advert.event.Route.Service, advert.event.Route.Address, advert.penalty)
|
||||
a.lastSeen = now
|
||||
// increment the penalty
|
||||
a.penalty += Penalty
|
||||
log.Debugf("Router advert %d for route %s %s event penalty: %f", hash, a.event.Route.Service, a.event.Route.Address, a.penalty)
|
||||
case <-r.exit:
|
||||
// first wait for the advertiser to finish
|
||||
r.advertWg.Wait()
|
||||
@ -474,6 +502,7 @@ func (r *router) close() {
|
||||
for range r.eventChan {
|
||||
}
|
||||
|
||||
r.sub.RLock()
|
||||
// close advert subscribers
|
||||
for id, sub := range r.subscribers {
|
||||
// close the channel
|
||||
@ -482,6 +511,7 @@ func (r *router) close() {
|
||||
// delete the subscriber
|
||||
delete(r.subscribers, id)
|
||||
}
|
||||
r.sub.RUnlock()
|
||||
}
|
||||
|
||||
// mark the router as Stopped and set its Error to nil
|
||||
@ -603,9 +633,16 @@ func (r *router) Advertise() (<-chan *Advert, error) {
|
||||
// create event channels
|
||||
r.eventChan = make(chan *Event)
|
||||
|
||||
// create advert channel
|
||||
advertChan := make(chan *Advert, 128)
|
||||
r.subscribers[uuid.New().String()] = advertChan
|
||||
|
||||
// advertise your presence
|
||||
r.advertWg.Add(1)
|
||||
go r.publishAdvert(Announce, events)
|
||||
go func() {
|
||||
defer r.advertWg.Done()
|
||||
r.publishAdvert(Announce, events)
|
||||
}()
|
||||
|
||||
r.wg.Add(1)
|
||||
go func() {
|
||||
@ -629,10 +666,7 @@ func (r *router) Advertise() (<-chan *Advert, error) {
|
||||
// mark router as Running and set its Error to nil
|
||||
r.status = Status{Code: Advertising, Error: nil}
|
||||
|
||||
// create advert channel
|
||||
advertChan := make(chan *Advert, 128)
|
||||
r.subscribers[uuid.New().String()] = advertChan
|
||||
|
||||
log.Debugf("Router starting to advertise")
|
||||
return advertChan, nil
|
||||
case Stopped:
|
||||
return nil, fmt.Errorf("not running")
|
||||
@ -747,7 +781,10 @@ func (r *router) Solicit() error {
|
||||
|
||||
// advertise the routes
|
||||
r.advertWg.Add(1)
|
||||
go r.publishAdvert(RouteUpdate, events)
|
||||
go func() {
|
||||
defer r.advertWg.Done()
|
||||
r.publishAdvert(RouteUpdate, events)
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
@ -776,19 +813,27 @@ func (r *router) Status() Status {
|
||||
// Stop stops the router
|
||||
func (r *router) Stop() error {
|
||||
r.Lock()
|
||||
defer r.Unlock()
|
||||
|
||||
log.Debugf("Router shutting down")
|
||||
|
||||
switch r.status.Code {
|
||||
case Stopped, Error:
|
||||
r.Unlock()
|
||||
return r.status.Error
|
||||
case Running, Advertising:
|
||||
// close all the channels
|
||||
// NOTE: close marks the router status as Stopped
|
||||
r.close()
|
||||
}
|
||||
r.Unlock()
|
||||
|
||||
log.Debugf("Router waiting for all goroutines to finish")
|
||||
|
||||
// wait for all goroutines to finish
|
||||
r.wg.Wait()
|
||||
|
||||
log.Debugf("Router successfully stopped")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
127
router/default_test.go
Normal file
127
router/default_test.go
Normal file
@ -0,0 +1,127 @@
|
||||
package router
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/micro/go-micro/registry/memory"
|
||||
"github.com/micro/go-micro/util/log"
|
||||
)
|
||||
|
||||
func routerTestSetup() Router {
|
||||
r := memory.NewRegistry()
|
||||
return newRouter(Registry(r))
|
||||
}
|
||||
|
||||
func TestRouterStartStop(t *testing.T) {
|
||||
r := routerTestSetup()
|
||||
|
||||
log.Debugf("TestRouterStartStop STARTING")
|
||||
if err := r.Start(); err != nil {
|
||||
t.Errorf("failed to start router: %v", err)
|
||||
}
|
||||
|
||||
_, err := r.Advertise()
|
||||
if err != nil {
|
||||
t.Errorf("failed to start advertising: %v", err)
|
||||
}
|
||||
|
||||
if err := r.Stop(); err != nil {
|
||||
t.Errorf("failed to stop router: %v", err)
|
||||
}
|
||||
log.Debugf("TestRouterStartStop STOPPED")
|
||||
}
|
||||
|
||||
func TestRouterAdvertise(t *testing.T) {
|
||||
r := routerTestSetup()
|
||||
|
||||
// lower the advertise interval
|
||||
AdvertiseEventsTick = 500 * time.Millisecond
|
||||
AdvertiseTableTick = 1 * time.Second
|
||||
|
||||
if err := r.Start(); err != nil {
|
||||
t.Errorf("failed to start router: %v", err)
|
||||
}
|
||||
|
||||
ch, err := r.Advertise()
|
||||
if err != nil {
|
||||
t.Errorf("failed to start advertising: %v", err)
|
||||
}
|
||||
|
||||
// receive announce event
|
||||
ann := <-ch
|
||||
log.Debugf("received announce advert: %v", ann)
|
||||
|
||||
// Generate random unique routes
|
||||
nrRoutes := 5
|
||||
routes := make([]Route, nrRoutes)
|
||||
route := Route{
|
||||
Service: "dest.svc",
|
||||
Address: "dest.addr",
|
||||
Gateway: "dest.gw",
|
||||
Network: "dest.network",
|
||||
Router: "src.router",
|
||||
Link: "det.link",
|
||||
Metric: 10,
|
||||
}
|
||||
|
||||
for i := 0; i < nrRoutes; i++ {
|
||||
testRoute := route
|
||||
testRoute.Service = fmt.Sprintf("%s-%d", route.Service, i)
|
||||
routes[i] = testRoute
|
||||
}
|
||||
|
||||
var advertErr error
|
||||
|
||||
errChan := make(chan error)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
wg.Done()
|
||||
for _, route := range routes {
|
||||
log.Debugf("Creating route %v", route)
|
||||
if err := r.Table().Create(route); err != nil {
|
||||
log.Debugf("Failed to create route: %v", err)
|
||||
errChan <- err
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
var adverts int
|
||||
doneChan := make(chan bool)
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer func() {
|
||||
wg.Done()
|
||||
doneChan <- true
|
||||
}()
|
||||
for advert := range ch {
|
||||
select {
|
||||
case advertErr = <-errChan:
|
||||
t.Errorf("failed advertising events: %v", advertErr)
|
||||
default:
|
||||
// do nothing for now
|
||||
log.Debugf("Router advert received: %v", advert)
|
||||
adverts += len(advert.Events)
|
||||
}
|
||||
return
|
||||
}
|
||||
}()
|
||||
|
||||
<-doneChan
|
||||
|
||||
if adverts != nrRoutes {
|
||||
t.Errorf("Expected %d adverts, received: %d", nrRoutes, adverts)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
if err := r.Stop(); err != nil {
|
||||
t.Errorf("failed to stop router: %v", err)
|
||||
}
|
||||
}
|
@ -7,6 +7,7 @@ func testSetup() (*table, Route) {
|
||||
|
||||
route := Route{
|
||||
Service: "dest.svc",
|
||||
Address: "dest.addr",
|
||||
Gateway: "dest.gw",
|
||||
Network: "dest.network",
|
||||
Router: "src.router",
|
||||
|
Loading…
Reference in New Issue
Block a user