Fixes advert dampening behaviour.

This commit adds the following changes:
* advert now stores a list of route events as opposed to just last one
* attempt to dedup route events before appending them to advert
* have max suppress threshold for long time suppressed adverts
* decaying events on every advert tick

Originally we werent decaying penalties on every advert tick.
That was incorrect behaviour. Furthermore some events would end up being
accumulated potentially causing memory leaks.

We were also overriding the last received router event which was causing
incorrect sequence of events to be applied when received by a receiver:
Create, Delete would be "squashed" into Delete only which would be
nonsensical since the Create event would never be delivered hence we
would be deleting nonexistent routes.

Not Decaying the events on every tick or not having the max suppression
threshold could lead to DoS by growing the router memory infinitely.
This commit is contained in:
Milos Gajdos 2019-07-16 19:00:25 +01:00
parent 8c7e35c3c6
commit 92495d22db
No known key found for this signature in database
GPG Key ID: 8B31058CC55DFD4F

View File

@ -18,17 +18,24 @@ const (
// AdvertiseTableTick is time interval in which router advertises all routes found in routing table // AdvertiseTableTick is time interval in which router advertises all routes found in routing table
AdvertiseTableTick = 1 * time.Minute AdvertiseTableTick = 1 * time.Minute
// AdvertSuppress is advert suppression threshold // AdvertSuppress is advert suppression threshold
AdvertSuppress = 2000 AdvertSuppress = 2000.0
// AdvertRecover is advert recovery threshold // AdvertRecover is advert recovery threshold
AdvertRecover = 750 AdvertRecover = 750.0
// DefaultAdvertTTL is default advertisement TTL // DefaultAdvertTTL is default advertisement TTL
DefaultAdvertTTL = 1 * time.Minute DefaultAdvertTTL = 1 * time.Minute
// PenaltyDecay is the penalty decay // DeletePenalty penalises route deletion
PenaltyDecay = 1.15 DeletePenalty = 1000.0
// Delete penalises route addition and deletion
Delete = 1000
// UpdatePenalty penalises route updates // UpdatePenalty penalises route updates
UpdatePenalty = 500 UpdatePenalty = 500.0
// PenaltyHalfLife is the time the advert penalty decays to half its value
PenaltyHalfLife = 2.0
// MaxSuppressTime defines time after which the suppressed advert is deleted
MaxSuppressTime = 5 * time.Minute
)
var (
// PenaltyDecay is a coefficient which controls the speed the advert penalty decays
PenaltyDecay = math.Log(2) / PenaltyHalfLife
) )
// router provides default router implementation // router provides default router implementation
@ -153,9 +160,9 @@ func (r *router) manageRegistryRoutes(reg registry.Registry, action string) erro
return nil return nil
} }
// watchServices watches services in given registry and updates the routing table accordingly. // watchRegistry watches sregistry and updates the routing table.
// It returns error if the service registry watcher stops or if the routing table can't be updated. // It returns error if either the registry watcher fails with error or if the routing table update fails.
func (r *router) watchServices(w registry.Watcher) error { func (r *router) watchRegistry(w registry.Watcher) error {
// wait in the background for the router to stop // wait in the background for the router to stop
// when the router stops, stop the watcher and exit // when the router stops, stop the watcher and exit
r.wg.Add(1) r.wg.Add(1)
@ -279,30 +286,17 @@ func (r *router) advertiseTable() error {
return nil return nil
} }
// isFlapping detects if the event is flapping based on the current and previous event status. // routeAdvert contains a list of route events to be advertised
func isFlapping(curr, prev *table.Event) bool { type routeAdvert struct {
if curr.Type == table.Update && prev.Type == table.Update { events []*table.Event
return true // lastUpdate records the time of the last advert update
} lastUpdate time.Time
// penalty is current advert penalty
if curr.Type == table.Create && prev.Type == table.Delete || curr.Type == table.Delete && prev.Type == table.Create {
return true
}
return false
}
// advertEvent is a table event enriched with advertisement data
type advertEvent struct {
*table.Event
// timestamp marks the time the event has been received
timestamp time.Time
// penalty is current event penalty
penalty float64 penalty float64
// isSuppressed flags if the event should be considered for flap detection // isSuppressed flags the advert suppression
isSuppressed bool isSuppressed bool
// isFlapping marks the event as flapping event // suppressTime records the time interval the advert has been suppressed for
isFlapping bool suppressTime time.Time
} }
// processEvents processes routing table events. // processEvents processes routing table events.
@ -310,22 +304,44 @@ type advertEvent struct {
func (r *router) processEvents() error { func (r *router) processEvents() error {
// ticker to periodically scan event for advertising // ticker to periodically scan event for advertising
ticker := time.NewTicker(AdvertiseEventsTick) ticker := time.NewTicker(AdvertiseEventsTick)
// eventMap is a map of advert events // advertMap is a map of advert events
eventMap := make(map[uint64]*advertEvent) advertMap := make(map[uint64]*routeAdvert)
for { for {
select { select {
case <-ticker.C: case <-ticker.C:
var events []*table.Event var events []*table.Event
// collect all events which are not flapping // collect all events which are not flapping
// TODO: decay the events and update suppression for key, advert := range advertMap {
for key, event := range eventMap { // decay the event penalty
if !event.isFlapping && !event.isSuppressed { delta := time.Since(advert.lastUpdate).Seconds()
e := new(table.Event) advert.penalty = advert.penalty * math.Exp(-delta*PenaltyDecay)
*e = *event.Event
events = append(events, e) // suppress/recover the event based on its penalty level
// this deletes the advertised event from the map switch {
delete(eventMap, key) case advert.penalty > AdvertSuppress && !advert.isSuppressed:
advert.isSuppressed = true
advert.suppressTime = time.Now()
case advert.penalty < AdvertRecover && advert.isSuppressed:
advert.isSuppressed = false
}
// max suppression time threshold has been reached, delete the advert
if advert.isSuppressed {
if time.Since(advert.suppressTime) > MaxSuppressTime {
delete(advertMap, key)
continue
}
}
if !advert.isSuppressed {
for _, event := range advert.events {
e := new(table.Event)
*e = *event
events = append(events, e)
// delete the advert from the advertMap
delete(advertMap, key)
}
} }
} }
@ -335,8 +351,6 @@ func (r *router) processEvents() error {
go r.advertiseEvents(Update, events) go r.advertiseEvents(Update, events)
} }
case e := <-r.eventChan: case e := <-r.eventChan:
// event timestamp
now := time.Now()
// if event is nil, continue // if event is nil, continue
if e == nil { if e == nil {
continue continue
@ -348,36 +362,36 @@ func (r *router) processEvents() error {
case table.Update: case table.Update:
penalty = UpdatePenalty penalty = UpdatePenalty
case table.Delete: case table.Delete:
penalty = Delete penalty = DeletePenalty
} }
// we use route hash as eventMap key
// check if we have already registered the route
// we use the route hash as advertMap key
hash := e.Route.Hash() hash := e.Route.Hash()
event, ok := eventMap[hash] advert, ok := advertMap[hash]
if !ok { if !ok {
event = &advertEvent{ events := []*table.Event{e}
Event: e, advert = &routeAdvert{
penalty: penalty, events: events,
timestamp: time.Now(), penalty: penalty,
lastUpdate: time.Now(),
} }
eventMap[hash] = event advertMap[hash] = advert
continue continue
} }
// update penalty for existing event: decay existing and add new penalty
delta := time.Since(event.timestamp).Seconds()
event.penalty = event.penalty*math.Exp(-delta) + penalty
event.timestamp = now
// suppress or recover the event based on its current penalty // attempt to squash last two events if possible
if !event.isSuppressed && event.penalty > AdvertSuppress { lastEvent := advert.events[len(advert.events)-1]
event.isSuppressed = true if lastEvent.Type == e.Type {
} else if event.penalty < AdvertRecover { advert.events[len(advert.events)-1] = e
event.isSuppressed = false } else {
} advert.events = append(advert.events, e)
// if not suppressed decide if if its flapping
if !event.isSuppressed {
// detect if its flapping by comparing current and previous event
event.isFlapping = isFlapping(e, event.Event)
} }
// update event penalty and recorded timestamp
advert.lastUpdate = time.Now()
advert.penalty += penalty
case <-r.exit: case <-r.exit:
// first wait for the advertiser to finish // first wait for the advertiser to finish
r.advertWg.Wait() r.advertWg.Wait()
@ -484,8 +498,9 @@ func (r *router) Advertise() (<-chan *Advert, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("failed creating routing table watcher: %v", err) return nil, fmt.Errorf("failed creating routing table watcher: %v", err)
} }
// service registry watcher
svcWatcher, err := r.opts.Registry.Watch() // registry watcher
regWatcher, err := r.opts.Registry.Watch()
if err != nil { if err != nil {
return nil, fmt.Errorf("failed creating service registry watcher: %v", err) return nil, fmt.Errorf("failed creating service registry watcher: %v", err)
} }
@ -497,7 +512,7 @@ func (r *router) Advertise() (<-chan *Advert, error) {
go func() { go func() {
defer r.wg.Done() defer r.wg.Done()
// watch local registry and register routes in routine table // watch local registry and register routes in routine table
errChan <- r.watchServices(svcWatcher) errChan <- r.watchRegistry(regWatcher)
}() }()
r.wg.Add(1) r.wg.Add(1)
@ -594,5 +609,5 @@ func (r *router) Stop() error {
// String prints debugging information about router // String prints debugging information about router
func (r *router) String() string { func (r *router) String() string {
return "router" return "default router"
} }