Fixes advert dampening behaviour.
This commit adds the following changes: * advert now stores a list of route events as opposed to just last one * attempt to dedup route events before appending them to advert * have max suppress threshold for long time suppressed adverts * decaying events on every advert tick Originally we werent decaying penalties on every advert tick. That was incorrect behaviour. Furthermore some events would end up being accumulated potentially causing memory leaks. We were also overriding the last received router event which was causing incorrect sequence of events to be applied when received by a receiver: Create, Delete would be "squashed" into Delete only which would be nonsensical since the Create event would never be delivered hence we would be deleting nonexistent routes. Not Decaying the events on every tick or not having the max suppression threshold could lead to DoS by growing the router memory infinitely.
This commit is contained in:
parent
8c7e35c3c6
commit
92495d22db
@ -18,17 +18,24 @@ const (
|
||||
// AdvertiseTableTick is time interval in which router advertises all routes found in routing table
|
||||
AdvertiseTableTick = 1 * time.Minute
|
||||
// AdvertSuppress is advert suppression threshold
|
||||
AdvertSuppress = 2000
|
||||
AdvertSuppress = 2000.0
|
||||
// AdvertRecover is advert recovery threshold
|
||||
AdvertRecover = 750
|
||||
AdvertRecover = 750.0
|
||||
// DefaultAdvertTTL is default advertisement TTL
|
||||
DefaultAdvertTTL = 1 * time.Minute
|
||||
// PenaltyDecay is the penalty decay
|
||||
PenaltyDecay = 1.15
|
||||
// Delete penalises route addition and deletion
|
||||
Delete = 1000
|
||||
// DeletePenalty penalises route deletion
|
||||
DeletePenalty = 1000.0
|
||||
// UpdatePenalty penalises route updates
|
||||
UpdatePenalty = 500
|
||||
UpdatePenalty = 500.0
|
||||
// PenaltyHalfLife is the time the advert penalty decays to half its value
|
||||
PenaltyHalfLife = 2.0
|
||||
// MaxSuppressTime defines time after which the suppressed advert is deleted
|
||||
MaxSuppressTime = 5 * time.Minute
|
||||
)
|
||||
|
||||
var (
|
||||
// PenaltyDecay is a coefficient which controls the speed the advert penalty decays
|
||||
PenaltyDecay = math.Log(2) / PenaltyHalfLife
|
||||
)
|
||||
|
||||
// router provides default router implementation
|
||||
@ -153,9 +160,9 @@ func (r *router) manageRegistryRoutes(reg registry.Registry, action string) erro
|
||||
return nil
|
||||
}
|
||||
|
||||
// watchServices watches services in given registry and updates the routing table accordingly.
|
||||
// It returns error if the service registry watcher stops or if the routing table can't be updated.
|
||||
func (r *router) watchServices(w registry.Watcher) error {
|
||||
// watchRegistry watches sregistry and updates the routing table.
|
||||
// It returns error if either the registry watcher fails with error or if the routing table update fails.
|
||||
func (r *router) watchRegistry(w registry.Watcher) error {
|
||||
// wait in the background for the router to stop
|
||||
// when the router stops, stop the watcher and exit
|
||||
r.wg.Add(1)
|
||||
@ -279,30 +286,17 @@ func (r *router) advertiseTable() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// isFlapping detects if the event is flapping based on the current and previous event status.
|
||||
func isFlapping(curr, prev *table.Event) bool {
|
||||
if curr.Type == table.Update && prev.Type == table.Update {
|
||||
return true
|
||||
}
|
||||
|
||||
if curr.Type == table.Create && prev.Type == table.Delete || curr.Type == table.Delete && prev.Type == table.Create {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// advertEvent is a table event enriched with advertisement data
|
||||
type advertEvent struct {
|
||||
*table.Event
|
||||
// timestamp marks the time the event has been received
|
||||
timestamp time.Time
|
||||
// penalty is current event penalty
|
||||
// routeAdvert contains a list of route events to be advertised
|
||||
type routeAdvert struct {
|
||||
events []*table.Event
|
||||
// lastUpdate records the time of the last advert update
|
||||
lastUpdate time.Time
|
||||
// penalty is current advert penalty
|
||||
penalty float64
|
||||
// isSuppressed flags if the event should be considered for flap detection
|
||||
// isSuppressed flags the advert suppression
|
||||
isSuppressed bool
|
||||
// isFlapping marks the event as flapping event
|
||||
isFlapping bool
|
||||
// suppressTime records the time interval the advert has been suppressed for
|
||||
suppressTime time.Time
|
||||
}
|
||||
|
||||
// processEvents processes routing table events.
|
||||
@ -310,22 +304,44 @@ type advertEvent struct {
|
||||
func (r *router) processEvents() error {
|
||||
// ticker to periodically scan event for advertising
|
||||
ticker := time.NewTicker(AdvertiseEventsTick)
|
||||
// eventMap is a map of advert events
|
||||
eventMap := make(map[uint64]*advertEvent)
|
||||
// advertMap is a map of advert events
|
||||
advertMap := make(map[uint64]*routeAdvert)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
var events []*table.Event
|
||||
// collect all events which are not flapping
|
||||
// TODO: decay the events and update suppression
|
||||
for key, event := range eventMap {
|
||||
if !event.isFlapping && !event.isSuppressed {
|
||||
for key, advert := range advertMap {
|
||||
// decay the event penalty
|
||||
delta := time.Since(advert.lastUpdate).Seconds()
|
||||
advert.penalty = advert.penalty * math.Exp(-delta*PenaltyDecay)
|
||||
|
||||
// suppress/recover the event based on its penalty level
|
||||
switch {
|
||||
case advert.penalty > AdvertSuppress && !advert.isSuppressed:
|
||||
advert.isSuppressed = true
|
||||
advert.suppressTime = time.Now()
|
||||
case advert.penalty < AdvertRecover && advert.isSuppressed:
|
||||
advert.isSuppressed = false
|
||||
}
|
||||
|
||||
// max suppression time threshold has been reached, delete the advert
|
||||
if advert.isSuppressed {
|
||||
if time.Since(advert.suppressTime) > MaxSuppressTime {
|
||||
delete(advertMap, key)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if !advert.isSuppressed {
|
||||
for _, event := range advert.events {
|
||||
e := new(table.Event)
|
||||
*e = *event.Event
|
||||
*e = *event
|
||||
events = append(events, e)
|
||||
// this deletes the advertised event from the map
|
||||
delete(eventMap, key)
|
||||
// delete the advert from the advertMap
|
||||
delete(advertMap, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -335,8 +351,6 @@ func (r *router) processEvents() error {
|
||||
go r.advertiseEvents(Update, events)
|
||||
}
|
||||
case e := <-r.eventChan:
|
||||
// event timestamp
|
||||
now := time.Now()
|
||||
// if event is nil, continue
|
||||
if e == nil {
|
||||
continue
|
||||
@ -348,36 +362,36 @@ func (r *router) processEvents() error {
|
||||
case table.Update:
|
||||
penalty = UpdatePenalty
|
||||
case table.Delete:
|
||||
penalty = Delete
|
||||
penalty = DeletePenalty
|
||||
}
|
||||
// we use route hash as eventMap key
|
||||
|
||||
// check if we have already registered the route
|
||||
// we use the route hash as advertMap key
|
||||
hash := e.Route.Hash()
|
||||
event, ok := eventMap[hash]
|
||||
advert, ok := advertMap[hash]
|
||||
if !ok {
|
||||
event = &advertEvent{
|
||||
Event: e,
|
||||
events := []*table.Event{e}
|
||||
advert = &routeAdvert{
|
||||
events: events,
|
||||
penalty: penalty,
|
||||
timestamp: time.Now(),
|
||||
lastUpdate: time.Now(),
|
||||
}
|
||||
eventMap[hash] = event
|
||||
advertMap[hash] = advert
|
||||
continue
|
||||
}
|
||||
// update penalty for existing event: decay existing and add new penalty
|
||||
delta := time.Since(event.timestamp).Seconds()
|
||||
event.penalty = event.penalty*math.Exp(-delta) + penalty
|
||||
event.timestamp = now
|
||||
|
||||
// suppress or recover the event based on its current penalty
|
||||
if !event.isSuppressed && event.penalty > AdvertSuppress {
|
||||
event.isSuppressed = true
|
||||
} else if event.penalty < AdvertRecover {
|
||||
event.isSuppressed = false
|
||||
}
|
||||
// if not suppressed decide if if its flapping
|
||||
if !event.isSuppressed {
|
||||
// detect if its flapping by comparing current and previous event
|
||||
event.isFlapping = isFlapping(e, event.Event)
|
||||
// attempt to squash last two events if possible
|
||||
lastEvent := advert.events[len(advert.events)-1]
|
||||
if lastEvent.Type == e.Type {
|
||||
advert.events[len(advert.events)-1] = e
|
||||
} else {
|
||||
advert.events = append(advert.events, e)
|
||||
}
|
||||
|
||||
// update event penalty and recorded timestamp
|
||||
advert.lastUpdate = time.Now()
|
||||
advert.penalty += penalty
|
||||
|
||||
case <-r.exit:
|
||||
// first wait for the advertiser to finish
|
||||
r.advertWg.Wait()
|
||||
@ -484,8 +498,9 @@ func (r *router) Advertise() (<-chan *Advert, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed creating routing table watcher: %v", err)
|
||||
}
|
||||
// service registry watcher
|
||||
svcWatcher, err := r.opts.Registry.Watch()
|
||||
|
||||
// registry watcher
|
||||
regWatcher, err := r.opts.Registry.Watch()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed creating service registry watcher: %v", err)
|
||||
}
|
||||
@ -497,7 +512,7 @@ func (r *router) Advertise() (<-chan *Advert, error) {
|
||||
go func() {
|
||||
defer r.wg.Done()
|
||||
// watch local registry and register routes in routine table
|
||||
errChan <- r.watchServices(svcWatcher)
|
||||
errChan <- r.watchRegistry(regWatcher)
|
||||
}()
|
||||
|
||||
r.wg.Add(1)
|
||||
@ -594,5 +609,5 @@ func (r *router) Stop() error {
|
||||
|
||||
// String prints debugging information about router
|
||||
func (r *router) String() string {
|
||||
return "router"
|
||||
return "default router"
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user