Fixes advert dampening behaviour.
This commit adds the following changes: * advert now stores a list of route events as opposed to just last one * attempt to dedup route events before appending them to advert * have max suppress threshold for long time suppressed adverts * decaying events on every advert tick Originally we werent decaying penalties on every advert tick. That was incorrect behaviour. Furthermore some events would end up being accumulated potentially causing memory leaks. We were also overriding the last received router event which was causing incorrect sequence of events to be applied when received by a receiver: Create, Delete would be "squashed" into Delete only which would be nonsensical since the Create event would never be delivered hence we would be deleting nonexistent routes. Not Decaying the events on every tick or not having the max suppression threshold could lead to DoS by growing the router memory infinitely.
This commit is contained in:
parent
8c7e35c3c6
commit
92495d22db
@ -18,17 +18,24 @@ const (
|
|||||||
// AdvertiseTableTick is time interval in which router advertises all routes found in routing table
|
// AdvertiseTableTick is time interval in which router advertises all routes found in routing table
|
||||||
AdvertiseTableTick = 1 * time.Minute
|
AdvertiseTableTick = 1 * time.Minute
|
||||||
// AdvertSuppress is advert suppression threshold
|
// AdvertSuppress is advert suppression threshold
|
||||||
AdvertSuppress = 2000
|
AdvertSuppress = 2000.0
|
||||||
// AdvertRecover is advert recovery threshold
|
// AdvertRecover is advert recovery threshold
|
||||||
AdvertRecover = 750
|
AdvertRecover = 750.0
|
||||||
// DefaultAdvertTTL is default advertisement TTL
|
// DefaultAdvertTTL is default advertisement TTL
|
||||||
DefaultAdvertTTL = 1 * time.Minute
|
DefaultAdvertTTL = 1 * time.Minute
|
||||||
// PenaltyDecay is the penalty decay
|
// DeletePenalty penalises route deletion
|
||||||
PenaltyDecay = 1.15
|
DeletePenalty = 1000.0
|
||||||
// Delete penalises route addition and deletion
|
|
||||||
Delete = 1000
|
|
||||||
// UpdatePenalty penalises route updates
|
// UpdatePenalty penalises route updates
|
||||||
UpdatePenalty = 500
|
UpdatePenalty = 500.0
|
||||||
|
// PenaltyHalfLife is the time the advert penalty decays to half its value
|
||||||
|
PenaltyHalfLife = 2.0
|
||||||
|
// MaxSuppressTime defines time after which the suppressed advert is deleted
|
||||||
|
MaxSuppressTime = 5 * time.Minute
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// PenaltyDecay is a coefficient which controls the speed the advert penalty decays
|
||||||
|
PenaltyDecay = math.Log(2) / PenaltyHalfLife
|
||||||
)
|
)
|
||||||
|
|
||||||
// router provides default router implementation
|
// router provides default router implementation
|
||||||
@ -153,9 +160,9 @@ func (r *router) manageRegistryRoutes(reg registry.Registry, action string) erro
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// watchServices watches services in given registry and updates the routing table accordingly.
|
// watchRegistry watches sregistry and updates the routing table.
|
||||||
// It returns error if the service registry watcher stops or if the routing table can't be updated.
|
// It returns error if either the registry watcher fails with error or if the routing table update fails.
|
||||||
func (r *router) watchServices(w registry.Watcher) error {
|
func (r *router) watchRegistry(w registry.Watcher) error {
|
||||||
// wait in the background for the router to stop
|
// wait in the background for the router to stop
|
||||||
// when the router stops, stop the watcher and exit
|
// when the router stops, stop the watcher and exit
|
||||||
r.wg.Add(1)
|
r.wg.Add(1)
|
||||||
@ -279,30 +286,17 @@ func (r *router) advertiseTable() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// isFlapping detects if the event is flapping based on the current and previous event status.
|
// routeAdvert contains a list of route events to be advertised
|
||||||
func isFlapping(curr, prev *table.Event) bool {
|
type routeAdvert struct {
|
||||||
if curr.Type == table.Update && prev.Type == table.Update {
|
events []*table.Event
|
||||||
return true
|
// lastUpdate records the time of the last advert update
|
||||||
}
|
lastUpdate time.Time
|
||||||
|
// penalty is current advert penalty
|
||||||
if curr.Type == table.Create && prev.Type == table.Delete || curr.Type == table.Delete && prev.Type == table.Create {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// advertEvent is a table event enriched with advertisement data
|
|
||||||
type advertEvent struct {
|
|
||||||
*table.Event
|
|
||||||
// timestamp marks the time the event has been received
|
|
||||||
timestamp time.Time
|
|
||||||
// penalty is current event penalty
|
|
||||||
penalty float64
|
penalty float64
|
||||||
// isSuppressed flags if the event should be considered for flap detection
|
// isSuppressed flags the advert suppression
|
||||||
isSuppressed bool
|
isSuppressed bool
|
||||||
// isFlapping marks the event as flapping event
|
// suppressTime records the time interval the advert has been suppressed for
|
||||||
isFlapping bool
|
suppressTime time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
// processEvents processes routing table events.
|
// processEvents processes routing table events.
|
||||||
@ -310,22 +304,44 @@ type advertEvent struct {
|
|||||||
func (r *router) processEvents() error {
|
func (r *router) processEvents() error {
|
||||||
// ticker to periodically scan event for advertising
|
// ticker to periodically scan event for advertising
|
||||||
ticker := time.NewTicker(AdvertiseEventsTick)
|
ticker := time.NewTicker(AdvertiseEventsTick)
|
||||||
// eventMap is a map of advert events
|
// advertMap is a map of advert events
|
||||||
eventMap := make(map[uint64]*advertEvent)
|
advertMap := make(map[uint64]*routeAdvert)
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
var events []*table.Event
|
var events []*table.Event
|
||||||
// collect all events which are not flapping
|
// collect all events which are not flapping
|
||||||
// TODO: decay the events and update suppression
|
for key, advert := range advertMap {
|
||||||
for key, event := range eventMap {
|
// decay the event penalty
|
||||||
if !event.isFlapping && !event.isSuppressed {
|
delta := time.Since(advert.lastUpdate).Seconds()
|
||||||
e := new(table.Event)
|
advert.penalty = advert.penalty * math.Exp(-delta*PenaltyDecay)
|
||||||
*e = *event.Event
|
|
||||||
events = append(events, e)
|
// suppress/recover the event based on its penalty level
|
||||||
// this deletes the advertised event from the map
|
switch {
|
||||||
delete(eventMap, key)
|
case advert.penalty > AdvertSuppress && !advert.isSuppressed:
|
||||||
|
advert.isSuppressed = true
|
||||||
|
advert.suppressTime = time.Now()
|
||||||
|
case advert.penalty < AdvertRecover && advert.isSuppressed:
|
||||||
|
advert.isSuppressed = false
|
||||||
|
}
|
||||||
|
|
||||||
|
// max suppression time threshold has been reached, delete the advert
|
||||||
|
if advert.isSuppressed {
|
||||||
|
if time.Since(advert.suppressTime) > MaxSuppressTime {
|
||||||
|
delete(advertMap, key)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !advert.isSuppressed {
|
||||||
|
for _, event := range advert.events {
|
||||||
|
e := new(table.Event)
|
||||||
|
*e = *event
|
||||||
|
events = append(events, e)
|
||||||
|
// delete the advert from the advertMap
|
||||||
|
delete(advertMap, key)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -335,8 +351,6 @@ func (r *router) processEvents() error {
|
|||||||
go r.advertiseEvents(Update, events)
|
go r.advertiseEvents(Update, events)
|
||||||
}
|
}
|
||||||
case e := <-r.eventChan:
|
case e := <-r.eventChan:
|
||||||
// event timestamp
|
|
||||||
now := time.Now()
|
|
||||||
// if event is nil, continue
|
// if event is nil, continue
|
||||||
if e == nil {
|
if e == nil {
|
||||||
continue
|
continue
|
||||||
@ -348,36 +362,36 @@ func (r *router) processEvents() error {
|
|||||||
case table.Update:
|
case table.Update:
|
||||||
penalty = UpdatePenalty
|
penalty = UpdatePenalty
|
||||||
case table.Delete:
|
case table.Delete:
|
||||||
penalty = Delete
|
penalty = DeletePenalty
|
||||||
}
|
}
|
||||||
// we use route hash as eventMap key
|
|
||||||
|
// check if we have already registered the route
|
||||||
|
// we use the route hash as advertMap key
|
||||||
hash := e.Route.Hash()
|
hash := e.Route.Hash()
|
||||||
event, ok := eventMap[hash]
|
advert, ok := advertMap[hash]
|
||||||
if !ok {
|
if !ok {
|
||||||
event = &advertEvent{
|
events := []*table.Event{e}
|
||||||
Event: e,
|
advert = &routeAdvert{
|
||||||
penalty: penalty,
|
events: events,
|
||||||
timestamp: time.Now(),
|
penalty: penalty,
|
||||||
|
lastUpdate: time.Now(),
|
||||||
}
|
}
|
||||||
eventMap[hash] = event
|
advertMap[hash] = advert
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// update penalty for existing event: decay existing and add new penalty
|
|
||||||
delta := time.Since(event.timestamp).Seconds()
|
|
||||||
event.penalty = event.penalty*math.Exp(-delta) + penalty
|
|
||||||
event.timestamp = now
|
|
||||||
|
|
||||||
// suppress or recover the event based on its current penalty
|
// attempt to squash last two events if possible
|
||||||
if !event.isSuppressed && event.penalty > AdvertSuppress {
|
lastEvent := advert.events[len(advert.events)-1]
|
||||||
event.isSuppressed = true
|
if lastEvent.Type == e.Type {
|
||||||
} else if event.penalty < AdvertRecover {
|
advert.events[len(advert.events)-1] = e
|
||||||
event.isSuppressed = false
|
} else {
|
||||||
}
|
advert.events = append(advert.events, e)
|
||||||
// if not suppressed decide if if its flapping
|
|
||||||
if !event.isSuppressed {
|
|
||||||
// detect if its flapping by comparing current and previous event
|
|
||||||
event.isFlapping = isFlapping(e, event.Event)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// update event penalty and recorded timestamp
|
||||||
|
advert.lastUpdate = time.Now()
|
||||||
|
advert.penalty += penalty
|
||||||
|
|
||||||
case <-r.exit:
|
case <-r.exit:
|
||||||
// first wait for the advertiser to finish
|
// first wait for the advertiser to finish
|
||||||
r.advertWg.Wait()
|
r.advertWg.Wait()
|
||||||
@ -484,8 +498,9 @@ func (r *router) Advertise() (<-chan *Advert, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed creating routing table watcher: %v", err)
|
return nil, fmt.Errorf("failed creating routing table watcher: %v", err)
|
||||||
}
|
}
|
||||||
// service registry watcher
|
|
||||||
svcWatcher, err := r.opts.Registry.Watch()
|
// registry watcher
|
||||||
|
regWatcher, err := r.opts.Registry.Watch()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed creating service registry watcher: %v", err)
|
return nil, fmt.Errorf("failed creating service registry watcher: %v", err)
|
||||||
}
|
}
|
||||||
@ -497,7 +512,7 @@ func (r *router) Advertise() (<-chan *Advert, error) {
|
|||||||
go func() {
|
go func() {
|
||||||
defer r.wg.Done()
|
defer r.wg.Done()
|
||||||
// watch local registry and register routes in routine table
|
// watch local registry and register routes in routine table
|
||||||
errChan <- r.watchServices(svcWatcher)
|
errChan <- r.watchRegistry(regWatcher)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
r.wg.Add(1)
|
r.wg.Add(1)
|
||||||
@ -594,5 +609,5 @@ func (r *router) Stop() error {
|
|||||||
|
|
||||||
// String prints debugging information about router
|
// String prints debugging information about router
|
||||||
func (r *router) String() string {
|
func (r *router) String() string {
|
||||||
return "router"
|
return "default router"
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user