package monitor import ( "context" "errors" "sync" "time" "github.com/micro/go-micro/v2/client" pb "github.com/micro/go-micro/v2/debug/service/proto" "github.com/micro/go-micro/v2/registry" "github.com/micro/go-micro/v2/registry/cache" ) type monitor struct { options Options exit chan bool registry cache.Cache client client.Client sync.RWMutex running bool services map[string]*Status } func (m *monitor) Check(service string) error { status, err := m.check(service) if err != nil { return err } m.Lock() m.services[service] = status m.Unlock() if status.Code != StatusRunning { return errors.New(status.Info) } return nil } // check provides binary running/failed status. // In the event Debug.Health cannot be called on a service we reap the node. func (m *monitor) check(service string) (*Status, error) { services, err := m.registry.GetService(service) if err != nil { return nil, err } // create debug client debug := pb.NewDebugService(service, m.client) var status *Status var gerr error // iterate through multiple versions of a service for _, service := range services { for _, node := range service.Nodes { // TODO: checks that are not just RPC based // TODO: better matching of the protocol // TODO: maybe everything has to be a go-micro service? if node.Metadata["server"] != m.client.String() { continue } // check the transport matches if node.Metadata["transport"] != m.client.Options().Transport.String() { continue } rsp, err := debug.Health( context.Background(), // empty health request &pb.HealthRequest{}, // call this specific node client.WithAddress(node.Address), // retry in the event of failure client.WithRetries(3), ) if err != nil { // save the error gerr = err continue } // expecting ok response status if rsp.Status != "ok" { gerr = errors.New(rsp.Status) continue } // no error set status status = &Status{ Code: StatusRunning, Info: "running", } } } // if we got the success case return it if status != nil { return status, nil } // if gerr is not nil return it if gerr != nil { return &Status{ Code: StatusFailed, Info: "not running", Error: gerr.Error(), }, nil } // otherwise unknown status return &Status{ Code: StatusUnknown, Info: "unknown status", }, nil } func (m *monitor) reap() { services, err := m.registry.ListServices() if err != nil { return } serviceMap := make(map[string]bool) for _, service := range services { serviceMap[service.Name] = true } m.Lock() defer m.Unlock() // range over our watched services for service := range m.services { // check if the service exists in the registry if !serviceMap[service] { // if not, delete it in our status map delete(m.services, service) } } } func (m *monitor) run() { // check the status every tick t := time.NewTicker(time.Minute) defer t.Stop() // reap dead services t2 := time.NewTicker(time.Hour) defer t2.Stop() // list the known services services, _ := m.registry.ListServices() // create a check chan of same length check := make(chan string, len(services)) // front-load the services to watch for _, service := range services { check <- service.Name } for { select { // exit if we're told to case <-m.exit: return // check a service when told to case service := <-check: // check the status status, err := m.check(service) if err != nil { status = &Status{ Code: StatusUnknown, Info: "unknown status", } } // save the status m.Lock() m.services[service] = status m.Unlock() // on the tick interval get all services and issue a check case <-t.C: // create a list of services serviceMap := make(map[string]bool) m.RLock() for service := range m.services { serviceMap[service] = true } m.RUnlock() go func() { // check the status of all watched services for service := range serviceMap { select { case <-m.exit: return case check <- service: default: // barf if we block } } // list services services, _ := m.registry.ListServices() for _, service := range services { // start watching the service if ok := serviceMap[service.Name]; !ok { m.Watch(service.Name) } } }() case <-t2.C: // reap any dead/non-existent services m.reap() } } } func (m *monitor) Reap(service string) error { services, err := m.registry.GetService(service) if err != nil { return nil } m.Lock() defer m.Unlock() delete(m.services, service) for _, service := range services { m.registry.Deregister(service) } return nil } func (m *monitor) Status(service string) (Status, error) { m.RLock() defer m.RUnlock() if status, ok := m.services[service]; ok { return *status, nil } return Status{}, ErrNotWatching } func (m *monitor) Watch(service string) error { m.Lock() defer m.Unlock() // check if we're watching if _, ok := m.services[service]; ok { return nil } // get the status status, err := m.check(service) if err != nil { return err } // set the status m.services[service] = status return nil } func (m *monitor) Run() error { m.Lock() defer m.Unlock() if m.running { return nil } // reset the exit channel m.exit = make(chan bool) // setup a new cache m.registry = cache.New(m.options.Registry) // start running go m.run() // set to running m.running = true return nil } func (m *monitor) Stop() error { m.Lock() defer m.Unlock() if !m.running { return nil } select { case <-m.exit: return nil default: close(m.exit) for s := range m.services { delete(m.services, s) } m.registry.Stop() m.running = false return nil } } func newMonitor(opts ...Option) Monitor { options := Options{ Client: client.DefaultClient, Registry: registry.DefaultRegistry, } for _, o := range opts { o(&options) } return &monitor{ options: options, exit: make(chan bool), client: options.Client, registry: cache.New(options.Registry), services: make(map[string]*Status), } }