Openstatus www.openstatus.dev

feat(checker): implement backoff retry (#570)

authored by

Arthur EICHELBERGER and committed by
GitHub
1f6680a6 440079a2

+67 -81
+51 -57
apps/checker/cmd/main.go
··· 7 7 "net/http" 8 8 "os" 9 9 "os/signal" 10 - "strconv" 11 10 "syscall" 12 11 13 12 "github.com/gin-gonic/gin" ··· 16 15 "github.com/openstatushq/openstatus/apps/checker/pkg/tinybird" 17 16 "github.com/openstatushq/openstatus/apps/checker/request" 18 17 "github.com/rs/zerolog/log" 18 + 19 + backoff "github.com/cenkalti/backoff/v4" 19 20 ) 21 + 22 + type statusCode int 23 + 24 + func (s statusCode) IsSuccessful() bool { 25 + return s >= 200 && s < 300 26 + } 20 27 21 28 func main() { 22 29 ctx, cancel := context.WithCancel(context.Background()) ··· 53 60 return 54 61 } 55 62 56 - switch i, err := strconv.Atoi(c.GetHeader("X-CloudTasks-TaskRetryCount")); { 57 - case err != nil: 58 - c.JSON(http.StatusBadRequest, gin.H{"error": "invalid retry count"}) 59 - return 60 - case i > 1: 61 - // Why would that be OK? 62 - c.JSON(http.StatusOK, gin.H{"message": "ok"}) 63 - return 64 - } 65 - 66 63 var req request.CheckerRequest 67 64 if err := c.ShouldBindJSON(&req); err != nil { 68 65 log.Ctx(ctx).Error().Err(err).Msg("failed to decode checker request") ··· 70 67 return 71 68 } 72 69 73 - response, err := checker.Ping(ctx, httpClient, req) 74 - if err != nil { 75 - response, err = checker.Ping(ctx, httpClient, req) 70 + op := func() error { 71 + res, err := checker.Ping(ctx, httpClient, req) 76 72 if err != nil { 77 - if err := tinybirdClient.SendEvent(ctx, checker.PingData{ 78 - URL: req.URL, 79 - Region: flyRegion, 80 - Message: err.Error(), 81 - CronTimestamp: req.CronTimestamp, 82 - Timestamp: req.CronTimestamp, 83 - MonitorID: req.MonitorID, 84 - WorkspaceID: req.WorkspaceID, 85 - }); err != nil { 86 - log.Ctx(ctx).Error().Err(err).Msg("failed to send event to tinybird") 87 - } 88 - 89 - if req.Status == "active" { 90 - checker.UpdateStatus(ctx, checker.UpdateData{ 91 - MonitorId: req.MonitorID, 92 - Status: "error", 93 - Message: err.Error(), 94 - Region: flyRegion, 95 - }) 96 - } 97 - 98 - c.JSON(http.StatusOK, gin.H{"message": "ok"}) 99 - return 73 + return fmt.Errorf("unable to ping: %w", err) 100 74 } 101 - } 102 75 103 - if response.StatusCode < 200 || response.StatusCode >= 300 { 104 - // Add one more retry 105 - response, err = checker.Ping(ctx, httpClient, req) 106 - if response.StatusCode < 200 || response.StatusCode >= 300 && req.Status == "active" { 107 - // If the status code is not within the 200 range, we update the status to err 76 + statusCode := statusCode(res.StatusCode) 77 + if !statusCode.IsSuccessful() { 78 + // Q: Why here we do not check if the status was previously active? 108 79 checker.UpdateStatus(ctx, checker.UpdateData{ 109 80 MonitorId: req.MonitorID, 110 81 Status: "error", 111 - StatusCode: response.StatusCode, 82 + StatusCode: res.StatusCode, 112 83 Region: flyRegion, 113 84 }) 85 + } else if req.Status == "error" && statusCode.IsSuccessful() { 86 + // Q: Why here we check the data before updating the status in this scenario? 87 + checker.UpdateStatus(ctx, checker.UpdateData{ 88 + MonitorId: req.MonitorID, 89 + Status: "active", 90 + Region: flyRegion, 91 + StatusCode: res.StatusCode, 92 + }) 114 93 } 115 - } 94 + 95 + if err := tinybirdClient.SendEvent(ctx, res); err != nil { 96 + log.Ctx(ctx).Error().Err(err).Msg("failed to send event to tinybird") 97 + } 116 98 117 - // If the status was error and the status code is within the 200 range, we update the status to active 118 - if req.Status == "error" && response.StatusCode >= 200 && response.StatusCode < 300 { 119 - // If the status was error, we update it to active 120 - checker.UpdateStatus(ctx, checker.UpdateData{ 121 - MonitorId: req.MonitorID, 122 - Status: "active", 123 - Region: flyRegion, 124 - StatusCode: response.StatusCode, 125 - }) 99 + return nil 126 100 } 127 101 128 - if err := tinybirdClient.SendEvent(ctx, response); err != nil { 129 - log.Ctx(ctx).Error().Err(err).Msg("failed to send event to tinybird") 102 + if err := backoff.Retry(op, backoff.WithMaxRetries(backoff.NewExponentialBackOff(), 3)); err != nil { 103 + if err := tinybirdClient.SendEvent(ctx, checker.PingData{ 104 + URL: req.URL, 105 + Region: flyRegion, 106 + Message: err.Error(), 107 + CronTimestamp: req.CronTimestamp, 108 + Timestamp: req.CronTimestamp, 109 + MonitorID: req.MonitorID, 110 + WorkspaceID: req.WorkspaceID, 111 + }); err != nil { 112 + log.Ctx(ctx).Error().Err(err).Msg("failed to send event to tinybird") 113 + } 114 + 115 + // If the status was previously active, we update it to error. 116 + // Q: Why not always updating the status? My idea is that the checker should be dumb and only check the status and return it. 117 + if req.Status == "active" { 118 + checker.UpdateStatus(ctx, checker.UpdateData{ 119 + MonitorId: req.MonitorID, 120 + Status: "error", 121 + Message: err.Error(), 122 + Region: flyRegion, 123 + }) 124 + } 130 125 } 131 126 132 127 c.JSON(http.StatusOK, gin.H{"message": "ok"}) 133 - return 134 128 }) 135 129 136 130 router.GET("/ping", func(c *gin.Context) {
+1
apps/checker/go.mod
··· 3 3 go 1.21.4 4 4 5 5 require ( 6 + github.com/cenkalti/backoff/v4 v4.2.1 6 7 github.com/gin-gonic/gin v1.9.1 7 8 github.com/rs/zerolog v1.31.0 8 9 github.com/stretchr/testify v1.8.3
+2
apps/checker/go.sum
··· 1 1 github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM= 2 2 github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s= 3 3 github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U= 4 + github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= 5 + github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= 4 6 github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY= 5 7 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams= 6 8 github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
+13 -24
apps/checker/ping.go
··· 3 3 import ( 4 4 "bytes" 5 5 "context" 6 + "errors" 6 7 "fmt" 7 - "io" 8 8 "net/http" 9 9 "net/url" 10 10 "os" ··· 37 37 } 38 38 39 39 req.Header.Set("User-Agent", "OpenStatus/1.0") 40 - 41 - // Setting headers 42 40 for _, header := range inputData.Headers { 43 - if header.Key != "" && header.Value != "" { 44 - req.Header.Set(header.Key, header.Value) 45 - } 41 + req.Header.Set(header.Key, header.Value) 46 42 } 47 43 48 44 start := time.Now() 49 45 response, err := client.Do(req) 50 46 latency := time.Since(start).Milliseconds() 51 - 52 47 if err != nil { 53 - if urlErr, ok := err.(*url.Error); ok { 54 - if urlErr.Timeout() { 55 - return PingData{ 56 - Latency: latency, 57 - MonitorID: inputData.MonitorID, 58 - Region: region, 59 - WorkspaceID: inputData.WorkspaceID, 60 - Timestamp: time.Now().UTC().UnixMilli(), 61 - URL: inputData.URL, 62 - Message: fmt.Sprintf("Timeout after %d ms", latency), 63 - }, nil 64 - } 48 + var urlErr *url.Error 49 + if errors.As(err, &urlErr) && urlErr.Timeout() { 50 + return PingData{ 51 + Latency: latency, 52 + MonitorID: inputData.MonitorID, 53 + Region: region, 54 + WorkspaceID: inputData.WorkspaceID, 55 + Timestamp: time.Now().UTC().UnixMilli(), 56 + URL: inputData.URL, 57 + Message: fmt.Sprintf("Timeout after %d ms", latency), 58 + }, nil 65 59 } 66 60 67 61 logger.Error().Err(err).Msg("error while pinging") 68 62 return PingData{}, fmt.Errorf("error with monitorURL %s: %w", inputData.URL, err) 69 63 } 70 64 defer response.Body.Close() 71 - 72 - if _, err := io.ReadAll(response.Body); err != nil { 73 - logger.Error().Err(err).Str("monitorURL", inputData.URL).Msg("error while reading body") 74 - return PingData{}, fmt.Errorf("error while reading body from %s: %w", inputData.URL, err) 75 - } 76 65 77 66 return PingData{ 78 67 Latency: latency,