swarm: integrate OpenTracing; propagate ctx to internal APIs (#17169)

* swarm: propagate ctx, enable opentracing

* swarm/tracing: log error when tracing is misconfigured
This commit is contained in:
Anton Evangelatov
2018-07-13 17:40:28 +02:00
committed by Balint Gabor
parent f7d3678c28
commit 7c9314f231
170 changed files with 21762 additions and 249 deletions

View File

@ -0,0 +1,5 @@
An Observer that can be used to emit RPC metrics
================================================
It can be attached to the tracer during tracer construction.
See `ExampleObserver` function in [observer_test.go](./observer_test.go).

View File

@ -0,0 +1,16 @@
// Copyright (c) 2017 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package rpcmetrics implements an Observer that can be used to emit RPC metrics.
package rpcmetrics

View File

@ -0,0 +1,63 @@
// Copyright (c) 2017 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rpcmetrics
import "sync"
// normalizedEndpoints is a cache for endpointName -> safeName mappings.
type normalizedEndpoints struct {
names map[string]string
maxSize int
defaultName string
normalizer NameNormalizer
mux sync.RWMutex
}
func newNormalizedEndpoints(maxSize int, normalizer NameNormalizer) *normalizedEndpoints {
return &normalizedEndpoints{
maxSize: maxSize,
normalizer: normalizer,
names: make(map[string]string, maxSize),
}
}
// normalize looks up the name in the cache, if not found it uses normalizer
// to convert the name to a safe name. If called with more than maxSize unique
// names it returns "" for all other names beyond those already cached.
func (n *normalizedEndpoints) normalize(name string) string {
n.mux.RLock()
norm, ok := n.names[name]
l := len(n.names)
n.mux.RUnlock()
if ok {
return norm
}
if l >= n.maxSize {
return ""
}
return n.normalizeWithLock(name)
}
func (n *normalizedEndpoints) normalizeWithLock(name string) string {
norm := n.normalizer.Normalize(name)
n.mux.Lock()
defer n.mux.Unlock()
// cache may have grown while we were not holding the lock
if len(n.names) >= n.maxSize {
return ""
}
n.names[name] = norm
return norm
}

View File

@ -0,0 +1,124 @@
// Copyright (c) 2017 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rpcmetrics
import (
"sync"
"github.com/uber/jaeger-lib/metrics"
)
const (
otherEndpointsPlaceholder = "other"
endpointNameMetricTag = "endpoint"
)
// Metrics is a collection of metrics for an endpoint describing
// throughput, success, errors, and performance.
type Metrics struct {
// RequestCountSuccess is a counter of the total number of successes.
RequestCountSuccess metrics.Counter `metric:"requests" tags:"error=false"`
// RequestCountFailures is a counter of the number of times any failure has been observed.
RequestCountFailures metrics.Counter `metric:"requests" tags:"error=true"`
// RequestLatencySuccess is a latency histogram of succesful requests.
RequestLatencySuccess metrics.Timer `metric:"request_latency" tags:"error=false"`
// RequestLatencyFailures is a latency histogram of failed requests.
RequestLatencyFailures metrics.Timer `metric:"request_latency" tags:"error=true"`
// HTTPStatusCode2xx is a counter of the total number of requests with HTTP status code 200-299
HTTPStatusCode2xx metrics.Counter `metric:"http_requests" tags:"status_code=2xx"`
// HTTPStatusCode3xx is a counter of the total number of requests with HTTP status code 300-399
HTTPStatusCode3xx metrics.Counter `metric:"http_requests" tags:"status_code=3xx"`
// HTTPStatusCode4xx is a counter of the total number of requests with HTTP status code 400-499
HTTPStatusCode4xx metrics.Counter `metric:"http_requests" tags:"status_code=4xx"`
// HTTPStatusCode5xx is a counter of the total number of requests with HTTP status code 500-599
HTTPStatusCode5xx metrics.Counter `metric:"http_requests" tags:"status_code=5xx"`
}
func (m *Metrics) recordHTTPStatusCode(statusCode uint16) {
if statusCode >= 200 && statusCode < 300 {
m.HTTPStatusCode2xx.Inc(1)
} else if statusCode >= 300 && statusCode < 400 {
m.HTTPStatusCode3xx.Inc(1)
} else if statusCode >= 400 && statusCode < 500 {
m.HTTPStatusCode4xx.Inc(1)
} else if statusCode >= 500 && statusCode < 600 {
m.HTTPStatusCode5xx.Inc(1)
}
}
// MetricsByEndpoint is a registry/cache of metrics for each unique endpoint name.
// Only maxNumberOfEndpoints Metrics are stored, all other endpoint names are mapped
// to a generic endpoint name "other".
type MetricsByEndpoint struct {
metricsFactory metrics.Factory
endpoints *normalizedEndpoints
metricsByEndpoint map[string]*Metrics
mux sync.RWMutex
}
func newMetricsByEndpoint(
metricsFactory metrics.Factory,
normalizer NameNormalizer,
maxNumberOfEndpoints int,
) *MetricsByEndpoint {
return &MetricsByEndpoint{
metricsFactory: metricsFactory,
endpoints: newNormalizedEndpoints(maxNumberOfEndpoints, normalizer),
metricsByEndpoint: make(map[string]*Metrics, maxNumberOfEndpoints+1), // +1 for "other"
}
}
func (m *MetricsByEndpoint) get(endpoint string) *Metrics {
safeName := m.endpoints.normalize(endpoint)
if safeName == "" {
safeName = otherEndpointsPlaceholder
}
m.mux.RLock()
met := m.metricsByEndpoint[safeName]
m.mux.RUnlock()
if met != nil {
return met
}
return m.getWithWriteLock(safeName)
}
// split to make easier to test
func (m *MetricsByEndpoint) getWithWriteLock(safeName string) *Metrics {
m.mux.Lock()
defer m.mux.Unlock()
// it is possible that the name has been already registered after we released
// the read lock and before we grabbed the write lock, so check for that.
if met, ok := m.metricsByEndpoint[safeName]; ok {
return met
}
// it would be nice to create the struct before locking, since Init() is somewhat
// expensive, however some metrics backends (e.g. expvar) may not like duplicate metrics.
met := &Metrics{}
tags := map[string]string{endpointNameMetricTag: safeName}
metrics.Init(met, m.metricsFactory, tags)
m.metricsByEndpoint[safeName] = met
return met
}

View File

@ -0,0 +1,101 @@
// Copyright (c) 2017 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rpcmetrics
// NameNormalizer is used to convert the endpoint names to strings
// that can be safely used as tags in the metrics.
type NameNormalizer interface {
Normalize(name string) string
}
// DefaultNameNormalizer converts endpoint names so that they contain only characters
// from the safe charset [a-zA-Z0-9-./_]. All other characters are replaced with '-'.
var DefaultNameNormalizer = &SimpleNameNormalizer{
SafeSets: []SafeCharacterSet{
&Range{From: 'a', To: 'z'},
&Range{From: 'A', To: 'Z'},
&Range{From: '0', To: '9'},
&Char{'-'},
&Char{'_'},
&Char{'/'},
&Char{'.'},
},
Replacement: '-',
}
// SimpleNameNormalizer uses a set of safe character sets.
type SimpleNameNormalizer struct {
SafeSets []SafeCharacterSet
Replacement byte
}
// SafeCharacterSet determines if the given character is "safe"
type SafeCharacterSet interface {
IsSafe(c byte) bool
}
// Range implements SafeCharacterSet
type Range struct {
From, To byte
}
// IsSafe implements SafeCharacterSet
func (r *Range) IsSafe(c byte) bool {
return c >= r.From && c <= r.To
}
// Char implements SafeCharacterSet
type Char struct {
Val byte
}
// IsSafe implements SafeCharacterSet
func (ch *Char) IsSafe(c byte) bool {
return c == ch.Val
}
// Normalize checks each character in the string against SafeSets,
// and if it's not safe substitutes it with Replacement.
func (n *SimpleNameNormalizer) Normalize(name string) string {
var retMe []byte
nameBytes := []byte(name)
for i, b := range nameBytes {
if n.safeByte(b) {
if retMe != nil {
retMe[i] = b
}
} else {
if retMe == nil {
retMe = make([]byte, len(nameBytes))
copy(retMe[0:i], nameBytes[0:i])
}
retMe[i] = n.Replacement
}
}
if retMe == nil {
return name
}
return string(retMe)
}
// safeByte checks if b against all safe charsets.
func (n *SimpleNameNormalizer) safeByte(b byte) bool {
for i := range n.SafeSets {
if n.SafeSets[i].IsSafe(b) {
return true
}
}
return false
}

View File

@ -0,0 +1,171 @@
// Copyright (c) 2017 Uber Technologies, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rpcmetrics
import (
"strconv"
"sync"
"time"
"github.com/opentracing/opentracing-go"
"github.com/opentracing/opentracing-go/ext"
"github.com/uber/jaeger-lib/metrics"
jaeger "github.com/uber/jaeger-client-go"
)
const defaultMaxNumberOfEndpoints = 200
// Observer is an observer that can emit RPC metrics.
type Observer struct {
metricsByEndpoint *MetricsByEndpoint
}
// NewObserver creates a new observer that can emit RPC metrics.
func NewObserver(metricsFactory metrics.Factory, normalizer NameNormalizer) *Observer {
return &Observer{
metricsByEndpoint: newMetricsByEndpoint(
metricsFactory,
normalizer,
defaultMaxNumberOfEndpoints,
),
}
}
// OnStartSpan creates a new Observer for the span.
func (o *Observer) OnStartSpan(
operationName string,
options opentracing.StartSpanOptions,
) jaeger.SpanObserver {
return NewSpanObserver(o.metricsByEndpoint, operationName, options)
}
// SpanKind identifies the span as inboud, outbound, or internal
type SpanKind int
const (
// Local span kind
Local SpanKind = iota
// Inbound span kind
Inbound
// Outbound span kind
Outbound
)
// SpanObserver collects RPC metrics
type SpanObserver struct {
metricsByEndpoint *MetricsByEndpoint
operationName string
startTime time.Time
mux sync.Mutex
kind SpanKind
httpStatusCode uint16
err bool
}
// NewSpanObserver creates a new SpanObserver that can emit RPC metrics.
func NewSpanObserver(
metricsByEndpoint *MetricsByEndpoint,
operationName string,
options opentracing.StartSpanOptions,
) *SpanObserver {
so := &SpanObserver{
metricsByEndpoint: metricsByEndpoint,
operationName: operationName,
startTime: options.StartTime,
}
for k, v := range options.Tags {
so.handleTagInLock(k, v)
}
return so
}
// handleTags watches for special tags
// - SpanKind
// - HttpStatusCode
// - Error
func (so *SpanObserver) handleTagInLock(key string, value interface{}) {
if key == string(ext.SpanKind) {
if v, ok := value.(ext.SpanKindEnum); ok {
value = string(v)
}
if v, ok := value.(string); ok {
if v == string(ext.SpanKindRPCClientEnum) {
so.kind = Outbound
} else if v == string(ext.SpanKindRPCServerEnum) {
so.kind = Inbound
}
}
return
}
if key == string(ext.HTTPStatusCode) {
if v, ok := value.(uint16); ok {
so.httpStatusCode = v
} else if v, ok := value.(int); ok {
so.httpStatusCode = uint16(v)
} else if v, ok := value.(string); ok {
if vv, err := strconv.Atoi(v); err == nil {
so.httpStatusCode = uint16(vv)
}
}
return
}
if key == string(ext.Error) {
if v, ok := value.(bool); ok {
so.err = v
} else if v, ok := value.(string); ok {
if vv, err := strconv.ParseBool(v); err == nil {
so.err = vv
}
}
return
}
}
// OnFinish emits the RPC metrics. It only has an effect when operation name
// is not blank, and the span kind is an RPC server.
func (so *SpanObserver) OnFinish(options opentracing.FinishOptions) {
so.mux.Lock()
defer so.mux.Unlock()
if so.operationName == "" || so.kind != Inbound {
return
}
mets := so.metricsByEndpoint.get(so.operationName)
latency := options.FinishTime.Sub(so.startTime)
if so.err {
mets.RequestCountFailures.Inc(1)
mets.RequestLatencyFailures.Record(latency)
} else {
mets.RequestCountSuccess.Inc(1)
mets.RequestLatencySuccess.Record(latency)
}
mets.recordHTTPStatusCode(so.httpStatusCode)
}
// OnSetOperationName records new operation name.
func (so *SpanObserver) OnSetOperationName(operationName string) {
so.mux.Lock()
so.operationName = operationName
so.mux.Unlock()
}
// OnSetTag implements SpanObserver
func (so *SpanObserver) OnSetTag(key string, value interface{}) {
so.mux.Lock()
so.handleTagInLock(key, value)
so.mux.Unlock()
}