mirror of
https://github.com/element-hq/synapse.git
synced 2024-11-29 15:39:00 +03:00
Merge pull request #2785 from matrix-org/rav/reorganise_metrics_again
Reorganise request and block metrics
This commit is contained in:
commit
febdca4b37
5 changed files with 209 additions and 47 deletions
|
@ -2,7 +2,10 @@ Unreleased
|
||||||
==========
|
==========
|
||||||
|
|
||||||
synctl no longer starts the main synapse when using ``-a`` option with workers.
|
synctl no longer starts the main synapse when using ``-a`` option with workers.
|
||||||
A new worker file should be added with ``worker_app: synapse.app.homeserver``
|
A new worker file should be added with ``worker_app: synapse.app.homeserver``.
|
||||||
|
|
||||||
|
This release also begins the process of renaming a number of the metrics
|
||||||
|
reported to prometheus. See `docs/metrics-howto.rst <docs/metrics-howto.rst#block-and-response-metrics-renamed-for-0-27-0>`_.
|
||||||
|
|
||||||
|
|
||||||
Changes in synapse v0.26.0 (2018-01-05)
|
Changes in synapse v0.26.0 (2018-01-05)
|
||||||
|
|
|
@ -16,7 +16,7 @@ How to monitor Synapse metrics using Prometheus
|
||||||
metrics_port: 9092
|
metrics_port: 9092
|
||||||
|
|
||||||
Also ensure that ``enable_metrics`` is set to ``True``.
|
Also ensure that ``enable_metrics`` is set to ``True``.
|
||||||
|
|
||||||
Restart synapse.
|
Restart synapse.
|
||||||
|
|
||||||
3. Add a prometheus target for synapse.
|
3. Add a prometheus target for synapse.
|
||||||
|
@ -28,11 +28,58 @@ How to monitor Synapse metrics using Prometheus
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets: ["my.server.here:9092"]
|
- targets: ["my.server.here:9092"]
|
||||||
|
|
||||||
If your prometheus is older than 1.5.2, you will need to replace
|
If your prometheus is older than 1.5.2, you will need to replace
|
||||||
``static_configs`` in the above with ``target_groups``.
|
``static_configs`` in the above with ``target_groups``.
|
||||||
|
|
||||||
Restart prometheus.
|
Restart prometheus.
|
||||||
|
|
||||||
|
|
||||||
|
Block and response metrics renamed for 0.27.0
|
||||||
|
---------------------------------------------
|
||||||
|
|
||||||
|
Synapse 0.27.0 begins the process of rationalising the duplicate ``*:count``
|
||||||
|
metrics reported for the resource tracking for code blocks and HTTP requests.
|
||||||
|
|
||||||
|
At the same time, the corresponding ``*:total`` metrics are being renamed, as
|
||||||
|
the ``:total`` suffix no longer makes sense in the absence of a corresponding
|
||||||
|
``:count`` metric.
|
||||||
|
|
||||||
|
To enable a graceful migration path, this release just adds new names for the
|
||||||
|
metrics being renamed. A future release will remove the old ones.
|
||||||
|
|
||||||
|
The following table shows the new metrics, and the old metrics which they are
|
||||||
|
replacing.
|
||||||
|
|
||||||
|
==================================================== ===================================================
|
||||||
|
New name Old name
|
||||||
|
==================================================== ===================================================
|
||||||
|
synapse_util_metrics_block_count synapse_util_metrics_block_timer:count
|
||||||
|
synapse_util_metrics_block_count synapse_util_metrics_block_ru_utime:count
|
||||||
|
synapse_util_metrics_block_count synapse_util_metrics_block_ru_stime:count
|
||||||
|
synapse_util_metrics_block_count synapse_util_metrics_block_db_txn_count:count
|
||||||
|
synapse_util_metrics_block_count synapse_util_metrics_block_db_txn_duration:count
|
||||||
|
|
||||||
|
synapse_util_metrics_block_time_seconds synapse_util_metrics_block_timer:total
|
||||||
|
synapse_util_metrics_block_ru_utime_seconds synapse_util_metrics_block_ru_utime:total
|
||||||
|
synapse_util_metrics_block_ru_stime_seconds synapse_util_metrics_block_ru_stime:total
|
||||||
|
synapse_util_metrics_block_db_txn_count synapse_util_metrics_block_db_txn_count:total
|
||||||
|
synapse_util_metrics_block_db_txn_duration_seconds synapse_util_metrics_block_db_txn_duration:total
|
||||||
|
|
||||||
|
synapse_http_server_response_count synapse_http_server_requests
|
||||||
|
synapse_http_server_response_count synapse_http_server_response_time:count
|
||||||
|
synapse_http_server_response_count synapse_http_server_response_ru_utime:count
|
||||||
|
synapse_http_server_response_count synapse_http_server_response_ru_stime:count
|
||||||
|
synapse_http_server_response_count synapse_http_server_response_db_txn_count:count
|
||||||
|
synapse_http_server_response_count synapse_http_server_response_db_txn_duration:count
|
||||||
|
|
||||||
|
synapse_http_server_response_time_seconds synapse_http_server_response_time:total
|
||||||
|
synapse_http_server_response_ru_utime_seconds synapse_http_server_response_ru_utime:total
|
||||||
|
synapse_http_server_response_ru_stime_seconds synapse_http_server_response_ru_stime:total
|
||||||
|
synapse_http_server_response_db_txn_count synapse_http_server_response_db_txn_count:total
|
||||||
|
synapse_http_server_response_db_txn_duration_seconds synapse_http_server_response_db_txn_duration:total
|
||||||
|
==================================================== ===================================================
|
||||||
|
|
||||||
|
|
||||||
Standard Metric Names
|
Standard Metric Names
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
@ -42,7 +89,7 @@ have been changed to seconds, from miliseconds.
|
||||||
|
|
||||||
================================== =============================
|
================================== =============================
|
||||||
New name Old name
|
New name Old name
|
||||||
---------------------------------- -----------------------------
|
================================== =============================
|
||||||
process_cpu_user_seconds_total process_resource_utime / 1000
|
process_cpu_user_seconds_total process_resource_utime / 1000
|
||||||
process_cpu_system_seconds_total process_resource_stime / 1000
|
process_cpu_system_seconds_total process_resource_stime / 1000
|
||||||
process_open_fds (no 'type' label) process_fds
|
process_open_fds (no 'type' label) process_fds
|
||||||
|
@ -52,8 +99,8 @@ The python-specific counts of garbage collector performance have been renamed.
|
||||||
|
|
||||||
=========================== ======================
|
=========================== ======================
|
||||||
New name Old name
|
New name Old name
|
||||||
--------------------------- ----------------------
|
=========================== ======================
|
||||||
python_gc_time reactor_gc_time
|
python_gc_time reactor_gc_time
|
||||||
python_gc_unreachable_total reactor_gc_unreachable
|
python_gc_unreachable_total reactor_gc_unreachable
|
||||||
python_gc_counts reactor_gc_counts
|
python_gc_counts reactor_gc_counts
|
||||||
=========================== ======================
|
=========================== ======================
|
||||||
|
@ -62,7 +109,7 @@ The twisted-specific reactor metrics have been renamed.
|
||||||
|
|
||||||
==================================== =====================
|
==================================== =====================
|
||||||
New name Old name
|
New name Old name
|
||||||
------------------------------------ ---------------------
|
==================================== =====================
|
||||||
python_twisted_reactor_pending_calls reactor_pending_calls
|
python_twisted_reactor_pending_calls reactor_pending_calls
|
||||||
python_twisted_reactor_tick_time reactor_tick_time
|
python_twisted_reactor_tick_time reactor_tick_time
|
||||||
==================================== =====================
|
==================================== =====================
|
||||||
|
|
|
@ -42,34 +42,62 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
metrics = synapse.metrics.get_metrics_for(__name__)
|
metrics = synapse.metrics.get_metrics_for(__name__)
|
||||||
|
|
||||||
incoming_requests_counter = metrics.register_counter(
|
# total number of responses served, split by method/servlet/tag
|
||||||
"requests",
|
response_count = metrics.register_counter(
|
||||||
|
"response_count",
|
||||||
labels=["method", "servlet", "tag"],
|
labels=["method", "servlet", "tag"],
|
||||||
|
alternative_names=(
|
||||||
|
# the following are all deprecated aliases for the same metric
|
||||||
|
metrics.name_prefix + x for x in (
|
||||||
|
"_requests",
|
||||||
|
"_response_time:count",
|
||||||
|
"_response_ru_utime:count",
|
||||||
|
"_response_ru_stime:count",
|
||||||
|
"_response_db_txn_count:count",
|
||||||
|
"_response_db_txn_duration:count",
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
outgoing_responses_counter = metrics.register_counter(
|
outgoing_responses_counter = metrics.register_counter(
|
||||||
"responses",
|
"responses",
|
||||||
labels=["method", "code"],
|
labels=["method", "code"],
|
||||||
)
|
)
|
||||||
|
|
||||||
response_timer = metrics.register_distribution(
|
response_timer = metrics.register_counter(
|
||||||
"response_time",
|
"response_time_seconds",
|
||||||
labels=["method", "servlet", "tag"]
|
labels=["method", "servlet", "tag"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_response_time:total",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
response_ru_utime = metrics.register_distribution(
|
response_ru_utime = metrics.register_counter(
|
||||||
"response_ru_utime", labels=["method", "servlet", "tag"]
|
"response_ru_utime_seconds", labels=["method", "servlet", "tag"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_response_ru_utime:total",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
response_ru_stime = metrics.register_distribution(
|
response_ru_stime = metrics.register_counter(
|
||||||
"response_ru_stime", labels=["method", "servlet", "tag"]
|
"response_ru_stime_seconds", labels=["method", "servlet", "tag"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_response_ru_stime:total",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
response_db_txn_count = metrics.register_distribution(
|
response_db_txn_count = metrics.register_counter(
|
||||||
"response_db_txn_count", labels=["method", "servlet", "tag"]
|
"response_db_txn_count", labels=["method", "servlet", "tag"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_response_db_txn_count:total",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
response_db_txn_duration = metrics.register_distribution(
|
response_db_txn_duration = metrics.register_counter(
|
||||||
"response_db_txn_duration", labels=["method", "servlet", "tag"]
|
"response_db_txn_duration_seconds", labels=["method", "servlet", "tag"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_response_db_txn_duration:total",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -330,7 +358,7 @@ class RequestMetrics(object):
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
incoming_requests_counter.inc(request.method, self.name, tag)
|
response_count.inc(request.method, self.name, tag)
|
||||||
|
|
||||||
response_timer.inc_by(
|
response_timer.inc_by(
|
||||||
clock.time_msec() - self.start, request.method,
|
clock.time_msec() - self.start, request.method,
|
||||||
|
|
|
@ -17,16 +17,33 @@
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
|
|
||||||
|
|
||||||
# TODO(paul): I can't believe Python doesn't have one of these
|
def flatten(items):
|
||||||
def map_concat(func, items):
|
"""Flatten a list of lists
|
||||||
# flatten a list-of-lists
|
|
||||||
return list(chain.from_iterable(map(func, items)))
|
Args:
|
||||||
|
items: iterable[iterable[X]]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[X]: flattened list
|
||||||
|
"""
|
||||||
|
return list(chain.from_iterable(items))
|
||||||
|
|
||||||
|
|
||||||
class BaseMetric(object):
|
class BaseMetric(object):
|
||||||
|
"""Base class for metrics which report a single value per label set
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, name, labels=[]):
|
def __init__(self, name, labels=[], alternative_names=[]):
|
||||||
self.name = name
|
"""
|
||||||
|
Args:
|
||||||
|
name (str): principal name for this metric
|
||||||
|
labels (list(str)): names of the labels which will be reported
|
||||||
|
for this metric
|
||||||
|
alternative_names (iterable(str)): list of alternative names for
|
||||||
|
this metric. This can be useful to provide a migration path
|
||||||
|
when renaming metrics.
|
||||||
|
"""
|
||||||
|
self._names = [name] + list(alternative_names)
|
||||||
self.labels = labels # OK not to clone as we never write it
|
self.labels = labels # OK not to clone as we never write it
|
||||||
|
|
||||||
def dimension(self):
|
def dimension(self):
|
||||||
|
@ -36,7 +53,7 @@ class BaseMetric(object):
|
||||||
return not len(self.labels)
|
return not len(self.labels)
|
||||||
|
|
||||||
def _render_labelvalue(self, value):
|
def _render_labelvalue(self, value):
|
||||||
# TODO: some kind of value escape
|
# TODO: escape backslashes, quotes and newlines
|
||||||
return '"%s"' % (value)
|
return '"%s"' % (value)
|
||||||
|
|
||||||
def _render_key(self, values):
|
def _render_key(self, values):
|
||||||
|
@ -47,6 +64,36 @@ class BaseMetric(object):
|
||||||
for k, v in zip(self.labels, values)])
|
for k, v in zip(self.labels, values)])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _render_for_labels(self, label_values, value):
|
||||||
|
"""Render this metric for a single set of labels
|
||||||
|
|
||||||
|
Args:
|
||||||
|
label_values (list[str]): values for each of the labels
|
||||||
|
value: value of the metric at with these labels
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
iterable[str]: rendered metric
|
||||||
|
"""
|
||||||
|
rendered_labels = self._render_key(label_values)
|
||||||
|
return (
|
||||||
|
"%s%s %.12g" % (name, rendered_labels, value)
|
||||||
|
for name in self._names
|
||||||
|
)
|
||||||
|
|
||||||
|
def render(self):
|
||||||
|
"""Render this metric
|
||||||
|
|
||||||
|
Each metric is rendered as:
|
||||||
|
|
||||||
|
name{label1="val1",label2="val2"} value
|
||||||
|
|
||||||
|
https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
iterable[str]: rendered metrics
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
class CounterMetric(BaseMetric):
|
class CounterMetric(BaseMetric):
|
||||||
"""The simplest kind of metric; one that stores a monotonically-increasing
|
"""The simplest kind of metric; one that stores a monotonically-increasing
|
||||||
|
@ -62,6 +109,10 @@ class CounterMetric(BaseMetric):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super(CounterMetric, self).__init__(*args, **kwargs)
|
super(CounterMetric, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# dict[list[str]]: value for each set of label values. the keys are the
|
||||||
|
# label values, in the same order as the labels in self.labels.
|
||||||
|
#
|
||||||
|
# (if the metric is a scalar, the (single) key is the empty list).
|
||||||
self.counts = {}
|
self.counts = {}
|
||||||
|
|
||||||
# Scalar metrics are never empty
|
# Scalar metrics are never empty
|
||||||
|
@ -84,11 +135,11 @@ class CounterMetric(BaseMetric):
|
||||||
def inc(self, *values):
|
def inc(self, *values):
|
||||||
self.inc_by(1, *values)
|
self.inc_by(1, *values)
|
||||||
|
|
||||||
def render_item(self, k):
|
|
||||||
return ["%s%s %.12g" % (self.name, self._render_key(k), self.counts[k])]
|
|
||||||
|
|
||||||
def render(self):
|
def render(self):
|
||||||
return map_concat(self.render_item, sorted(self.counts.keys()))
|
return flatten(
|
||||||
|
self._render_for_labels(k, self.counts[k])
|
||||||
|
for k in sorted(self.counts.keys())
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CallbackMetric(BaseMetric):
|
class CallbackMetric(BaseMetric):
|
||||||
|
@ -105,10 +156,12 @@ class CallbackMetric(BaseMetric):
|
||||||
value = self.callback()
|
value = self.callback()
|
||||||
|
|
||||||
if self.is_scalar():
|
if self.is_scalar():
|
||||||
return ["%s %.12g" % (self.name, value)]
|
return list(self._render_for_labels([], value))
|
||||||
|
|
||||||
return ["%s%s %.12g" % (self.name, self._render_key(k), value[k])
|
return flatten(
|
||||||
for k in sorted(value.keys())]
|
self._render_for_labels(k, value[k])
|
||||||
|
for k in sorted(value.keys())
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class DistributionMetric(object):
|
class DistributionMetric(object):
|
||||||
|
|
|
@ -27,25 +27,56 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
metrics = synapse.metrics.get_metrics_for(__name__)
|
metrics = synapse.metrics.get_metrics_for(__name__)
|
||||||
|
|
||||||
block_timer = metrics.register_distribution(
|
# total number of times we have hit this block
|
||||||
"block_timer",
|
response_count = metrics.register_counter(
|
||||||
labels=["block_name"]
|
"block_count",
|
||||||
|
labels=["block_name"],
|
||||||
|
alternative_names=(
|
||||||
|
# the following are all deprecated aliases for the same metric
|
||||||
|
metrics.name_prefix + x for x in (
|
||||||
|
"_block_timer:count",
|
||||||
|
"_block_ru_utime:count",
|
||||||
|
"_block_ru_stime:count",
|
||||||
|
"_block_db_txn_count:count",
|
||||||
|
"_block_db_txn_duration:count",
|
||||||
|
)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
block_ru_utime = metrics.register_distribution(
|
block_timer = metrics.register_counter(
|
||||||
"block_ru_utime", labels=["block_name"]
|
"block_time_seconds",
|
||||||
|
labels=["block_name"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_block_timer:total",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
block_ru_stime = metrics.register_distribution(
|
block_ru_utime = metrics.register_counter(
|
||||||
"block_ru_stime", labels=["block_name"]
|
"block_ru_utime_seconds", labels=["block_name"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_block_ru_utime:total",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
block_db_txn_count = metrics.register_distribution(
|
block_ru_stime = metrics.register_counter(
|
||||||
"block_db_txn_count", labels=["block_name"]
|
"block_ru_stime_seconds", labels=["block_name"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_block_ru_stime:total",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
block_db_txn_duration = metrics.register_distribution(
|
block_db_txn_count = metrics.register_counter(
|
||||||
"block_db_txn_duration", labels=["block_name"]
|
"block_db_txn_count", labels=["block_name"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_block_db_txn_count:total",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
block_db_txn_duration = metrics.register_counter(
|
||||||
|
"block_db_txn_duration_seconds", labels=["block_name"],
|
||||||
|
alternative_names=(
|
||||||
|
metrics.name_prefix + "_block_db_txn_count:total",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue