Track why we're evicting from caches (#10829)

So we can see distinguish between "evicting because the cache is too big" and "evicting because the cache entries haven't been recently used".
This commit is contained in:
David Robertson 2021-09-22 10:59:52 +01:00 committed by GitHub
parent 51e2db3598
commit a2d7195e01
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 31 additions and 15 deletions

1
changelog.d/10829.misc Normal file
View file

@ -0,0 +1 @@
Track cache eviction rates more finely in Prometheus' monitoring.

View file

@ -12,8 +12,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import collections
import logging import logging
import typing
from enum import Enum, auto
from sys import intern from sys import intern
from typing import Callable, Dict, Optional, Sized from typing import Callable, Dict, Optional, Sized
@ -34,7 +36,7 @@ collectors_by_name: Dict[str, "CacheMetric"] = {}
cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"]) cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"])
cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"]) cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"])
cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name"]) cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name", "reason"])
cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"]) cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"])
cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"]) cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
cache_memory_usage = Gauge( cache_memory_usage = Gauge(
@ -46,11 +48,16 @@ cache_memory_usage = Gauge(
response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"]) response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"])
response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"]) response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"])
response_cache_evicted = Gauge( response_cache_evicted = Gauge(
"synapse_util_caches_response_cache:evicted_size", "", ["name"] "synapse_util_caches_response_cache:evicted_size", "", ["name", "reason"]
) )
response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"]) response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"])
class EvictionReason(Enum):
size = auto()
time = auto()
@attr.s(slots=True) @attr.s(slots=True)
class CacheMetric: class CacheMetric:
@ -61,7 +68,9 @@ class CacheMetric:
hits = attr.ib(default=0) hits = attr.ib(default=0)
misses = attr.ib(default=0) misses = attr.ib(default=0)
evicted_size = attr.ib(default=0) eviction_size_by_reason: typing.Counter[EvictionReason] = attr.ib(
factory=collections.Counter
)
memory_usage = attr.ib(default=None) memory_usage = attr.ib(default=None)
def inc_hits(self) -> None: def inc_hits(self) -> None:
@ -70,8 +79,8 @@ class CacheMetric:
def inc_misses(self) -> None: def inc_misses(self) -> None:
self.misses += 1 self.misses += 1
def inc_evictions(self, size: int = 1) -> None: def inc_evictions(self, reason: EvictionReason, size: int = 1) -> None:
self.evicted_size += size self.eviction_size_by_reason[reason] += size
def inc_memory_usage(self, memory: int) -> None: def inc_memory_usage(self, memory: int) -> None:
if self.memory_usage is None: if self.memory_usage is None:
@ -94,14 +103,20 @@ class CacheMetric:
if self._cache_type == "response_cache": if self._cache_type == "response_cache":
response_cache_size.labels(self._cache_name).set(len(self._cache)) response_cache_size.labels(self._cache_name).set(len(self._cache))
response_cache_hits.labels(self._cache_name).set(self.hits) response_cache_hits.labels(self._cache_name).set(self.hits)
response_cache_evicted.labels(self._cache_name).set(self.evicted_size) for reason in EvictionReason:
response_cache_evicted.labels(self._cache_name, reason.name).set(
self.eviction_size_by_reason[reason]
)
response_cache_total.labels(self._cache_name).set( response_cache_total.labels(self._cache_name).set(
self.hits + self.misses self.hits + self.misses
) )
else: else:
cache_size.labels(self._cache_name).set(len(self._cache)) cache_size.labels(self._cache_name).set(len(self._cache))
cache_hits.labels(self._cache_name).set(self.hits) cache_hits.labels(self._cache_name).set(self.hits)
cache_evicted.labels(self._cache_name).set(self.evicted_size) for reason in EvictionReason:
cache_evicted.labels(self._cache_name, reason.name).set(
self.eviction_size_by_reason[reason]
)
cache_total.labels(self._cache_name).set(self.hits + self.misses) cache_total.labels(self._cache_name).set(self.hits + self.misses)
if getattr(self._cache, "max_size", None): if getattr(self._cache, "max_size", None):
cache_max_size.labels(self._cache_name).set(self._cache.max_size) cache_max_size.labels(self._cache_name).set(self._cache.max_size)

View file

@ -22,7 +22,7 @@ from typing_extensions import Literal
from synapse.config import cache as cache_config from synapse.config import cache as cache_config
from synapse.metrics.background_process_metrics import run_as_background_process from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.util import Clock from synapse.util import Clock
from synapse.util.caches import register_cache from synapse.util.caches import EvictionReason, register_cache
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -98,9 +98,9 @@ class ExpiringCache(Generic[KT, VT]):
while self._max_size and len(self) > self._max_size: while self._max_size and len(self) > self._max_size:
_key, value = self._cache.popitem(last=False) _key, value = self._cache.popitem(last=False)
if self.iterable: if self.iterable:
self.metrics.inc_evictions(len(value.value)) self.metrics.inc_evictions(EvictionReason.size, len(value.value))
else: else:
self.metrics.inc_evictions() self.metrics.inc_evictions(EvictionReason.size)
def __getitem__(self, key: KT) -> VT: def __getitem__(self, key: KT) -> VT:
try: try:
@ -175,9 +175,9 @@ class ExpiringCache(Generic[KT, VT]):
for k in keys_to_delete: for k in keys_to_delete:
value = self._cache.pop(k) value = self._cache.pop(k)
if self.iterable: if self.iterable:
self.metrics.inc_evictions(len(value.value)) self.metrics.inc_evictions(EvictionReason.time, len(value.value))
else: else:
self.metrics.inc_evictions() self.metrics.inc_evictions(EvictionReason.time)
logger.debug( logger.debug(
"[%s] _prune_cache before: %d, after len: %d", "[%s] _prune_cache before: %d, after len: %d",

View file

@ -40,7 +40,7 @@ from twisted.internet.interfaces import IReactorTime
from synapse.config import cache as cache_config from synapse.config import cache as cache_config
from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.metrics.background_process_metrics import wrap_as_background_process
from synapse.util import Clock, caches from synapse.util import Clock, caches
from synapse.util.caches import CacheMetric, register_cache from synapse.util.caches import CacheMetric, EvictionReason, register_cache
from synapse.util.caches.treecache import TreeCache, iterate_tree_cache_entry from synapse.util.caches.treecache import TreeCache, iterate_tree_cache_entry
from synapse.util.linked_list import ListNode from synapse.util.linked_list import ListNode
@ -403,7 +403,7 @@ class LruCache(Generic[KT, VT]):
evicted_len = delete_node(node) evicted_len = delete_node(node)
cache.pop(node.key, None) cache.pop(node.key, None)
if metrics: if metrics:
metrics.inc_evictions(evicted_len) metrics.inc_evictions(EvictionReason.size, evicted_len)
def synchronized(f: FT) -> FT: def synchronized(f: FT) -> FT:
@wraps(f) @wraps(f)