Track why we're evicting from caches (#10829)

So we can see distinguish between "evicting because the cache is too big" and "evicting because the cache entries haven't been recently used".
This commit is contained in:
David Robertson 2021-09-22 10:59:52 +01:00 committed by GitHub
parent 51e2db3598
commit a2d7195e01
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 31 additions and 15 deletions

1
changelog.d/10829.misc Normal file
View file

@ -0,0 +1 @@
Track cache eviction rates more finely in Prometheus' monitoring.

View file

@ -12,8 +12,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import logging
import typing
from enum import Enum, auto
from sys import intern
from typing import Callable, Dict, Optional, Sized
@ -34,7 +36,7 @@ collectors_by_name: Dict[str, "CacheMetric"] = {}
cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"])
cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"])
cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name"])
cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name", "reason"])
cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"])
cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"])
cache_memory_usage = Gauge(
@ -46,11 +48,16 @@ cache_memory_usage = Gauge(
response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"])
response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"])
response_cache_evicted = Gauge(
"synapse_util_caches_response_cache:evicted_size", "", ["name"]
"synapse_util_caches_response_cache:evicted_size", "", ["name", "reason"]
)
response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"])
class EvictionReason(Enum):
size = auto()
time = auto()
@attr.s(slots=True)
class CacheMetric:
@ -61,7 +68,9 @@ class CacheMetric:
hits = attr.ib(default=0)
misses = attr.ib(default=0)
evicted_size = attr.ib(default=0)
eviction_size_by_reason: typing.Counter[EvictionReason] = attr.ib(
factory=collections.Counter
)
memory_usage = attr.ib(default=None)
def inc_hits(self) -> None:
@ -70,8 +79,8 @@ class CacheMetric:
def inc_misses(self) -> None:
self.misses += 1
def inc_evictions(self, size: int = 1) -> None:
self.evicted_size += size
def inc_evictions(self, reason: EvictionReason, size: int = 1) -> None:
self.eviction_size_by_reason[reason] += size
def inc_memory_usage(self, memory: int) -> None:
if self.memory_usage is None:
@ -94,14 +103,20 @@ class CacheMetric:
if self._cache_type == "response_cache":
response_cache_size.labels(self._cache_name).set(len(self._cache))
response_cache_hits.labels(self._cache_name).set(self.hits)
response_cache_evicted.labels(self._cache_name).set(self.evicted_size)
for reason in EvictionReason:
response_cache_evicted.labels(self._cache_name, reason.name).set(
self.eviction_size_by_reason[reason]
)
response_cache_total.labels(self._cache_name).set(
self.hits + self.misses
)
else:
cache_size.labels(self._cache_name).set(len(self._cache))
cache_hits.labels(self._cache_name).set(self.hits)
cache_evicted.labels(self._cache_name).set(self.evicted_size)
for reason in EvictionReason:
cache_evicted.labels(self._cache_name, reason.name).set(
self.eviction_size_by_reason[reason]
)
cache_total.labels(self._cache_name).set(self.hits + self.misses)
if getattr(self._cache, "max_size", None):
cache_max_size.labels(self._cache_name).set(self._cache.max_size)

View file

@ -22,7 +22,7 @@ from typing_extensions import Literal
from synapse.config import cache as cache_config
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.util import Clock
from synapse.util.caches import register_cache
from synapse.util.caches import EvictionReason, register_cache
logger = logging.getLogger(__name__)
@ -98,9 +98,9 @@ class ExpiringCache(Generic[KT, VT]):
while self._max_size and len(self) > self._max_size:
_key, value = self._cache.popitem(last=False)
if self.iterable:
self.metrics.inc_evictions(len(value.value))
self.metrics.inc_evictions(EvictionReason.size, len(value.value))
else:
self.metrics.inc_evictions()
self.metrics.inc_evictions(EvictionReason.size)
def __getitem__(self, key: KT) -> VT:
try:
@ -175,9 +175,9 @@ class ExpiringCache(Generic[KT, VT]):
for k in keys_to_delete:
value = self._cache.pop(k)
if self.iterable:
self.metrics.inc_evictions(len(value.value))
self.metrics.inc_evictions(EvictionReason.time, len(value.value))
else:
self.metrics.inc_evictions()
self.metrics.inc_evictions(EvictionReason.time)
logger.debug(
"[%s] _prune_cache before: %d, after len: %d",

View file

@ -40,7 +40,7 @@ from twisted.internet.interfaces import IReactorTime
from synapse.config import cache as cache_config
from synapse.metrics.background_process_metrics import wrap_as_background_process
from synapse.util import Clock, caches
from synapse.util.caches import CacheMetric, register_cache
from synapse.util.caches import CacheMetric, EvictionReason, register_cache
from synapse.util.caches.treecache import TreeCache, iterate_tree_cache_entry
from synapse.util.linked_list import ListNode
@ -403,7 +403,7 @@ class LruCache(Generic[KT, VT]):
evicted_len = delete_node(node)
cache.pop(node.key, None)
if metrics:
metrics.inc_evictions(evicted_len)
metrics.inc_evictions(EvictionReason.size, evicted_len)
def synchronized(f: FT) -> FT:
@wraps(f)