Improve lock performance when a lot of locks are waiting (#16840)

When a lot of locks are waiting for a single lock, notifying all locks
independently with `call_later` on each release is really costly and
incurs some kind of async contention, where the CPU is spinning a lot
for not much.

The included test is taking around 30s before the change, and 0.5s
after.

It was found following failing tests with
https://github.com/element-hq/synapse/pull/16827.
This commit is contained in:
Mathieu Velten 2024-03-14 14:49:54 +01:00 committed by GitHub
parent a111ba0207
commit cb562d73aa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 73 additions and 6 deletions

1
changelog.d/16840.misc Normal file
View file

@ -0,0 +1 @@
Improve lock performance when a lot of locks are all waiting for a single lock to be released.

View file

@ -182,12 +182,15 @@ class WorkerLocksHandler:
if not locks: if not locks:
return return
def _wake_deferred(deferred: defer.Deferred) -> None: def _wake_all_locks(
if not deferred.called: locks: Collection[Union[WaitingLock, WaitingMultiLock]]
deferred.callback(None) ) -> None:
for lock in locks:
deferred = lock.deferred
if not deferred.called:
deferred.callback(None)
for lock in locks: self._clock.call_later(0, _wake_all_locks, locks)
self._clock.call_later(0, _wake_deferred, lock.deferred)
@wrap_as_background_process("_cleanup_locks") @wrap_as_background_process("_cleanup_locks")
async def _cleanup_locks(self) -> None: async def _cleanup_locks(self) -> None:

View file

@ -27,6 +27,7 @@ from synapse.util import Clock
from tests import unittest from tests import unittest
from tests.replication._base import BaseMultiWorkerStreamTestCase from tests.replication._base import BaseMultiWorkerStreamTestCase
from tests.utils import test_timeout
class WorkerLockTestCase(unittest.HomeserverTestCase): class WorkerLockTestCase(unittest.HomeserverTestCase):
@ -50,6 +51,28 @@ class WorkerLockTestCase(unittest.HomeserverTestCase):
self.get_success(d2) self.get_success(d2)
self.get_success(lock2.__aexit__(None, None, None)) self.get_success(lock2.__aexit__(None, None, None))
def test_lock_contention(self) -> None:
"""Test lock contention when a lot of locks wait on a single worker"""
# It takes around 0.5s on a 5+ years old laptop
with test_timeout(5):
nb_locks = 500
d = self._take_locks(nb_locks)
self.assertEqual(self.get_success(d), nb_locks)
async def _take_locks(self, nb_locks: int) -> int:
locks = [
self.hs.get_worker_locks_handler().acquire_lock("test_lock", "")
for _ in range(nb_locks)
]
nb_locks_taken = 0
for lock in locks:
async with lock:
nb_locks_taken += 1
return nb_locks_taken
class WorkerLockWorkersTestCase(BaseMultiWorkerStreamTestCase): class WorkerLockWorkersTestCase(BaseMultiWorkerStreamTestCase):
def prepare( def prepare(

View file

@ -21,7 +21,20 @@
import atexit import atexit
import os import os
from typing import Any, Callable, Dict, List, Tuple, Type, TypeVar, Union, overload import signal
from types import FrameType, TracebackType
from typing import (
Any,
Callable,
Dict,
List,
Optional,
Tuple,
Type,
TypeVar,
Union,
overload,
)
import attr import attr
from typing_extensions import Literal, ParamSpec from typing_extensions import Literal, ParamSpec
@ -379,3 +392,30 @@ def checked_cast(type: Type[T], x: object) -> T:
""" """
assert isinstance(x, type) assert isinstance(x, type)
return x return x
class TestTimeout(Exception):
pass
class test_timeout:
def __init__(self, seconds: int, error_message: Optional[str] = None) -> None:
if error_message is None:
error_message = "test timed out after {}s.".format(seconds)
self.seconds = seconds
self.error_message = error_message
def handle_timeout(self, signum: int, frame: Optional[FrameType]) -> None:
raise TestTimeout(self.error_message)
def __enter__(self) -> None:
signal.signal(signal.SIGALRM, self.handle_timeout)
signal.alarm(self.seconds)
def __exit__(
self,
exc_type: Optional[Type[BaseException]],
exc_val: Optional[BaseException],
exc_tb: Optional[TracebackType],
) -> None:
signal.alarm(0)