mirror of
https://github.com/element-hq/synapse.git
synced 2024-12-18 08:54:54 +03:00
Stagger send presence to remotes (#10398)
This is to help with performance, where trying to connect to thousands of hosts at once can consume a lot of CPU (due to TLS etc). Co-authored-by: Brendan Abolivier <babolivier@matrix.org>
This commit is contained in:
parent
5ecad4e7a5
commit
ac5c221208
4 changed files with 116 additions and 5 deletions
1
changelog.d/10398.misc
Normal file
1
changelog.d/10398.misc
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Stagger sending of presence update to remote servers, reducing CPU spikes caused by starting many connections to remote servers at once.
|
|
@ -14,9 +14,12 @@
|
||||||
|
|
||||||
import abc
|
import abc
|
||||||
import logging
|
import logging
|
||||||
|
from collections import OrderedDict
|
||||||
from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Set, Tuple
|
from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Set, Tuple
|
||||||
|
|
||||||
|
import attr
|
||||||
from prometheus_client import Counter
|
from prometheus_client import Counter
|
||||||
|
from typing_extensions import Literal
|
||||||
|
|
||||||
from twisted.internet import defer
|
from twisted.internet import defer
|
||||||
|
|
||||||
|
@ -33,8 +36,12 @@ from synapse.metrics import (
|
||||||
event_processing_loop_room_count,
|
event_processing_loop_room_count,
|
||||||
events_processed_counter,
|
events_processed_counter,
|
||||||
)
|
)
|
||||||
from synapse.metrics.background_process_metrics import run_as_background_process
|
from synapse.metrics.background_process_metrics import (
|
||||||
|
run_as_background_process,
|
||||||
|
wrap_as_background_process,
|
||||||
|
)
|
||||||
from synapse.types import JsonDict, ReadReceipt, RoomStreamToken
|
from synapse.types import JsonDict, ReadReceipt, RoomStreamToken
|
||||||
|
from synapse.util import Clock
|
||||||
from synapse.util.metrics import Measure
|
from synapse.util.metrics import Measure
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -137,6 +144,84 @@ class AbstractFederationSender(metaclass=abc.ABCMeta):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
@attr.s
|
||||||
|
class _PresenceQueue:
|
||||||
|
"""A queue of destinations that need to be woken up due to new presence
|
||||||
|
updates.
|
||||||
|
|
||||||
|
Staggers waking up of per destination queues to ensure that we don't attempt
|
||||||
|
to start TLS connections with many hosts all at once, leading to pinned CPU.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# The maximum duration in seconds between queuing up a destination and it
|
||||||
|
# being woken up.
|
||||||
|
_MAX_TIME_IN_QUEUE = 30.0
|
||||||
|
|
||||||
|
# The maximum duration in seconds between waking up consecutive destination
|
||||||
|
# queues.
|
||||||
|
_MAX_DELAY = 0.1
|
||||||
|
|
||||||
|
sender: "FederationSender" = attr.ib()
|
||||||
|
clock: Clock = attr.ib()
|
||||||
|
queue: "OrderedDict[str, Literal[None]]" = attr.ib(factory=OrderedDict)
|
||||||
|
processing: bool = attr.ib(default=False)
|
||||||
|
|
||||||
|
def add_to_queue(self, destination: str) -> None:
|
||||||
|
"""Add a destination to the queue to be woken up."""
|
||||||
|
|
||||||
|
self.queue[destination] = None
|
||||||
|
|
||||||
|
if not self.processing:
|
||||||
|
self._handle()
|
||||||
|
|
||||||
|
@wrap_as_background_process("_PresenceQueue.handle")
|
||||||
|
async def _handle(self) -> None:
|
||||||
|
"""Background process to drain the queue."""
|
||||||
|
|
||||||
|
if not self.queue:
|
||||||
|
return
|
||||||
|
|
||||||
|
assert not self.processing
|
||||||
|
self.processing = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
# We start with a delay that should drain the queue quickly enough that
|
||||||
|
# we process all destinations in the queue in _MAX_TIME_IN_QUEUE
|
||||||
|
# seconds.
|
||||||
|
#
|
||||||
|
# We also add an upper bound to the delay, to gracefully handle the
|
||||||
|
# case where the queue only has a few entries in it.
|
||||||
|
current_sleep_seconds = min(
|
||||||
|
self._MAX_DELAY, self._MAX_TIME_IN_QUEUE / len(self.queue)
|
||||||
|
)
|
||||||
|
|
||||||
|
while self.queue:
|
||||||
|
destination, _ = self.queue.popitem(last=False)
|
||||||
|
|
||||||
|
queue = self.sender._get_per_destination_queue(destination)
|
||||||
|
|
||||||
|
if not queue._new_data_to_send:
|
||||||
|
# The per destination queue has already been woken up.
|
||||||
|
continue
|
||||||
|
|
||||||
|
queue.attempt_new_transaction()
|
||||||
|
|
||||||
|
await self.clock.sleep(current_sleep_seconds)
|
||||||
|
|
||||||
|
if not self.queue:
|
||||||
|
break
|
||||||
|
|
||||||
|
# More destinations may have been added to the queue, so we may
|
||||||
|
# need to reduce the delay to ensure everything gets processed
|
||||||
|
# within _MAX_TIME_IN_QUEUE seconds.
|
||||||
|
current_sleep_seconds = min(
|
||||||
|
current_sleep_seconds, self._MAX_TIME_IN_QUEUE / len(self.queue)
|
||||||
|
)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
self.processing = False
|
||||||
|
|
||||||
|
|
||||||
class FederationSender(AbstractFederationSender):
|
class FederationSender(AbstractFederationSender):
|
||||||
def __init__(self, hs: "HomeServer"):
|
def __init__(self, hs: "HomeServer"):
|
||||||
self.hs = hs
|
self.hs = hs
|
||||||
|
@ -208,6 +293,8 @@ class FederationSender(AbstractFederationSender):
|
||||||
|
|
||||||
self._external_cache = hs.get_external_cache()
|
self._external_cache = hs.get_external_cache()
|
||||||
|
|
||||||
|
self._presence_queue = _PresenceQueue(self, self.clock)
|
||||||
|
|
||||||
def _get_per_destination_queue(self, destination: str) -> PerDestinationQueue:
|
def _get_per_destination_queue(self, destination: str) -> PerDestinationQueue:
|
||||||
"""Get or create a PerDestinationQueue for the given destination
|
"""Get or create a PerDestinationQueue for the given destination
|
||||||
|
|
||||||
|
@ -517,7 +604,12 @@ class FederationSender(AbstractFederationSender):
|
||||||
self._instance_name, destination
|
self._instance_name, destination
|
||||||
):
|
):
|
||||||
continue
|
continue
|
||||||
self._get_per_destination_queue(destination).send_presence(states)
|
|
||||||
|
self._get_per_destination_queue(destination).send_presence(
|
||||||
|
states, start_loop=False
|
||||||
|
)
|
||||||
|
|
||||||
|
self._presence_queue.add_to_queue(destination)
|
||||||
|
|
||||||
def build_and_send_edu(
|
def build_and_send_edu(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -171,13 +171,23 @@ class PerDestinationQueue:
|
||||||
|
|
||||||
self.attempt_new_transaction()
|
self.attempt_new_transaction()
|
||||||
|
|
||||||
def send_presence(self, states: Iterable[UserPresenceState]) -> None:
|
def send_presence(
|
||||||
"""Add presence updates to the queue. Start the transmission loop if necessary.
|
self, states: Iterable[UserPresenceState], start_loop: bool = True
|
||||||
|
) -> None:
|
||||||
|
"""Add presence updates to the queue.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
states: Presence updates to send
|
||||||
|
start_loop: Whether to start the transmission loop if not already
|
||||||
|
running.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
states: presence to send
|
states: presence to send
|
||||||
"""
|
"""
|
||||||
self._pending_presence.update({state.user_id: state for state in states})
|
self._pending_presence.update({state.user_id: state for state in states})
|
||||||
|
self._new_data_to_send = True
|
||||||
|
|
||||||
|
if start_loop:
|
||||||
self.attempt_new_transaction()
|
self.attempt_new_transaction()
|
||||||
|
|
||||||
def queue_read_receipt(self, receipt: ReadReceipt) -> None:
|
def queue_read_receipt(self, receipt: ReadReceipt) -> None:
|
||||||
|
|
|
@ -285,6 +285,10 @@ class PresenceRouterTestCase(FederatingHomeserverTestCase):
|
||||||
presence_updates, _ = sync_presence(self, self.presence_receiving_user_two_id)
|
presence_updates, _ = sync_presence(self, self.presence_receiving_user_two_id)
|
||||||
self.assertEqual(len(presence_updates), 3)
|
self.assertEqual(len(presence_updates), 3)
|
||||||
|
|
||||||
|
# We stagger sending of presence, so we need to wait a bit for them to
|
||||||
|
# get sent out.
|
||||||
|
self.reactor.advance(60)
|
||||||
|
|
||||||
# Test that sending to a remote user works
|
# Test that sending to a remote user works
|
||||||
remote_user_id = "@far_away_person:island"
|
remote_user_id = "@far_away_person:island"
|
||||||
|
|
||||||
|
@ -301,6 +305,10 @@ class PresenceRouterTestCase(FederatingHomeserverTestCase):
|
||||||
self.module_api.send_local_online_presence_to([remote_user_id])
|
self.module_api.send_local_online_presence_to([remote_user_id])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# We stagger sending of presence, so we need to wait a bit for them to
|
||||||
|
# get sent out.
|
||||||
|
self.reactor.advance(60)
|
||||||
|
|
||||||
# Check that the expected presence updates were sent
|
# Check that the expected presence updates were sent
|
||||||
# We explicitly compare using sets as we expect that calling
|
# We explicitly compare using sets as we expect that calling
|
||||||
# module_api.send_local_online_presence_to will create a presence
|
# module_api.send_local_online_presence_to will create a presence
|
||||||
|
|
Loading…
Reference in a new issue