Stagger send presence to remotes (#10398)

This is to help with performance, where trying to connect to thousands
of hosts at once can consume a lot of CPU (due to TLS etc).

Co-authored-by: Brendan Abolivier <babolivier@matrix.org>
This commit is contained in:
Erik Johnston 2021-07-15 11:52:56 +01:00 committed by GitHub
parent 5ecad4e7a5
commit ac5c221208
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 116 additions and 5 deletions

1
changelog.d/10398.misc Normal file
View file

@ -0,0 +1 @@
Stagger sending of presence update to remote servers, reducing CPU spikes caused by starting many connections to remote servers at once.

View file

@ -14,9 +14,12 @@
import abc import abc
import logging import logging
from collections import OrderedDict
from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Set, Tuple from typing import TYPE_CHECKING, Dict, Hashable, Iterable, List, Optional, Set, Tuple
import attr
from prometheus_client import Counter from prometheus_client import Counter
from typing_extensions import Literal
from twisted.internet import defer from twisted.internet import defer
@ -33,8 +36,12 @@ from synapse.metrics import (
event_processing_loop_room_count, event_processing_loop_room_count,
events_processed_counter, events_processed_counter,
) )
from synapse.metrics.background_process_metrics import run_as_background_process from synapse.metrics.background_process_metrics import (
run_as_background_process,
wrap_as_background_process,
)
from synapse.types import JsonDict, ReadReceipt, RoomStreamToken from synapse.types import JsonDict, ReadReceipt, RoomStreamToken
from synapse.util import Clock
from synapse.util.metrics import Measure from synapse.util.metrics import Measure
if TYPE_CHECKING: if TYPE_CHECKING:
@ -137,6 +144,84 @@ class AbstractFederationSender(metaclass=abc.ABCMeta):
raise NotImplementedError() raise NotImplementedError()
@attr.s
class _PresenceQueue:
"""A queue of destinations that need to be woken up due to new presence
updates.
Staggers waking up of per destination queues to ensure that we don't attempt
to start TLS connections with many hosts all at once, leading to pinned CPU.
"""
# The maximum duration in seconds between queuing up a destination and it
# being woken up.
_MAX_TIME_IN_QUEUE = 30.0
# The maximum duration in seconds between waking up consecutive destination
# queues.
_MAX_DELAY = 0.1
sender: "FederationSender" = attr.ib()
clock: Clock = attr.ib()
queue: "OrderedDict[str, Literal[None]]" = attr.ib(factory=OrderedDict)
processing: bool = attr.ib(default=False)
def add_to_queue(self, destination: str) -> None:
"""Add a destination to the queue to be woken up."""
self.queue[destination] = None
if not self.processing:
self._handle()
@wrap_as_background_process("_PresenceQueue.handle")
async def _handle(self) -> None:
"""Background process to drain the queue."""
if not self.queue:
return
assert not self.processing
self.processing = True
try:
# We start with a delay that should drain the queue quickly enough that
# we process all destinations in the queue in _MAX_TIME_IN_QUEUE
# seconds.
#
# We also add an upper bound to the delay, to gracefully handle the
# case where the queue only has a few entries in it.
current_sleep_seconds = min(
self._MAX_DELAY, self._MAX_TIME_IN_QUEUE / len(self.queue)
)
while self.queue:
destination, _ = self.queue.popitem(last=False)
queue = self.sender._get_per_destination_queue(destination)
if not queue._new_data_to_send:
# The per destination queue has already been woken up.
continue
queue.attempt_new_transaction()
await self.clock.sleep(current_sleep_seconds)
if not self.queue:
break
# More destinations may have been added to the queue, so we may
# need to reduce the delay to ensure everything gets processed
# within _MAX_TIME_IN_QUEUE seconds.
current_sleep_seconds = min(
current_sleep_seconds, self._MAX_TIME_IN_QUEUE / len(self.queue)
)
finally:
self.processing = False
class FederationSender(AbstractFederationSender): class FederationSender(AbstractFederationSender):
def __init__(self, hs: "HomeServer"): def __init__(self, hs: "HomeServer"):
self.hs = hs self.hs = hs
@ -208,6 +293,8 @@ class FederationSender(AbstractFederationSender):
self._external_cache = hs.get_external_cache() self._external_cache = hs.get_external_cache()
self._presence_queue = _PresenceQueue(self, self.clock)
def _get_per_destination_queue(self, destination: str) -> PerDestinationQueue: def _get_per_destination_queue(self, destination: str) -> PerDestinationQueue:
"""Get or create a PerDestinationQueue for the given destination """Get or create a PerDestinationQueue for the given destination
@ -517,7 +604,12 @@ class FederationSender(AbstractFederationSender):
self._instance_name, destination self._instance_name, destination
): ):
continue continue
self._get_per_destination_queue(destination).send_presence(states)
self._get_per_destination_queue(destination).send_presence(
states, start_loop=False
)
self._presence_queue.add_to_queue(destination)
def build_and_send_edu( def build_and_send_edu(
self, self,

View file

@ -171,13 +171,23 @@ class PerDestinationQueue:
self.attempt_new_transaction() self.attempt_new_transaction()
def send_presence(self, states: Iterable[UserPresenceState]) -> None: def send_presence(
"""Add presence updates to the queue. Start the transmission loop if necessary. self, states: Iterable[UserPresenceState], start_loop: bool = True
) -> None:
"""Add presence updates to the queue.
Args:
states: Presence updates to send
start_loop: Whether to start the transmission loop if not already
running.
Args: Args:
states: presence to send states: presence to send
""" """
self._pending_presence.update({state.user_id: state for state in states}) self._pending_presence.update({state.user_id: state for state in states})
self._new_data_to_send = True
if start_loop:
self.attempt_new_transaction() self.attempt_new_transaction()
def queue_read_receipt(self, receipt: ReadReceipt) -> None: def queue_read_receipt(self, receipt: ReadReceipt) -> None:

View file

@ -285,6 +285,10 @@ class PresenceRouterTestCase(FederatingHomeserverTestCase):
presence_updates, _ = sync_presence(self, self.presence_receiving_user_two_id) presence_updates, _ = sync_presence(self, self.presence_receiving_user_two_id)
self.assertEqual(len(presence_updates), 3) self.assertEqual(len(presence_updates), 3)
# We stagger sending of presence, so we need to wait a bit for them to
# get sent out.
self.reactor.advance(60)
# Test that sending to a remote user works # Test that sending to a remote user works
remote_user_id = "@far_away_person:island" remote_user_id = "@far_away_person:island"
@ -301,6 +305,10 @@ class PresenceRouterTestCase(FederatingHomeserverTestCase):
self.module_api.send_local_online_presence_to([remote_user_id]) self.module_api.send_local_online_presence_to([remote_user_id])
) )
# We stagger sending of presence, so we need to wait a bit for them to
# get sent out.
self.reactor.advance(60)
# Check that the expected presence updates were sent # Check that the expected presence updates were sent
# We explicitly compare using sets as we expect that calling # We explicitly compare using sets as we expect that calling
# module_api.send_local_online_presence_to will create a presence # module_api.send_local_online_presence_to will create a presence