mirror of
https://github.com/element-hq/synapse.git
synced 2024-11-22 01:25:44 +03:00
Immediately retry any requests that have backed off when a server comes back online. (#12500)
Otherwise it can take up to a minute for any in-flight `/send` requests to be retried.
This commit is contained in:
parent
ade3008821
commit
8dd3e0e084
6 changed files with 179 additions and 6 deletions
1
changelog.d/12500.misc
Normal file
1
changelog.d/12500.misc
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Immediately retry any requests that have backed off when a server comes back online.
|
|
@ -73,7 +73,7 @@ from synapse.logging.context import make_deferred_yieldable, run_in_background
|
||||||
from synapse.logging.opentracing import set_tag, start_active_span, tags
|
from synapse.logging.opentracing import set_tag, start_active_span, tags
|
||||||
from synapse.types import JsonDict
|
from synapse.types import JsonDict
|
||||||
from synapse.util import json_decoder
|
from synapse.util import json_decoder
|
||||||
from synapse.util.async_helpers import timeout_deferred
|
from synapse.util.async_helpers import AwakenableSleeper, timeout_deferred
|
||||||
from synapse.util.metrics import Measure
|
from synapse.util.metrics import Measure
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
@ -353,6 +353,13 @@ class MatrixFederationHttpClient:
|
||||||
|
|
||||||
self._cooperator = Cooperator(scheduler=schedule)
|
self._cooperator = Cooperator(scheduler=schedule)
|
||||||
|
|
||||||
|
self._sleeper = AwakenableSleeper(self.reactor)
|
||||||
|
|
||||||
|
def wake_destination(self, destination: str) -> None:
|
||||||
|
"""Called when the remote server may have come back online."""
|
||||||
|
|
||||||
|
self._sleeper.wake(destination)
|
||||||
|
|
||||||
async def _send_request_with_optional_trailing_slash(
|
async def _send_request_with_optional_trailing_slash(
|
||||||
self,
|
self,
|
||||||
request: MatrixFederationRequest,
|
request: MatrixFederationRequest,
|
||||||
|
@ -474,6 +481,8 @@ class MatrixFederationHttpClient:
|
||||||
self._store,
|
self._store,
|
||||||
backoff_on_404=backoff_on_404,
|
backoff_on_404=backoff_on_404,
|
||||||
ignore_backoff=ignore_backoff,
|
ignore_backoff=ignore_backoff,
|
||||||
|
notifier=self.hs.get_notifier(),
|
||||||
|
replication_client=self.hs.get_replication_command_handler(),
|
||||||
)
|
)
|
||||||
|
|
||||||
method_bytes = request.method.encode("ascii")
|
method_bytes = request.method.encode("ascii")
|
||||||
|
@ -664,7 +673,9 @@ class MatrixFederationHttpClient:
|
||||||
delay,
|
delay,
|
||||||
)
|
)
|
||||||
|
|
||||||
await self.clock.sleep(delay)
|
# Sleep for the calculated delay, or wake up immediately
|
||||||
|
# if we get notified that the server is back up.
|
||||||
|
await self._sleeper.sleep(request.destination, delay * 1000)
|
||||||
retries_left -= 1
|
retries_left -= 1
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
|
@ -228,9 +228,7 @@ class Notifier:
|
||||||
# Called when there are new things to stream over replication
|
# Called when there are new things to stream over replication
|
||||||
self.replication_callbacks: List[Callable[[], None]] = []
|
self.replication_callbacks: List[Callable[[], None]] = []
|
||||||
|
|
||||||
# Called when remote servers have come back online after having been
|
self._federation_client = hs.get_federation_http_client()
|
||||||
# down.
|
|
||||||
self.remote_server_up_callbacks: List[Callable[[str], None]] = []
|
|
||||||
|
|
||||||
self._third_party_rules = hs.get_third_party_event_rules()
|
self._third_party_rules = hs.get_third_party_event_rules()
|
||||||
|
|
||||||
|
@ -731,3 +729,7 @@ class Notifier:
|
||||||
# circular dependencies.
|
# circular dependencies.
|
||||||
if self.federation_sender:
|
if self.federation_sender:
|
||||||
self.federation_sender.wake_destination(server)
|
self.federation_sender.wake_destination(server)
|
||||||
|
|
||||||
|
# Tell the federation client about the fact the server is back up, so
|
||||||
|
# that any in flight requests can be immediately retried.
|
||||||
|
self._federation_client.wake_destination(server)
|
||||||
|
|
|
@ -778,3 +778,60 @@ def delay_cancellation(awaitable: Awaitable[T]) -> Awaitable[T]:
|
||||||
new_deferred: "defer.Deferred[T]" = defer.Deferred(handle_cancel)
|
new_deferred: "defer.Deferred[T]" = defer.Deferred(handle_cancel)
|
||||||
deferred.chainDeferred(new_deferred)
|
deferred.chainDeferred(new_deferred)
|
||||||
return new_deferred
|
return new_deferred
|
||||||
|
|
||||||
|
|
||||||
|
class AwakenableSleeper:
|
||||||
|
"""Allows explicitly waking up deferreds related to an entity that are
|
||||||
|
currently sleeping.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, reactor: IReactorTime) -> None:
|
||||||
|
self._streams: Dict[str, Set[defer.Deferred[None]]] = {}
|
||||||
|
self._reactor = reactor
|
||||||
|
|
||||||
|
def wake(self, name: str) -> None:
|
||||||
|
"""Wake everything related to `name` that is currently sleeping."""
|
||||||
|
stream_set = self._streams.pop(name, set())
|
||||||
|
for deferred in stream_set:
|
||||||
|
try:
|
||||||
|
with PreserveLoggingContext():
|
||||||
|
deferred.callback(None)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def sleep(self, name: str, delay_ms: int) -> None:
|
||||||
|
"""Sleep for the given number of milliseconds, or return if the given
|
||||||
|
`name` is explicitly woken up.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Create a deferred that gets called in N seconds
|
||||||
|
sleep_deferred: "defer.Deferred[None]" = defer.Deferred()
|
||||||
|
call = self._reactor.callLater(delay_ms / 1000, sleep_deferred.callback, None)
|
||||||
|
|
||||||
|
# Create a deferred that will get called if `wake` is called with
|
||||||
|
# the same `name`.
|
||||||
|
stream_set = self._streams.setdefault(name, set())
|
||||||
|
notify_deferred: "defer.Deferred[None]" = defer.Deferred()
|
||||||
|
stream_set.add(notify_deferred)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Wait for either the delay or for `wake` to be called.
|
||||||
|
await make_deferred_yieldable(
|
||||||
|
defer.DeferredList(
|
||||||
|
[sleep_deferred, notify_deferred],
|
||||||
|
fireOnOneCallback=True,
|
||||||
|
fireOnOneErrback=True,
|
||||||
|
consumeErrors=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
# Clean up the state
|
||||||
|
curr_stream_set = self._streams.get(name)
|
||||||
|
if curr_stream_set is not None:
|
||||||
|
curr_stream_set.discard(notify_deferred)
|
||||||
|
if len(curr_stream_set) == 0:
|
||||||
|
self._streams.pop(name)
|
||||||
|
|
||||||
|
# Cancel the sleep if we were woken up
|
||||||
|
if call.active():
|
||||||
|
call.cancel()
|
||||||
|
|
|
@ -14,13 +14,17 @@
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
from types import TracebackType
|
from types import TracebackType
|
||||||
from typing import Any, Optional, Type
|
from typing import TYPE_CHECKING, Any, Optional, Type
|
||||||
|
|
||||||
import synapse.logging.context
|
import synapse.logging.context
|
||||||
from synapse.api.errors import CodeMessageException
|
from synapse.api.errors import CodeMessageException
|
||||||
from synapse.storage import DataStore
|
from synapse.storage import DataStore
|
||||||
from synapse.util import Clock
|
from synapse.util import Clock
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from synapse.notifier import Notifier
|
||||||
|
from synapse.replication.tcp.handler import ReplicationCommandHandler
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# the initial backoff, after the first transaction fails
|
# the initial backoff, after the first transaction fails
|
||||||
|
@ -131,6 +135,8 @@ class RetryDestinationLimiter:
|
||||||
retry_interval: int,
|
retry_interval: int,
|
||||||
backoff_on_404: bool = False,
|
backoff_on_404: bool = False,
|
||||||
backoff_on_failure: bool = True,
|
backoff_on_failure: bool = True,
|
||||||
|
notifier: Optional["Notifier"] = None,
|
||||||
|
replication_client: Optional["ReplicationCommandHandler"] = None,
|
||||||
):
|
):
|
||||||
"""Marks the destination as "down" if an exception is thrown in the
|
"""Marks the destination as "down" if an exception is thrown in the
|
||||||
context, except for CodeMessageException with code < 500.
|
context, except for CodeMessageException with code < 500.
|
||||||
|
@ -160,6 +166,9 @@ class RetryDestinationLimiter:
|
||||||
self.backoff_on_404 = backoff_on_404
|
self.backoff_on_404 = backoff_on_404
|
||||||
self.backoff_on_failure = backoff_on_failure
|
self.backoff_on_failure = backoff_on_failure
|
||||||
|
|
||||||
|
self.notifier = notifier
|
||||||
|
self.replication_client = replication_client
|
||||||
|
|
||||||
def __enter__(self) -> None:
|
def __enter__(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -239,6 +248,19 @@ class RetryDestinationLimiter:
|
||||||
retry_last_ts,
|
retry_last_ts,
|
||||||
self.retry_interval,
|
self.retry_interval,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if self.notifier:
|
||||||
|
# Inform the relevant places that the remote server is back up.
|
||||||
|
self.notifier.notify_remote_server_up(self.destination)
|
||||||
|
|
||||||
|
if self.replication_client:
|
||||||
|
# If we're on a worker we try and inform master about this. The
|
||||||
|
# replication client doesn't hook into the notifier to avoid
|
||||||
|
# infinite loops where we send a `REMOTE_SERVER_UP` command to
|
||||||
|
# master, which then echoes it back to us which in turn pokes
|
||||||
|
# the notifier.
|
||||||
|
self.replication_client.send_remote_server_up(self.destination)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Failed to store destination_retry_timings")
|
logger.exception("Failed to store destination_retry_timings")
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,7 @@ from synapse.logging.context import (
|
||||||
make_deferred_yieldable,
|
make_deferred_yieldable,
|
||||||
)
|
)
|
||||||
from synapse.util.async_helpers import (
|
from synapse.util.async_helpers import (
|
||||||
|
AwakenableSleeper,
|
||||||
ObservableDeferred,
|
ObservableDeferred,
|
||||||
concurrently_execute,
|
concurrently_execute,
|
||||||
delay_cancellation,
|
delay_cancellation,
|
||||||
|
@ -35,6 +36,7 @@ from synapse.util.async_helpers import (
|
||||||
timeout_deferred,
|
timeout_deferred,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from tests.server import get_clock
|
||||||
from tests.unittest import TestCase
|
from tests.unittest import TestCase
|
||||||
|
|
||||||
|
|
||||||
|
@ -496,3 +498,81 @@ class DelayCancellationTests(TestCase):
|
||||||
# logging context.
|
# logging context.
|
||||||
blocking_d.callback(None)
|
blocking_d.callback(None)
|
||||||
self.successResultOf(d)
|
self.successResultOf(d)
|
||||||
|
|
||||||
|
|
||||||
|
class AwakenableSleeperTests(TestCase):
|
||||||
|
"Tests AwakenableSleeper"
|
||||||
|
|
||||||
|
def test_sleep(self):
|
||||||
|
reactor, _ = get_clock()
|
||||||
|
sleeper = AwakenableSleeper(reactor)
|
||||||
|
|
||||||
|
d = defer.ensureDeferred(sleeper.sleep("name", 1000))
|
||||||
|
|
||||||
|
reactor.pump([0.0])
|
||||||
|
self.assertFalse(d.called)
|
||||||
|
|
||||||
|
reactor.advance(0.5)
|
||||||
|
self.assertFalse(d.called)
|
||||||
|
|
||||||
|
reactor.advance(0.6)
|
||||||
|
self.assertTrue(d.called)
|
||||||
|
|
||||||
|
def test_explicit_wake(self):
|
||||||
|
reactor, _ = get_clock()
|
||||||
|
sleeper = AwakenableSleeper(reactor)
|
||||||
|
|
||||||
|
d = defer.ensureDeferred(sleeper.sleep("name", 1000))
|
||||||
|
|
||||||
|
reactor.pump([0.0])
|
||||||
|
self.assertFalse(d.called)
|
||||||
|
|
||||||
|
reactor.advance(0.5)
|
||||||
|
self.assertFalse(d.called)
|
||||||
|
|
||||||
|
sleeper.wake("name")
|
||||||
|
self.assertTrue(d.called)
|
||||||
|
|
||||||
|
reactor.advance(0.6)
|
||||||
|
|
||||||
|
def test_multiple_sleepers_timeout(self):
|
||||||
|
reactor, _ = get_clock()
|
||||||
|
sleeper = AwakenableSleeper(reactor)
|
||||||
|
|
||||||
|
d1 = defer.ensureDeferred(sleeper.sleep("name", 1000))
|
||||||
|
|
||||||
|
reactor.advance(0.6)
|
||||||
|
self.assertFalse(d1.called)
|
||||||
|
|
||||||
|
# Add another sleeper
|
||||||
|
d2 = defer.ensureDeferred(sleeper.sleep("name", 1000))
|
||||||
|
|
||||||
|
# Only the first sleep should time out now.
|
||||||
|
reactor.advance(0.6)
|
||||||
|
self.assertTrue(d1.called)
|
||||||
|
self.assertFalse(d2.called)
|
||||||
|
|
||||||
|
reactor.advance(0.6)
|
||||||
|
self.assertTrue(d2.called)
|
||||||
|
|
||||||
|
def test_multiple_sleepers_wake(self):
|
||||||
|
reactor, _ = get_clock()
|
||||||
|
sleeper = AwakenableSleeper(reactor)
|
||||||
|
|
||||||
|
d1 = defer.ensureDeferred(sleeper.sleep("name", 1000))
|
||||||
|
|
||||||
|
reactor.advance(0.5)
|
||||||
|
self.assertFalse(d1.called)
|
||||||
|
|
||||||
|
# Add another sleeper
|
||||||
|
d2 = defer.ensureDeferred(sleeper.sleep("name", 1000))
|
||||||
|
|
||||||
|
# Neither should fire yet
|
||||||
|
reactor.advance(0.3)
|
||||||
|
self.assertFalse(d1.called)
|
||||||
|
self.assertFalse(d2.called)
|
||||||
|
|
||||||
|
# Explicitly waking both up works
|
||||||
|
sleeper.wake("name")
|
||||||
|
self.assertTrue(d1.called)
|
||||||
|
self.assertTrue(d2.called)
|
||||||
|
|
Loading…
Reference in a new issue