mirror of
https://github.com/element-hq/synapse.git
synced 2024-12-19 17:56:19 +03:00
d0fc1e904a
The latest Twisted release changed how they implemented `__await__` on deferreds, which broke the machinery we used to test cancellation. This PR changes things a bit to instead patch the `__await__` method, which is a stable API. This mostly doesn't change the core logic, except for fixing two bugs: - We previously did not intercept all await points - After cancellation we now need to not only unblock currently blocked await points, but also make sure we don't block any future await points. c.f. https://github.com/twisted/twisted/pull/12226 --------- Co-authored-by: Devon Hudson <devon.dmytro@gmail.com>
627 lines
24 KiB
Python
627 lines
24 KiB
Python
#
|
|
# This file is licensed under the Affero General Public License (AGPL) version 3.
|
|
#
|
|
# Copyright 2022 The Matrix.org Foundation C.I.C.
|
|
# Copyright (C) 2023 New Vector, Ltd
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU Affero General Public License as
|
|
# published by the Free Software Foundation, either version 3 of the
|
|
# License, or (at your option) any later version.
|
|
#
|
|
# See the GNU Affero General Public License for more details:
|
|
# <https://www.gnu.org/licenses/agpl-3.0.html>.
|
|
#
|
|
# Originally licensed under the Apache License, Version 2.0:
|
|
# <http://www.apache.org/licenses/LICENSE-2.0>.
|
|
#
|
|
# [This file includes modifications made by New Vector Limited]
|
|
#
|
|
#
|
|
|
|
import inspect
|
|
import itertools
|
|
import logging
|
|
from typing import (
|
|
Any,
|
|
Callable,
|
|
ContextManager,
|
|
Dict,
|
|
Generator,
|
|
List,
|
|
Optional,
|
|
Set,
|
|
Tuple,
|
|
TypeVar,
|
|
Union,
|
|
)
|
|
from unittest import mock
|
|
from unittest.mock import Mock
|
|
|
|
from twisted.internet.defer import Deferred
|
|
from twisted.internet.error import ConnectionDone
|
|
from twisted.python.failure import Failure
|
|
from twisted.test.proto_helpers import MemoryReactorClock
|
|
from twisted.web.server import Site
|
|
|
|
from synapse.http.server import (
|
|
HTTP_STATUS_REQUEST_CANCELLED,
|
|
respond_with_html_bytes,
|
|
respond_with_json,
|
|
)
|
|
from synapse.http.site import SynapseRequest
|
|
from synapse.logging.context import (
|
|
LoggingContext,
|
|
make_deferred_yieldable,
|
|
)
|
|
from synapse.types import JsonDict
|
|
|
|
from tests.server import FakeChannel, make_request
|
|
from tests.unittest import logcontext_clean
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
T = TypeVar("T")
|
|
|
|
|
|
def test_disconnect(
|
|
reactor: MemoryReactorClock,
|
|
channel: FakeChannel,
|
|
expect_cancellation: bool,
|
|
expected_body: Union[bytes, JsonDict],
|
|
expected_code: Optional[int] = None,
|
|
) -> None:
|
|
"""Disconnects an in-flight request and checks the response.
|
|
|
|
Args:
|
|
reactor: The twisted reactor running the request handler.
|
|
channel: The `FakeChannel` for the request.
|
|
expect_cancellation: `True` if request processing is expected to be cancelled,
|
|
`False` if the request should run to completion.
|
|
expected_body: The expected response for the request.
|
|
expected_code: The expected status code for the request. Defaults to `200` or
|
|
`499` depending on `expect_cancellation`.
|
|
"""
|
|
# Determine the expected status code.
|
|
if expected_code is None:
|
|
if expect_cancellation:
|
|
expected_code = HTTP_STATUS_REQUEST_CANCELLED
|
|
else:
|
|
expected_code = 200
|
|
|
|
request = channel.request
|
|
if channel.is_finished():
|
|
raise AssertionError(
|
|
"Request finished before we could disconnect - "
|
|
"ensure `await_result=False` is passed to `make_request`.",
|
|
)
|
|
|
|
# We're about to disconnect the request. This also disconnects the channel, so we
|
|
# have to rely on mocks to extract the response.
|
|
respond_method: Callable[..., Any]
|
|
if isinstance(expected_body, bytes):
|
|
respond_method = respond_with_html_bytes
|
|
else:
|
|
respond_method = respond_with_json
|
|
|
|
with mock.patch(
|
|
f"synapse.http.server.{respond_method.__name__}", wraps=respond_method
|
|
) as respond_mock:
|
|
# Disconnect the request.
|
|
request.connectionLost(reason=ConnectionDone())
|
|
|
|
if expect_cancellation:
|
|
# An immediate cancellation is expected.
|
|
respond_mock.assert_called_once()
|
|
else:
|
|
respond_mock.assert_not_called()
|
|
|
|
# The handler is expected to run to completion.
|
|
reactor.advance(1.0)
|
|
respond_mock.assert_called_once()
|
|
|
|
args, _kwargs = respond_mock.call_args
|
|
code, body = args[1], args[2]
|
|
|
|
if code != expected_code:
|
|
raise AssertionError(
|
|
f"{code} != {expected_code} : "
|
|
"Request did not finish with the expected status code."
|
|
)
|
|
|
|
if request.code != expected_code:
|
|
raise AssertionError(
|
|
f"{request.code} != {expected_code} : "
|
|
"Request did not finish with the expected status code."
|
|
)
|
|
|
|
if body != expected_body:
|
|
raise AssertionError(
|
|
f"{body!r} != {expected_body!r} : "
|
|
"Request did not finish with the expected status code."
|
|
)
|
|
|
|
|
|
@logcontext_clean
|
|
def make_request_with_cancellation_test(
|
|
test_name: str,
|
|
reactor: MemoryReactorClock,
|
|
site: Site,
|
|
method: str,
|
|
path: str,
|
|
content: Union[bytes, str, JsonDict] = b"",
|
|
*,
|
|
token: Optional[str] = None,
|
|
) -> FakeChannel:
|
|
"""Performs a request repeatedly, disconnecting at successive `await`s, until
|
|
one completes.
|
|
|
|
Fails if:
|
|
* A logging context is lost during cancellation.
|
|
* A logging context get restarted after it is marked as finished, eg. if
|
|
a request's logging context is used by some processing started by the
|
|
request, but the request neglects to cancel that processing or wait for it
|
|
to complete.
|
|
|
|
Note that "Re-starting finished log context" errors get raised within the
|
|
request handling code and may or may not get caught. These errors will
|
|
likely manifest as a different logging context error at a later point. When
|
|
debugging logging context failures, setting a breakpoint in
|
|
`logcontext_error` can prove useful.
|
|
* A request gets stuck, possibly due to a previous cancellation.
|
|
* The request does not return a 499 when the client disconnects.
|
|
This implies that a `CancelledError` was swallowed somewhere.
|
|
|
|
It is up to the caller to verify that the request returns the correct data when
|
|
it finally runs to completion.
|
|
|
|
Note that this function can only cover a single code path and does not guarantee
|
|
that an endpoint is compatible with cancellation on every code path.
|
|
To allow inspection of the code path that is being tested, this function will
|
|
log the stack trace at every `await` that gets cancelled. To view these log
|
|
lines, `trial` can be run with the `SYNAPSE_TEST_LOG_LEVEL=INFO` environment
|
|
variable, which will include the log lines in `_trial_temp/test.log`.
|
|
Alternatively, `_log_for_request` can be modified to write to `sys.stdout`.
|
|
|
|
Args:
|
|
test_name: The name of the test, which will be logged.
|
|
reactor: The twisted reactor running the request handler.
|
|
site: The twisted `Site` to use to render the request.
|
|
method: The HTTP request method ("verb").
|
|
path: The HTTP path, suitably URL encoded (e.g. escaped UTF-8 & spaces and
|
|
such).
|
|
content: The body of the request.
|
|
|
|
Returns:
|
|
The `FakeChannel` object which stores the result of the final request that
|
|
runs to completion.
|
|
"""
|
|
# To process a request, a coroutine run is created for the async method handling
|
|
# the request. That method may then start other coroutine runs, wrapped in
|
|
# `Deferred`s.
|
|
#
|
|
# We would like to trigger a cancellation at the first `await`, re-run the
|
|
# request and cancel at the second `await`, and so on. By patching
|
|
# `Deferred.__await__`, we can intercept `await`s, track which ones we have or
|
|
# have not seen, and force them to block when they wouldn't have.
|
|
|
|
# The set of previously seen `await`s.
|
|
# Each element is a stringified stack trace.
|
|
seen_awaits: Set[Tuple[str, ...]] = set()
|
|
|
|
_log_for_request(
|
|
0, f"Running make_request_with_cancellation_test for {test_name}..."
|
|
)
|
|
|
|
for request_number in itertools.count(1):
|
|
deferred_patch = Deferred__await__Patch(seen_awaits, request_number)
|
|
|
|
try:
|
|
with mock.patch(
|
|
"synapse.http.server.respond_with_json", wraps=respond_with_json
|
|
) as respond_mock:
|
|
with deferred_patch.patch():
|
|
# Start the request.
|
|
channel = make_request(
|
|
reactor,
|
|
site,
|
|
method,
|
|
path,
|
|
content,
|
|
await_result=False,
|
|
access_token=token,
|
|
)
|
|
request = channel.request
|
|
|
|
# Run the request until we see a new `await` which we have not
|
|
# yet cancelled at, or it completes.
|
|
while not respond_mock.called and not deferred_patch.new_await_seen:
|
|
previous_awaits_seen = deferred_patch.awaits_seen
|
|
|
|
reactor.advance(0.0)
|
|
|
|
if deferred_patch.awaits_seen == previous_awaits_seen:
|
|
# We didn't see any progress. Try advancing the clock.
|
|
reactor.advance(1.0)
|
|
|
|
if deferred_patch.awaits_seen == previous_awaits_seen:
|
|
# We still didn't see any progress. The request might be
|
|
# stuck.
|
|
raise AssertionError(
|
|
"Request appears to be stuck, possibly due to a "
|
|
"previous cancelled request"
|
|
)
|
|
|
|
if respond_mock.called:
|
|
_log_for_request(request_number, "--- response finished ---")
|
|
|
|
# The request ran to completion and we are done with testing it.
|
|
|
|
# `respond_with_json` writes the response asynchronously, so we
|
|
# might have to give the reactor a kick before the channel gets
|
|
# the response.
|
|
deferred_patch.unblock_awaits()
|
|
channel.await_result()
|
|
|
|
return channel
|
|
|
|
# Disconnect the client and wait for the response.
|
|
request.connectionLost(reason=ConnectionDone())
|
|
|
|
_log_for_request(request_number, "--- disconnected ---")
|
|
|
|
# Advance the reactor just enough to get a response.
|
|
# We don't want to advance the reactor too far, because we can only
|
|
# detect re-starts of finished logging contexts after we set the
|
|
# finished flag below.
|
|
for _ in range(2):
|
|
# We may need to pump the reactor to allow `delay_cancellation`s to
|
|
# finish.
|
|
if not respond_mock.called:
|
|
reactor.advance(0.0)
|
|
|
|
# Try advancing the clock if that didn't work.
|
|
if not respond_mock.called:
|
|
reactor.advance(1.0)
|
|
|
|
# `delay_cancellation`s may be waiting for processing that we've
|
|
# forced to block. Try unblocking them, followed by another round of
|
|
# pumping the reactor.
|
|
if not respond_mock.called:
|
|
deferred_patch.unblock_awaits()
|
|
|
|
# Mark the request's logging context as finished. If it gets
|
|
# activated again, an `AssertionError` will be raised and bubble up
|
|
# through request handling code. This `AssertionError` may or may not be
|
|
# caught. Eventually some other code will deactivate the logging
|
|
# context which will raise a different `AssertionError` because
|
|
# resource usage won't have been correctly tracked.
|
|
if isinstance(request, SynapseRequest) and request.logcontext:
|
|
request.logcontext.finished = True
|
|
|
|
# Check that the request finished with a 499,
|
|
# ie. the `CancelledError` wasn't swallowed.
|
|
respond_mock.assert_called_once()
|
|
|
|
if request.code != HTTP_STATUS_REQUEST_CANCELLED:
|
|
raise AssertionError(
|
|
f"{request.code} != {HTTP_STATUS_REQUEST_CANCELLED} : "
|
|
"Cancelled request did not finish with the correct status code."
|
|
)
|
|
finally:
|
|
# Unblock any processing that might be shared between requests, if we
|
|
# haven't already done so.
|
|
deferred_patch.unblock_awaits()
|
|
|
|
assert False, "unreachable" # noqa: B011
|
|
|
|
|
|
class Deferred__await__Patch:
|
|
"""A `Deferred.__await__` patch that will intercept `await`s and force them
|
|
to block once it sees a new `await`.
|
|
|
|
When done with the patch, `unblock_awaits()` must be called to clean up after any
|
|
`await`s that were forced to block, otherwise processing shared between multiple
|
|
requests, such as database queries started by `@cached`, will become permanently
|
|
stuck.
|
|
|
|
Usage:
|
|
seen_awaits = set()
|
|
deferred_patch = Deferred__await__Patch(seen_awaits, 1)
|
|
try:
|
|
with deferred_patch.patch():
|
|
# do things
|
|
...
|
|
finally:
|
|
deferred_patch.unblock_awaits()
|
|
"""
|
|
|
|
def __init__(self, seen_awaits: Set[Tuple[str, ...]], request_number: int):
|
|
"""
|
|
Args:
|
|
seen_awaits: The set of stack traces of `await`s that have been previously
|
|
seen. When the `Deferred.__await__` patch sees a new `await`, it will add
|
|
it to the set.
|
|
request_number: The request number to log against.
|
|
"""
|
|
self._request_number = request_number
|
|
self._seen_awaits = seen_awaits
|
|
|
|
self._original_Deferred__await__ = Deferred.__await__ # type: ignore[misc,unused-ignore]
|
|
|
|
# The number of `await`s on `Deferred`s we have seen so far.
|
|
self.awaits_seen = 0
|
|
|
|
# Whether we have seen a new `await` not in `seen_awaits`.
|
|
self.new_await_seen = False
|
|
|
|
# Whether to block new await points we see. This gets set to False once
|
|
# we have cancelled the request to allow things to run after
|
|
# cancellation.
|
|
self._block_new_awaits = True
|
|
|
|
# To force `await`s on resolved `Deferred`s to block, we make up a new
|
|
# unresolved `Deferred` and return it out of `Deferred.__await__` /
|
|
# `coroutine.send()`. We have to resolve it later, in case the `await`ing
|
|
# coroutine is part of some shared processing, such as `@cached`.
|
|
self._to_unblock: Dict[Deferred, Union[object, Failure]] = {}
|
|
|
|
# The last stack we logged.
|
|
self._previous_stack: List[inspect.FrameInfo] = []
|
|
|
|
def patch(self) -> ContextManager[Mock]:
|
|
"""Returns a context manager which patches `Deferred.__await__`."""
|
|
|
|
def Deferred___await__(
|
|
deferred: "Deferred[T]",
|
|
) -> Generator["Deferred[T]", None, T]:
|
|
"""Intercepts calls to `__await__`, which returns a generator
|
|
yielding deferreds that we await on.
|
|
|
|
The generator for `__await__` will normally:
|
|
* return `self` if the `Deferred` is unresolved, in which case
|
|
`coroutine.send()` will return the `Deferred`, and
|
|
`_defer.inlineCallbacks` will stop running the coroutine until the
|
|
`Deferred` is resolved.
|
|
* raise a `StopIteration(result)`, containing the result of the `await`.
|
|
* raise another exception, which will come out of the `await`.
|
|
"""
|
|
|
|
# Get the original generator.
|
|
gen = self._original_Deferred__await__(deferred)
|
|
|
|
# Run the generator, handling each iteration to see if we need to
|
|
# block.
|
|
try:
|
|
while True:
|
|
# We've hit a new await point (or the deferred has
|
|
# completed), handle it.
|
|
handle_next_iteration(deferred)
|
|
|
|
# Continue on.
|
|
yield gen.send(None)
|
|
except StopIteration as e:
|
|
# We need to convert `StopIteration` into a normal return.
|
|
return e.value
|
|
|
|
def handle_next_iteration(
|
|
deferred: "Deferred[T]",
|
|
) -> None:
|
|
"""Intercepts `await`s on `Deferred`s and rigs them to block once we have
|
|
seen enough of them.
|
|
|
|
Args:
|
|
deferred: The deferred that we've captured and are intercepting
|
|
`await` calls within.
|
|
"""
|
|
if not self._block_new_awaits:
|
|
# We're no longer blocking awaits points
|
|
return
|
|
|
|
self.awaits_seen += 1
|
|
|
|
stack = _get_stack(
|
|
skip_frames=2 # Ignore this function and `Deferred___await__` in stack trace
|
|
)
|
|
stack_hash = _hash_stack(stack)
|
|
|
|
if stack_hash not in self._seen_awaits:
|
|
# Block at the current `await` onwards.
|
|
self._seen_awaits.add(stack_hash)
|
|
self.new_await_seen = True
|
|
|
|
if not self.new_await_seen:
|
|
# This `await` isn't interesting. Let it proceed normally.
|
|
|
|
_log_await_stack(
|
|
stack,
|
|
self._previous_stack,
|
|
self._request_number,
|
|
"already seen",
|
|
)
|
|
|
|
# Don't log the stack. It's been seen before in a previous run.
|
|
self._previous_stack = stack
|
|
|
|
return
|
|
|
|
# We want to block at the current `await`.
|
|
if deferred.called and not deferred.paused:
|
|
# This `Deferred` already has a result. We chain a new,
|
|
# unresolved, `Deferred` to the end of this Deferred that it
|
|
# will wait on. This blocks the coroutine that did this `await`.
|
|
# We queue it up for unblocking later.
|
|
new_deferred: "Deferred[T]" = Deferred()
|
|
self._to_unblock[new_deferred] = deferred.result
|
|
|
|
deferred.addBoth(lambda _: make_deferred_yieldable(new_deferred))
|
|
|
|
_log_await_stack(
|
|
stack,
|
|
self._previous_stack,
|
|
self._request_number,
|
|
"force-blocked await",
|
|
)
|
|
self._previous_stack = stack
|
|
|
|
# Continue iterating on the deferred now that we've blocked it
|
|
# again.
|
|
return
|
|
|
|
# This `Deferred` does not have a result yet.
|
|
# The `await` will block normally, so we don't have to do anything.
|
|
_log_await_stack(
|
|
stack,
|
|
self._previous_stack,
|
|
self._request_number,
|
|
"blocking await",
|
|
)
|
|
self._previous_stack = stack
|
|
|
|
return
|
|
|
|
return mock.patch.object(Deferred, "__await__", new=Deferred___await__)
|
|
|
|
def unblock_awaits(self) -> None:
|
|
"""Unblocks any shared processing that we forced to block.
|
|
|
|
Must be called when done, otherwise processing shared between multiple requests,
|
|
such as database queries started by `@cached`, will become permanently stuck.
|
|
"""
|
|
# Also disable blocking at future await points
|
|
self._block_new_awaits = False
|
|
|
|
to_unblock = self._to_unblock
|
|
self._to_unblock = {}
|
|
for deferred, result in to_unblock.items():
|
|
deferred.callback(result)
|
|
|
|
|
|
def _log_for_request(request_number: int, message: str) -> None:
|
|
"""Logs a message for an iteration of `make_request_with_cancellation_test`."""
|
|
# We want consistent alignment when logging stack traces, so ensure the logging
|
|
# context has a fixed width name.
|
|
with LoggingContext(name=f"request-{request_number:<2}"):
|
|
logger.info(message)
|
|
|
|
|
|
def _log_await_stack(
|
|
stack: List[inspect.FrameInfo],
|
|
previous_stack: List[inspect.FrameInfo],
|
|
request_number: int,
|
|
note: str,
|
|
) -> None:
|
|
"""Logs the stack for an `await` in `make_request_with_cancellation_test`.
|
|
|
|
Only logs the part of the stack that has changed since the previous call.
|
|
|
|
Example output looks like:
|
|
```
|
|
delay_cancellation:750 (synapse/util/async_helpers.py:750)
|
|
DatabasePool._runInteraction:768 (synapse/storage/database.py:768)
|
|
> *blocked on await* at DatabasePool.runWithConnection:891 (synapse/storage/database.py:891)
|
|
```
|
|
|
|
Args:
|
|
stack: The stack to log, as returned by `_get_stack()`.
|
|
previous_stack: The previous stack logged, with callers appearing before
|
|
callees.
|
|
request_number: The request number to log against.
|
|
note: A note to attach to the last stack frame, eg. "blocked on await".
|
|
"""
|
|
for i, frame_info in enumerate(stack[:-1]):
|
|
# Skip any frames in common with the previous logging.
|
|
if i < len(previous_stack) and frame_info == previous_stack[i]:
|
|
continue
|
|
|
|
frame = _format_stack_frame(frame_info)
|
|
message = f"{' ' * i}{frame}"
|
|
_log_for_request(request_number, message)
|
|
|
|
# Always print the final frame with the `await`.
|
|
# If the frame with the `await` started another coroutine run, we may have already
|
|
# printed a deeper stack which includes our final frame. We want to log where all
|
|
# `await`s happen, so we reprint the frame in this case.
|
|
i = len(stack) - 1
|
|
frame_info = stack[i]
|
|
frame = _format_stack_frame(frame_info)
|
|
message = f"{' ' * i}> *{note}* at {frame}"
|
|
_log_for_request(request_number, message)
|
|
|
|
|
|
def _format_stack_frame(frame_info: inspect.FrameInfo) -> str:
|
|
"""Returns a string representation of a stack frame.
|
|
|
|
Used for debug logging.
|
|
|
|
Returns:
|
|
A string, formatted like
|
|
"JsonResource._async_render:559 (synapse/http/server.py:559)".
|
|
"""
|
|
method_name = _get_stack_frame_method_name(frame_info)
|
|
|
|
return (
|
|
f"{method_name}:{frame_info.lineno} ({frame_info.filename}:{frame_info.lineno})"
|
|
)
|
|
|
|
|
|
def _get_stack(skip_frames: int) -> List[inspect.FrameInfo]:
|
|
"""Captures the stack for a request.
|
|
|
|
Skips any twisted frames and stops at `JsonResource.wrapped_async_request_handler`.
|
|
|
|
Used for debug logging.
|
|
|
|
Returns:
|
|
A list of `inspect.FrameInfo`s, with callers appearing before callees.
|
|
"""
|
|
stack = []
|
|
|
|
skip_frames += 1 # Also skip `get_stack` itself.
|
|
|
|
for frame_info in inspect.stack()[skip_frames:]:
|
|
# Skip any twisted `inlineCallbacks` gunk.
|
|
if "/twisted/" in frame_info.filename:
|
|
continue
|
|
|
|
# Exclude the reactor frame, upwards.
|
|
method_name = _get_stack_frame_method_name(frame_info)
|
|
if method_name == "ThreadedMemoryReactorClock.advance":
|
|
break
|
|
|
|
stack.append(frame_info)
|
|
|
|
# Stop at `JsonResource`'s `wrapped_async_request_handler`, which is the entry
|
|
# point for request handling.
|
|
if frame_info.function == "wrapped_async_request_handler":
|
|
break
|
|
|
|
return stack[::-1]
|
|
|
|
|
|
def _get_stack_frame_method_name(frame_info: inspect.FrameInfo) -> str:
|
|
"""Returns the name of a stack frame's method.
|
|
|
|
eg. "JsonResource._async_render".
|
|
"""
|
|
method_name = frame_info.function
|
|
|
|
# Prefix the class name for instance methods.
|
|
frame_self = frame_info.frame.f_locals.get("self")
|
|
if frame_self:
|
|
method = getattr(frame_self, method_name, None)
|
|
if method:
|
|
method_name = method.__qualname__
|
|
else:
|
|
# We couldn't find the method on `self`.
|
|
# Make something up. It's useful to know which class "contains" a
|
|
# function anyway.
|
|
method_name = f"{type(frame_self).__name__} {method_name}"
|
|
|
|
return method_name
|
|
|
|
|
|
def _hash_stack(stack: List[inspect.FrameInfo]) -> Tuple[str, ...]:
|
|
"""Turns a stack into a hashable value that can be put into a set."""
|
|
return tuple(_format_stack_frame(frame) for frame in stack)
|