diff --git a/changelog.d/17499.bugfix b/changelog.d/17499.bugfix new file mode 100644 index 0000000000..5cb7b3c30e --- /dev/null +++ b/changelog.d/17499.bugfix @@ -0,0 +1 @@ +Fix a bug introduced in v1.110.0 which caused `/keys/query` to return incomplete results, leading to high network activity and CPU usage on Matrix clients. diff --git a/changelog.d/17501.misc b/changelog.d/17501.misc new file mode 100644 index 0000000000..ba96472acb --- /dev/null +++ b/changelog.d/17501.misc @@ -0,0 +1 @@ +Add some opentracing tags and logging to the experimental sliding sync implementation. diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py index 668cec513b..f78e66ad0a 100644 --- a/synapse/handlers/e2e_keys.py +++ b/synapse/handlers/e2e_keys.py @@ -291,13 +291,20 @@ class E2eKeysHandler: # Only try and fetch keys for destinations that are not marked as # down. - filtered_destinations = await filter_destinations_by_retry_limiter( - remote_queries_not_in_cache.keys(), - self.clock, - self.store, - # Let's give an arbitrary grace period for those hosts that are - # only recently down - retry_due_within_ms=60 * 1000, + unfiltered_destinations = remote_queries_not_in_cache.keys() + filtered_destinations = set( + await filter_destinations_by_retry_limiter( + unfiltered_destinations, + self.clock, + self.store, + # Let's give an arbitrary grace period for those hosts that are + # only recently down + retry_due_within_ms=60 * 1000, + ) + ) + failures.update( + (dest, _NOT_READY_FOR_RETRY_FAILURE) + for dest in (unfiltered_destinations - filtered_destinations) ) await concurrently_execute( @@ -1641,6 +1648,9 @@ def _check_device_signature( raise SynapseError(400, "Invalid signature", Codes.INVALID_SIGNATURE) +_NOT_READY_FOR_RETRY_FAILURE = {"status": 503, "message": "Not ready for retry"} + + def _exception_to_failure(e: Exception) -> JsonDict: if isinstance(e, SynapseError): return {"status": e.code, "errcode": e.errcode, "message": str(e)} @@ -1649,7 +1659,7 @@ def _exception_to_failure(e: Exception) -> JsonDict: return {"status": e.code, "message": str(e)} if isinstance(e, NotRetryingDestination): - return {"status": 503, "message": "Not ready for retry"} + return _NOT_READY_FOR_RETRY_FAILURE # include ConnectionRefused and other errors # diff --git a/synapse/handlers/sliding_sync.py b/synapse/handlers/sliding_sync.py index 84f2fa18ff..73414dbf69 100644 --- a/synapse/handlers/sliding_sync.py +++ b/synapse/handlers/sliding_sync.py @@ -41,7 +41,7 @@ from synapse.api.constants import AccountDataTypes, Direction, EventTypes, Membe from synapse.events import EventBase from synapse.events.utils import strip_event from synapse.handlers.relations import BundledAggregations -from synapse.logging.opentracing import start_active_span, tag_args, trace +from synapse.logging.opentracing import log_kv, start_active_span, tag_args, trace from synapse.storage.databases.main.roommember import extract_heroes_from_room_summary from synapse.storage.databases.main.stream import CurrentStateDeltaMembership from synapse.storage.roommember import MemberSummary @@ -444,6 +444,7 @@ class SlidingSyncHandler: return result + @trace async def current_sync_for_user( self, sync_config: SlidingSyncConfig, @@ -682,8 +683,9 @@ class SlidingSyncHandler: if room_sync_result or not from_token: rooms[room_id] = room_sync_result - with start_active_span("sliding_sync.generate_room_entries"): - await concurrently_execute(handle_room, relevant_room_map, 10) + if relevant_room_map: + with start_active_span("sliding_sync.generate_room_entries"): + await concurrently_execute(handle_room, relevant_room_map, 10) extensions = await self.get_extensions_response( sync_config=sync_config, @@ -1161,6 +1163,7 @@ class SlidingSyncHandler: # return None + @trace async def filter_rooms( self, user: UserID, @@ -1284,6 +1287,7 @@ class SlidingSyncHandler: # Assemble a new sync room map but only with the `filtered_room_id_set` return {room_id: sync_room_map[room_id] for room_id in filtered_room_id_set} + @trace async def sort_rooms( self, sync_room_map: Dict[str, _RoomMembershipForUser], @@ -1492,6 +1496,10 @@ class SlidingSyncHandler: else: assert_never(room_status.status) + log_kv({"sliding_sync.room_status": room_status}) + + log_kv({"sliding_sync.from_bound": from_bound, "sliding_sync.initial": initial}) + # Assemble the list of timeline events # # FIXME: It would be nice to make the `rooms` response more uniform regardless of @@ -1890,6 +1898,7 @@ class SlidingSyncHandler: highlight_count=0, ) + @trace async def get_extensions_response( self, sync_config: SlidingSyncConfig, @@ -1942,6 +1951,7 @@ class SlidingSyncHandler: account_data=account_data_response, ) + @trace async def get_to_device_extension_response( self, sync_config: SlidingSyncConfig, @@ -2016,6 +2026,7 @@ class SlidingSyncHandler: events=messages, ) + @trace async def get_e2ee_extension_response( self, sync_config: SlidingSyncConfig, @@ -2066,6 +2077,7 @@ class SlidingSyncHandler: device_unused_fallback_key_types=device_unused_fallback_key_types, ) + @trace async def get_account_data_extension_response( self, sync_config: SlidingSyncConfig, diff --git a/synapse/rest/client/sync.py b/synapse/rest/client/sync.py index bf3ac8d483..ccfce6bd53 100644 --- a/synapse/rest/client/sync.py +++ b/synapse/rest/client/sync.py @@ -52,7 +52,7 @@ from synapse.http.servlet import ( parse_string, ) from synapse.http.site import SynapseRequest -from synapse.logging.opentracing import trace_with_opname +from synapse.logging.opentracing import log_kv, set_tag, trace_with_opname from synapse.rest.admin.experimental_features import ExperimentalFeature from synapse.types import JsonDict, Requester, SlidingSyncStreamToken, StreamToken from synapse.types.rest.client import SlidingSyncBody @@ -897,7 +897,23 @@ class SlidingSyncRestServlet(RestServlet): # by filter ID. For now, we will just prototype with always passing everything # in. body = parse_and_validate_json_object_from_request(request, SlidingSyncBody) - logger.info("Sliding sync request: %r", body) + + # Tag and log useful data to differentiate requests. + set_tag("sliding_sync.conn_id", body.conn_id or "") + log_kv( + { + "sliding_sync.lists": { + list_name: { + "ranges": list_config.ranges, + "timeline_limit": list_config.timeline_limit, + } + for list_name, list_config in (body.lists or {}).items() + }, + "sliding_sync.room_subscriptions": list( + (body.room_subscriptions or {}).keys() + ), + } + ) sync_config = SlidingSyncConfig( user=user, diff --git a/tests/handlers/test_e2e_keys.py b/tests/handlers/test_e2e_keys.py index 0e6352ff4b..8a3dfdcf75 100644 --- a/tests/handlers/test_e2e_keys.py +++ b/tests/handlers/test_e2e_keys.py @@ -43,9 +43,7 @@ from tests.unittest import override_config class E2eKeysHandlerTestCase(unittest.HomeserverTestCase): def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer: self.appservice_api = mock.AsyncMock() - return self.setup_test_homeserver( - federation_client=mock.Mock(), application_service_api=self.appservice_api - ) + return self.setup_test_homeserver(application_service_api=self.appservice_api) def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.handler = hs.get_e2e_keys_handler() @@ -1224,6 +1222,61 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase): }, ) + def test_query_devices_remote_down(self) -> None: + """Tests that querying keys for a remote user on an unreachable server returns + results in the "failures" property + """ + + remote_user_id = "@test:other" + local_user_id = "@test:test" + + # The backoff code treats time zero as special + self.reactor.advance(5) + + self.hs.get_federation_http_client().agent.request = mock.AsyncMock( # type: ignore[method-assign] + side_effect=Exception("boop") + ) + + e2e_handler = self.hs.get_e2e_keys_handler() + + query_result = self.get_success( + e2e_handler.query_devices( + { + "device_keys": {remote_user_id: []}, + }, + timeout=10, + from_user_id=local_user_id, + from_device_id="some_device_id", + ) + ) + + self.assertEqual( + query_result["failures"], + { + "other": { + "message": "Failed to send request: Exception: boop", + "status": 503, + } + }, + ) + + # Do it again: we should hit the backoff + query_result = self.get_success( + e2e_handler.query_devices( + { + "device_keys": {remote_user_id: []}, + }, + timeout=10, + from_user_id=local_user_id, + from_device_id="some_device_id", + ) + ) + + self.assertEqual( + query_result["failures"], + {"other": {"message": "Not ready for retry", "status": 503}}, + ) + @parameterized.expand( [ # The remote homeserver's response indicates that this user has 0/1/2 devices.