mirror of
https://github.com/element-hq/synapse.git
synced 2024-11-22 09:35:45 +03:00
Fix sending out of order POSITION
over replication (#16639)
If a worker reconnects to Redis we send out the current positions of all our streams. However, if we're also trying to send out a backlog of RDATA at the same time then we can end up sending a `POSITION` with the current token *before* we've sent all the RDATA before the current token. This doesn't cause actual bugs as the receiving servers see the POSITION, fetch the relevant rows from the DB, and then ignore the old RDATA as they come in. However, this is inefficient so it'd be better if we didn't send out-of-order positions
This commit is contained in:
parent
898655fd12
commit
fef08cbee8
5 changed files with 45 additions and 21 deletions
1
changelog.d/16639.bugfix
Normal file
1
changelog.d/16639.bugfix
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Fix sending out of order `POSITION` over replication, causing additional database load.
|
|
@ -257,6 +257,11 @@ class ReplicationCommandHandler:
|
||||||
if hs.config.redis.redis_enabled:
|
if hs.config.redis.redis_enabled:
|
||||||
self._notifier.add_lock_released_callback(self.on_lock_released)
|
self._notifier.add_lock_released_callback(self.on_lock_released)
|
||||||
|
|
||||||
|
# Marks if we should send POSITION commands for all streams ASAP. This
|
||||||
|
# is checked by the `ReplicationStreamer` which manages sending
|
||||||
|
# RDATA/POSITION commands
|
||||||
|
self._should_announce_positions = True
|
||||||
|
|
||||||
def subscribe_to_channel(self, channel_name: str) -> None:
|
def subscribe_to_channel(self, channel_name: str) -> None:
|
||||||
"""
|
"""
|
||||||
Indicates that we wish to subscribe to a Redis channel by name.
|
Indicates that we wish to subscribe to a Redis channel by name.
|
||||||
|
@ -397,29 +402,23 @@ class ReplicationCommandHandler:
|
||||||
return self._streams_to_replicate
|
return self._streams_to_replicate
|
||||||
|
|
||||||
def on_REPLICATE(self, conn: IReplicationConnection, cmd: ReplicateCommand) -> None:
|
def on_REPLICATE(self, conn: IReplicationConnection, cmd: ReplicateCommand) -> None:
|
||||||
self.send_positions_to_connection(conn)
|
self.send_positions_to_connection()
|
||||||
|
|
||||||
def send_positions_to_connection(self, conn: IReplicationConnection) -> None:
|
def send_positions_to_connection(self) -> None:
|
||||||
"""Send current position of all streams this process is source of to
|
"""Send current position of all streams this process is source of to
|
||||||
the connection.
|
the connection.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# We respond with current position of all streams this instance
|
self._should_announce_positions = True
|
||||||
# replicates.
|
self._notifier.notify_replication()
|
||||||
for stream in self.get_streams_to_replicate():
|
|
||||||
# Note that we use the current token as the prev token here (rather
|
def should_announce_positions(self) -> bool:
|
||||||
# than stream.last_token), as we can't be sure that there have been
|
"""Check if we should send POSITION commands for all streams ASAP."""
|
||||||
# no rows written between last token and the current token (since we
|
return self._should_announce_positions
|
||||||
# might be racing with the replication sending bg process).
|
|
||||||
current_token = stream.current_token(self._instance_name)
|
def will_announce_positions(self) -> None:
|
||||||
self.send_command(
|
"""Mark that we're about to send POSITIONs out for all streams."""
|
||||||
PositionCommand(
|
self._should_announce_positions = False
|
||||||
stream.NAME,
|
|
||||||
self._instance_name,
|
|
||||||
current_token,
|
|
||||||
current_token,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def on_USER_SYNC(
|
def on_USER_SYNC(
|
||||||
self, conn: IReplicationConnection, cmd: UserSyncCommand
|
self, conn: IReplicationConnection, cmd: UserSyncCommand
|
||||||
|
@ -653,8 +652,9 @@ class ReplicationCommandHandler:
|
||||||
# for why this can happen.
|
# for why this can happen.
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Fetching replication rows for '%s' between %i and %i",
|
"Fetching replication rows for '%s' / %s between %i and %i",
|
||||||
stream_name,
|
stream_name,
|
||||||
|
cmd.instance_name,
|
||||||
current_token,
|
current_token,
|
||||||
cmd.new_token,
|
cmd.new_token,
|
||||||
)
|
)
|
||||||
|
|
|
@ -141,7 +141,7 @@ class RedisSubscriber(SubscriberProtocol):
|
||||||
# We send out our positions when there is a new connection in case the
|
# We send out our positions when there is a new connection in case the
|
||||||
# other side missed updates. We do this for Redis connections as the
|
# other side missed updates. We do this for Redis connections as the
|
||||||
# otherside won't know we've connected and so won't issue a REPLICATE.
|
# otherside won't know we've connected and so won't issue a REPLICATE.
|
||||||
self.synapse_handler.send_positions_to_connection(self)
|
self.synapse_handler.send_positions_to_connection()
|
||||||
|
|
||||||
def messageReceived(self, pattern: str, channel: str, message: str) -> None:
|
def messageReceived(self, pattern: str, channel: str, message: str) -> None:
|
||||||
"""Received a message from redis."""
|
"""Received a message from redis."""
|
||||||
|
|
|
@ -123,7 +123,7 @@ class ReplicationStreamer:
|
||||||
|
|
||||||
# We check up front to see if anything has actually changed, as we get
|
# We check up front to see if anything has actually changed, as we get
|
||||||
# poked because of changes that happened on other instances.
|
# poked because of changes that happened on other instances.
|
||||||
if all(
|
if not self.command_handler.should_announce_positions() and all(
|
||||||
stream.last_token == stream.current_token(self._instance_name)
|
stream.last_token == stream.current_token(self._instance_name)
|
||||||
for stream in self.streams
|
for stream in self.streams
|
||||||
):
|
):
|
||||||
|
@ -158,6 +158,21 @@ class ReplicationStreamer:
|
||||||
all_streams = list(all_streams)
|
all_streams = list(all_streams)
|
||||||
random.shuffle(all_streams)
|
random.shuffle(all_streams)
|
||||||
|
|
||||||
|
if self.command_handler.should_announce_positions():
|
||||||
|
# We need to send out POSITIONs for all streams, usually
|
||||||
|
# because a worker has reconnected.
|
||||||
|
self.command_handler.will_announce_positions()
|
||||||
|
|
||||||
|
for stream in all_streams:
|
||||||
|
self.command_handler.send_command(
|
||||||
|
PositionCommand(
|
||||||
|
stream.NAME,
|
||||||
|
self._instance_name,
|
||||||
|
stream.last_token,
|
||||||
|
stream.last_token,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
for stream in all_streams:
|
for stream in all_streams:
|
||||||
if stream.last_token == stream.current_token(
|
if stream.last_token == stream.current_token(
|
||||||
self._instance_name
|
self._instance_name
|
||||||
|
|
|
@ -35,6 +35,10 @@ class TypingStreamTestCase(BaseStreamTestCase):
|
||||||
typing = self.hs.get_typing_handler()
|
typing = self.hs.get_typing_handler()
|
||||||
assert isinstance(typing, TypingWriterHandler)
|
assert isinstance(typing, TypingWriterHandler)
|
||||||
|
|
||||||
|
# Create a typing update before we reconnect so that there is a missing
|
||||||
|
# update to fetch.
|
||||||
|
typing._push_update(member=RoomMember(ROOM_ID, USER_ID), typing=True)
|
||||||
|
|
||||||
self.reconnect()
|
self.reconnect()
|
||||||
|
|
||||||
typing._push_update(member=RoomMember(ROOM_ID, USER_ID), typing=True)
|
typing._push_update(member=RoomMember(ROOM_ID, USER_ID), typing=True)
|
||||||
|
@ -91,6 +95,10 @@ class TypingStreamTestCase(BaseStreamTestCase):
|
||||||
typing = self.hs.get_typing_handler()
|
typing = self.hs.get_typing_handler()
|
||||||
assert isinstance(typing, TypingWriterHandler)
|
assert isinstance(typing, TypingWriterHandler)
|
||||||
|
|
||||||
|
# Create a typing update before we reconnect so that there is a missing
|
||||||
|
# update to fetch.
|
||||||
|
typing._push_update(member=RoomMember(ROOM_ID, USER_ID), typing=True)
|
||||||
|
|
||||||
self.reconnect()
|
self.reconnect()
|
||||||
|
|
||||||
typing._push_update(member=RoomMember(ROOM_ID, USER_ID), typing=True)
|
typing._push_update(member=RoomMember(ROOM_ID, USER_ID), typing=True)
|
||||||
|
|
Loading…
Reference in a new issue