Refresh remote profiles that have been marked as stale, in order to fill the user directory. [rei:userdirpriv] (#14756)

* Scaffolding for background process to refresh profiles

* Add scaffolding for background process to refresh profiles for a given server

* Implement the code to select servers to refresh from

* Ensure we don't build up multiple looping calls

* Make `get_profile` able to respect backoffs

* Add logic for refreshing users

* When backing off, schedule a refresh when the backoff is over

* Wake up the background processes when we receive an interesting state event

* Add tests

* Newsfile

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>

* Add comment about 1<<62

---------

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>
This commit is contained in:
reivilibre 2023-03-16 11:44:11 +00:00 committed by GitHub
parent 4953cd71df
commit 1f5473465d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 504 additions and 4 deletions

1
changelog.d/14756.bugfix Normal file
View file

@ -0,0 +1 @@
Fix a long-standing bug in which the user directory would assume any remote membership state events represent a profile change.

View file

@ -63,7 +63,7 @@ class ProfileHandler:
self._third_party_rules = hs.get_third_party_event_rules() self._third_party_rules = hs.get_third_party_event_rules()
async def get_profile(self, user_id: str) -> JsonDict: async def get_profile(self, user_id: str, ignore_backoff: bool = True) -> JsonDict:
target_user = UserID.from_string(user_id) target_user = UserID.from_string(user_id)
if self.hs.is_mine(target_user): if self.hs.is_mine(target_user):
@ -81,7 +81,7 @@ class ProfileHandler:
destination=target_user.domain, destination=target_user.domain,
query_type="profile", query_type="profile",
args={"user_id": user_id}, args={"user_id": user_id},
ignore_backoff=True, ignore_backoff=ignore_backoff,
) )
return result return result
except RequestSendFailed as e: except RequestSendFailed as e:

View file

@ -13,15 +13,22 @@
# limitations under the License. # limitations under the License.
import logging import logging
from http import HTTPStatus
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
from twisted.internet.interfaces import IDelayedCall
import synapse.metrics import synapse.metrics
from synapse.api.constants import EventTypes, HistoryVisibility, JoinRules, Membership from synapse.api.constants import EventTypes, HistoryVisibility, JoinRules, Membership
from synapse.api.errors import Codes, SynapseError
from synapse.handlers.state_deltas import MatchChange, StateDeltasHandler from synapse.handlers.state_deltas import MatchChange, StateDeltasHandler
from synapse.metrics.background_process_metrics import run_as_background_process from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.storage.databases.main.user_directory import SearchResult from synapse.storage.databases.main.user_directory import SearchResult
from synapse.storage.roommember import ProfileInfo from synapse.storage.roommember import ProfileInfo
from synapse.types import UserID
from synapse.util.metrics import Measure from synapse.util.metrics import Measure
from synapse.util.retryutils import NotRetryingDestination
from synapse.util.stringutils import non_null_str_or_none
if TYPE_CHECKING: if TYPE_CHECKING:
from synapse.server import HomeServer from synapse.server import HomeServer
@ -33,6 +40,25 @@ logger = logging.getLogger(__name__)
# then be coalesced such that only one /profile request is made). # then be coalesced such that only one /profile request is made).
USER_DIRECTORY_STALE_REFRESH_TIME_MS = 60 * 1000 USER_DIRECTORY_STALE_REFRESH_TIME_MS = 60 * 1000
# Maximum number of remote servers that we will attempt to refresh profiles for
# in one go.
MAX_SERVERS_TO_REFRESH_PROFILES_FOR_IN_ONE_GO = 5
# As long as we have servers to refresh (without backoff), keep adding more
# every 15 seconds.
INTERVAL_TO_ADD_MORE_SERVERS_TO_REFRESH_PROFILES = 15
def calculate_time_of_next_retry(now_ts: int, retry_count: int) -> int:
"""
Calculates the time of a next retry given `now_ts` in ms and the number
of failures encountered thus far.
Currently the sequence goes:
1 min, 5 min, 25 min, 2 hour, 10 hour, 52 hour, 10 day, 7.75 week
"""
return now_ts + 60_000 * (5 ** min(retry_count, 7))
class UserDirectoryHandler(StateDeltasHandler): class UserDirectoryHandler(StateDeltasHandler):
"""Handles queries and updates for the user_directory. """Handles queries and updates for the user_directory.
@ -69,12 +95,24 @@ class UserDirectoryHandler(StateDeltasHandler):
self.update_user_directory = hs.config.worker.should_update_user_directory self.update_user_directory = hs.config.worker.should_update_user_directory
self.search_all_users = hs.config.userdirectory.user_directory_search_all_users self.search_all_users = hs.config.userdirectory.user_directory_search_all_users
self.spam_checker = hs.get_spam_checker() self.spam_checker = hs.get_spam_checker()
self._hs = hs
# The current position in the current_state_delta stream # The current position in the current_state_delta stream
self.pos: Optional[int] = None self.pos: Optional[int] = None
# Guard to ensure we only process deltas one at a time # Guard to ensure we only process deltas one at a time
self._is_processing = False self._is_processing = False
# Guard to ensure we only have one process for refreshing remote profiles
self._is_refreshing_remote_profiles = False
# Handle to cancel the `call_later` of `kick_off_remote_profile_refresh_process`
self._refresh_remote_profiles_call_later: Optional[IDelayedCall] = None
# Guard to ensure we only have one process for refreshing remote profiles
# for the given servers.
# Set of server names.
self._is_refreshing_remote_profiles_for_servers: Set[str] = set()
if self.update_user_directory: if self.update_user_directory:
self.notifier.add_replication_callback(self.notify_new_event) self.notifier.add_replication_callback(self.notify_new_event)
@ -82,6 +120,11 @@ class UserDirectoryHandler(StateDeltasHandler):
# we start populating the user directory # we start populating the user directory
self.clock.call_later(0, self.notify_new_event) self.clock.call_later(0, self.notify_new_event)
# Kick off the profile refresh process on startup
self._refresh_remote_profiles_call_later = self.clock.call_later(
10, self.kick_off_remote_profile_refresh_process
)
async def search_users( async def search_users(
self, user_id: str, search_term: str, limit: int self, user_id: str, search_term: str, limit: int
) -> SearchResult: ) -> SearchResult:
@ -483,6 +526,20 @@ class UserDirectoryHandler(StateDeltasHandler):
next_try_at_ms=now_ts + USER_DIRECTORY_STALE_REFRESH_TIME_MS, next_try_at_ms=now_ts + USER_DIRECTORY_STALE_REFRESH_TIME_MS,
retry_counter=0, retry_counter=0,
) )
# Schedule a wake-up to refresh the user directory for this server.
# We intentionally wake up this server directly because we don't want
# other servers ahead of it in the queue to get in the way of updating
# the profile if the server only just sent us an event.
self.clock.call_later(
USER_DIRECTORY_STALE_REFRESH_TIME_MS // 1000 + 1,
self.kick_off_remote_profile_refresh_process_for_remote_server,
UserID.from_string(user_id).domain,
)
# Schedule a wake-up to handle any backoffs that may occur in the future.
self.clock.call_later(
2 * USER_DIRECTORY_STALE_REFRESH_TIME_MS // 1000 + 1,
self.kick_off_remote_profile_refresh_process,
)
return return
prev_name = prev_event.content.get("displayname") prev_name = prev_event.content.get("displayname")
@ -505,3 +562,188 @@ class UserDirectoryHandler(StateDeltasHandler):
# Only update if something has changed, or we didn't have a previous event # Only update if something has changed, or we didn't have a previous event
# in the first place. # in the first place.
await self.store.update_profile_in_user_dir(user_id, new_name, new_avatar) await self.store.update_profile_in_user_dir(user_id, new_name, new_avatar)
def kick_off_remote_profile_refresh_process(self) -> None:
"""Called when there may be remote users with stale profiles to be refreshed"""
if not self.update_user_directory:
return
if self._is_refreshing_remote_profiles:
return
if self._refresh_remote_profiles_call_later:
if self._refresh_remote_profiles_call_later.active():
self._refresh_remote_profiles_call_later.cancel()
self._refresh_remote_profiles_call_later = None
async def process() -> None:
try:
await self._unsafe_refresh_remote_profiles()
finally:
self._is_refreshing_remote_profiles = False
self._is_refreshing_remote_profiles = True
run_as_background_process("user_directory.refresh_remote_profiles", process)
async def _unsafe_refresh_remote_profiles(self) -> None:
limit = MAX_SERVERS_TO_REFRESH_PROFILES_FOR_IN_ONE_GO - len(
self._is_refreshing_remote_profiles_for_servers
)
if limit <= 0:
# nothing to do: already refreshing the maximum number of servers
# at once.
# Come back later.
self._refresh_remote_profiles_call_later = self.clock.call_later(
INTERVAL_TO_ADD_MORE_SERVERS_TO_REFRESH_PROFILES,
self.kick_off_remote_profile_refresh_process,
)
return
servers_to_refresh = (
await self.store.get_remote_servers_with_profiles_to_refresh(
now_ts=self.clock.time_msec(), limit=limit
)
)
if not servers_to_refresh:
# Do we have any backing-off servers that we should try again
# for eventually?
# By setting `now` is a point in the far future, we can ask for
# which server/user is next to be refreshed, even though it is
# not actually refreshable *now*.
end_of_time = 1 << 62
backing_off_servers = (
await self.store.get_remote_servers_with_profiles_to_refresh(
now_ts=end_of_time, limit=1
)
)
if backing_off_servers:
# Find out when the next user is refreshable and schedule a
# refresh then.
backing_off_server_name = backing_off_servers[0]
users = await self.store.get_remote_users_to_refresh_on_server(
backing_off_server_name, now_ts=end_of_time, limit=1
)
if not users:
return
_, _, next_try_at_ts = users[0]
self._refresh_remote_profiles_call_later = self.clock.call_later(
((next_try_at_ts - self.clock.time_msec()) // 1000) + 2,
self.kick_off_remote_profile_refresh_process,
)
return
for server_to_refresh in servers_to_refresh:
self.kick_off_remote_profile_refresh_process_for_remote_server(
server_to_refresh
)
self._refresh_remote_profiles_call_later = self.clock.call_later(
INTERVAL_TO_ADD_MORE_SERVERS_TO_REFRESH_PROFILES,
self.kick_off_remote_profile_refresh_process,
)
def kick_off_remote_profile_refresh_process_for_remote_server(
self, server_name: str
) -> None:
"""Called when there may be remote users with stale profiles to be refreshed
on the given server."""
if not self.update_user_directory:
return
if server_name in self._is_refreshing_remote_profiles_for_servers:
return
async def process() -> None:
try:
await self._unsafe_refresh_remote_profiles_for_remote_server(
server_name
)
finally:
self._is_refreshing_remote_profiles_for_servers.remove(server_name)
self._is_refreshing_remote_profiles_for_servers.add(server_name)
run_as_background_process(
"user_directory.refresh_remote_profiles_for_remote_server", process
)
async def _unsafe_refresh_remote_profiles_for_remote_server(
self, server_name: str
) -> None:
logger.info("Refreshing profiles in user directory for %s", server_name)
while True:
# Get a handful of users to process.
next_batch = await self.store.get_remote_users_to_refresh_on_server(
server_name, now_ts=self.clock.time_msec(), limit=10
)
if not next_batch:
# Finished for now
return
for user_id, retry_counter, _ in next_batch:
# Request the profile of the user.
try:
profile = await self._hs.get_profile_handler().get_profile(
user_id, ignore_backoff=False
)
except NotRetryingDestination as e:
logger.info(
"Failed to refresh profile for %r because the destination is undergoing backoff",
user_id,
)
# As a special-case, we back off until the destination is no longer
# backed off from.
await self.store.set_remote_user_profile_in_user_dir_stale(
user_id,
e.retry_last_ts + e.retry_interval,
retry_counter=retry_counter + 1,
)
continue
except SynapseError as e:
if e.code == HTTPStatus.NOT_FOUND and e.errcode == Codes.NOT_FOUND:
# The profile doesn't exist.
# TODO Does this mean we should clear it from our user
# directory?
await self.store.clear_remote_user_profile_in_user_dir_stale(
user_id
)
logger.warning(
"Refresh of remote profile %r: not found (%r)",
user_id,
e.msg,
)
continue
logger.warning(
"Failed to refresh profile for %r because %r", user_id, e
)
await self.store.set_remote_user_profile_in_user_dir_stale(
user_id,
calculate_time_of_next_retry(
self.clock.time_msec(), retry_counter + 1
),
retry_counter=retry_counter + 1,
)
continue
except Exception:
logger.error(
"Failed to refresh profile for %r due to unhandled exception",
user_id,
exc_info=True,
)
await self.store.set_remote_user_profile_in_user_dir_stale(
user_id,
calculate_time_of_next_retry(
self.clock.time_msec(), retry_counter + 1
),
retry_counter=retry_counter + 1,
)
continue
await self.store.update_profile_in_user_dir(
user_id,
display_name=non_null_str_or_none(profile.get("displayname")),
avatar_url=non_null_str_or_none(profile.get("avatar_url")),
)

View file

@ -504,6 +504,80 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore):
desc="set_remote_user_profile_in_user_dir_stale", desc="set_remote_user_profile_in_user_dir_stale",
) )
async def clear_remote_user_profile_in_user_dir_stale(self, user_id: str) -> None:
"""
Marks a remote user as no longer having a possibly-stale user directory profile.
Args:
user_id: the remote user who no longer has a stale profile on this server.
"""
await self.db_pool.simple_delete(
table="user_directory_stale_remote_users",
keyvalues={"user_id": user_id},
desc="clear_remote_user_profile_in_user_dir_stale",
)
async def get_remote_servers_with_profiles_to_refresh(
self, now_ts: int, limit: int
) -> List[str]:
"""
Get a list of up to `limit` server names which have users whose
locally-cached profiles we believe to be stale
and are refreshable given the current time `now_ts` in milliseconds.
"""
def _get_remote_servers_with_refreshable_profiles_txn(
txn: LoggingTransaction,
) -> List[str]:
sql = """
SELECT user_server_name
FROM user_directory_stale_remote_users
WHERE next_try_at_ts < ?
GROUP BY user_server_name
ORDER BY MIN(next_try_at_ts), user_server_name
LIMIT ?
"""
txn.execute(sql, (now_ts, limit))
return [row[0] for row in txn]
return await self.db_pool.runInteraction(
"get_remote_servers_with_profiles_to_refresh",
_get_remote_servers_with_refreshable_profiles_txn,
)
async def get_remote_users_to_refresh_on_server(
self, server_name: str, now_ts: int, limit: int
) -> List[Tuple[str, int, int]]:
"""
Get a list of up to `limit` user IDs from the server `server_name`
whose locally-cached profiles we believe to be stale
and are refreshable given the current time `now_ts` in milliseconds.
Returns:
tuple of:
- User ID
- Retry counter (number of failures so far)
- Time the retry is scheduled for, in milliseconds
"""
def _get_remote_users_to_refresh_on_server_txn(
txn: LoggingTransaction,
) -> List[Tuple[str, int, int]]:
sql = """
SELECT user_id, retry_counter, next_try_at_ts
FROM user_directory_stale_remote_users
WHERE user_server_name = ? AND next_try_at_ts < ?
ORDER BY next_try_at_ts
LIMIT ?
"""
txn.execute(sql, (server_name, now_ts, limit))
return cast(List[Tuple[str, int, int]], txn.fetchall())
return await self.db_pool.runInteraction(
"get_remote_users_to_refresh_on_server",
_get_remote_users_to_refresh_on_server_txn,
)
async def update_profile_in_user_dir( async def update_profile_in_user_dir(
self, user_id: str, display_name: Optional[str], avatar_url: Optional[str] self, user_id: str, display_name: Optional[str], avatar_url: Optional[str]
) -> None: ) -> None:

View file

@ -19,17 +19,18 @@ from twisted.test.proto_helpers import MemoryReactor
import synapse.rest.admin import synapse.rest.admin
from synapse.api.constants import UserTypes from synapse.api.constants import UserTypes
from synapse.api.errors import SynapseError
from synapse.api.room_versions import RoomVersion, RoomVersions from synapse.api.room_versions import RoomVersion, RoomVersions
from synapse.appservice import ApplicationService from synapse.appservice import ApplicationService
from synapse.rest.client import login, register, room, user_directory from synapse.rest.client import login, register, room, user_directory
from synapse.server import HomeServer from synapse.server import HomeServer
from synapse.storage.roommember import ProfileInfo from synapse.storage.roommember import ProfileInfo
from synapse.types import UserProfile, create_requester from synapse.types import JsonDict, UserProfile, create_requester
from synapse.util import Clock from synapse.util import Clock
from tests import unittest from tests import unittest
from tests.storage.test_user_directory import GetUserDirectoryTables from tests.storage.test_user_directory import GetUserDirectoryTables
from tests.test_utils import make_awaitable from tests.test_utils import event_injection, make_awaitable
from tests.test_utils.event_injection import inject_member_event from tests.test_utils.event_injection import inject_member_event
from tests.unittest import override_config from tests.unittest import override_config
@ -1103,3 +1104,185 @@ class TestUserDirSearchDisabled(unittest.HomeserverTestCase):
) )
self.assertEqual(200, channel.code, channel.result) self.assertEqual(200, channel.code, channel.result)
self.assertTrue(len(channel.json_body["results"]) == 0) self.assertTrue(len(channel.json_body["results"]) == 0)
class UserDirectoryRemoteProfileTestCase(unittest.HomeserverTestCase):
servlets = [
login.register_servlets,
synapse.rest.admin.register_servlets,
register.register_servlets,
room.register_servlets,
]
def default_config(self) -> JsonDict:
config = super().default_config()
# Re-enables updating the user directory, as that functionality is needed below.
config["update_user_directory_from_worker"] = None
return config
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main
self.alice = self.register_user("alice", "alice123")
self.alice_tok = self.login("alice", "alice123")
self.user_dir_helper = GetUserDirectoryTables(self.store)
self.user_dir_handler = hs.get_user_directory_handler()
self.profile_handler = hs.get_profile_handler()
# Cancel the startup call: in the steady-state case we can't rely on it anyway.
assert self.user_dir_handler._refresh_remote_profiles_call_later is not None
self.user_dir_handler._refresh_remote_profiles_call_later.cancel()
def test_public_rooms_have_profiles_collected(self) -> None:
"""
In a public room, member state events are treated as reflecting the user's
real profile and they are accepted.
(The main motivation for accepting this is to prevent having to query
*every* single profile change over federation.)
"""
room_id = self.helper.create_room_as(
self.alice, is_public=True, tok=self.alice_tok
)
self.get_success(
event_injection.inject_member_event(
self.hs,
room_id,
"@bruce:remote",
"join",
"@bruce:remote",
extra_content={
"displayname": "Bruce!",
"avatar_url": "mxc://remote/123",
},
)
)
# Sending this event makes the streams move forward after the injection...
self.helper.send(room_id, "Test", tok=self.alice_tok)
self.pump(0.1)
profiles = self.get_success(
self.user_dir_helper.get_profiles_in_user_directory()
)
self.assertEqual(
profiles.get("@bruce:remote"),
ProfileInfo(display_name="Bruce!", avatar_url="mxc://remote/123"),
)
def test_private_rooms_do_not_have_profiles_collected(self) -> None:
"""
In a private room, member state events are not pulled out and used to populate
the user directory.
"""
room_id = self.helper.create_room_as(
self.alice, is_public=False, tok=self.alice_tok
)
self.get_success(
event_injection.inject_member_event(
self.hs,
room_id,
"@bruce:remote",
"join",
"@bruce:remote",
extra_content={
"displayname": "super-duper bruce",
"avatar_url": "mxc://remote/456",
},
)
)
# Sending this event makes the streams move forward after the injection...
self.helper.send(room_id, "Test", tok=self.alice_tok)
self.pump(0.1)
profiles = self.get_success(
self.user_dir_helper.get_profiles_in_user_directory()
)
self.assertNotIn("@bruce:remote", profiles)
def test_private_rooms_have_profiles_requested(self) -> None:
"""
When a name changes in a private room, the homeserver instead requests
the user's global profile over federation.
"""
async def get_remote_profile(
user_id: str, ignore_backoff: bool = True
) -> JsonDict:
if user_id == "@bruce:remote":
return {
"displayname": "Sir Bruce Bruceson",
"avatar_url": "mxc://remote/789",
}
else:
raise ValueError(f"unable to fetch {user_id}")
with patch.object(self.profile_handler, "get_profile", get_remote_profile):
# Continue from the earlier test...
self.test_private_rooms_do_not_have_profiles_collected()
# Advance by a minute
self.reactor.advance(61.0)
profiles = self.get_success(
self.user_dir_helper.get_profiles_in_user_directory()
)
self.assertEqual(
profiles.get("@bruce:remote"),
ProfileInfo(
display_name="Sir Bruce Bruceson", avatar_url="mxc://remote/789"
),
)
def test_profile_requests_are_retried(self) -> None:
"""
When we fail to fetch the user's profile over federation,
we try again later.
"""
has_failed_once = False
async def get_remote_profile(
user_id: str, ignore_backoff: bool = True
) -> JsonDict:
nonlocal has_failed_once
if user_id == "@bruce:remote":
if not has_failed_once:
has_failed_once = True
raise SynapseError(502, "temporary network problem")
return {
"displayname": "Sir Bruce Bruceson",
"avatar_url": "mxc://remote/789",
}
else:
raise ValueError(f"unable to fetch {user_id}")
with patch.object(self.profile_handler, "get_profile", get_remote_profile):
# Continue from the earlier test...
self.test_private_rooms_do_not_have_profiles_collected()
# Advance by a minute
self.reactor.advance(61.0)
# The request has already failed once
self.assertTrue(has_failed_once)
# The profile has yet to be updated.
profiles = self.get_success(
self.user_dir_helper.get_profiles_in_user_directory()
)
self.assertNotIn(
"@bruce:remote",
profiles,
)
# Advance by five minutes, after the backoff has finished
self.reactor.advance(301.0)
# The profile should have been updated now
profiles = self.get_success(
self.user_dir_helper.get_profiles_in_user_directory()
)
self.assertEqual(
profiles.get("@bruce:remote"),
ProfileInfo(
display_name="Sir Bruce Bruceson", avatar_url="mxc://remote/789"
),
)