Handle hyphens in user dir search porperly (#17254)

c.f. #16675
2024-11-22 01:25:44 +03:00 · 2024-06-05 10:40:34 +01:00 · 2024-06-05 10:40:34 +01:00 · aabf577166
commit aabf577166
parent 7d8f0ef351
4 changed files with 104 additions and 6 deletions
--- a/changelog.d/17254.bugfix
+++ b/changelog.d/17254.bugfix
@ -0,0 +1 @@
 Fix searching for users with their exact localpart whose ID includes a hyphen.
--- a/synapse/storage/databases/main/user_directory.py
+++ b/synapse/storage/databases/main/user_directory.py
@ -1281,7 +1281,7 @@ def _parse_words_with_regex(search_term: str) -> List[str]:
    Break down search term into words, when we don't have ICU available.
    See: `_parse_words`
    """
-    return re.findall(r"([\w\-]+)", search_term, re.UNICODE)
+    return re.findall(r"([\w-]+)", search_term, re.UNICODE)
 def _parse_words_with_icu(search_term: str) -> List[str]:
@ -1303,15 +1303,69 @@ def _parse_words_with_icu(search_term: str) -> List[str]:
        if j < 0:
            break
-        result = search_term[i:j]
+        # We want to make sure that we split on `@` and `:` specifically, as
        # they occur in user IDs.
        for result in re.split(r"[@:]+", search_term[i:j]):
            results.append(result.strip())
        i = j
    # libicu will break up words that have punctuation in them, but to handle
    # cases where user IDs have '-', '.' and '_' in them we want to *not* break
    # those into words and instead allow the DB to tokenise them how it wants.
    #
    # In particular, user-71 in postgres gets tokenised to "user, -71", and this
    # will not match a query for "user, 71".
    new_results: List[str] = []
    i = 0
    while i < len(results):
        curr = results[i]
        prev = None
        next = None
        if i > 0:
            prev = results[i - 1]
        if i + 1 < len(results):
            next = results[i + 1]
        i += 1
        # libicu considers spaces and punctuation between words as words, but we don't
        # want to include those in results as they would result in syntax errors in SQL
        # queries (e.g. "foo bar" would result in the search query including "foo &  &
        # bar").
-        if len(re.findall(r"([\w\-]+)", result, re.UNICODE)):
+        if not curr:
-            results.append(result)
+            continue
-        i = j
+        if curr in ["-", ".", "_"]:
            prefix = ""
            suffix = ""
-    return results
+            # Check if the next item is a word, and if so use it as the suffix.
            # We check for if its a word as we don't want to concatenate
            # multiple punctuation marks.
            if next is not None and re.match(r"\w", next):
                suffix = next
                i += 1  # We're using next, so we skip it in the outer loop.
            else:
                # We want to avoid creating terms like "user-", as we should
                # strip trailing punctuation.
                continue
            if prev and re.match(r"\w", prev) and new_results:
                prefix = new_results[-1]
                new_results.pop()
            # We might not have a prefix here, but that's fine as we want to
            # ensure that we don't strip preceding punctuation e.g. '-71'
            # shouldn't be converted to '71'.
            new_results.append(f"{prefix}{curr}{suffix}")
            continue
        elif not re.match(r"\w", curr):
            # Ignore other punctuation
            continue
        new_results.append(curr)
    return new_results
--- a/tests/handlers/test_user_directory.py
+++ b/tests/handlers/test_user_directory.py
@ -1061,6 +1061,45 @@ class UserDirectoryTestCase(unittest.HomeserverTestCase):
            {alice: ProfileInfo(display_name=None, avatar_url=MXC_DUMMY)},
        )
    def test_search_punctuation(self) -> None:
        """Test that you can search for a user that includes punctuation"""
        searching_user = self.register_user("searcher", "password")
        searching_user_tok = self.login("searcher", "password")
        room_id = self.helper.create_room_as(
            searching_user,
            room_version=RoomVersions.V1.identifier,
            tok=searching_user_tok,
        )
        # We want to test searching for users of the form e.g. "user-1", with
        # various punctuation. We also test both where the prefix is numeric and
        # alphanumeric, as e.g. postgres tokenises "user-1" as "user" and "-1".
        i = 1
        for char in ["-", ".", "_"]:
            for use_numeric in [False, True]:
                if use_numeric:
                    prefix1 = f"{i}"
                    prefix2 = f"{i+1}"
                else:
                    prefix1 = f"a{i}"
                    prefix2 = f"a{i+1}"
                local_user_1 = self.register_user(f"user{char}{prefix1}", "password")
                local_user_2 = self.register_user(f"user{char}{prefix2}", "password")
                self._add_user_to_room(room_id, RoomVersions.V1, local_user_1)
                self._add_user_to_room(room_id, RoomVersions.V1, local_user_2)
                results = self.get_success(
                    self.handler.search_users(searching_user, local_user_1, 20)
                )["results"]
                received_user_id_ordering = [result["user_id"] for result in results]
                self.assertSequenceEqual(received_user_id_ordering[:1], [local_user_1])
                i += 2
 class TestUserDirSearchDisabled(unittest.HomeserverTestCase):
    servlets = [
--- a/tests/storage/test_user_directory.py
+++ b/tests/storage/test_user_directory.py
@ -711,6 +711,10 @@ class UserDirectoryICUTestCase(HomeserverTestCase):
            ),
        )
        self.assertEqual(_parse_words_with_icu("user-1"), ["user-1"])
        self.assertEqual(_parse_words_with_icu("user-ab"), ["user-ab"])
        self.assertEqual(_parse_words_with_icu("user.--1"), ["user", "-1"])
    def test_regex_word_boundary_punctuation(self) -> None:
        """
        Tests the behaviour of punctuation with the non-ICU tokeniser
		`@ -0,0 +1 @@`
							`Fix searching for users with their exact localpart whose ID includes a hyphen.`