Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. (#15079)

Co-authored-by: David Robertson <davidr@element.io>
This commit is contained in:
reivilibre 2023-02-20 12:00:18 +00:00 committed by GitHub
parent 7ee7f49316
commit 1cbc3f197c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 90 additions and 5 deletions

1
changelog.d/15079.bugfix Normal file
View file

@ -0,0 +1 @@
Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error.

View file

@ -918,11 +918,19 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
We use this so that we can add prefix matching, which isn't something We use this so that we can add prefix matching, which isn't something
that is supported by default. that is supported by default.
""" """
results = _parse_words(search_term) escaped_words = []
for word in _parse_words(search_term):
# Postgres tsvector and tsquery quoting rules:
# words potentially containing punctuation should be quoted
# and then existing quotes and backslashes should be doubled
# See: https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY
both = " & ".join("(%s:* | %s)" % (result, result) for result in results) quoted_word = word.replace("'", "''").replace("\\", "\\\\")
exact = " & ".join("%s" % (result,) for result in results) escaped_words.append(f"'{quoted_word}'")
prefix = " & ".join("%s:*" % (result,) for result in results)
both = " & ".join("(%s:* | %s)" % (word, word) for word in escaped_words)
exact = " & ".join("%s" % (word,) for word in escaped_words)
prefix = " & ".join("%s:*" % (word,) for word in escaped_words)
return both, exact, prefix return both, exact, prefix
@ -944,6 +952,14 @@ def _parse_words(search_term: str) -> List[str]:
if USE_ICU: if USE_ICU:
return _parse_words_with_icu(search_term) return _parse_words_with_icu(search_term)
return _parse_words_with_regex(search_term)
def _parse_words_with_regex(search_term: str) -> List[str]:
"""
Break down search term into words, when we don't have ICU available.
See: `_parse_words`
"""
return re.findall(r"([\w\-]+)", search_term, re.UNICODE) return re.findall(r"([\w\-]+)", search_term, re.UNICODE)

View file

@ -192,6 +192,13 @@ class UserDirectoryTestCase(unittest.HomeserverTestCase):
self.helper.join(room, self.appservice.sender, tok=self.appservice.token) self.helper.join(room, self.appservice.sender, tok=self.appservice.token)
self._check_only_one_user_in_directory(user, room) self._check_only_one_user_in_directory(user, room)
def test_search_term_with_colon_in_it_does_not_raise(self) -> None:
"""
Regression test: Test that search terms with colons in them are acceptable.
"""
u1 = self.register_user("user1", "pass")
self.get_success(self.handler.search_users(u1, "haha:paamayim-nekudotayim", 10))
def test_user_not_in_users_table(self) -> None: def test_user_not_in_users_table(self) -> None:
"""Unclear how it happens, but on matrix.org we've seen join events """Unclear how it happens, but on matrix.org we've seen join events
for users who aren't in the users table. Test that we don't fall over for users who aren't in the users table. Test that we don't fall over

View file

@ -25,6 +25,11 @@ from synapse.rest.client import login, register, room
from synapse.server import HomeServer from synapse.server import HomeServer
from synapse.storage import DataStore from synapse.storage import DataStore
from synapse.storage.background_updates import _BackgroundUpdateHandler from synapse.storage.background_updates import _BackgroundUpdateHandler
from synapse.storage.databases.main import user_directory
from synapse.storage.databases.main.user_directory import (
_parse_words_with_icu,
_parse_words_with_regex,
)
from synapse.storage.roommember import ProfileInfo from synapse.storage.roommember import ProfileInfo
from synapse.util import Clock from synapse.util import Clock
@ -42,7 +47,7 @@ ALICE = "@alice:a"
BOB = "@bob:b" BOB = "@bob:b"
BOBBY = "@bobby:a" BOBBY = "@bobby:a"
# The localpart isn't 'Bela' on purpose so we can test looking up display names. # The localpart isn't 'Bela' on purpose so we can test looking up display names.
BELA = "@somenickname:a" BELA = "@somenickname:example.org"
class GetUserDirectoryTables: class GetUserDirectoryTables:
@ -423,6 +428,8 @@ class UserDirectoryInitialPopulationTestcase(HomeserverTestCase):
class UserDirectoryStoreTestCase(HomeserverTestCase): class UserDirectoryStoreTestCase(HomeserverTestCase):
use_icu = False
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main self.store = hs.get_datastores().main
@ -434,6 +441,12 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None)) self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None))
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB))) self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB)))
self._restore_use_icu = user_directory.USE_ICU
user_directory.USE_ICU = self.use_icu
def tearDown(self) -> None:
user_directory.USE_ICU = self._restore_use_icu
def test_search_user_dir(self) -> None: def test_search_user_dir(self) -> None:
# normally when alice searches the directory she should just find # normally when alice searches the directory she should just find
# bob because bobby doesn't share a room with her. # bob because bobby doesn't share a room with her.
@ -478,6 +491,26 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
{"user_id": BELA, "display_name": "Bela", "avatar_url": None}, {"user_id": BELA, "display_name": "Bela", "avatar_url": None},
) )
@override_config({"user_directory": {"search_all_users": True}})
def test_search_user_dir_start_of_user_id(self) -> None:
"""Tests that a user can look up another user by searching for the start
of their user ID.
"""
r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10))
self.assertFalse(r["limited"])
self.assertEqual(1, len(r["results"]))
self.assertDictEqual(
r["results"][0],
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)
class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
use_icu = True
if not icu:
skip = "Requires PyICU"
class UserDirectoryICUTestCase(HomeserverTestCase): class UserDirectoryICUTestCase(HomeserverTestCase):
if not icu: if not icu:
@ -513,3 +546,31 @@ class UserDirectoryICUTestCase(HomeserverTestCase):
r["results"][0], r["results"][0],
{"user_id": ALICE, "display_name": display_name, "avatar_url": None}, {"user_id": ALICE, "display_name": display_name, "avatar_url": None},
) )
def test_icu_word_boundary_punctuation(self) -> None:
"""
Tests the behaviour of punctuation with the ICU tokeniser.
Seems to depend on underlying version of ICU.
"""
# Note: either tokenisation is fine, because Postgres actually splits
# words itself afterwards.
self.assertIn(
_parse_words_with_icu("lazy'fox jumped:over the.dog"),
(
# ICU 66 on Ubuntu 20.04
["lazy'fox", "jumped", "over", "the", "dog"],
# ICU 70 on Ubuntu 22.04
["lazy'fox", "jumped:over", "the.dog"],
),
)
def test_regex_word_boundary_punctuation(self) -> None:
"""
Tests the behaviour of punctuation with the non-ICU tokeniser
"""
self.assertEqual(
_parse_words_with_regex("lazy'fox jumped:over the.dog"),
["lazy", "fox", "jumped", "over", "the", "dog"],
)