From 3f9b61ff9504dd88cac17fb3cb097e319babd2a3 Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Thu, 7 Nov 2019 11:49:37 +0000 Subject: [PATCH 1/6] Fix the SQL SELECT query in _paginate_room_events_txn Doing a SELECT DISTINCT when paginating is quite expensive, because it requires the engine to do sorting on the entire events table. However, we only need to run it if we're filtering on 2+ labels, so this PR is changing the request so that DISTINCT is only used then. --- synapse/storage/data_stores/main/stream.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/synapse/storage/data_stores/main/stream.py b/synapse/storage/data_stores/main/stream.py index 616ef91d4e..ef0b1426d1 100644 --- a/synapse/storage/data_stores/main/stream.py +++ b/synapse/storage/data_stores/main/stream.py @@ -871,14 +871,25 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore): args.append(int(limit)) + # Using DISTINCT in this SELECT query is quite expensive, because it requires the + # engine to sort on the entire (not limited) result set, i.e. the entire events + # table. We only need to use it when we're filtering on more than two labels, + # because that's the only scenario in which we can possibly to get multiple times + # the same event ID in the results. + if event_filter.labels and len(event_filter.labels) > 1: + select_keywords = "SELECT DISTINCT" + + else: + select_keywords = "SELECT" + sql = ( - "SELECT DISTINCT event_id, topological_ordering, stream_ordering" + "%(select_keywords)s event_id, topological_ordering, stream_ordering" " FROM events" " LEFT JOIN event_labels USING (event_id, room_id, topological_ordering)" " WHERE outlier = ? AND room_id = ? AND %(bounds)s" " ORDER BY topological_ordering %(order)s," " stream_ordering %(order)s LIMIT ?" - ) % {"bounds": bounds, "order": order} + ) % {"select_keywords": select_keywords, "bounds": bounds, "order": order} txn.execute(sql, args) From 4f519d556e32ac29a960977df4ed14c42290af5e Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Thu, 7 Nov 2019 11:51:54 +0000 Subject: [PATCH 2/6] Changelog --- changelog.d/6340.feature | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/6340.feature diff --git a/changelog.d/6340.feature b/changelog.d/6340.feature new file mode 100644 index 0000000000..78a187a1dc --- /dev/null +++ b/changelog.d/6340.feature @@ -0,0 +1 @@ +Implement label-based filtering on `/sync` and `/messages` ([MSC2326](https://github.com/matrix-org/matrix-doc/pull/2326)). From 15a1a02e70ca18d8af9a4faaf1bb40427ea6a643 Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Thu, 7 Nov 2019 12:04:37 +0000 Subject: [PATCH 3/6] Handle lack of filter --- synapse/storage/data_stores/main/stream.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/synapse/storage/data_stores/main/stream.py b/synapse/storage/data_stores/main/stream.py index ef0b1426d1..bb70a0f38a 100644 --- a/synapse/storage/data_stores/main/stream.py +++ b/synapse/storage/data_stores/main/stream.py @@ -876,11 +876,9 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore): # table. We only need to use it when we're filtering on more than two labels, # because that's the only scenario in which we can possibly to get multiple times # the same event ID in the results. - if event_filter.labels and len(event_filter.labels) > 1: - select_keywords = "SELECT DISTINCT" - - else: - select_keywords = "SELECT" + select_keywords = "SELECT" + if event_filter and event_filter.labels and len(event_filter.labels) > 1: + select_keywords += "DISTINCT" sql = ( "%(select_keywords)s event_id, topological_ordering, stream_ordering" From 70804392ae42949109e55e4e51566e2545e1df63 Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Thu, 7 Nov 2019 14:55:10 +0000 Subject: [PATCH 4/6] Only join on event_labels if we're filtering on labels --- synapse/storage/data_stores/main/stream.py | 33 ++++++++++++++++------ 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/synapse/storage/data_stores/main/stream.py b/synapse/storage/data_stores/main/stream.py index bb70a0f38a..064f602a65 100644 --- a/synapse/storage/data_stores/main/stream.py +++ b/synapse/storage/data_stores/main/stream.py @@ -871,23 +871,38 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore): args.append(int(limit)) - # Using DISTINCT in this SELECT query is quite expensive, because it requires the - # engine to sort on the entire (not limited) result set, i.e. the entire events - # table. We only need to use it when we're filtering on more than two labels, - # because that's the only scenario in which we can possibly to get multiple times - # the same event ID in the results. select_keywords = "SELECT" - if event_filter and event_filter.labels and len(event_filter.labels) > 1: - select_keywords += "DISTINCT" + join_clause = "" + if event_filter and event_filter.labels: + # If we're not filtering on a label, then joining on event_labels will + # return as many row for a single event as the number of labels it has. To + # avoid this, only join if we're filtering on at least one label. + join_clause = ( + "LEFT JOIN event_labels" + " USING (event_id, room_id, topological_ordering)" + ) + if len(event_filter.labels) > 1: + # Using DISTINCT in this SELECT query is quite expensive, because it + # requires the engine to sort on the entire (not limited) result set, + # i.e. the entire events table. We only need to use it when we're + # filtering on more than two labels, because that's the only scenario + # in which we can possibly to get multiple times the same event ID in + # the results. + select_keywords += "DISTINCT" sql = ( "%(select_keywords)s event_id, topological_ordering, stream_ordering" " FROM events" - " LEFT JOIN event_labels USING (event_id, room_id, topological_ordering)" + " %(join_clause)s" " WHERE outlier = ? AND room_id = ? AND %(bounds)s" " ORDER BY topological_ordering %(order)s," " stream_ordering %(order)s LIMIT ?" - ) % {"select_keywords": select_keywords, "bounds": bounds, "order": order} + ) % { + "select_keywords": select_keywords, + "join_clause": join_clause, + "bounds": bounds, + "order": order + } txn.execute(sql, args) From b9cba079628db11951fc5ed30b03e8825eb954d7 Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Thu, 7 Nov 2019 14:57:15 +0000 Subject: [PATCH 5/6] Lint --- synapse/storage/data_stores/main/stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/storage/data_stores/main/stream.py b/synapse/storage/data_stores/main/stream.py index 064f602a65..90bb2d8e8f 100644 --- a/synapse/storage/data_stores/main/stream.py +++ b/synapse/storage/data_stores/main/stream.py @@ -876,7 +876,7 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore): if event_filter and event_filter.labels: # If we're not filtering on a label, then joining on event_labels will # return as many row for a single event as the number of labels it has. To - # avoid this, only join if we're filtering on at least one label. + # avoid this, only join if we're filtering on at least one label. join_clause = ( "LEFT JOIN event_labels" " USING (event_id, room_id, topological_ordering)" @@ -901,7 +901,7 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore): "select_keywords": select_keywords, "join_clause": join_clause, "bounds": bounds, - "order": order + "order": order, } txn.execute(sql, args) From b16fa433860824560c64acf594aa6c891e5f1a0a Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Fri, 8 Nov 2019 10:34:09 +0000 Subject: [PATCH 6/6] Incorporate review --- synapse/storage/data_stores/main/stream.py | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/synapse/storage/data_stores/main/stream.py b/synapse/storage/data_stores/main/stream.py index 90bb2d8e8f..8780fdd989 100644 --- a/synapse/storage/data_stores/main/stream.py +++ b/synapse/storage/data_stores/main/stream.py @@ -877,10 +877,10 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore): # If we're not filtering on a label, then joining on event_labels will # return as many row for a single event as the number of labels it has. To # avoid this, only join if we're filtering on at least one label. - join_clause = ( - "LEFT JOIN event_labels" - " USING (event_id, room_id, topological_ordering)" - ) + join_clause = """ + LEFT JOIN event_labels + USING (event_id, room_id, topological_ordering) + """ if len(event_filter.labels) > 1: # Using DISTINCT in this SELECT query is quite expensive, because it # requires the engine to sort on the entire (not limited) result set, @@ -890,14 +890,14 @@ class StreamWorkerStore(EventsWorkerStore, SQLBaseStore): # the results. select_keywords += "DISTINCT" - sql = ( - "%(select_keywords)s event_id, topological_ordering, stream_ordering" - " FROM events" - " %(join_clause)s" - " WHERE outlier = ? AND room_id = ? AND %(bounds)s" - " ORDER BY topological_ordering %(order)s," - " stream_ordering %(order)s LIMIT ?" - ) % { + sql = """ + %(select_keywords)s event_id, topological_ordering, stream_ordering + FROM events + %(join_clause)s + WHERE outlier = ? AND room_id = ? AND %(bounds)s + ORDER BY topological_ordering %(order)s, + stream_ordering %(order)s LIMIT ? + """ % { "select_keywords": select_keywords, "join_clause": join_clause, "bounds": bounds,