2019-11-12 15:29:07 +03:00
|
|
|
/*
|
|
|
|
Copyright 2019 New Vector Ltd
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
import PlatformPeg from "./PlatformPeg";
|
|
|
|
import MatrixClientPeg from "./MatrixClientPeg";
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Event indexing class that wraps the platform specific event indexing.
|
|
|
|
*/
|
|
|
|
export default class EventIndexer {
|
|
|
|
constructor() {
|
|
|
|
this.crawlerChekpoints = [];
|
|
|
|
// The time that the crawler will wait between /rooms/{room_id}/messages
|
|
|
|
// requests
|
|
|
|
this._crawler_timeout = 3000;
|
|
|
|
this._crawlerRef = null;
|
|
|
|
this.liveEventsForIndex = new Set();
|
|
|
|
}
|
|
|
|
|
|
|
|
async init(userId) {
|
2019-11-13 14:25:16 +03:00
|
|
|
const indexManager = PlatformPeg.get().getEventIndexingManager();
|
|
|
|
if (indexManager === null) return false;
|
|
|
|
indexManager.initEventIndex(userId);
|
2019-11-12 17:40:49 +03:00
|
|
|
return true;
|
2019-11-12 15:29:07 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
async onSync(state, prevState, data) {
|
2019-11-13 14:25:16 +03:00
|
|
|
const indexManager = PlatformPeg.get().getEventIndexingManager();
|
|
|
|
if (indexManager === null) return;
|
2019-11-12 15:29:07 +03:00
|
|
|
|
|
|
|
if (prevState === null && state === "PREPARED") {
|
|
|
|
// Load our stored checkpoints, if any.
|
2019-11-13 14:25:16 +03:00
|
|
|
this.crawlerChekpoints = await indexManager.loadCheckpoints();
|
2019-11-12 15:29:07 +03:00
|
|
|
console.log("Seshat: Loaded checkpoints",
|
|
|
|
this.crawlerChekpoints);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (prevState === "PREPARED" && state === "SYNCING") {
|
|
|
|
const addInitialCheckpoints = async () => {
|
|
|
|
const client = MatrixClientPeg.get();
|
|
|
|
const rooms = client.getRooms();
|
|
|
|
|
|
|
|
const isRoomEncrypted = (room) => {
|
|
|
|
return client.isRoomEncrypted(room.roomId);
|
|
|
|
};
|
|
|
|
|
|
|
|
// We only care to crawl the encrypted rooms, non-encrytped
|
|
|
|
// rooms can use the search provided by the Homeserver.
|
|
|
|
const encryptedRooms = rooms.filter(isRoomEncrypted);
|
|
|
|
|
|
|
|
console.log("Seshat: Adding initial crawler checkpoints");
|
|
|
|
|
|
|
|
// Gather the prev_batch tokens and create checkpoints for
|
|
|
|
// our message crawler.
|
|
|
|
await Promise.all(encryptedRooms.map(async (room) => {
|
|
|
|
const timeline = room.getLiveTimeline();
|
|
|
|
const token = timeline.getPaginationToken("b");
|
|
|
|
|
|
|
|
console.log("Seshat: Got token for indexer",
|
|
|
|
room.roomId, token);
|
|
|
|
|
|
|
|
const backCheckpoint = {
|
|
|
|
roomId: room.roomId,
|
|
|
|
token: token,
|
|
|
|
direction: "b",
|
|
|
|
};
|
|
|
|
|
|
|
|
const forwardCheckpoint = {
|
|
|
|
roomId: room.roomId,
|
|
|
|
token: token,
|
|
|
|
direction: "f",
|
|
|
|
};
|
|
|
|
|
2019-11-13 14:25:16 +03:00
|
|
|
await indexManager.addCrawlerCheckpoint(backCheckpoint);
|
|
|
|
await indexManager.addCrawlerCheckpoint(forwardCheckpoint);
|
2019-11-12 15:29:07 +03:00
|
|
|
this.crawlerChekpoints.push(backCheckpoint);
|
|
|
|
this.crawlerChekpoints.push(forwardCheckpoint);
|
|
|
|
}));
|
|
|
|
};
|
|
|
|
|
|
|
|
// If our indexer is empty we're most likely running Riot the
|
|
|
|
// first time with indexing support or running it with an
|
|
|
|
// initial sync. Add checkpoints to crawl our encrypted rooms.
|
2019-11-13 14:25:16 +03:00
|
|
|
const eventIndexWasEmpty = await indexManager.isEventIndexEmpty();
|
2019-11-12 15:29:07 +03:00
|
|
|
if (eventIndexWasEmpty) await addInitialCheckpoints();
|
|
|
|
|
|
|
|
// Start our crawler.
|
|
|
|
this.startCrawler();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (prevState === "SYNCING" && state === "SYNCING") {
|
|
|
|
// A sync was done, presumably we queued up some live events,
|
|
|
|
// commit them now.
|
|
|
|
console.log("Seshat: Committing events");
|
2019-11-13 14:25:16 +03:00
|
|
|
await indexManager.commitLiveEvents();
|
2019-11-12 15:29:07 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async onRoomTimeline(ev, room, toStartOfTimeline, removed, data) {
|
2019-11-13 14:25:16 +03:00
|
|
|
const indexManager = PlatformPeg.get().getEventIndexingManager();
|
|
|
|
if (indexManager === null) return;
|
2019-11-12 15:29:07 +03:00
|
|
|
|
|
|
|
// We only index encrypted rooms locally.
|
|
|
|
if (!MatrixClientPeg.get().isRoomEncrypted(room.roomId)) return;
|
|
|
|
|
|
|
|
// If it isn't a live event or if it's redacted there's nothing to
|
|
|
|
// do.
|
|
|
|
if (toStartOfTimeline || !data || !data.liveEvent
|
|
|
|
|| ev.isRedacted()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the event is not yet decrypted mark it for the
|
|
|
|
// Event.decrypted callback.
|
|
|
|
if (ev.isBeingDecrypted()) {
|
|
|
|
const eventId = ev.getId();
|
|
|
|
this.liveEventsForIndex.add(eventId);
|
|
|
|
} else {
|
|
|
|
// If the event is decrypted or is unencrypted add it to the
|
|
|
|
// index now.
|
|
|
|
await this.addLiveEventToIndex(ev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async onEventDecrypted(ev, err) {
|
2019-11-13 14:25:16 +03:00
|
|
|
const indexManager = PlatformPeg.get().getEventIndexingManager();
|
|
|
|
if (indexManager === null) return;
|
2019-11-12 15:29:07 +03:00
|
|
|
|
|
|
|
const eventId = ev.getId();
|
|
|
|
|
|
|
|
// If the event isn't in our live event set, ignore it.
|
|
|
|
if (!this.liveEventsForIndex.delete(eventId)) return;
|
|
|
|
if (err) return;
|
|
|
|
await this.addLiveEventToIndex(ev);
|
|
|
|
}
|
|
|
|
|
|
|
|
async addLiveEventToIndex(ev) {
|
2019-11-13 14:25:16 +03:00
|
|
|
const indexManager = PlatformPeg.get().getEventIndexingManager();
|
|
|
|
if (indexManager === null) return;
|
2019-11-12 15:29:07 +03:00
|
|
|
|
|
|
|
if (["m.room.message", "m.room.name", "m.room.topic"]
|
|
|
|
.indexOf(ev.getType()) == -1) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const e = ev.toJSON().decrypted;
|
|
|
|
const profile = {
|
|
|
|
displayname: ev.sender.rawDisplayName,
|
|
|
|
avatar_url: ev.sender.getMxcAvatarUrl(),
|
|
|
|
};
|
|
|
|
|
2019-11-13 14:25:16 +03:00
|
|
|
indexManager.addEventToIndex(e, profile);
|
2019-11-12 15:29:07 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
async crawlerFunc(handle) {
|
|
|
|
// TODO either put this in a better place or find a library provided
|
|
|
|
// method that does this.
|
|
|
|
const sleep = async (ms) => {
|
|
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
|
|
};
|
|
|
|
|
|
|
|
let cancelled = false;
|
|
|
|
|
|
|
|
console.log("Seshat: Started crawler function");
|
|
|
|
|
|
|
|
const client = MatrixClientPeg.get();
|
2019-11-13 14:25:16 +03:00
|
|
|
const indexManager = PlatformPeg.get().getEventIndexingManager();
|
2019-11-12 15:29:07 +03:00
|
|
|
|
|
|
|
handle.cancel = () => {
|
|
|
|
cancelled = true;
|
|
|
|
};
|
|
|
|
|
|
|
|
while (!cancelled) {
|
|
|
|
// This is a low priority task and we don't want to spam our
|
|
|
|
// Homeserver with /messages requests so we set a hefty timeout
|
|
|
|
// here.
|
|
|
|
await sleep(this._crawler_timeout);
|
|
|
|
|
|
|
|
console.log("Seshat: Running the crawler loop.");
|
|
|
|
|
|
|
|
if (cancelled) {
|
|
|
|
console.log("Seshat: Cancelling the crawler.");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
const checkpoint = this.crawlerChekpoints.shift();
|
|
|
|
|
|
|
|
/// There is no checkpoint available currently, one may appear if
|
|
|
|
// a sync with limited room timelines happens, so go back to sleep.
|
|
|
|
if (checkpoint === undefined) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
console.log("Seshat: crawling using checkpoint", checkpoint);
|
|
|
|
|
|
|
|
// We have a checkpoint, let us fetch some messages, again, very
|
|
|
|
// conservatively to not bother our Homeserver too much.
|
|
|
|
const eventMapper = client.getEventMapper();
|
|
|
|
// TODO we need to ensure to use member lazy loading with this
|
|
|
|
// request so we get the correct profiles.
|
|
|
|
let res;
|
|
|
|
|
|
|
|
try {
|
|
|
|
res = await client._createMessagesRequest(
|
|
|
|
checkpoint.roomId, checkpoint.token, 100,
|
|
|
|
checkpoint.direction);
|
|
|
|
} catch (e) {
|
|
|
|
console.log("Seshat: Error crawling events:", e);
|
|
|
|
this.crawlerChekpoints.push(checkpoint);
|
2019-11-13 14:25:16 +03:00
|
|
|
continue;
|
2019-11-12 15:29:07 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (res.chunk.length === 0) {
|
|
|
|
console.log("Seshat: Done with the checkpoint", checkpoint);
|
|
|
|
// We got to the start/end of our timeline, lets just
|
|
|
|
// delete our checkpoint and go back to sleep.
|
2019-11-13 14:25:16 +03:00
|
|
|
await indexManager.removeCrawlerCheckpoint(checkpoint);
|
2019-11-12 15:29:07 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Convert the plain JSON events into Matrix events so they get
|
|
|
|
// decrypted if necessary.
|
|
|
|
const matrixEvents = res.chunk.map(eventMapper);
|
|
|
|
let stateEvents = [];
|
|
|
|
if (res.state !== undefined) {
|
|
|
|
stateEvents = res.state.map(eventMapper);
|
|
|
|
}
|
|
|
|
|
|
|
|
const profiles = {};
|
|
|
|
|
|
|
|
stateEvents.forEach(ev => {
|
|
|
|
if (ev.event.content &&
|
|
|
|
ev.event.content.membership === "join") {
|
|
|
|
profiles[ev.event.sender] = {
|
|
|
|
displayname: ev.event.content.displayname,
|
|
|
|
avatar_url: ev.event.content.avatar_url,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
const decryptionPromises = [];
|
|
|
|
|
|
|
|
matrixEvents.forEach(ev => {
|
|
|
|
if (ev.isBeingDecrypted() || ev.isDecryptionFailure()) {
|
|
|
|
// TODO the decryption promise is a private property, this
|
|
|
|
// should either be made public or we should convert the
|
|
|
|
// event that gets fired when decryption is done into a
|
|
|
|
// promise using the once event emitter method:
|
|
|
|
// https://nodejs.org/api/events.html#events_events_once_emitter_name
|
|
|
|
decryptionPromises.push(ev._decryptionPromise);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
// Let us wait for all the events to get decrypted.
|
|
|
|
await Promise.all(decryptionPromises);
|
|
|
|
|
|
|
|
// We filter out events for which decryption failed, are redacted
|
|
|
|
// or aren't of a type that we know how to index.
|
|
|
|
const isValidEvent = (value) => {
|
|
|
|
return ([
|
|
|
|
"m.room.message",
|
|
|
|
"m.room.name",
|
|
|
|
"m.room.topic",
|
|
|
|
].indexOf(value.getType()) >= 0
|
|
|
|
&& !value.isRedacted() && !value.isDecryptionFailure()
|
|
|
|
);
|
|
|
|
// TODO do we need to check if the event has all the valid
|
|
|
|
// attributes?
|
|
|
|
};
|
|
|
|
|
|
|
|
// TODO if there ar no events at this point we're missing a lot
|
|
|
|
// decryption keys, do we wan't to retry this checkpoint at a later
|
|
|
|
// stage?
|
|
|
|
const filteredEvents = matrixEvents.filter(isValidEvent);
|
|
|
|
|
|
|
|
// Let us convert the events back into a format that Seshat can
|
|
|
|
// consume.
|
|
|
|
const events = filteredEvents.map((ev) => {
|
|
|
|
const jsonEvent = ev.toJSON();
|
|
|
|
|
|
|
|
let e;
|
|
|
|
if (ev.isEncrypted()) e = jsonEvent.decrypted;
|
|
|
|
else e = jsonEvent;
|
|
|
|
|
|
|
|
let profile = {};
|
|
|
|
if (e.sender in profiles) profile = profiles[e.sender];
|
|
|
|
const object = {
|
|
|
|
event: e,
|
|
|
|
profile: profile,
|
|
|
|
};
|
|
|
|
return object;
|
|
|
|
});
|
|
|
|
|
|
|
|
// Create a new checkpoint so we can continue crawling the room for
|
|
|
|
// messages.
|
|
|
|
const newCheckpoint = {
|
|
|
|
roomId: checkpoint.roomId,
|
|
|
|
token: res.end,
|
|
|
|
fullCrawl: checkpoint.fullCrawl,
|
|
|
|
direction: checkpoint.direction,
|
|
|
|
};
|
|
|
|
|
|
|
|
console.log(
|
|
|
|
"Seshat: Crawled room",
|
|
|
|
client.getRoom(checkpoint.roomId).name,
|
|
|
|
"and fetched", events.length, "events.",
|
|
|
|
);
|
|
|
|
|
|
|
|
try {
|
2019-11-13 14:25:16 +03:00
|
|
|
const eventsAlreadyAdded = await indexManager.addHistoricEvents(
|
2019-11-12 15:29:07 +03:00
|
|
|
events, newCheckpoint, checkpoint);
|
|
|
|
// If all events were already indexed we assume that we catched
|
|
|
|
// up with our index and don't need to crawl the room further.
|
|
|
|
// Let us delete the checkpoint in that case, otherwise push
|
|
|
|
// the new checkpoint to be used by the crawler.
|
|
|
|
if (eventsAlreadyAdded === true && newCheckpoint.fullCrawl !== true) {
|
|
|
|
console.log("Seshat: Checkpoint had already all events",
|
|
|
|
"added, stopping the crawl", checkpoint);
|
2019-11-13 14:25:16 +03:00
|
|
|
await indexManager.removeCrawlerCheckpoint(newCheckpoint);
|
2019-11-12 15:29:07 +03:00
|
|
|
} else {
|
|
|
|
this.crawlerChekpoints.push(newCheckpoint);
|
|
|
|
}
|
|
|
|
} catch (e) {
|
|
|
|
console.log("Seshat: Error durring a crawl", e);
|
|
|
|
// An error occured, put the checkpoint back so we
|
|
|
|
// can retry.
|
|
|
|
this.crawlerChekpoints.push(checkpoint);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
console.log("Seshat: Stopping crawler function");
|
|
|
|
}
|
|
|
|
|
2019-11-13 17:23:08 +03:00
|
|
|
async onLimitedTimeline(room) {
|
2019-11-13 14:25:16 +03:00
|
|
|
const indexManager = PlatformPeg.get().getEventIndexingManager();
|
|
|
|
if (indexManager === null) return;
|
2019-11-13 12:37:20 +03:00
|
|
|
if (!MatrixClientPeg.get().isRoomEncrypted(room.roomId)) return;
|
2019-11-12 15:29:07 +03:00
|
|
|
|
|
|
|
const timeline = room.getLiveTimeline();
|
|
|
|
const token = timeline.getPaginationToken("b");
|
|
|
|
|
|
|
|
const backwardsCheckpoint = {
|
|
|
|
roomId: room.roomId,
|
|
|
|
token: token,
|
|
|
|
fullCrawl: false,
|
|
|
|
direction: "b",
|
|
|
|
};
|
|
|
|
|
|
|
|
console.log("Seshat: Added checkpoint because of a limited timeline",
|
2019-11-13 17:23:08 +03:00
|
|
|
backwardsCheckpoint);
|
2019-11-12 15:29:07 +03:00
|
|
|
|
2019-11-13 14:25:16 +03:00
|
|
|
await indexManager.addCrawlerCheckpoint(backwardsCheckpoint);
|
2019-11-12 15:29:07 +03:00
|
|
|
|
|
|
|
this.crawlerChekpoints.push(backwardsCheckpoint);
|
|
|
|
}
|
|
|
|
|
|
|
|
async deleteEventIndex() {
|
2019-11-13 14:25:16 +03:00
|
|
|
const indexManager = PlatformPeg.get().getEventIndexingManager();
|
|
|
|
if (indexManager !== null) {
|
2019-11-12 15:29:07 +03:00
|
|
|
console.log("Seshat: Deleting event index.");
|
|
|
|
this.crawlerRef.cancel();
|
2019-11-13 14:25:16 +03:00
|
|
|
await indexManager.deleteEventIndex();
|
2019-11-12 15:29:07 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
startCrawler() {
|
|
|
|
const crawlerHandle = {};
|
|
|
|
this.crawlerFunc(crawlerHandle);
|
|
|
|
this.crawlerRef = crawlerHandle;
|
|
|
|
}
|
|
|
|
|
2019-11-12 17:40:49 +03:00
|
|
|
stop() {
|
2019-11-12 15:29:07 +03:00
|
|
|
this._crawlerRef.cancel();
|
|
|
|
this._crawlerRef = null;
|
|
|
|
}
|
2019-11-12 17:39:26 +03:00
|
|
|
|
|
|
|
async search(searchArgs) {
|
2019-11-13 14:25:16 +03:00
|
|
|
const indexManager = PlatformPeg.get().getEventIndexingManager();
|
|
|
|
return indexManager.searchEventIndex(searchArgs);
|
2019-11-12 17:39:26 +03:00
|
|
|
}
|
2019-11-12 15:29:07 +03:00
|
|
|
}
|