element-web/src/voice/VoiceRecorder.ts

/*
Copyright 2021 The Matrix.org Foundation C.I.C.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

import * as Recorder from 'opus-recorder';
import encoderPath from 'opus-recorder/dist/encoderWorker.min.js';
import {MatrixClient} from "matrix-js-sdk/src/client";
import CallMediaHandler from "../CallMediaHandler";
import {SimpleObservable} from "matrix-widget-api";
import {clamp} from "../utils/numbers";

const CHANNELS = 1; // stereo isn't important
const SAMPLE_RATE = 48000; // 48khz is what WebRTC uses. 12khz is where we lose quality.
const BITRATE = 24000; // 24kbps is pretty high quality for our use case in opus.

export interface IRecordingUpdate {
    waveform: number[]; // floating points between 0 (low) and 1 (high).
    timeSeconds: number; // float
}

export class VoiceRecorder {
    private recorder: Recorder;
    private recorderContext: AudioContext;
    private recorderSource: MediaStreamAudioSourceNode;
    private recorderStream: MediaStream;
    private recorderFFT: AnalyserNode;
    private recorderProcessor: ScriptProcessorNode;
    private buffer = new Uint8Array(0);
    private mxc: string;
    private recording = false;
    private observable: SimpleObservable<IRecordingUpdate>;

    public constructor(private client: MatrixClient) {
    }

    private async makeRecorder() {
        this.recorderStream = await navigator.mediaDevices.getUserMedia({
            audio: {
                channelCount: CHANNELS,
                noiseSuppression: true, // browsers ignore constraints they can't honour
                deviceId: CallMediaHandler.getAudioInput(),
            },
        });
        this.recorderContext = new AudioContext({
            // latencyHint: "interactive", // we don't want a latency hint (this causes data smoothing)
        });
        this.recorderSource = this.recorderContext.createMediaStreamSource(this.recorderStream);
        this.recorderFFT = this.recorderContext.createAnalyser();

        // Bring the FFT time domain down a bit. The default is 2048, and this must be a power
        // of two. We use 64 points because we happen to know down the line we need less than
        // that, but 32 would be too few. Large numbers are not helpful here and do not add
        // precision: they introduce higher precision outputs of the FFT (frequency data), but
        // it makes the time domain less than helpful.
        this.recorderFFT.fftSize = 64;

        // We use an audio processor to get accurate timing information.
        // The size of the audio buffer largely decides how quickly we push timing/waveform data
        // out of this class. Smaller buffers mean we update more frequently as we can't hold as
        // many bytes. Larger buffers mean slower updates. For scale, 1024 gives us about 30Hz of
        // updates and 2048 gives us about 20Hz. We use 1024 to get as close to perceived realtime
        // as possible. Must be a power of 2.
        this.recorderProcessor = this.recorderContext.createScriptProcessor(1024, CHANNELS, CHANNELS);

        // Connect our inputs and outputs
        this.recorderSource.connect(this.recorderFFT);
        this.recorderSource.connect(this.recorderProcessor);
        this.recorderProcessor.connect(this.recorderContext.destination);

        this.recorder = new Recorder({
            encoderPath, // magic from webpack
            encoderSampleRate: SAMPLE_RATE,
            encoderApplication: 2048, // voice (default is "audio")
            streamPages: true, // this speeds up the encoding process by using CPU over time
            encoderFrameSize: 20, // ms, arbitrary frame size we send to the encoder
            numberOfChannels: CHANNELS,
            sourceNode: this.recorderSource,
            encoderBitRate: BITRATE,

            // We use low values for the following to ease CPU usage - the resulting waveform
            // is indistinguishable for a voice message. Note that the underlying library will
            // pick defaults which prefer the highest possible quality, CPU be damned.
            encoderComplexity: 3, // 0-10, 10 is slow and high quality.
            resampleQuality: 3, // 0-10, 10 is slow and high quality
        });
        this.recorder.ondataavailable = (a: ArrayBuffer) => {
            const buf = new Uint8Array(a);
            const newBuf = new Uint8Array(this.buffer.length + buf.length);
            newBuf.set(this.buffer, 0);
            newBuf.set(buf, this.buffer.length);
            this.buffer = newBuf;
        };
    }

    public get liveData(): SimpleObservable<IRecordingUpdate> {
        if (!this.recording) throw new Error("No observable when not recording");
        return this.observable;
    }

    public get isSupported(): boolean {
        return !!Recorder.isRecordingSupported();
    }

    public get hasRecording(): boolean {
        return this.buffer.length > 0;
    }

    public get mxcUri(): string {
        if (!this.mxc) {
            throw new Error("Recording has not been uploaded yet");
        }
        return this.mxc;
    }

    private tryUpdateLiveData = (ev: AudioProcessingEvent) => {
        if (!this.recording) return;

        // The time domain is the input to the FFT, which means we use an array of the same
        // size. The time domain is also known as the audio waveform. We're ignoring the
        // output of the FFT here (frequency data) because we're not interested in it.
        const data = new Float32Array(this.recorderFFT.fftSize);
        this.recorderFFT.getFloatTimeDomainData(data);

        // We can't just `Array.from()` the array because we're dealing with 32bit floats
        // and the built-in function won't consider that when converting between numbers.
        // However, the runtime will convert the float32 to a float64 during the math operations
        // which is why the loop works below. Note that a `.map()` call also doesn't work
        // and will instead return a Float32Array still.
        const translatedData: number[] = [];
        for (let i = 0; i < data.length; i++) {
            // We're clamping the values so we can do that math operation mentioned above,
            // and to ensure that we produce consistent data (it's possible for the array
            // to exceed the specified range with some audio input devices).
            translatedData.push(clamp(data[i], 0, 1));
        }

        this.observable.update({
            waveform: translatedData,
            timeSeconds: ev.playbackTime,
        });
    };

    public async start(): Promise<void> {
        if (this.mxc || this.hasRecording) {
            throw new Error("Recording already prepared");
        }
        if (this.recording) {
            throw new Error("Recording already in progress");
        }
        if (this.observable) {
            this.observable.close();
        }
        this.observable = new SimpleObservable<IRecordingUpdate>();
        await this.makeRecorder();
        this.recorderProcessor.addEventListener("audioprocess", this.tryUpdateLiveData);
        await this.recorder.start();
        this.recording = true;
    }

    public async stop(): Promise<Uint8Array> {
        if (!this.recording) {
            throw new Error("No recording to stop");
        }

        // Disconnect the source early to start shutting down resources
        this.recorderSource.disconnect();
        await this.recorder.stop();

        // close the context after the recorder so the recorder doesn't try to
        // connect anything to the context (this would generate a warning)
        await this.recorderContext.close();

        // Now stop all the media tracks so we can release them back to the user/OS
        this.recorderStream.getTracks().forEach(t => t.stop());

        // Finally do our post-processing and clean up
        this.recording = false;
        this.recorderProcessor.removeEventListener("audioprocess", this.tryUpdateLiveData);
        await this.recorder.close();

        return this.buffer;
    }

    public async upload(): Promise<string> {
        if (!this.hasRecording) {
            throw new Error("No recording available to upload");
        }

        if (this.mxc) return this.mxc;

        this.mxc = await this.client.uploadContent(new Blob([this.buffer], {
            type: "audio/ogg",
        }), {
            onlyContentUri: false, // to stop the warnings in the console
        }).then(r => r['content_uri']);
        return this.mxc;
    }
}

window.mxVoiceRecorder = VoiceRecorder;
Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00			`/*`
			`Copyright 2021 The Matrix.org Foundation C.I.C.`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License.`
			`*/`

			`import * as Recorder from 'opus-recorder';`
			`import encoderPath from 'opus-recorder/dist/encoderWorker.min.js';`
			`import {MatrixClient} from "matrix-js-sdk/src/client";`
			`import CallMediaHandler from "../CallMediaHandler";`
Wire up a simple record button 2021-03-16 07:16:58 +03:00			`import {SimpleObservable} from "matrix-widget-api";`
It helps to use the right function 2021-03-30 21:08:11 +03:00			`import {clamp} from "../utils/numbers";`
Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00			`const CHANNELS = 1; // stereo isn't important`
			`const SAMPLE_RATE = 48000; // 48khz is what WebRTC uses. 12khz is where we lose quality.`
Adjust settings/docs for encoder 2021-03-24 03:24:40 +03:00			`const BITRATE = 24000; // 24kbps is pretty high quality for our use case in opus.`
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00
Actually use a waveform instead of the frequency data 2021-03-25 08:31:02 +03:00			`export interface IRecordingUpdate {`
			`waveform: number[]; // floating points between 0 (low) and 1 (high).`
Hook up a clock and implement proper design 2021-03-26 02:12:26 +03:00			`timeSeconds: number; // float`
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00			`}`

Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00			`export class VoiceRecorder {`
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00			`private recorder: Recorder;`
			`private recorderContext: AudioContext;`
			`private recorderSource: MediaStreamAudioSourceNode;`
			`private recorderStream: MediaStream;`
Actually use a waveform instead of the frequency data 2021-03-25 08:31:02 +03:00			`private recorderFFT: AnalyserNode;`
Revert "Avoid use of deprecated APIs, instead using an AudioWorklet" This reverts commit 5c685dcf35fbe8c896f22882a6843dc0cc0b835d. 2021-03-30 06:59:31 +03:00			`private recorderProcessor: ScriptProcessorNode;`
Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00			`private buffer = new Uint8Array(0);`
			`private mxc: string;`
			`private recording = false;`
Actually use a waveform instead of the frequency data 2021-03-25 08:31:02 +03:00			`private observable: SimpleObservable<IRecordingUpdate>;`
Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00
			`public constructor(private client: MatrixClient) {`
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00			`}`

			`private async makeRecorder() {`
			`this.recorderStream = await navigator.mediaDevices.getUserMedia({`
			`audio: {`
			`channelCount: CHANNELS,`
			`noiseSuppression: true, // browsers ignore constraints they can't honour`
			`deviceId: CallMediaHandler.getAudioInput(),`
			`},`
			`});`
			`this.recorderContext = new AudioContext({`
Early concept for rendering the frequency waveform 2021-03-23 05:54:09 +03:00			`// latencyHint: "interactive", // we don't want a latency hint (this causes data smoothing)`
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00			`});`
			`this.recorderSource = this.recorderContext.createMediaStreamSource(this.recorderStream);`
Actually use a waveform instead of the frequency data 2021-03-25 08:31:02 +03:00			`this.recorderFFT = this.recorderContext.createAnalyser();`

			`// Bring the FFT time domain down a bit. The default is 2048, and this must be a power`
			`// of two. We use 64 points because we happen to know down the line we need less than`
			`// that, but 32 would be too few. Large numbers are not helpful here and do not add`
			`// precision: they introduce higher precision outputs of the FFT (frequency data), but`
			`// it makes the time domain less than helpful.`
			`this.recorderFFT.fftSize = 64;`

Revert "Avoid use of deprecated APIs, instead using an AudioWorklet" This reverts commit 5c685dcf35fbe8c896f22882a6843dc0cc0b835d. 2021-03-30 06:59:31 +03:00			`// We use an audio processor to get accurate timing information.`
			`// The size of the audio buffer largely decides how quickly we push timing/waveform data`
			`// out of this class. Smaller buffers mean we update more frequently as we can't hold as`
			`// many bytes. Larger buffers mean slower updates. For scale, 1024 gives us about 30Hz of`
			`// updates and 2048 gives us about 20Hz. We use 1024 to get as close to perceived realtime`
			`// as possible. Must be a power of 2.`
			`this.recorderProcessor = this.recorderContext.createScriptProcessor(1024, CHANNELS, CHANNELS);`
Hook up a clock and implement proper design 2021-03-26 02:12:26 +03:00
			`// Connect our inputs and outputs`
Actually use a waveform instead of the frequency data 2021-03-25 08:31:02 +03:00			`this.recorderSource.connect(this.recorderFFT);`
Revert "Avoid use of deprecated APIs, instead using an AudioWorklet" This reverts commit 5c685dcf35fbe8c896f22882a6843dc0cc0b835d. 2021-03-30 06:59:31 +03:00			`this.recorderSource.connect(this.recorderProcessor);`
			`this.recorderProcessor.connect(this.recorderContext.destination);`
Hook up a clock and implement proper design 2021-03-26 02:12:26 +03:00
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00			`this.recorder = new Recorder({`
			`encoderPath, // magic from webpack`
			`encoderSampleRate: SAMPLE_RATE,`
			`encoderApplication: 2048, // voice (default is "audio")`
			`streamPages: true, // this speeds up the encoding process by using CPU over time`
			`encoderFrameSize: 20, // ms, arbitrary frame size we send to the encoder`
			`numberOfChannels: CHANNELS,`
			`sourceNode: this.recorderSource,`
			`encoderBitRate: BITRATE,`
Adjust settings/docs for encoder 2021-03-24 03:24:40 +03:00
			`// We use low values for the following to ease CPU usage - the resulting waveform`
			`// is indistinguishable for a voice message. Note that the underlying library will`
			`// pick defaults which prefer the highest possible quality, CPU be damned.`
			`encoderComplexity: 3, // 0-10, 10 is slow and high quality.`
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00			`resampleQuality: 3, // 0-10, 10 is slow and high quality`
			`});`
Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00			`this.recorder.ondataavailable = (a: ArrayBuffer) => {`
			`const buf = new Uint8Array(a);`
			`const newBuf = new Uint8Array(this.buffer.length + buf.length);`
			`newBuf.set(this.buffer, 0);`
			`newBuf.set(buf, this.buffer.length);`
			`this.buffer = newBuf;`
			`};`
			`}`

Actually use a waveform instead of the frequency data 2021-03-25 08:31:02 +03:00			`public get liveData(): SimpleObservable<IRecordingUpdate> {`
Wire up a simple record button 2021-03-16 07:16:58 +03:00			`if (!this.recording) throw new Error("No observable when not recording");`
			`return this.observable;`
			`}`

Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00			`public get isSupported(): boolean {`
			`return !!Recorder.isRecordingSupported();`
			`}`

			`public get hasRecording(): boolean {`
			`return this.buffer.length > 0;`
			`}`

			`public get mxcUri(): string {`
			`if (!this.mxc) {`
			`throw new Error("Recording has not been uploaded yet");`
			`}`
			`return this.mxc;`
			`}`

Revert "Avoid use of deprecated APIs, instead using an AudioWorklet" This reverts commit 5c685dcf35fbe8c896f22882a6843dc0cc0b835d. 2021-03-30 06:59:31 +03:00			`private tryUpdateLiveData = (ev: AudioProcessingEvent) => {`
Hook up a clock and implement proper design 2021-03-26 02:12:26 +03:00			`if (!this.recording) return;`

			`// The time domain is the input to the FFT, which means we use an array of the same`
			`// size. The time domain is also known as the audio waveform. We're ignoring the`
			`// output of the FFT here (frequency data) because we're not interested in it.`
Fix float operations to make a little more sense. 2021-03-30 06:25:06 +03:00			`const data = new Float32Array(this.recorderFFT.fftSize);`
			`this.recorderFFT.getFloatTimeDomainData(data);`

			// We can't just `Array.from()` the array because we're dealing with 32bit floats
			`// and the built-in function won't consider that when converting between numbers.`
			`// However, the runtime will convert the float32 to a float64 during the math operations`
			// which is why the loop works below. Note that a `.map()` call also doesn't work
			`// and will instead return a Float32Array still.`
Hook up a clock and implement proper design 2021-03-26 02:12:26 +03:00			`const translatedData: number[] = [];`
			`for (let i = 0; i < data.length; i++) {`
Clarify comment 2021-03-30 21:01:35 +03:00			`// We're clamping the values so we can do that math operation mentioned above,`
			`// and to ensure that we produce consistent data (it's possible for the array`
			`// to exceed the specified range with some audio input devices).`
It helps to use the right function 2021-03-30 21:08:11 +03:00			`translatedData.push(clamp(data[i], 0, 1));`
Hook up a clock and implement proper design 2021-03-26 02:12:26 +03:00			`}`

			`this.observable.update({`
			`waveform: translatedData,`
Revert "Avoid use of deprecated APIs, instead using an AudioWorklet" This reverts commit 5c685dcf35fbe8c896f22882a6843dc0cc0b835d. 2021-03-30 06:59:31 +03:00			`timeSeconds: ev.playbackTime,`
Hook up a clock and implement proper design 2021-03-26 02:12:26 +03:00			`});`
			`};`

Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00			`public async start(): Promise<void> {`
			`if (this.mxc \|\| this.hasRecording) {`
			`throw new Error("Recording already prepared");`
			`}`
			`if (this.recording) {`
			`throw new Error("Recording already in progress");`
			`}`
Wire up a simple record button 2021-03-16 07:16:58 +03:00			`if (this.observable) {`
			`this.observable.close();`
			`}`
Actually use a waveform instead of the frequency data 2021-03-25 08:31:02 +03:00			`this.observable = new SimpleObservable<IRecordingUpdate>();`
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00			`await this.makeRecorder();`
Revert "Avoid use of deprecated APIs, instead using an AudioWorklet" This reverts commit 5c685dcf35fbe8c896f22882a6843dc0cc0b835d. 2021-03-30 06:59:31 +03:00			`this.recorderProcessor.addEventListener("audioprocess", this.tryUpdateLiveData);`
Clean up promises 2021-03-24 03:26:43 +03:00			`await this.recorder.start();`
			`this.recording = true;`
Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00			`}`

			`public async stop(): Promise<Uint8Array> {`
			`if (!this.recording) {`
			`throw new Error("No recording to stop");`
			`}`
Clean up promises 2021-03-24 03:26:43 +03:00
Run audio through the Web Audio API instead This leads to more reliable frequency/timing information, and involves a whole lot less decoding. We still maintain ongoing encoded frames to avoid having to do one giant encode at the end, as that could take long enough to be disruptive. 2021-03-23 04:32:24 +03:00			`// Disconnect the source early to start shutting down resources`
			`this.recorderSource.disconnect();`
Clean up promises 2021-03-24 03:26:43 +03:00			`await this.recorder.stop();`

			`// close the context after the recorder so the recorder doesn't try to`
			`// connect anything to the context (this would generate a warning)`
			`await this.recorderContext.close();`

			`// Now stop all the media tracks so we can release them back to the user/OS`
			`this.recorderStream.getTracks().forEach(t => t.stop());`

			`// Finally do our post-processing and clean up`
			`this.recording = false;`
Revert "Avoid use of deprecated APIs, instead using an AudioWorklet" This reverts commit 5c685dcf35fbe8c896f22882a6843dc0cc0b835d. 2021-03-30 06:59:31 +03:00			`this.recorderProcessor.removeEventListener("audioprocess", this.tryUpdateLiveData);`
Clean up promises 2021-03-24 03:26:43 +03:00			`await this.recorder.close();`

			`return this.buffer;`
Add an early voice recorder utility class 2021-03-12 08:05:47 +03:00			`}`

			`public async upload(): Promise<string> {`
			`if (!this.hasRecording) {`
			`throw new Error("No recording available to upload");`
			`}`

			`if (this.mxc) return this.mxc;`

			`this.mxc = await this.client.uploadContent(new Blob([this.buffer], {`
			`type: "audio/ogg",`
			`}), {`
			`onlyContentUri: false, // to stop the warnings in the console`
			`}).then(r => r['content_uri']);`
			`return this.mxc;`
			`}`
			`}`

			`window.mxVoiceRecorder = VoiceRecorder;`