element-web/src/voice/VoiceRecorder.ts

206 lines
8.5 KiB
TypeScript
Raw Normal View History

/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import * as Recorder from 'opus-recorder';
import encoderPath from 'opus-recorder/dist/encoderWorker.min.js';
import {MatrixClient} from "matrix-js-sdk/src/client";
import CallMediaHandler from "../CallMediaHandler";
2021-03-16 07:16:58 +03:00
import {SimpleObservable} from "matrix-widget-api";
const CHANNELS = 1; // stereo isn't important
const SAMPLE_RATE = 48000; // 48khz is what WebRTC uses. 12khz is where we lose quality.
2021-03-24 03:24:40 +03:00
const BITRATE = 24000; // 24kbps is pretty high quality for our use case in opus.
const FREQ_SAMPLE_RATE = 10; // Target rate of frequency data (samples / sec). We don't need this super often.
export interface IRecordingUpdate {
waveform: number[]; // floating points between 0 (low) and 1 (high).
// TODO: @@ TravisR: Generalize this for a timing package?
}
export class VoiceRecorder {
private recorder: Recorder;
private recorderContext: AudioContext;
private recorderSource: MediaStreamAudioSourceNode;
private recorderStream: MediaStream;
private recorderFFT: AnalyserNode;
private buffer = new Uint8Array(0);
private mxc: string;
private recording = false;
private observable: SimpleObservable<IRecordingUpdate>;
private freqTimerId: number;
public constructor(private client: MatrixClient) {
}
private async makeRecorder() {
this.recorderStream = await navigator.mediaDevices.getUserMedia({
audio: {
// specify some audio settings so we're feeding the recorder with the
// best possible values. The browser will handle resampling for us.
sampleRate: SAMPLE_RATE,
channelCount: CHANNELS,
noiseSuppression: true, // browsers ignore constraints they can't honour
deviceId: CallMediaHandler.getAudioInput(),
},
});
this.recorderContext = new AudioContext({
// latencyHint: "interactive", // we don't want a latency hint (this causes data smoothing)
sampleRate: SAMPLE_RATE, // once again, the browser will resample for us
});
this.recorderSource = this.recorderContext.createMediaStreamSource(this.recorderStream);
this.recorderFFT = this.recorderContext.createAnalyser();
// Bring the FFT time domain down a bit. The default is 2048, and this must be a power
// of two. We use 64 points because we happen to know down the line we need less than
// that, but 32 would be too few. Large numbers are not helpful here and do not add
// precision: they introduce higher precision outputs of the FFT (frequency data), but
// it makes the time domain less than helpful.
this.recorderFFT.fftSize = 64;
this.recorderSource.connect(this.recorderFFT);
this.recorder = new Recorder({
encoderPath, // magic from webpack
encoderSampleRate: SAMPLE_RATE,
encoderApplication: 2048, // voice (default is "audio")
streamPages: true, // this speeds up the encoding process by using CPU over time
encoderFrameSize: 20, // ms, arbitrary frame size we send to the encoder
numberOfChannels: CHANNELS,
sourceNode: this.recorderSource,
encoderBitRate: BITRATE,
2021-03-24 03:24:40 +03:00
// We use low values for the following to ease CPU usage - the resulting waveform
// is indistinguishable for a voice message. Note that the underlying library will
// pick defaults which prefer the highest possible quality, CPU be damned.
encoderComplexity: 3, // 0-10, 10 is slow and high quality.
resampleQuality: 3, // 0-10, 10 is slow and high quality
});
this.recorder.ondataavailable = (a: ArrayBuffer) => {
const buf = new Uint8Array(a);
const newBuf = new Uint8Array(this.buffer.length + buf.length);
newBuf.set(this.buffer, 0);
newBuf.set(buf, this.buffer.length);
this.buffer = newBuf;
};
}
public get liveData(): SimpleObservable<IRecordingUpdate> {
2021-03-16 07:16:58 +03:00
if (!this.recording) throw new Error("No observable when not recording");
return this.observable;
}
public get isSupported(): boolean {
return !!Recorder.isRecordingSupported();
}
public get hasRecording(): boolean {
return this.buffer.length > 0;
}
public get mxcUri(): string {
if (!this.mxc) {
throw new Error("Recording has not been uploaded yet");
}
return this.mxc;
}
public async start(): Promise<void> {
if (this.mxc || this.hasRecording) {
throw new Error("Recording already prepared");
}
if (this.recording) {
throw new Error("Recording already in progress");
}
2021-03-16 07:16:58 +03:00
if (this.observable) {
this.observable.close();
}
this.observable = new SimpleObservable<IRecordingUpdate>();
await this.makeRecorder();
this.freqTimerId = setInterval(() => {
if (!this.recording) return;
// The time domain is the input to the FFT, which means we use an array of the same
// size. The time domain is also known as the audio waveform. We're ignoring the
// output of the FFT here (frequency data) because we're not interested in it.
//
// We use bytes out of the analyser because floats have weird precision problems
// and are slightly more difficult to work with. The bytes are easy to work with,
// which is why we pick them (they're also more precise, but we care less about that).
const data = new Uint8Array(this.recorderFFT.fftSize);
this.recorderFFT.getByteTimeDomainData(data);
// Because we're dealing with a uint array we need to do math a bit differently.
// If we just `Array.from()` the uint array, we end up with 1s and 0s, which aren't
// what we're after. Instead, we have to use a bit of manual looping to correctly end
// up with the right values
const translatedData: number[] = [];
for (let i = 0; i < data.length; i++) {
// All we're doing here is inverting the amplitude and putting the metric somewhere
// between zero and one. Without the inversion, lower values are "louder", which is
// not super helpful.
translatedData.push(1 - (data[i] / 128.0));
}
this.observable.update({
waveform: translatedData,
});
}, 1000 / FREQ_SAMPLE_RATE) as any as number; // XXX: Linter doesn't understand timer environment
2021-03-24 03:26:43 +03:00
await this.recorder.start();
this.recording = true;
}
public async stop(): Promise<Uint8Array> {
if (!this.recording) {
throw new Error("No recording to stop");
}
2021-03-24 03:26:43 +03:00
// Disconnect the source early to start shutting down resources
this.recorderSource.disconnect();
2021-03-24 03:26:43 +03:00
await this.recorder.stop();
// close the context after the recorder so the recorder doesn't try to
// connect anything to the context (this would generate a warning)
await this.recorderContext.close();
// Now stop all the media tracks so we can release them back to the user/OS
this.recorderStream.getTracks().forEach(t => t.stop());
// Finally do our post-processing and clean up
clearInterval(this.freqTimerId);
this.recording = false;
await this.recorder.close();
return this.buffer;
}
public async upload(): Promise<string> {
if (!this.hasRecording) {
throw new Error("No recording available to upload");
}
if (this.mxc) return this.mxc;
this.mxc = await this.client.uploadContent(new Blob([this.buffer], {
type: "audio/ogg",
}), {
onlyContentUri: false, // to stop the warnings in the console
}).then(r => r['content_uri']);
return this.mxc;
}
}
window.mxVoiceRecorder = VoiceRecorder;