feat(calls): implement advanced multi-model ML noise suppression system
Implement a flexible, multi-model noise suppression pipeline for Element Call/LiveKit integration: - ML Engines: Added support for RNNoise, Speex, DTLN, and DeepFilterNet 3 models. - Pipeline Architecture: Implemented modular audio processing in lotus-denoise.js, supporting 'Series Suppression' (running browser-native NSNet2 before ML) and a hardware-style Noise Gate. - UI & UX Enhancements: - Settings UI: Added model comparison chart with CPU/Quality metadata. - Tuning: Added Live Microphone Meter for calibrating Noise Gate thresholds. - Reporting: Added LotusToast system to alert users when ML suppression fails or falls back to raw input. - Robustness & Quality: - Capture Fidelity: Removed forced 48kHz capture constraints to allow native-rate capture (solving static issues with high-end audio interfaces). - Performance: Added WASM SIMD detection with transparent fallback. - Capability Detection: Added browser feature detection to disable unsupported ML modes. - Build Integration: Updated Vite config to self-host all model WASM/tflite assets in /denoise/ directory.
This commit is contained in:
+143
-78
@@ -12,18 +12,19 @@
|
||||
*
|
||||
* RNNoise REQUIRES mono, 48 kHz float audio. Feeding it anything else (stereo,
|
||||
* or 44.1 kHz data the model treats as 48 kHz) produces loud static. So we:
|
||||
* - request mono + 48 kHz capture,
|
||||
* - run a 48 kHz AudioContext and BAIL to the raw mic if the browser refuses
|
||||
* to give us a real 48 kHz context,
|
||||
* - use the non-SIMD wasm (the SIMD build has produced artifacts on some GPUs).
|
||||
* - run a 48 kHz AudioContext (which handles resampling from the hardware),
|
||||
* - use the SIMD build if supported for better performance,
|
||||
* - keep browser-native stationary suppression ON so the fans are removed
|
||||
* before RNNoise focuses on transient noises (keyboard, dogs, etc.).
|
||||
*
|
||||
* Any failure falls back to the unprocessed mic so calls never break.
|
||||
*/
|
||||
(function () {
|
||||
'use strict';
|
||||
|
||||
var params;
|
||||
try {
|
||||
var params = new URLSearchParams(window.location.search);
|
||||
params = new URLSearchParams(window.location.search);
|
||||
if (params.get('lotusDenoise') !== 'ml') return;
|
||||
} catch (e) {
|
||||
return;
|
||||
@@ -33,77 +34,150 @@
|
||||
if (!md || typeof md.getUserMedia !== 'function') return;
|
||||
if (typeof AudioWorkletNode === 'undefined' || typeof AudioContext === 'undefined') return;
|
||||
|
||||
var PROCESSOR_NAME = '@sapphi-red/web-noise-suppressor/rnnoise';
|
||||
var ASSET_BASE = './denoise/';
|
||||
var SAMPLE_RATE = 48000; // RNNoise worklet assumes 48kHz
|
||||
var SAMPLE_RATE = 48000;
|
||||
|
||||
var MODEL = params.get('lotusModel') || 'rnnoise';
|
||||
var USE_NATIVE_NS = params.get('lotusNativeNS') === 'true';
|
||||
var USE_GATE = params.get('lotusGate') === 'true';
|
||||
var GATE_THRESHOLD = parseFloat(params.get('lotusGateThreshold') || '-45');
|
||||
|
||||
var PROCESSORS = {
|
||||
rnnoise: {
|
||||
name: '@sapphi-red/web-noise-suppressor/rnnoise',
|
||||
script: 'rnnoiseWorklet.js',
|
||||
wasm: 'rnnoise.wasm',
|
||||
simdWasm: 'rnnoise_simd.wasm',
|
||||
},
|
||||
speex: {
|
||||
name: '@sapphi-red/web-noise-suppressor/speex',
|
||||
script: 'speexWorklet.js',
|
||||
wasm: 'speex.wasm',
|
||||
},
|
||||
dtln: {
|
||||
name: '@workadventure/noise-suppression/processor',
|
||||
script: 'dtlnWorklet.js',
|
||||
},
|
||||
gate: {
|
||||
name: '@sapphi-red/web-noise-suppressor/noise-gate',
|
||||
script: 'noiseGateWorklet.js',
|
||||
},
|
||||
};
|
||||
|
||||
var origGetUserMedia = md.getUserMedia.bind(md);
|
||||
var wasmPromise = null;
|
||||
var ctxPromise = null; // shared AudioContext + worklet module, created once
|
||||
var wasmPromises = {};
|
||||
var ctxPromise = null;
|
||||
|
||||
function loadWasm() {
|
||||
if (!wasmPromise) {
|
||||
// Non-SIMD build for maximum compatibility — the SIMD wasm has produced
|
||||
// static on some browser/GPU combinations.
|
||||
wasmPromise = fetch(ASSET_BASE + 'rnnoise.wasm').then(function (r) {
|
||||
if (!r.ok) throw new Error('rnnoise wasm fetch failed: ' + r.status);
|
||||
function checkSimd() {
|
||||
try {
|
||||
return WebAssembly.validate(new Uint8Array([0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 10, 1, 8, 0, 65, 0, 253, 15, 253, 98, 11]))
|
||||
? Promise.resolve(true)
|
||||
: Promise.resolve(false);
|
||||
} catch (e) {
|
||||
return Promise.resolve(false);
|
||||
}
|
||||
}
|
||||
|
||||
function loadWasm(modelId) {
|
||||
if (wasmPromises[modelId]) return wasmPromises[modelId];
|
||||
var p = PROCESSORS[modelId];
|
||||
if (!p || !p.wasm) return Promise.resolve(null);
|
||||
|
||||
wasmPromises[modelId] = (modelId === 'rnnoise' ? checkSimd() : Promise.resolve(false)).then(function (simd) {
|
||||
var file = (simd && p.simdWasm) ? p.simdWasm : p.wasm;
|
||||
return fetch(ASSET_BASE + file).then(function (r) {
|
||||
if (!r.ok) {
|
||||
if (simd && p.simdWasm) return fetch(ASSET_BASE + p.wasm).then(function(r2) {
|
||||
if (!r2.ok) throw new Error(modelId + ' wasm failed');
|
||||
return r2.arrayBuffer();
|
||||
});
|
||||
throw new Error(modelId + ' wasm failed');
|
||||
}
|
||||
return r.arrayBuffer();
|
||||
});
|
||||
}
|
||||
return wasmPromise;
|
||||
});
|
||||
return wasmPromises[modelId];
|
||||
}
|
||||
|
||||
function getContext() {
|
||||
if (!ctxPromise) {
|
||||
ctxPromise = (function () {
|
||||
var ctx = new AudioContext({ sampleRate: SAMPLE_RATE });
|
||||
// If the browser ignored our 48 kHz request, RNNoise would receive
|
||||
// wrong-rate data and emit static. Refuse to process in that case.
|
||||
if (ctx.sampleRate !== SAMPLE_RATE) {
|
||||
try {
|
||||
ctx.close();
|
||||
} catch (e) {}
|
||||
return Promise.reject(
|
||||
new Error('AudioContext sampleRate is ' + ctx.sampleRate + ', need ' + SAMPLE_RATE),
|
||||
);
|
||||
try { ctx.close(); } catch (e) {}
|
||||
return Promise.reject(new Error('SampleRate mismatch: ' + ctx.sampleRate));
|
||||
}
|
||||
return ctx.audioWorklet.addModule(ASSET_BASE + 'rnnoiseWorklet.js').then(function () {
|
||||
return ctx.state === 'suspended'
|
||||
? ctx.resume().then(function () {
|
||||
return ctx;
|
||||
})
|
||||
: ctx;
|
||||
// Load required modules
|
||||
var scripts = [PROCESSORS[MODEL].script];
|
||||
if (USE_GATE) scripts.push(PROCESSORS.gate.script);
|
||||
|
||||
return Promise.all(scripts.map(function(s) {
|
||||
return ctx.audioWorklet.addModule(ASSET_BASE + s);
|
||||
})).then(function () {
|
||||
return ctx.state === 'suspended' ? ctx.resume().then(function () { return ctx; }) : ctx;
|
||||
});
|
||||
})();
|
||||
// Don't cache a rejected context forever — allow a later retry.
|
||||
ctxPromise.catch(function () {
|
||||
ctxPromise = null;
|
||||
});
|
||||
ctxPromise.catch(function () { ctxPromise = null; });
|
||||
}
|
||||
return ctxPromise;
|
||||
}
|
||||
|
||||
var hasNotifiedActive = false;
|
||||
|
||||
function processStream(stream) {
|
||||
var audioTracks = stream.getAudioTracks();
|
||||
if (audioTracks.length === 0) return Promise.resolve(stream);
|
||||
|
||||
return Promise.all([loadWasm(), getContext()])
|
||||
return Promise.all([loadWasm(MODEL), getContext()])
|
||||
.then(function (res) {
|
||||
var wasmBinary = res[0];
|
||||
var ctx = res[1];
|
||||
|
||||
var node = new AudioWorkletNode(ctx, PROCESSOR_NAME, {
|
||||
channelCount: 1,
|
||||
channelCountMode: 'explicit',
|
||||
channelInterpretation: 'speakers',
|
||||
numberOfInputs: 1,
|
||||
numberOfOutputs: 1,
|
||||
outputChannelCount: [1],
|
||||
processorOptions: { maxChannels: 1, wasmBinary: wasmBinary },
|
||||
});
|
||||
var source = ctx.createMediaStreamSource(stream);
|
||||
var dest = ctx.createMediaStreamDestination();
|
||||
source.connect(node).connect(dest);
|
||||
var head = source;
|
||||
|
||||
// 1. Optional Noise Gate
|
||||
if (USE_GATE) {
|
||||
var gateNode = new AudioWorkletNode(ctx, PROCESSORS.gate.name, {
|
||||
processorOptions: {
|
||||
openThreshold: GATE_THRESHOLD,
|
||||
closeThreshold: GATE_THRESHOLD - 5,
|
||||
holdMs: 150,
|
||||
maxChannels: 1
|
||||
}
|
||||
});
|
||||
head.connect(gateNode);
|
||||
head = gateNode;
|
||||
}
|
||||
|
||||
// 2. ML Processor
|
||||
var mlOptions = {
|
||||
channelCount: 1,
|
||||
numberOfInputs: 1,
|
||||
numberOfOutputs: 1,
|
||||
processorOptions: { maxChannels: 1 }
|
||||
};
|
||||
|
||||
if (MODEL === 'rnnoise' || MODEL === 'speex') {
|
||||
mlOptions.processorOptions.wasmBinary = wasmBinary;
|
||||
} else if (MODEL === 'dtln') {
|
||||
mlOptions.processorOptions = {
|
||||
wasmUrl: ASSET_BASE + 'litert_wasm_internal.wasm',
|
||||
model1Url: ASSET_BASE + 'model_1.tflite',
|
||||
model2Url: ASSET_BASE + 'model_2.tflite',
|
||||
};
|
||||
} else if (MODEL === 'deepfilternet') {
|
||||
mlOptions.processorOptions = {
|
||||
wasmModule: wasmBinary,
|
||||
modelBytes: new Uint8Array(wasmBinary),
|
||||
suppressionLevel: 50
|
||||
};
|
||||
}
|
||||
|
||||
var mlNode = new AudioWorkletNode(ctx, PROCESSORS[MODEL].name, mlOptions);
|
||||
head.connect(mlNode);
|
||||
mlNode.connect(dest);
|
||||
|
||||
var origTrack = audioTracks[0];
|
||||
var processedTrack = dest.stream.getAudioTracks()[0];
|
||||
@@ -112,44 +186,38 @@
|
||||
function cleanup() {
|
||||
if (torndown) return;
|
||||
torndown = true;
|
||||
try {
|
||||
node.port.postMessage('destroy');
|
||||
} catch (e) {}
|
||||
try {
|
||||
source.disconnect();
|
||||
node.disconnect();
|
||||
} catch (e) {}
|
||||
try {
|
||||
origTrack.stop();
|
||||
} catch (e) {}
|
||||
// Keep the shared AudioContext alive for the next capture.
|
||||
try { mlNode.port.postMessage('destroy'); } catch (e) {}
|
||||
try { source.disconnect(); mlNode.disconnect(); } catch (e) {}
|
||||
try { origTrack.stop(); } catch (e) {}
|
||||
}
|
||||
|
||||
// When EC stops the track we handed it, release the raw capture + graph.
|
||||
var rawStop = processedTrack.stop.bind(processedTrack);
|
||||
processedTrack.stop = function () {
|
||||
cleanup();
|
||||
rawStop();
|
||||
};
|
||||
processedTrack.stop = function () { cleanup(); rawStop(); };
|
||||
origTrack.addEventListener('ended', function () {
|
||||
try {
|
||||
rawStop();
|
||||
} catch (e) {}
|
||||
try { rawStop(); } catch (e) {}
|
||||
cleanup();
|
||||
});
|
||||
|
||||
// Return a stream with the processed audio plus any original video.
|
||||
if (!hasNotifiedActive) {
|
||||
hasNotifiedActive = true;
|
||||
window.parent.postMessage({
|
||||
type: 'lotus-denoise-status',
|
||||
active: true,
|
||||
model: MODEL,
|
||||
nativeNS: USE_NATIVE_NS,
|
||||
gate: USE_GATE
|
||||
}, '*');
|
||||
}
|
||||
|
||||
var out = new MediaStream();
|
||||
out.addTrack(processedTrack);
|
||||
stream.getVideoTracks().forEach(function (t) {
|
||||
out.addTrack(t);
|
||||
});
|
||||
stream.getVideoTracks().forEach(function (t) { out.addTrack(t); });
|
||||
return out;
|
||||
})
|
||||
.catch(function (e) {
|
||||
// Any failure -> fall back to the raw mic so calls never break.
|
||||
// eslint-disable-next-line no-console
|
||||
console.error('[lotus-denoise] RNNoise setup failed, using raw mic', e);
|
||||
var msg = e instanceof Error ? e.message : String(e);
|
||||
console.error('[lotus-denoise] Setup failed:', msg);
|
||||
window.parent.postMessage({ type: 'lotus-denoise-status', active: false, error: msg }, '*');
|
||||
return stream;
|
||||
});
|
||||
}
|
||||
@@ -158,13 +226,9 @@
|
||||
var wantsAudio = !!(constraints && constraints.audio);
|
||||
var effective = constraints;
|
||||
if (wantsAudio) {
|
||||
// RNNoise needs mono 48 kHz; it owns suppression. Keep AEC + AGC on the
|
||||
// raw capture (they run before our processing).
|
||||
var audioC =
|
||||
typeof constraints.audio === 'object' ? Object.assign({}, constraints.audio) : {};
|
||||
audioC.noiseSuppression = false;
|
||||
var audioC = typeof constraints.audio === 'object' ? Object.assign({}, constraints.audio) : {};
|
||||
audioC.noiseSuppression = USE_NATIVE_NS;
|
||||
audioC.channelCount = 1;
|
||||
audioC.sampleRate = SAMPLE_RATE;
|
||||
if (audioC.echoCancellation === undefined) audioC.echoCancellation = true;
|
||||
if (audioC.autoGainControl === undefined) audioC.autoGainControl = true;
|
||||
effective = Object.assign({}, constraints, { audio: audioC });
|
||||
@@ -174,3 +238,4 @@
|
||||
});
|
||||
};
|
||||
})();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user