cinny/build/lotus-denoise.js

/*
 * Lotus Chat — client-side ML noise suppression shim for Element Call.
 *
 * Element Call runs as a same-origin iframe widget that captures the mic
 * internally (via livekit-client -> getUserMedia) and publishes it to LiveKit.
 * We can't reach that track from the host. Instead this classic <script> is
 * injected (by the vite `lotus-denoise` plugin) into EC's index.html BEFORE its
 * deferred module entry, so it runs first and monkeypatches getUserMedia. When
 * the "ml" tier is selected (lotusDenoise=ml in the widget URL) we route the
 * captured mic through an RNNoise AudioWorklet (@sapphi-red/web-noise-suppressor)
 * and hand the processed track back to EC/LiveKit.
 *
 * RNNoise REQUIRES mono, 48 kHz float audio. Feeding it anything else (stereo,
 * or 44.1 kHz data the model treats as 48 kHz) produces loud static. So we:
 *   - request mono + 48 kHz capture,
 *   - run a 48 kHz AudioContext and BAIL to the raw mic if the browser refuses
 *     to give us a real 48 kHz context,
 *   - use the non-SIMD wasm (the SIMD build has produced artifacts on some GPUs).
 *
 * Any failure falls back to the unprocessed mic so calls never break.
 */
(function () {
  'use strict';

  try {
    var params = new URLSearchParams(window.location.search);
    if (params.get('lotusDenoise') !== 'ml') return;
  } catch (e) {
    return;
  }

  var md = navigator.mediaDevices;
  if (!md || typeof md.getUserMedia !== 'function') return;
  if (typeof AudioWorkletNode === 'undefined' || typeof AudioContext === 'undefined') return;

  var PROCESSOR_NAME = '@sapphi-red/web-noise-suppressor/rnnoise';
  var ASSET_BASE = './denoise/';
  var SAMPLE_RATE = 48000; // RNNoise worklet assumes 48kHz

  var origGetUserMedia = md.getUserMedia.bind(md);
  var wasmPromise = null;
  var ctxPromise = null; // shared AudioContext + worklet module, created once

  function loadWasm() {
    if (!wasmPromise) {
      // Non-SIMD build for maximum compatibility — the SIMD wasm has produced
      // static on some browser/GPU combinations.
      wasmPromise = fetch(ASSET_BASE + 'rnnoise.wasm').then(function (r) {
        if (!r.ok) throw new Error('rnnoise wasm fetch failed: ' + r.status);
        return r.arrayBuffer();
      });
    }
    return wasmPromise;
  }

  function getContext() {
    if (!ctxPromise) {
      ctxPromise = (function () {
        var ctx = new AudioContext({ sampleRate: SAMPLE_RATE });
        // If the browser ignored our 48 kHz request, RNNoise would receive
        // wrong-rate data and emit static. Refuse to process in that case.
        if (ctx.sampleRate !== SAMPLE_RATE) {
          try {
            ctx.close();
          } catch (e) {}
          return Promise.reject(
            new Error('AudioContext sampleRate is ' + ctx.sampleRate + ', need ' + SAMPLE_RATE),
          );
        }
        return ctx.audioWorklet.addModule(ASSET_BASE + 'rnnoiseWorklet.js').then(function () {
          return ctx.state === 'suspended'
            ? ctx.resume().then(function () {
                return ctx;
              })
            : ctx;
        });
      })();
      // Don't cache a rejected context forever — allow a later retry.
      ctxPromise.catch(function () {
        ctxPromise = null;
      });
    }
    return ctxPromise;
  }

  function processStream(stream) {
    var audioTracks = stream.getAudioTracks();
    if (audioTracks.length === 0) return Promise.resolve(stream);

    return Promise.all([loadWasm(), getContext()])
      .then(function (res) {
        var wasmBinary = res[0];
        var ctx = res[1];

        var node = new AudioWorkletNode(ctx, PROCESSOR_NAME, {
          channelCount: 1,
          channelCountMode: 'explicit',
          channelInterpretation: 'speakers',
          numberOfInputs: 1,
          numberOfOutputs: 1,
          outputChannelCount: [1],
          processorOptions: { maxChannels: 1, wasmBinary: wasmBinary },
        });
        var source = ctx.createMediaStreamSource(stream);
        var dest = ctx.createMediaStreamDestination();
        source.connect(node).connect(dest);

        var origTrack = audioTracks[0];
        var processedTrack = dest.stream.getAudioTracks()[0];

        var torndown = false;
        function cleanup() {
          if (torndown) return;
          torndown = true;
          try {
            node.port.postMessage('destroy');
          } catch (e) {}
          try {
            source.disconnect();
            node.disconnect();
          } catch (e) {}
          try {
            origTrack.stop();
          } catch (e) {}
          // Keep the shared AudioContext alive for the next capture.
        }

        // When EC stops the track we handed it, release the raw capture + graph.
        var rawStop = processedTrack.stop.bind(processedTrack);
        processedTrack.stop = function () {
          cleanup();
          rawStop();
        };
        origTrack.addEventListener('ended', function () {
          try {
            rawStop();
          } catch (e) {}
          cleanup();
        });

        // Return a stream with the processed audio plus any original video.
        var out = new MediaStream();
        out.addTrack(processedTrack);
        stream.getVideoTracks().forEach(function (t) {
          out.addTrack(t);
        });
        return out;
      })
      .catch(function (e) {
        // Any failure -> fall back to the raw mic so calls never break.
        // eslint-disable-next-line no-console
        console.error('[lotus-denoise] RNNoise setup failed, using raw mic', e);
        return stream;
      });
  }

  navigator.mediaDevices.getUserMedia = function (constraints) {
    var wantsAudio = !!(constraints && constraints.audio);
    var effective = constraints;
    if (wantsAudio) {
      // RNNoise needs mono 48 kHz; it owns suppression. Keep AEC + AGC on the
      // raw capture (they run before our processing).
      var audioC =
        typeof constraints.audio === 'object' ? Object.assign({}, constraints.audio) : {};
      audioC.noiseSuppression = false;
      audioC.channelCount = 1;
      audioC.sampleRate = SAMPLE_RATE;
      if (audioC.echoCancellation === undefined) audioC.echoCancellation = true;
      if (audioC.autoGainControl === undefined) audioC.autoGainControl = true;
      effective = Object.assign({}, constraints, { audio: audioC });
    }
    return origGetUserMedia(effective).then(function (stream) {
      return wantsAudio ? processStream(stream) : stream;
    });
  };
})();