cinny/build/lotus-denoise.js

/*
 * Lotus Chat — client-side ML noise suppression shim for Element Call.
 *
 * Element Call runs as a same-origin iframe widget that captures the mic
 * internally (via livekit-client -> getUserMedia) and publishes it to LiveKit.
 * We can't reach that track from the host. Instead this classic <script> is
 * injected (by the vite `lotus-denoise` plugin) into EC's index.html BEFORE its
 * deferred module entry, so it runs first and monkeypatches getUserMedia. When
 * the "ml" tier is selected (lotusDenoise=ml in the widget URL) we route the
 * captured mic through an RNNoise AudioWorklet (@sapphi-red/web-noise-suppressor)
 * and hand the processed track back to EC/LiveKit.
 *
 * RNNoise REQUIRES mono, 48 kHz float audio. Feeding it anything else (stereo,
 * or 44.1 kHz data the model treats as 48 kHz) produces loud static. So we:
 *   - run a 48 kHz AudioContext (which handles resampling from the hardware),
 *   - use the SIMD build if supported for better performance,
 *   - keep browser-native stationary suppression ON so the fans are removed
 *     before RNNoise focuses on transient noises (keyboard, dogs, etc.).
 *
 * Any failure falls back to the unprocessed mic so calls never break.
 */
(function () {
  'use strict';

  var params;
  try {
    params = new URLSearchParams(window.location.search);
    if (params.get('lotusDenoise') !== 'ml') return;
  } catch (e) {
    return;
  }

  var md = navigator.mediaDevices;
  if (!md || typeof md.getUserMedia !== 'function') return;
  if (typeof AudioWorkletNode === 'undefined' || typeof AudioContext === 'undefined') return;

  var ASSET_BASE = './denoise/';

  var MODEL = params.get('lotusModel') || 'rnnoise';
  // DTLN (@workadventure) targets 16 kHz and does not resample internally, so
  // its whole graph runs in a 16 kHz context; RNNoise/Speex (sapphi) and
  // DeepFilterNet 3 are 48 kHz fullband. The processed MediaStreamTrack is
  // published to LiveKit either way (WebRTC/Opus resamples as needed).
  var SAMPLE_RATE = MODEL === 'dtln' ? 16000 : 48000;
  var USE_NATIVE_NS = params.get('lotusNativeNS') === 'true';
  var USE_GATE = params.get('lotusGate') === 'true';
  var GATE_THRESHOLD = parseFloat(params.get('lotusGateThreshold') || '-45');

  var PROCESSORS = {
    rnnoise: {
      name: '@sapphi-red/web-noise-suppressor/rnnoise',
      script: 'rnnoiseWorklet.js',
      wasm: 'rnnoise.wasm',
      simdWasm: 'rnnoise_simd.wasm',
    },
    speex: {
      name: '@sapphi-red/web-noise-suppressor/speex',
      script: 'speexWorklet.js',
      wasm: 'speex.wasm',
    },
    dtln: {
      // @workadventure/noise-suppression is a self-contained ES module that
      // resolves its own AudioWorklet processor + LiteRT WASM + TFLite models
      // via import.meta.url. We dynamic-import this helper and let it build the
      // node, rather than addModule-ing a flat worklet ourselves.
      helper: 'workadventure/audio-worklet.js',
    },
    deepfilternet: {
      // deepfilternet3-noise-filter ships an ESM whose AudioWorklet processor +
      // wasm-bindgen glue are INLINED as a string (loaded via a Blob URL — no
      // CDN for the worklet). The only assets it fetches are its single-threaded
      // df_bg.wasm + ONNX model, which we vendor + self-host under
      // deepfilternet/v2/... We dynamic-import the ESM, build a DeepFilterNet3Core
      // pointed at the self-hosted base, and let it create the worklet node.
      esm: 'deepfilternet/index.esm.js',
    },
    gate: {
      name: '@sapphi-red/web-noise-suppressor/noise-gate',
      script: 'noiseGateWorklet.js',
    },
  };

  var origGetUserMedia = md.getUserMedia.bind(md);
  var wasmPromises = {};
  var ctxPromise = null;

  function checkSimd() {
    try {
      return WebAssembly.validate(
        new Uint8Array([
          0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 10, 1, 8, 0, 65, 0,
          253, 15, 253, 98, 11,
        ]),
      )
        ? Promise.resolve(true)
        : Promise.resolve(false);
    } catch (e) {
      return Promise.resolve(false);
    }
  }

  function loadWasm(modelId) {
    if (wasmPromises[modelId]) return wasmPromises[modelId];
    var p = PROCESSORS[modelId];
    if (!p || !p.wasm) return Promise.resolve(null);

    wasmPromises[modelId] = (modelId === 'rnnoise' ? checkSimd() : Promise.resolve(false)).then(
      function (simd) {
        var file = simd && p.simdWasm ? p.simdWasm : p.wasm;
        return fetch(ASSET_BASE + file).then(function (r) {
          if (!r.ok) {
            if (simd && p.simdWasm)
              return fetch(ASSET_BASE + p.wasm).then(function (r2) {
                if (!r2.ok) throw new Error(modelId + ' wasm failed');
                return r2.arrayBuffer();
              });
            throw new Error(modelId + ' wasm failed');
          }
          return r.arrayBuffer();
        });
      },
    );
    return wasmPromises[modelId];
  }

  function getContext() {
    if (!ctxPromise) {
      ctxPromise = (function () {
        var ctx = new AudioContext({ sampleRate: SAMPLE_RATE });
        if (ctx.sampleRate !== SAMPLE_RATE) {
          try {
            ctx.close();
          } catch (e) {}
          return Promise.reject(new Error('SampleRate mismatch: ' + ctx.sampleRate));
        }
        // Load worklet modules. DTLN registers its own processor via the
        // dynamic-imported helper (see buildMlNode), so it needs nothing here.
        var scripts = [];
        if (MODEL === 'rnnoise' || MODEL === 'speex') scripts.push(PROCESSORS[MODEL].script);
        if (USE_GATE) scripts.push(PROCESSORS.gate.script);

        return Promise.all(
          scripts.map(function (s) {
            return ctx.audioWorklet.addModule(ASSET_BASE + s);
          }),
        ).then(function () {
          return ctx.state === 'suspended'
            ? ctx.resume().then(function () {
                return ctx;
              })
            : ctx;
        });
      })();
      ctxPromise.catch(function () {
        ctxPromise = null;
      });
    }
    return ctxPromise;
  }

  var hasNotifiedActive = false;

  // Build the ML denoise AudioWorkletNode. RNNoise/Speex are flat sapphi
  // worklets we instantiate directly with the fetched WASM binary. DTLN comes
  // from @workadventure's self-contained helper, which we dynamic-import; it
  // resolves its own processor + LiteRT WASM + TFLite models internally and
  // returns the node. Resolves to { node, ready, dispose }.
  function buildMlNode(ctx, wasmBinary) {
    if (MODEL === 'dtln') {
      return import(ASSET_BASE + PROCESSORS.dtln.helper).then(function (mod) {
        // bypassUntilReady: pass raw audio through until the model is loaded so
        // the call never has a silent/missing track during init.
        return mod.createNoiseSuppressionAudioWorklet(ctx, { bypassUntilReady: true });
      });
    }
    if (MODEL === 'deepfilternet') {
      // Resolve an absolute self-hosted base so the package's cdnUrl override
      // fetches our vendored df_bg.wasm + ONNX model (never the upstream CDN).
      var dfnBase = new URL(ASSET_BASE + 'deepfilternet', window.location.href).href;
      return import(ASSET_BASE + PROCESSORS.deepfilternet.esm).then(function (mod) {
        var core = new mod.DeepFilterNet3Core({
          sampleRate: SAMPLE_RATE,
          noiseReductionLevel: 80,
          assetConfig: { cdnUrl: dfnBase },
        });
        // initialize() fetches + compiles the wasm and loads the model on the
        // main thread; the worklet node only exists once that resolves, so the
        // graph is connected with a ready model (no half-initialised passthrough).
        return core.initialize().then(function () {
          return core.createAudioWorkletNode(ctx).then(function (node) {
            return {
              node: node,
              ready: Promise.resolve(),
              dispose: function () {
                try {
                  core.destroy();
                } catch (e) {}
              },
            };
          });
        });
      });
    }
    var node = new AudioWorkletNode(ctx, PROCESSORS[MODEL].name, {
      channelCount: 1,
      numberOfInputs: 1,
      numberOfOutputs: 1,
      processorOptions: { maxChannels: 1, wasmBinary: wasmBinary },
    });
    return Promise.resolve({
      node: node,
      ready: Promise.resolve(),
      dispose: function () {
        try {
          node.port.postMessage('destroy');
        } catch (e) {}
      },
    });
  }

  function processStream(stream) {
    var audioTracks = stream.getAudioTracks();
    if (audioTracks.length === 0) return Promise.resolve(stream);

    return Promise.all([loadWasm(MODEL), getContext()])
      .then(function (res) {
        var wasmBinary = res[0];
        var ctx = res[1];

        var source = ctx.createMediaStreamSource(stream);
        var dest = ctx.createMediaStreamDestination();
        var head = source;

        // 1. Optional Noise Gate
        if (USE_GATE) {
          var gateNode = new AudioWorkletNode(ctx, PROCESSORS.gate.name, {
            processorOptions: {
              openThreshold: GATE_THRESHOLD,
              closeThreshold: GATE_THRESHOLD - 5,
              holdMs: 150,
              maxChannels: 1,
            },
          });
          head.connect(gateNode);
          head = gateNode;
        }

        // 2. ML Processor
        return buildMlNode(ctx, wasmBinary).then(function (ml) {
          var mlNode = ml.node;
          head.connect(mlNode);
          mlNode.connect(dest);

          // Surface async init failures (e.g. DTLN model load) without blocking
          // the track handoff — audio flows via bypassUntilReady meanwhile.
          if (ml.ready && typeof ml.ready.then === 'function') {
            ml.ready.catch(function (err) {
              var m = err instanceof Error ? err.message : String(err);
              console.error('[lotus-denoise] ' + MODEL + ' init failed:', m);
            });
          }

          var origTrack = audioTracks[0];
          var processedTrack = dest.stream.getAudioTracks()[0];

          var torndown = false;
          function cleanup() {
            if (torndown) return;
            torndown = true;
            try {
              ml.dispose();
            } catch (e) {}
            try {
              source.disconnect();
              mlNode.disconnect();
            } catch (e) {}
            try {
              origTrack.stop();
            } catch (e) {}
          }

          var rawStop = processedTrack.stop.bind(processedTrack);
          processedTrack.stop = function () {
            cleanup();
            rawStop();
          };
          origTrack.addEventListener('ended', function () {
            try {
              rawStop();
            } catch (e) {}
            cleanup();
          });

          if (!hasNotifiedActive) {
            hasNotifiedActive = true;
            window.parent.postMessage(
              {
                type: 'lotus-denoise-status',
                active: true,
                model: MODEL,
                nativeNS: USE_NATIVE_NS,
                gate: USE_GATE,
              },
              '*',
            );
          }

          var out = new MediaStream();
          out.addTrack(processedTrack);
          stream.getVideoTracks().forEach(function (t) {
            out.addTrack(t);
          });
          return out;
        });
      })
      .catch(function (e) {
        var msg = e instanceof Error ? e.message : String(e);
        console.error('[lotus-denoise] Setup failed:', msg);
        window.parent.postMessage({ type: 'lotus-denoise-status', active: false, error: msg }, '*');
        return stream;
      });
  }

  navigator.mediaDevices.getUserMedia = function (constraints) {
    var wantsAudio = !!(constraints && constraints.audio);
    var effective = constraints;
    if (wantsAudio) {
      var audioC =
        typeof constraints.audio === 'object' ? Object.assign({}, constraints.audio) : {};
      audioC.noiseSuppression = USE_NATIVE_NS;
      audioC.channelCount = 1;
      if (audioC.echoCancellation === undefined) audioC.echoCancellation = true;
      if (audioC.autoGainControl === undefined) audioC.autoGainControl = true;
      effective = Object.assign({}, constraints, { audio: audioC });
    }
    return origGetUserMedia(effective).then(function (stream) {
      return wantsAudio ? processStream(stream) : stream;
    });
  };
})();