fix(calls): DTLN at 16kHz + raw-capture A/B; explains weak/robotic results

Two issues found from real testing of the in-app tester: 1. Raw ≈ RNNoise ≈ Speex sounded identical in Record & compare because the clip was captured with browser noise suppression ON (the user's native-NS setting), so "Raw" was already cleaned and the models had nothing left to remove. Record & compare now captures fully raw audio (noiseSuppression / AGC / echoCancellation off) so each model's effect on real noise is audible. (Friends still heard differences in calls — the models work; the test was feeding them pre-cleaned audio.) 2. DTLN was robotic/choppy/quiet because @workadventure/noise-suppression targets 16 kHz (AUDIO_CONFIG.sampleRate) and does NOT resample internally, while we ran it at 48 kHz. Run DTLN's whole graph in a 16 kHz context: - denoisePipeline: add sampleRateFor(model) (16k for dtln, 48k otherwise); tester live-monitor + playback contexts use it (bufferSource resamples the 48k clip down for DTLN). - shim (build/lotus-denoise.js): SAMPLE_RATE is now model-aware, so DTLN is correct in real calls too (it was previously broken at 48 kHz). The 16 kHz processed track is still published to LiveKit (WebRTC/Opus resamples). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-17 17:27:15 -04:00
parent 14cfa021c5
commit abb7f743b8
3 changed files with 37 additions and 11 deletions
@@ -35,9 +35,13 @@
  if (typeof AudioWorkletNode === 'undefined' || typeof AudioContext === 'undefined') return;

  var ASSET_BASE = './denoise/';
-  var SAMPLE_RATE = 48000;

  var MODEL = params.get('lotusModel') || 'rnnoise';
+  // DTLN (@workadventure) targets 16 kHz and does not resample internally, so
+  // its whole graph runs in a 16 kHz context; RNNoise/Speex (sapphi) need
+  // 48 kHz. The processed MediaStreamTrack is published to LiveKit either way
+  // (WebRTC/Opus resamples as needed).
+  var SAMPLE_RATE = MODEL === 'dtln' ? 16000 : 48000;
  var USE_NATIVE_NS = params.get('lotusNativeNS') === 'true';
  var USE_GATE = params.get('lotusGate') === 'true';
  var GATE_THRESHOLD = parseFloat(params.get('lotusGateThreshold') || '-45');
@@ -2,15 +2,16 @@ import React, { useCallback, useEffect, useRef, useState } from 'react';
 import { Box, Button, Text } from 'folds';
 import { DenoiseModelId } from '../../../state/settings';
 import {
-  DENOISE_SAMPLE_RATE,
  DenoiseNode,
  buildGateNode,
  buildModelNode,
  readDb,
+  sampleRateFor,
 } from '../../../utils/denoisePipeline';

 const MAX_RECORD_MS = 6000;

+// Live monitor mirrors the call's capture (respects the user's native-NS choice).
 const MIC_CONSTRAINTS = (nativeNS: boolean): MediaStreamConstraints => ({
  audio: {
    noiseSuppression: nativeNS,
@@ -20,6 +21,19 @@ const MIC_CONSTRAINTS = (nativeNS: boolean): MediaStreamConstraints => ({
  },
 });

+// Record & compare captures fully RAW audio (no browser noise suppression / AGC
+// / echo cancel) so each model's effect on real background noise is audible.
+// Capturing with native NS on would pre-clean the clip and make Raw/RNNoise/
+// Speex sound identical.
+const RAW_CONSTRAINTS: MediaStreamConstraints = {
+  audio: {
+    noiseSuppression: false,
+    echoCancellation: false,
+    autoGainControl: false,
+    channelCount: 1,
+  },
+};
+
 /** A -100..0 dBFS bar with optional threshold marker. */
 function DbMeter({ label, db, threshold }: { label: string; db: number; threshold?: number }) {
  const pct = Math.max(0, Math.min(100, db + 100));
@@ -112,7 +126,7 @@ export function DenoiseTester({ model, useGate, gateThreshold, nativeNS }: Denoi

  const startLive = async () => {
    try {
-      const ctx = new AudioContext({ sampleRate: DENOISE_SAMPLE_RATE });
+      const ctx = new AudioContext({ sampleRate: sampleRateFor(model) });
      const stream = await navigator.mediaDevices.getUserMedia(MIC_CONSTRAINTS(nativeNS));
      const source = ctx.createMediaStreamSource(stream);
      const inAnalyser = ctx.createAnalyser();
@@ -192,8 +206,8 @@ export function DenoiseTester({ model, useGate, gateThreshold, nativeNS }: Denoi

  const startRecord = async () => {
    try {
-      const stream = await navigator.mediaDevices.getUserMedia(MIC_CONSTRAINTS(nativeNS));
-      const ctx = new AudioContext({ sampleRate: DENOISE_SAMPLE_RATE });
+      const stream = await navigator.mediaDevices.getUserMedia(RAW_CONSTRAINTS);
+      const ctx = new AudioContext();
      const source = ctx.createMediaStreamSource(stream);
      const analyser = ctx.createAnalyser();
      analyser.fftSize = 1024;
@@ -209,7 +223,7 @@ export function DenoiseTester({ model, useGate, gateThreshold, nativeNS }: Denoi
        teardownRecorder();
        setRecDb(-100);
        try {
-          const decodeCtx = new AudioContext({ sampleRate: DENOISE_SAMPLE_RATE });
+          const decodeCtx = new AudioContext({ sampleRate: 48000 });
          clipRef.current = await decodeCtx.decodeAudioData(await blob.arrayBuffer());
          decodeCtx.close().catch(() => undefined);
          setHasClip(true);
@@ -254,7 +268,9 @@ export function DenoiseTester({ model, useGate, gateThreshold, nativeNS }: Denoi
    const clip = clipRef.current;
    if (!clip) return;
    try {
-      const ctx = new AudioContext({ sampleRate: DENOISE_SAMPLE_RATE });
+      // bufferSource auto-resamples the 48 kHz clip to the context rate, so DTLN
+      // gets the 16 kHz it needs while raw/RNNoise/Speex stay at 48 kHz.
+      const ctx = new AudioContext({ sampleRate: sampleRateFor(playModel ?? 'rnnoise') });
      const source = ctx.createBufferSource();
      source.buffer = clip;
      if (playModel) {
@@ -327,8 +343,9 @@ export function DenoiseTester({ model, useGate, gateThreshold, nativeNS }: Denoi
        <Text size="T300">Record &amp; compare</Text>
        <Text size="T200" priority="300">
          Record up to {MAX_RECORD_MS / 1000}s of yourself with your usual background noise, then
-          play the same clip back raw vs through each model to A/B them. (Uses the gate when it's
-          enabled above.)
+          play the same clip back raw vs through each model to A/B them. Captured fully raw (browser
+          noise suppression off) so each model&apos;s effect is audible; uses the gate when enabled
+          above.
        </Text>
        <Box direction="Row" gap="200" alignItems="Center">
          <Button
@@ -13,8 +13,13 @@ import { DenoiseModelId } from '../state/settings';
 // Mirror CallEmbed's widget-base resolution so assets resolve under any base.
 const BASE = `${import.meta.env.BASE_URL.replace(/\/+$/, '')}/public/element-call/denoise/`;

-/** RNNoise/Speex/DTLN all assume mono 48 kHz, matching the call pipeline. */
-export const DENOISE_SAMPLE_RATE = 48000;
+/**
+ * Required AudioContext sample rate per model. RNNoise/Speex (sapphi) assume
+ * 48 kHz. DTLN (@workadventure) targets 16 kHz and does NOT resample internally
+ * — running it at 48 kHz produces robotic/choppy/quiet output, so its whole
+ * graph must run in a 16 kHz context.
+ */
+export const sampleRateFor = (model: DenoiseModelId): number => (model === 'dtln' ? 16000 : 48000);

 export type DenoiseNode = {
  node: AudioWorkletNode;