diff --git a/src/app/hooks/useCallSpeakers.ts b/src/app/hooks/useCallSpeakers.ts
index 64c97a1c6..de36929ba 100644
--- a/src/app/hooks/useCallSpeakers.ts
+++ b/src/app/hooks/useCallSpeakers.ts
@@ -1,60 +1,116 @@
-import { useCallback, useEffect, useMemo, useState } from 'react';
+import { useEffect, useState } from 'react';
 import { CallEmbed } from '../plugins/call';
-import { useMutationObserver } from './useMutationObserver';
 import { isUserId } from '../utils/matrix';
 import { useCallMembers, useCallSession } from './useCall';
 import { useCallJoined } from './useCallEmbed';
 
+/**
+ * Returns the set of Matrix user IDs currently speaking in the Element Call
+ * iframe.
+ *
+ * EC renders each participant's video tile with a `[data-video-fit]` wrapper.
+ * When a participant is speaking, EC draws a speaking indicator via the tile's
+ * `::before` pseudo-element `background-image` (anything other than `none`).
+ * The participant's Matrix user ID is exposed on the first descendant carrying
+ * an `aria-label`.
+ *
+ * We watch the whole iframe document so tiles added/removed mid-call are picked
+ * up automatically, and on every relevant mutation we re-scan ALL `[data-video-fit]`
+ * tiles and rebuild the set from the full current DOM state (rather than just the
+ * tiles in the mutation batch).
+ */
 export const useCallSpeakers = (callEmbed: CallEmbed): Set<string> => {
   const [speakers, setSpeakers] = useState(new Set<string>());
   const callSession = useCallSession(callEmbed.room);
   const callMembers = useCallMembers(callSession);
   const joined = useCallJoined(callEmbed);
 
-  const videoContainers = useMemo(() => {
-    if (callMembers && joined) return callEmbed.document?.querySelectorAll('[data-video-fit]');
-    return undefined;
-  }, [callEmbed, callMembers, joined]);
-
-  const mutationObserver = useMutationObserver(
-    useCallback(
-      (mutations) => {
-        const s = new Set<string>();
-
-        mutations.forEach((mutation) => {
-          if (mutation.type !== 'attributes') return;
-          const el = mutation.target as HTMLElement;
-
-          const style = callEmbed.iframe.contentWindow?.getComputedStyle(el, '::before');
-          if (!style) return;
-          const tileBackgroundImage = style.getPropertyValue('background-image');
-          const speaking = tileBackgroundImage !== 'none';
-          if (!speaking) return;
-
-          const speakerId = el.querySelector('[aria-label]')?.getAttribute('aria-label');
-          if (speakerId && isUserId(speakerId)) {
-            s.add(speakerId);
-          }
-        });
-
-        setSpeakers(s);
-      },
-      [callEmbed],
-    ),
-  );
-
   useEffect(() => {
-    videoContainers?.forEach((element) => {
-      mutationObserver.observe(element, {
+    if (!callMembers || !joined) {
+      setSpeakers(new Set<string>());
+      return undefined;
+    }
+
+    const getDoc = (): Document | undefined =>
+      callEmbed.iframe.contentDocument ?? callEmbed.iframe.contentWindow?.document ?? undefined;
+
+    const syncState = (): void => {
+      const doc = getDoc();
+      if (!doc) {
+        setSpeakers(new Set<string>());
+        return;
+      }
+      const s = new Set<string>();
+      // Re-scan every tile on each mutation and build the set from the full
+      // current DOM state, not just the tiles that mutated this batch.
+      const tiles = doc.querySelectorAll<HTMLElement>('[data-video-fit]');
+      tiles.forEach((el) => {
+        const style = callEmbed.iframe.contentWindow?.getComputedStyle(el, '::before');
+        if (!style) return;
+        const tileBackgroundImage = style.getPropertyValue('background-image');
+        const speaking = tileBackgroundImage !== 'none';
+        if (!speaking) return;
+
+        const speakerId = el.querySelector('[aria-label]')?.getAttribute('aria-label');
+        if (speakerId && isUserId(speakerId)) {
+          s.add(speakerId);
+        }
+      });
+      setSpeakers(s);
+    };
+
+    let tileObserver: MutationObserver | undefined;
+
+    const attachObserver = (): void => {
+      const doc = getDoc();
+      if (!doc) return;
+      tileObserver?.disconnect();
+      // Watch the whole document for attribute changes on tiles (which carry
+      // the speaking indicator) and for new tiles being added/removed.
+      tileObserver = new MutationObserver((mutations) => {
+        const relevant = mutations.some(
+          (m) =>
+            m.type === 'attributes' ||
+            (m.type === 'childList' &&
+              (Array.from(m.addedNodes).some(
+                (n) => n instanceof Element && n.querySelector('[data-video-fit]'),
+              ) ||
+                Array.from(m.removedNodes).some(
+                  (n) => n instanceof Element && n.querySelector('[data-video-fit]'),
+                ))),
+        );
+        if (relevant) syncState();
+      });
+      tileObserver.observe(doc.body, {
+        subtree: true,
+        childList: true,
         attributes: true,
         attributeFilter: ['class', 'style'],
       });
-    });
+      syncState();
+    };
+
+    attachObserver();
+
+    // If iframe isn't ready yet, wait for body to be available.
+    let bodyWatcher: MutationObserver | undefined;
+    if (!getDoc()?.body) {
+      bodyWatcher = new MutationObserver(() => {
+        if (getDoc()?.body) {
+          bodyWatcher?.disconnect();
+          bodyWatcher = undefined;
+          attachObserver();
+        }
+      });
+      const doc = getDoc();
+      if (doc) bodyWatcher.observe(doc, { childList: true });
+    }
 
     return () => {
-      mutationObserver.disconnect();
+      tileObserver?.disconnect();
+      bodyWatcher?.disconnect();
     };
-  }, [videoContainers, mutationObserver]);
+  }, [callEmbed, callMembers, joined]);
 
   return speakers;
 };