diff --git a/LOTUS_E2EE_INVESTIGATION.md b/LOTUS_E2EE_INVESTIGATION.md new file mode 100644 index 000000000..19719d78b --- /dev/null +++ b/LOTUS_E2EE_INVESTIGATION.md @@ -0,0 +1,402 @@ +# Lotus Chat — E2EE Investigation Runbook (KE-1 → KE-4) + +> **Scope:** evidence-gathering only. Do **not** apply fixes from this document +> without a cross-system planning session (client rust-crypto ↔ Synapse ↔ +> Element Call MatrixRTC). Symptom source: `LOTUS_BUGS.md` §"Encryption / E2EE" +> (KE-1..KE-4), observed live 2026-06-30 on `chat.lotusguild.org` during a +> 2-person Element Call. +> +> **Client:** Lotus Cinny fork, `matrix-js-sdk@41.6.0-rc.0`, rust-crypto. +> **Server:** Synapse `1.155.0` on **LXC 151** (`10.10.10.29`), PostgreSQL 17.9 +> on **LXC 109** (`10.10.10.44`). Facts below are copy-pasteable against that +> deployment (paths/IPs from `/root/code/matrix/README.md`). + +--- + +## 0. Deployment facts used by this runbook + +From the matrix infra README (`/root/code/matrix/README.md`): + +| Thing | Value | +|-------|-------| +| Synapse host | LXC **151**, `10.10.10.29` (Synapse 1.155.0) | +| Synapse log | `/var/log/matrix-synapse/homeserver.log` | +| Synapse config | `/etc/matrix-synapse/homeserver.yaml` (+ `conf.d/`) | +| Synapse HTTP | `10.10.10.29:8008` | +| PostgreSQL host | LXC **109**, `10.10.10.44` (PG 17.9), db `synapse` | +| synapse-admin UI | `http://10.10.10.29:8080` | +| LiveKit / lk-jwt / guard | LXC 151: LiveKit `:7880/:7881`, guard `:8070`, lk-jwt `:8071` | +| SSH path to Synapse | `ssh root@10.10.10.4` then `pct enter 151` | +| SSH path to PG | `ssh root@10.10.10.4` then `pct enter 109` | + +**Getting a psql shell** (run on LXC 109, or from 151 over the network): + +```bash +# On LXC 109: +sudo -u postgres psql synapse +# From LXC 151 (pg_hba allows 10.10.10.29): +psql "host=10.10.10.44 user=synapse dbname=synapse" +``` + +**Tailing Synapse during a call** (on LXC 151): + +```bash +tail -F /var/log/matrix-synapse/homeserver.log | tee /tmp/lotus-call-$(date +%s).log +``` + +Synapse E2EE/to-device logging is chatty at `INFO`; if a category is silent, +temporarily raise it in `/etc/matrix-synapse/conf.d/log.yaml` (or the +`log_config` file referenced by `homeserver.yaml`): + +```yaml +loggers: + synapse.rest.client.keys: { level: DEBUG } + synapse.handlers.e2e_keys: { level: DEBUG } + synapse.storage.databases.main.end_to_end_keys: { level: DEBUG } + synapse.handlers.devicemessage: { level: DEBUG } # to-device +``` + +Then `systemctl reload matrix-synapse` (reload re-reads log config without a +full restart). **Revert to `INFO` after the capture** — DEBUG is very verbose. + +--- + +## 1. Per-KE evidence matrix + +Client greps assume Chrome/Firefox DevTools console (filter box or, better, +"Preserve log" + save-as). The **Crypto Diagnostics** card (Settings → +Developer Tools) auto-captures every signature below into a downloadable JSON — +use it as the primary client artifact and DevTools as the raw backup. + +### KE-1 — OTK upload conflict storm (root-cause candidate) + +- **Console signature (grep):** + - `already exists` + - full: `POST /_matrix/client/v3/keys/upload … 400 M_UNKNOWN: One time key signed_curve25519: already exists. Old key: {…} new key: {…}` +- **Capture client-side:** + - Timestamp (first occurrence + rate — "N/sec"), **device id**, **user id**. + - DevTools → **Network** → filter `keys/upload`: for a failing call save the + **request body** (the `one_time_keys` map — note the exact `signed_curve25519:`) + and the **response body** (the `Old key` / `new key` JSON). This diff is the + smoking gun: same key-id, different value ⇒ store vs server divergence. + - Whether it self-heals or loops forever (KE-1 loops). +- **Synapse log grep (LXC 151):** + ```bash + grep -E "keys/upload|One time key .* already exists|OneTimeKey" \ + /var/log/matrix-synapse/homeserver.log | grep "" + ``` +- **Synapse SQL (LXC 109) — what the server thinks it holds:** + ```sql + -- Current OTK inventory for the device (compare key_id set against the + -- request body the client keeps retrying). + SELECT algorithm, key_id, ts_added_ms + FROM e2e_one_time_keys_json + WHERE user_id = '@user:matrix.lotusguild.org' + AND device_id = '' + ORDER BY algorithm, key_id; + + -- Server's advertised counts (this is what /sync tells the client it has, + -- and drives whether the client decides to upload more). + SELECT algorithm, count(*) FROM e2e_one_time_keys_json + WHERE user_id = '@user:matrix.lotusguild.org' AND device_id = '' + GROUP BY algorithm; + + -- Fallback key state (used when OTKs are exhausted). + SELECT algorithm, key_id, used, ts_added_ms + FROM e2e_fallback_keys_json + WHERE user_id = '@user:matrix.lotusguild.org' AND device_id = ''; + ``` + > Table names are Synapse 1.155 (`e2e_one_time_keys_json`, + > `e2e_fallback_keys_json`). If a name is absent, list with `\dt e2e*` in psql. +- **Confirms:** if the offending `key_id` (from the 400) is **present** in + `e2e_one_time_keys_json` with a **different** stored value than the client's + request body → OTK state has diverged (rust-crypto store vs Synapse). That is + the KE-1 root condition. + +### KE-2 — EC media keys not arriving/decrypting (audio/video cutouts) + +- **Console signature (grep):** + - `MissingKey` + - `missing key at index` (e.g. `MissingKey: missing key at index N for participant @user`) + - `key set not found` + - `io.element.call.encryption_keys` (rust-crypto: `WARN … Received an unexpected encrypted to-device event … event_type="io.element.call.encryption_keys"`) +- **Capture client-side:** + - Timestamp windows where a participant's audio/video cut out, and the + `@participant` + `index N` from the message. + - The `io.element.call.encryption_keys` warnings (these are the media-key + to-device events failing to decrypt) with their timestamps. + - Own device id + user id (to correlate with the sender's Olm session). +- **Synapse log grep (LXC 151) — to-device delivery of the media keys:** + ```bash + grep -E "io.element.call.encryption_keys|m.room.encrypted|/sendToDevice|to_device" \ + /var/log/matrix-synapse/homeserver.log | grep -E "|" + ``` +- **Synapse SQL (LXC 109) — undelivered / queued to-device events:** + ```sql + -- Backlog of to-device messages queued for the affected device. A growing + -- count here = the HS has the media-key events but the device isn't draining + -- them via /sync (or they were sent to a stale device id). + SELECT user_id, device_id, count(*) AS pending + FROM device_inbox + WHERE user_id = '@user:matrix.lotusguild.org' + GROUP BY user_id, device_id; + + -- Cross-check the device id the sender is targeting actually exists / is current. + SELECT device_id, display_name, last_seen, ts + FROM devices WHERE user_id = '@user:matrix.lotusguild.org'; + ``` +- **Confirms:** to-device events present but undecryptable (client shows the + `io.element.call.encryption_keys` "unexpected encrypted" warning) ⇒ there is + **no valid Olm session** to decrypt them — the expected downstream of KE-1. + +### KE-3 — Timeline decryption error: missing `algorithm` field + +- **Console signature (grep):** + - `DecryptionError` + - full: `Error decrypting event (… type=m.room.encrypted …): DecryptionError[msg: missing field 'algorithm' at line 1 column 138 …]` +- **Capture client-side:** + - The **event id** (`$SASBBzoqj…` was one) and the **room id**. + - Pull the raw event JSON via DevTools or the Developer Tools account-data/event + viewer, or directly: + ``` + GET https://matrix.lotusguild.org/_matrix/client/v3/rooms//event/ + ``` + Inspect `content` — confirm whether `algorithm` (should be + `m.megolm.v1.aes-sha2`) is truly absent vs a serialization mismatch. +- **Synapse log grep (LXC 151):** + ```bash + grep -E "" /var/log/matrix-synapse/homeserver.log + ``` +- **Synapse SQL (LXC 109) — the stored event content as the HS holds it:** + ```sql + SELECT ej.event_id, e.type, e.sender, e.origin_server_ts, + (ej.json::json -> 'content' -> 'algorithm') AS algorithm + FROM event_json ej + JOIN events e USING (event_id) + WHERE ej.event_id = '$SASBBzoqj...'; + ``` +- **Confirms:** if the stored `content.algorithm` is **NULL/absent** on the HS → + a malformed/legacy event was persisted (sender-side or federation). If it is + **present** on the HS but the client throws → an RC-SDK deserialization bug. + This distinction decides whether KE-3 is a data problem or a client problem. + +### KE-4 — MatrixRTC delayed-event / membership timeouts + +- **Console signature (grep):** + - `update_delayed_event` (`org.matrix.msc4157.update_delayed_event`) + - `delayed event` / `Restart delayed event timed out` + - full: `[MembershipManager] Network local timeout error while sending event, immediate retry … AbortError: Restart delayed event timed out before the HS responded` +- **Capture client-side:** + - Timestamps of each timeout; whether they correlate with call join/leave or + with general sync slowness. + - DevTools → Network: the `…/delayed_events…` / `update_delayed_event` + requests — their **HTTP status and latency** (timed-out vs slow-200). +- **Synapse log grep (LXC 151):** + ```bash + grep -E "delayed_event|msc4140|msc4157|update_delayed" \ + /var/log/matrix-synapse/homeserver.log | grep "" + # HS responsiveness in the same window (KE-4 may be pure latency): + grep -E "Processed request|/sync" /var/log/matrix-synapse/homeserver.log | tail -50 + ``` +- **Server-side corroboration (Grafana, `dashboard.lotusguild.org`):** Synapse + p99 response time (excl. `/sync`), event-processing lag, DB query latency for + the call window. High latency here ⇒ KE-4 is (partly) homeserver + responsiveness, not a client bug. +- **Confirms:** timeouts that line up with HS latency spikes → reliability/load; + timeouts with a healthy HS → client MembershipManager retry logic. + +--- + +## 2. Causality hypothesis + +``` +KE-1 OTK upload conflict storm + (rust-crypto store ↔ Synapse OTK state DIVERGED; server rejects re-uploads) + │ no fresh OTKs can be published/claimed + ▼ + No new Olm (1:1) sessions can be established with this device + │ + ▼ +KE-2 EC media-key to-device events (io.element.call.encryption_keys) + arrive but cannot be decrypted ⇒ MissingKey at index N + ⇒ friend's audio/video cuts out +``` + +KE-3 (missing `algorithm`) and KE-4 (delayed-event timeouts) are **likely +independent** of the KE-1→KE-2 chain: KE-3 is a decode/serialization path, +KE-4 is a MatrixRTC-vs-HS reliability path. Confirm/refute independence with the +decision tree below. + +### Decision tree — which capture confirms/refutes each link + +``` +Q1. Does the KE-1 offending key_id from the 400 response exist in + e2e_one_time_keys_json with a DIFFERENT value than the client request body? + ├─ YES → OTK divergence CONFIRMED (KE-1 root). Go to Q2. + └─ NO → Not divergence. Check: are OTK counts at 0 with fallback key `used=true`? + ├─ YES → OTK exhaustion, not divergence — different remediation. + └─ NO → Suspect RC-SDK 41.6.0-rc.0 upload-loop regression (see §3). + +Q2. During the same call, are io.element.call.encryption_keys to-device events + present in device_inbox / Synapse to-device logs for our device id? + ├─ YES + client shows "unexpected encrypted"/MissingKey + │ → KE-1 ⇒ KE-2 LINK CONFIRMED (events delivered, no Olm session to open them). + ├─ YES + client decrypts fine, but LiveKit still silent + │ → KE-2 is downstream of LiveKit/SFU, NOT KE-1. Decouple from crypto. + └─ NO (nothing queued/targeted our device) + → media keys never sent to us: stale device id / membership (see KE-4) + → KE-2 is a device-targeting problem, weakly linked to KE-1. + +Q3. KE-3: is content.algorithm NULL in event_json on the HS? + ├─ YES → malformed persisted event (sender/federation). Independent of KE-1. + └─ NO → client-side RC-SDK deserialization bug. Independent of KE-1. + +Q4. KE-4: do delayed-event timeouts coincide with Synapse p99 latency spikes + (Grafana) in the same minute? + ├─ YES → homeserver responsiveness/load. Independent of KE-1..KE-3. + └─ NO → client MembershipManager retry behavior. Independent. +``` + +--- + +## 3. Ranked remediation options (with blast radius) + +> Ordered least-destructive → most-destructive. **Do not run any of these as a +> "fix" before the planning session** — they are listed so evidence collection +> can be paired with a recovery plan. Confirm the root condition (Q1/Q2) first. + +1. **Per-device logout + re-login of the affected device** *(lowest blast radius)* + - **What:** log the one glitching device out and back in. Forces a fresh + device id, fresh device keys, and a clean OTK batch — sidesteps a diverged + OTK store without touching other sessions. + - **Blast radius:** that device only. Other sessions/devices untouched. + - **Cost:** the new device must be re-verified (cross-signing) and will need + to restore room keys from **key backup** to read old encrypted history. + - **Confirms/uses:** if KE-1 stops after this, OTK-store divergence (Q1) was + the cause. + +2. **Client crypto-store reset (`clearLoginData` path)** *(medium)* + - **What:** `clearLoginData()` in `src/client/initMatrix.ts` (coordinator's + file — do not edit) **deletes ALL IndexedDB databases** (incl. + `web-sync-store` and the rust-crypto store `crypto-store`), **unregisters + service workers**, **clears all Cache Storage**, and **`localStorage.clear()`**, + then reloads. `clearCacheAndReload()` is lighter — it only calls + `mx.store.deleteAllData()` (sync cache) and does **not** wipe crypto. + - **Blast radius:** this browser profile only, but total: you are logged out, + lose all cached sync state, drafts, settings, and **the local + megolm/room-key store**. + - **⚠️ Message-history / backup implication:** wiping `crypto-store` destroys + locally-held **room keys (megolm inbound sessions)**. Any history **not + backed up to server-side Key Backup** becomes **permanently undecryptable + on this device**. Before doing this: verify Key Backup is enabled and the + recovery key / passphrase is available (Settings → Security), or the user + loses readable history. Cross-signing must be re-established too. + - **Use when:** the rust-crypto store itself is corrupt/diverged and option 1 + didn't clear it. + +3. **SDK pin change off the RC** *(medium — codebase change, needs rebuild)* + - **Current pin:** `package.json` → `"matrix-js-sdk": "41.6.0-rc.0"` (a + release candidate). + - **Finding (npm / GitHub changelog, checked 2026-07):** stable **`41.6.0`** + was released **2026-05-26**. Its only changelog line is *"Throw sane error + on completeLoginOnNewDevice IdP rejection"* — **no OTK / keys-upload / Olm / + to-device fix** relative to the RC. Later stable lines exist + (`41.7.0`, `41.8.0`; `41.7.0-rc.3` / `41.9.0-rc.0` seen as pre-releases). + Nearby crypto-relevant entries: `41.5.0` *"Enable encrypted history sharing + by default"*; `41.4.0` key-backup handling. **No changelog entry directly + addresses the KE-1 OTK-conflict symptom** in the immediate range — so + moving RC→`41.6.0` stable is a low-risk hygiene step but is **not expected + to fix KE-1 by itself**. Before pinning, re-read the CHANGELOG for any + `41.7.x`/`41.8.x` OTK/one-time-key/olm entry that post-dates this note. + - **Blast radius:** all users after the next `cinny-build.sh` deploy. Test the + rust-crypto IndexedDB schema — a downgrade triggers the `IDB_VERSION_CONFLICT` + path in `initMatrix.ts`. + +4. **Synapse-side OTK row surgery** *(LAST RESORT — highest danger)* + - **What:** deleting/rewriting rows in `e2e_one_time_keys_json` (and/or + `e2e_fallback_keys_json`, `device_inbox`) for the affected device to force + the client to re-upload a clean batch. + - **⚠️ Danger:** direct writes to Synapse crypto tables can **desync every + device of that user**, break Olm sessions **for everyone who has claimed one + of those keys**, and are easy to get wrong (wrong `key_id`, cache not + invalidated). Synapse caches OTK counts — a raw DELETE without a restart can + leave the advertised count wrong, **worsening** the KE-1 loop. + - **Guardrails if ever done (planning session + HS owner only):** full + `pg_dump` of `synapse` first; do it during **zero active calls**; delete only + the exact diverged `key_id` for the exact `device_id`; `systemctl restart + matrix-synapse` to flush caches; then log the device out/in (option 1) so it + republishes. **Never** run this speculatively. + +--- + +## 4. "Capture session" checklist (run during the next call) + +Do these **in order**. Aim to have client + server capturing the **same call**. + +1. **Prep server tail (LXC 151):** SSH in, start + `tail -F /var/log/matrix-synapse/homeserver.log | tee /tmp/lotus-call-$(date +%s).log`. + (Optionally raise the `synapse.rest.client.keys` / `handlers.e2e_keys` / + `handlers.devicemessage` loggers to DEBUG per §0 and `systemctl reload + matrix-synapse` — remember to revert after.) +2. **Prep client:** open Lotus Chat → Settings → Developer Tools → **enable + Developer Tools** so the **Crypto Diagnostics** card is visible; note its + entry count starts at (or reset by reload to) 0. +3. **Open DevTools** (F12) → Console: enable **Preserve log**; Network tab: + enable **Preserve log** + **Record**. Note your **device id** and **user id** + (Settings → Devices / Developer Tools → Copy access token page shows ids). +4. **Note wall-clock start time** (ISO/UTC) on both machines so logs align. +5. **Join the Element Call** with the second participant; reproduce the fault + (wait for the audio/video cutouts and let KE-1 storm run ~30–60s). +6. **When a fault occurs, note the wall-clock timestamp** and which symptom + (audio cut / video freeze / etc.) — this bounds the log window. +7. **Client artifacts:** in the Crypto Diagnostics card click **Download report** + (`lotus-crypto-diag-.json`); in DevTools Network, save the failing + `keys/upload` request+response (right-click → Save/Copy), and the raw HAR + (Network → Save all as HAR) for the call window. +8. **Grab KE-3 event id / KE-2 participant+index** from the console (or the + diag JSON `entries[]`) for the SQL lookups. +9. **Server artifacts:** stop the tail; run the per-KE greps and SQL from §1 + against the noted device id / user id / event id, saving output alongside the + client JSON. Screenshot the Grafana Synapse latency panels for the window + (for KE-4). +10. **Bundle & label:** put client JSON + HAR + server log slice + SQL output in + one folder named with the call's UTC start time. Revert any DEBUG log config + (`systemctl reload matrix-synapse`). Hand off to the planning session — **do + not apply §3 remediations yet.** + +--- + +## 5. Client diagnostics helper (this kit) + +- **`src/app/utils/cryptoDiagLog.ts`** — capture-only console instrumentation. + - `installCryptoDiagLog()` — idempotent; wraps `console.warn`/`console.error` + with pass-through wrappers (originals always called) that ring-buffer (max + **200**) any line matching the KE signatures. No network, no timers. + - `getCryptoDiagEntries()` — snapshot copy of the buffer (`{ ts, level, ke, + signature, message }`, most-recent-last). + - `buildCryptoDiagReport(mx)` — JSON string: SDK version, device id, user id, + sync state, `cryptoReady` (`mx.getCrypto()` presence), per-KE counts, and the + entry buffer. No tokens/PII beyond those ids; captured log lines are retained + verbatim as evidence. + - **Signatures → KE mapping:** `already exists`→KE-1; `missing key at index` / + `io.element.call.encryption_keys` / `MissingKey`→KE-2; `DecryptionError`→KE-3; + `update_delayed_event` / `delayed event`→KE-4. +- **`src/app/features/settings/developer/CryptoDiagnostics.tsx`** — a folds + `SequenceCard`/`SettingTile` card (mirrors `developer-tools/DevelopTools.tsx`) + showing the live matched-entry count (Badge) and a **Download report** button + (Blob → `lotus-crypto-diag-.json`, same download idiom as + `room-settings/ExportRoomHistory.tsx`). + +### Recommended mount points (coordinator) + +- **Install call:** call `installCryptoDiagLog()` **as early as possible during + boot** so it captures crypto errors from first sync — ideally at the top of + the client entry module or inside `ClientRoot` before/around `initClient` + (e.g. `src/app/pages/client/ClientRoot.tsx`). It is idempotent, side-effect + only, and needs no `mx`, so a module-scope call at app entry is safe. (Do + **not** put it in `initMatrix.ts` — that file is off-limits.) +- **Settings card:** render `` inside the Developer Tools + page — in `src/app/features/settings/developer-tools/DevelopTools.tsx`, add it + to the `Box direction="Column" gap="700"` list (guarded by the existing + `developerTools` flag), right after the "Access Token" card. It pulls `mx` + from `useMatrixClient()` itself, so it just needs to be placed in the tree. diff --git a/src/app/features/settings/developer-tools/DevelopTools.tsx b/src/app/features/settings/developer-tools/DevelopTools.tsx index 00e6ecb09..9ed034fde 100644 --- a/src/app/features/settings/developer-tools/DevelopTools.tsx +++ b/src/app/features/settings/developer-tools/DevelopTools.tsx @@ -13,6 +13,7 @@ import { } from '../../../components/AccountDataEditor'; import { copyToClipboard } from '../../../utils/dom'; import { AccountData } from './AccountData'; +import { CryptoDiagnostics } from '../developer/CryptoDiagnostics'; type DeveloperToolsProps = { requestClose: () => void; @@ -109,6 +110,7 @@ export function DeveloperTools({ requestClose }: DeveloperToolsProps) { /> )} + {developerTools && } {developerTools && ( { + const report = buildCryptoDiagReport(mx); + const blob = new Blob([report], { type: 'application/json' }); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = `lotus-crypto-diag-${new Date().toISOString().replace(/[:.]/g, '-')}.json`; + a.click(); + URL.revokeObjectURL(url); + }, [mx]); + + return ( + + Crypto Diagnostics + + + 0 ? 'Critical' : 'Secondary'} fill="Solid" radii="Pill"> + + {count} + + + + + } + /> + + + ); +} diff --git a/src/app/pages/client/ClientRoot.tsx b/src/app/pages/client/ClientRoot.tsx index 52cf00973..0529e3c05 100644 --- a/src/app/pages/client/ClientRoot.tsx +++ b/src/app/pages/client/ClientRoot.tsx @@ -43,8 +43,15 @@ import { stopPropagation } from '../../utils/keyboard'; import { SyncStatus } from './SyncStatus'; import { AuthMetadataProvider } from '../../hooks/useAuthMetadata'; import { getFallbackSession, removeFallbackSession } from '../../state/sessions'; +import { useSessionSync } from '../../hooks/useSessionSync'; +import { installCryptoDiagLog } from '../../utils/cryptoDiagLog'; import { AutoDiscovery } from './AutoDiscovery'; +// Capture-only E2EE diagnostics ring buffer (KE-1→4 signatures) — installed at +// module load so it sees crypto warnings from the very first sync. Idempotent; +// report download lives in Settings → Developer Tools → Crypto Diagnostics. +installCryptoDiagLog(); + function ClientRootLoading() { return ( @@ -178,6 +185,9 @@ export function ClientRoot({ children }: ClientRootProps) { ); useLogoutListener(mx); + // Cross-tab session sync: another tab logging out / in (access token changed + // in localStorage) reloads this tab so it never runs with stale credentials. + useSessionSync(); useEffect(() => { if (loadState.status === AsyncStatus.Idle) { diff --git a/src/app/utils/cryptoDiagLog.ts b/src/app/utils/cryptoDiagLog.ts new file mode 100644 index 000000000..6f100f2de --- /dev/null +++ b/src/app/utils/cryptoDiagLog.ts @@ -0,0 +1,151 @@ +import type { MatrixClient } from 'matrix-js-sdk'; +import pkg from '../../../package.json'; + +// Lotus E2EE investigation kit — capture-only console diagnostics. +// +// Installs pass-through wrappers around `console.warn` / `console.error` that +// ring-buffer any log line matching the KE-1..KE-4 bug-cluster signatures +// (see LOTUS_E2EE_INVESTIGATION.md). It NEVER swallows a log call — the +// original console method is always invoked — and it performs NO network I/O. +// The report metadata is limited to SDK version / device id / user id / sync +// state; the captured log lines themselves are intentional evidence and may +// contain event ids or matrix ids exactly as the SDK logged them. + +export type CryptoDiagLevel = 'warn' | 'error'; + +export type CryptoDiagEntry = { + /** ISO-8601 UTC timestamp of when the line was captured. */ + ts: string; + level: CryptoDiagLevel; + /** Which KE bucket the signature belongs to, e.g. `KE-1`. */ + ke: string; + /** Human-readable label of the matched signature. */ + signature: string; + /** The serialized console line (best-effort). */ + message: string; +}; + +type Signature = { + ke: string; + label: string; + re: RegExp; +}; + +// Ordered most-specific-first so the recorded label is the tightest match. +const SIGNATURES: Signature[] = [ + { ke: 'KE-1', label: 'already exists', re: /already exists/i }, + { ke: 'KE-2', label: 'missing key at index', re: /missing key at index/i }, + { + ke: 'KE-2', + label: 'io.element.call.encryption_keys', + re: /io\.element\.call\.encryption_keys/, + }, + { ke: 'KE-2', label: 'MissingKey', re: /MissingKey/ }, + { ke: 'KE-3', label: 'DecryptionError', re: /DecryptionError/ }, + { ke: 'KE-4', label: 'update_delayed_event', re: /update_delayed_event/ }, + { ke: 'KE-4', label: 'delayed event', re: /delayed event/i }, +]; + +const MAX_ENTRIES = 200; + +const entries: CryptoDiagEntry[] = []; + +let installed = false; +let originalWarn: ((...data: unknown[]) => void) | undefined; +let originalError: ((...data: unknown[]) => void) | undefined; + +const stringifyArg = (arg: unknown): string => { + if (typeof arg === 'string') return arg; + if (arg instanceof Error) return `${arg.name}: ${arg.message}`; + try { + return JSON.stringify(arg); + } catch { + return String(arg); + } +}; + +const capture = (level: CryptoDiagLevel, args: unknown[]): void => { + const message = args.map(stringifyArg).join(' '); + const sig = SIGNATURES.find((s) => s.re.test(message)); + if (!sig) return; + + entries.push({ + ts: new Date().toISOString(), + level, + ke: sig.ke, + signature: sig.label, + message, + }); + // Ring-buffer: keep only the most recent MAX_ENTRIES. + while (entries.length > MAX_ENTRIES) { + entries.shift(); + } +}; + +/** + * Install the capture-only console wrappers. Idempotent — calling it more than + * once is a no-op. Safe to call as early as possible during app boot. + */ +export const installCryptoDiagLog = (): void => { + if (installed) return; + installed = true; + + originalWarn = console.warn.bind(console); + originalError = console.error.bind(console); + + console.warn = (...args: unknown[]): void => { + capture('warn', args); + originalWarn?.(...args); + }; + console.error = (...args: unknown[]): void => { + capture('error', args); + originalError?.(...args); + }; +}; + +/** A snapshot copy of the current capture buffer (most-recent-last). */ +export const getCryptoDiagEntries = (): CryptoDiagEntry[] => entries.slice(); + +const readSdkVersion = (mx?: MatrixClient): string => { + // Prefer the value the running client reports; fall back to the declared pin. + const declared = (pkg.dependencies as Record | undefined)?.['matrix-js-sdk']; + const clientVersion = (mx as unknown as { getSdkVersion?: () => string } | undefined) + ?.getSdkVersion; + if (typeof clientVersion === 'function') { + try { + return clientVersion.call(mx) || declared || 'unknown'; + } catch { + // fall through to the declared pin + } + } + return declared ?? 'unknown'; +}; + +/** + * Build a self-contained JSON diagnostic report string. Contains only the SDK + * version, device id, user id, sync state, crypto readiness, and the captured + * KE signature buffer — no message content, tokens, or other PII. + */ +export const buildCryptoDiagReport = (mx?: MatrixClient): string => { + const buffer = getCryptoDiagEntries(); + const countsByKe: Record = {}; + buffer.forEach((entry) => { + countsByKe[entry.ke] = (countsByKe[entry.ke] ?? 0) + 1; + }); + + const report = { + kind: 'lotus-crypto-diag', + generatedAt: new Date().toISOString(), + sdkVersion: readSdkVersion(mx), + deviceId: mx?.getDeviceId() ?? null, + userId: mx?.getUserId() ?? null, + syncState: mx?.getSyncState() ?? null, + cryptoReady: Boolean(mx?.getCrypto()), + entryCount: buffer.length, + maxEntries: MAX_ENTRIES, + countsByKe, + entries: buffer, + }; + + return JSON.stringify(report, null, 2); +};