/** * Lightweight syntax tokenizer for code blocks. * * Returns an array of {text, type} tokens that can be rendered as * coloured elements using TDS (Lotus Terminal Design System) * CSS custom properties via inline styles. * * Supported token types: * 'kw' → keywords → var(--lt-accent-cyan) * 'str' → strings → var(--lt-accent-green) * 'num' → numbers → var(--lt-accent-orange) * 'cmt' → comments → opacity 0.5, fontStyle italic * 'fn' → function names → var(--lt-accent-purple) * 'plain' → everything else → inherit * * Supported languages: javascript / typescript / python / rust (and aliases). */ import type { CSSProperties } from 'react'; export type SyntaxToken = { text: string; type: 'kw' | 'str' | 'num' | 'cmt' | 'fn' | 'plain'; }; // ── Language keyword sets ────────────────────────────────────────────────── const JS_KEYWORDS = new Set([ 'break', 'case', 'catch', 'class', 'const', 'continue', 'debugger', 'default', 'delete', 'do', 'else', 'export', 'extends', 'false', 'finally', 'for', 'from', 'function', 'if', 'import', 'in', 'instanceof', 'let', 'new', 'null', 'of', 'return', 'static', 'super', 'switch', 'this', 'throw', 'true', 'try', 'typeof', 'undefined', 'var', 'void', 'while', 'with', 'yield', 'async', 'await', 'type', 'interface', 'enum', 'declare', 'abstract', 'as', 'namespace', 'module', 'readonly', ]); const PYTHON_KEYWORDS = new Set([ 'False', 'None', 'True', 'and', 'as', 'assert', 'async', 'await', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'nonlocal', 'not', 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield', ]); const RUST_KEYWORDS = new Set([ 'as', 'async', 'await', 'break', 'const', 'continue', 'crate', 'dyn', 'else', 'enum', 'extern', 'false', 'fn', 'for', 'if', 'impl', 'in', 'let', 'loop', 'match', 'mod', 'move', 'mut', 'pub', 'ref', 'return', 'self', 'Self', 'static', 'struct', 'super', 'trait', 'true', 'type', 'union', 'unsafe', 'use', 'where', 'while', ]); function getKeywords(lang: string): Set { const l = lang.toLowerCase(); if (l === 'python' || l === 'py') return PYTHON_KEYWORDS; if (l === 'rust' || l === 'rs') return RUST_KEYWORDS; // js / ts / jsx / tsx and friends return JS_KEYWORDS; } // ── Tokenizer ────────────────────────────────────────────────────────────── /** * Tokenises `code` for the given `lang` and returns an array of SyntaxToken * objects. Falls back to a single 'plain' token when the language is not * recognised or when `lang` is empty. */ export function tokenize(code: string, lang: string): SyntaxToken[] { const normalised = lang.toLowerCase().replace(/^language-/, ''); const supported = [ 'js', 'javascript', 'ts', 'typescript', 'jsx', 'tsx', 'py', 'python', 'rs', 'rust', ]; if (!supported.includes(normalised)) { return [{ text: code, type: 'plain' }]; } const keywords = getKeywords(normalised); const tokens: SyntaxToken[] = []; let i = 0; const len = code.length; while (i < len) { // ── Block comment /* … */ ────────────────────────────────────────────── if (code[i] === '/' && code[i + 1] === '*') { const end = code.indexOf('*/', i + 2); const closeIdx = end === -1 ? len : end + 2; tokens.push({ text: code.slice(i, closeIdx), type: 'cmt' }); i = closeIdx; continue; } // ── Line comment // … ────────────────────────────────────────────────── if (code[i] === '/' && code[i + 1] === '/') { const nl = code.indexOf('\n', i); const closeIdx = nl === -1 ? len : nl; tokens.push({ text: code.slice(i, closeIdx), type: 'cmt' }); i = closeIdx; continue; } // ── Python / shell line comment # … ────────────────────────────────── if ( code[i] === '#' && (normalised === 'python' || normalised === 'py') && (i === 0 || code[i - 1] === '\n') ) { const nlHash = code.indexOf('\n', i); const closeIdx = nlHash === -1 ? len : nlHash; tokens.push({ text: code.slice(i, closeIdx), type: 'cmt' }); i = closeIdx; continue; } // ── String literals (single, double, backtick) ───────────────────────── const quote = code[i]; if (quote === '"' || quote === "'" || quote === '`') { let j = i + 1; while (j < len) { if (code[j] === '\\') { j += 2; // skip escaped char } else if (code[j] === quote) { j += 1; break; } else if (quote !== '`' && code[j] === '\n') { // unterminated single/double quote — stop at newline break; } else { j += 1; } } tokens.push({ text: code.slice(i, j), type: 'str' }); i = j; continue; } // ── Numbers ──────────────────────────────────────────────────────────── if (/\d/.test(code[i]) && (i === 0 || /\W/.test(code[i - 1]))) { let j = i; while (j < len && /[\d._xXbBoOeE]/.test(code[j])) j++; tokens.push({ text: code.slice(i, j), type: 'num' }); i = j; continue; } // ── Identifiers (keywords, function names, plain words) ─────────────── if (/[a-zA-Z_$]/.test(code[i])) { let j = i; while (j < len && /[a-zA-Z0-9_$]/.test(code[j])) j++; const word = code.slice(i, j); // Look ahead for `(` to detect function calls / definitions let k = j; while (k < len && (code[k] === ' ' || code[k] === '\t')) k++; const isFunctionCall = code[k] === '('; if (keywords.has(word)) { tokens.push({ text: word, type: 'kw' }); } else if (isFunctionCall) { tokens.push({ text: word, type: 'fn' }); } else { tokens.push({ text: word, type: 'plain' }); } i = j; continue; } // ── Everything else — collect a run of non-special chars ────────────── const start = i; while ( i < len && code[i] !== '/' && code[i] !== '#' && code[i] !== '"' && code[i] !== "'" && code[i] !== '`' && !/[a-zA-Z0-9_$]/.test(code[i]) ) { i++; } if (i === start) i++; // safety: always advance if (start < i) tokens.push({ text: code.slice(start, i), type: 'plain' }); } return tokens; } // ── Inline style helpers ──────────────────────────────────────────────────── /** Returns the React inline-style object for a given SyntaxToken type. */ export function tokenStyle(type: SyntaxToken['type']): CSSProperties { switch (type) { case 'kw': return { color: 'var(--lt-accent-cyan, #66d9ef)' }; case 'str': return { color: 'var(--lt-accent-green, #a6e22e)' }; case 'num': return { color: 'var(--lt-accent-orange, #fd971f)' }; case 'cmt': return { opacity: 0.5, fontStyle: 'italic' as const }; case 'fn': return { color: 'var(--lt-accent-purple, #ae81ff)' }; default: return {}; } }