Reading aloud streamed text (GPT style)

I wanted my ChatGPT responses to be read aloud immediately, as they appear on the screen. This was problematic because of two reasons:

ChatGPT sends partial text responses (e.g. 'he', 'llo', ', I', 'am S', 'hy'). This isn't readable and should be accumulated.
The say command stops any spoken text and starts speaking the new text. Meaning, we cannot simply call it every time we receive new text.

This is a rough start, but works well for my current needs. This util class maintains a queue, detects certain delimiters (e.g. ., ,) and starts speaking only when it detects that a phrase has probably been accumulated.

https://gist.github.com/shyagamzo/749b7535aa8876ec2ce09f39aaef6a80


import '@johnlindquist/kit';

const speechStream = new (class SpeechStream
{
    private textQueue: string[] = [];
    private isSpeaking: boolean = false;
    private feed: string = '';
    private finalizeFeedDebounced: () => void;

    constructor(private readonly config: { waitForDelimiter: number, estimatedWordsPerMinute: number })
    {
        this.finalizeFeedDebounced = _.debounce(this.finalizeFeed.bind(this), config.waitForDelimiter);

        onExit(() =>
        {
            this.textQueue = [];
            this.feed = '';

            sayIt('');
        });
    }

    public addText(text: string): void
    {
        this.feed += text;
        this.processAccumulatedText();
        this.finalizeFeedDebounced();
    }

    private processAccumulatedText(): void
    {
        const delimiters = /([.,;:!?\n])/;

        const delimiterMatch = this.feed.match(delimiters);

        if (delimiterMatch)
        {
            const delimiterIndex = delimiterMatch.index;

            const textUntilDelimiter = this.feed.slice(0, delimiterIndex + 1);
            this.textQueue.push(textUntilDelimiter.trim());

            this.feed = this.feed.slice(delimiterIndex + 1);
        }

        this.processQueue();
    }

    private finalizeFeed(): void
    {
        if (this.feed)
        {
            this.textQueue.push(this.feed.trim());
            this.feed = '';
            this.processQueue();
        }
    }

    private processQueue(): void
    {
        if (this.isSpeaking || this.textQueue.length === 0) return;

        this.isSpeaking = true;

        const textToSpeak = this.textQueue.shift();

        this.waitForSpeechEnd(textToSpeak);
        sayIt(textToSpeak);
    }

    private waitForSpeechEnd(text: string): void
    {
        const estimatedSpeechDuration = this.estimateSpeechDuration(text);

        setTimeout(() =>
        {
            this.isSpeaking = false;
            this.processQueue();
        }, estimatedSpeechDuration);
    }

    private estimateSpeechDuration(text: string): number
    {
        const wordsPerMinute = this.config.estimatedWordsPerMinute; // Average speaking rate
        const words = text.trim().split(/\s+/).length;
        const minutes = words / wordsPerMinute;

        return minutes * 60 * 1000; // Convert to milliseconds
    }
})({
    waitForDelimiter: 4000,
    estimatedWordsPerMinute: 200
});

export function sayIt(text: string): ReturnType<typeof say>
{
    return say(text, { name: 'Microsoft Zira - English (United States)', rate: 1.3 });
}

export function queueSpeech(text: string)
{
    speechStream.addText(text);
}

To use it, simply import and call queueSpeech:


import { queueSpeech } from '../lib/speech-queue';

function handleGPTText(text: string)
{
    // ...
    queueSpeech(text);
}