Source code for podcast_llm.text_to_speech

"""
Text-to-speech conversion module for podcast generation.

This module handles the conversion of text scripts into natural-sounding speech using
multiple TTS providers (Google Cloud TTS and ElevenLabs). It includes functionality for:

- Rate limiting API requests to stay within provider quotas
- Exponential backoff retry logic for API resilience 
- Processing individual conversation lines with appropriate voices
- Merging multiple audio segments into a complete podcast
- Managing temporary audio file storage and cleanup

The module supports different voices for interviewer/interviewee to create natural
conversational flow and allows configuration of voice settings and audio effects
through the PodcastConfig system.

Typical usage:
    config = PodcastConfig()
    convert_to_speech(
        config,
        conversation_script,
        'output.mp3',
        '.temp_audio/',
        'mp3'
    )
"""


import logging
import os
from io import BytesIO
from pathlib import Path
from typing import List

from elevenlabs import client as elevenlabs_client
from google.cloud import texttospeech
from google.cloud import texttospeech_v1beta1
from pydub import AudioSegment

from podcast_llm.config import PodcastConfig
from podcast_llm.utils.rate_limits import (
    rate_limit_per_minute,
    retry_with_exponential_backoff
)


logger = logging.getLogger(__name__)




[docs]
def clean_text_for_tts(lines: List) -> List:
    """
    Clean text lines for text-to-speech processing by removing special characters.

    Takes a list of dictionaries containing speaker and text information and removes
    characters that may interfere with text-to-speech synthesis, such as asterisks,
    underscores, and em dashes.

    Args:
        lines (List[dict]): List of dictionaries with structure:
            {
                'speaker': str,  # Speaker identifier
                'text': str      # Text to be cleaned
            }

    Returns:
        List[dict]: List of dictionaries with cleaned text and same structure as input
    """
    cleaned = []
    for l in lines:
        cleaned.append({'speaker': l['speaker'], 'text': l['text'].replace("*", "").replace("_", "").replace("—", "")})

    return cleaned





[docs]
def merge_audio_files(audio_files: List, output_file: str, audio_format: str) -> None:
    """
    Merge multiple audio files into a single output file.

    Takes a list of audio files and combines them in the provided order into a single output
    file. Handles any audio format supported by pydub.

    Args:
        audio_files (list): List of paths to audio files to merge
        output_file (str): Path where merged audio file should be saved
        audio_format (str): Format of input/output audio files (e.g. 'mp3', 'wav')

    Returns:
        None

    Raises:
        Exception: If there are any errors during the merging process
    """
    logger.info("Merging audio files...")
    try:
        combined = AudioSegment.empty()

        for filename in audio_files:
            audio = AudioSegment.from_file(filename)

            combined += audio

        combined.export(output_file, format=audio_format)
    except Exception as e:
        raise




[docs]
@retry_with_exponential_backoff(max_retries=10, base_delay=2.0)
@rate_limit_per_minute(max_requests_per_minute=20)
def process_line_google(config: PodcastConfig, text: str, speaker: str):
    """
    Process a single line of text using Google Text-to-Speech API.

    Takes a line of text and speaker identifier and generates synthesized speech using
    Google's TTS service. Uses different voices based on the speaker to create natural
    conversation flow.

    Args:
        text (str): The text content to convert to speech
        speaker (str): Speaker identifier to determine voice selection

    Returns:
        bytes: Raw audio data in bytes format containing the synthesized speech
    """
    client = texttospeech.TextToSpeechClient(client_options={'api_key': config.google_api_key})
    tts_settings = config.tts_settings['google']
    
    interviewer_voice = texttospeech.VoiceSelectionParams(
        language_code=tts_settings['language_code'],
        name=tts_settings['voice_mapping']['Interviewer'],
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
    )
    
    interviewee_voice = texttospeech.VoiceSelectionParams(
        language_code=tts_settings['language_code'],
        name=tts_settings['voice_mapping']['Interviewee'],
        ssml_gender=texttospeech.SsmlVoiceGender.MALE
    )
    
    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice = interviewee_voice
    if speaker == 'Interviewer':
        voice = interviewer_voice
    
    # Select the type of audio file you want returned
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
        effects_profile_id=tts_settings['effects_profile_id']
    )
    
    # Perform the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )
    
    return response.audio_content




[docs]
@retry_with_exponential_backoff(max_retries=10, base_delay=2.0)
@rate_limit_per_minute(max_requests_per_minute=20)
def process_line_elevenlabs(config: PodcastConfig, text: str, speaker: str):
    """
    Process a line of text into speech using ElevenLabs TTS service.

    Takes a line of text and speaker identifier and generates synthesized speech using
    ElevenLabs' TTS service. Uses different voices based on the speaker to create natural
    conversation flow.

    Args:
        config (PodcastConfig): Configuration object containing API keys and settings
        text (str): The text content to convert to speech
        speaker (str): Speaker identifier to determine voice selection

    Returns:
        bytes: Raw audio data in bytes format containing the synthesized speech
    """
    client = elevenlabs_client.ElevenLabs(api_key=config.elevenlabs_api_key)
    tts_settings = config.tts_settings['elevenlabs']

    audio = client.generate(
        text=text,
        voice=tts_settings['voice_mapping'][speaker],
        model=tts_settings['model']
    )

    # Convert audio iterator to bytes that can be written to disk
    audio_bytes = BytesIO()
    for chunk in audio:
        audio_bytes.write(chunk)
    
    return audio_bytes.getvalue()




[docs]
def combine_consecutive_speaker_chunks(chunks: List[dict]) -> List[dict]:
    """
    Combine consecutive chunks from the same speaker into single chunks.
    
    Args:
        chunks (List[dict]): List of dictionaries containing conversation chunks with structure:
            {
                'speaker': str,  # Speaker identifier
                'text': str      # Text content
            }
    
    Returns:
        List[dict]: List of combined chunks where consecutive entries from the same speaker
                   are merged into single chunks
    """
    combined_chunks = []
    current_chunk = None

    for chunk in chunks:
        if current_chunk is None:
            current_chunk = chunk.copy()
        elif current_chunk['speaker'] == chunk['speaker']:
            current_chunk['text'] += ' ' + chunk['text']
        else:
            combined_chunks.append(current_chunk)
            current_chunk = chunk.copy()

    if current_chunk is not None:
        combined_chunks.append(current_chunk)

    return combined_chunks




[docs]
@retry_with_exponential_backoff(max_retries=10, base_delay=2.0)
@rate_limit_per_minute(max_requests_per_minute=20)
def process_lines_google_multispeaker(config: PodcastConfig, chunks: List):
    """
    Process multiple lines of text into speech using Google's multi-speaker TTS service.

    Takes a chunk of conversation lines and generates synthesized speech using Google's
    multi-speaker TTS service. Handles up to 6 turns of conversation at once for more
    natural conversational flow.

    Args:
        config (PodcastConfig): Configuration object containing API keys and settings
        chunks (List): List of dictionaries containing conversation lines with structure:
            {
                'speaker': str,  # Speaker identifier
                'text': str      # Line content to convert to speech
            }

    Returns:
        bytes: Raw audio data in bytes format containing the synthesized speech
    """
    client = texttospeech_v1beta1.TextToSpeechClient(client_options={'api_key': config.google_api_key})
    tts_settings = config.tts_settings['google_multispeaker']

    # Combine consecutive lines from same speaker
    chunks = combine_consecutive_speaker_chunks(chunks)

    # Create multi-speaker markup
    multi_speaker_markup = texttospeech_v1beta1.MultiSpeakerMarkup()

    # Add each line as a conversation turn
    for line in chunks:
        turn = texttospeech_v1beta1.MultiSpeakerMarkup.Turn()
        turn.text = line['text']
        turn.speaker = tts_settings['voice_mapping'][line['speaker']]
        multi_speaker_markup.turns.append(turn)

    # Configure synthesis input with multi-speaker markup
    synthesis_input = texttospeech_v1beta1.SynthesisInput(
        multi_speaker_markup=multi_speaker_markup
    )

    # Configure voice parameters
    voice = texttospeech_v1beta1.VoiceSelectionParams(
        language_code=tts_settings['language_code'],
        name='en-US-Studio-MultiSpeaker'
    )

    # Configure audio output
    audio_config = texttospeech_v1beta1.AudioConfig(
        audio_encoding=texttospeech_v1beta1.AudioEncoding.MP3_64_KBPS,
        effects_profile_id=tts_settings['effects_profile_id']
    )

    # Generate speech
    response = client.synthesize_speech(
        input=synthesis_input,
        voice=voice,
        audio_config=audio_config
    )

    return response.audio_content




[docs]
def convert_to_speech(
        config: PodcastConfig,
        conversation: str, 
        output_file: str, 
        temp_audio_dir: str, 
        audio_format: str) -> None:
    """
    Convert a conversation script to speech audio using Google Text-to-Speech API.

    Takes a conversation script consisting of speaker/text pairs and generates audio files
    for each line using Google's TTS service. The individual audio files are then merged
    into a single output file. Uses different voices for different speakers to create a
    natural conversational feel.

    Args:
        conversation (str): List of dictionaries containing conversation lines with structure:
            {
                'speaker': str,  # Speaker identifier ('Interviewer' or 'Interviewee')
                'text': str      # Line content to convert to speech
            }
        output_file (str): Path where the final merged audio file should be saved
        temp_audio_dir (str): Directory path for temporary audio file storage
        audio_format (str): Format of the audio files (e.g. 'mp3')

    Raises:
        Exception: If any errors occur during TTS conversion or file operations
    """
    tts_audio_formats = {
        'elevenlabs': 'mp3',
        'google': 'mp3',
        'google_multispeaker': 'mp3'
    }

    try:
        logger.info(f"Generating audio files for {len(conversation)} lines...")
        audio_files = []
        counter = 0

        if config.tts_provider == 'google_multispeaker':
            # We will not use a line by line strategy. 
            # Instead we will process in chunks of 6.
            # Process conversation in chunks of 6 lines
            for chunk_start in range(0, len(conversation), 4):
                chunk = conversation[chunk_start:chunk_start + 4]
                logger.info(f"Processing chunk {counter} with {len(chunk)} lines...")
                
                audio = process_lines_google_multispeaker(config, chunk)
                
                file_name = os.path.join(temp_audio_dir, f"{counter:03d}.{tts_audio_formats[config.tts_provider]}")
                with open(file_name, "wb") as out:
                    out.write(audio)
                audio_files.append(file_name)
                
                counter += 1
        else:
            for line in conversation:
                logger.info(f"Generating audio for line {counter}...")

                if config.tts_provider == 'google':
                    audio = process_line_google(config, line['text'], line['speaker'])
                elif config.tts_provider == 'elevenlabs':
                    audio = process_line_elevenlabs(config, line['text'], line['speaker'])

                logger.info(f"Saving audio chunk {counter}...")
                file_name = os.path.join(temp_audio_dir, f"{counter:03d}.{tts_audio_formats[config.tts_provider]}")
                with open(file_name, "wb") as out:
                    out.write(audio)
                audio_files.append(file_name)

                counter += 1

        # Merge all audio files and save the result
        merge_audio_files(audio_files, output_file, audio_format)

        # Clean up individual audio files
        for file in audio_files:
            os.remove(file)

    except Exception as e:
        raise




[docs]
def generate_audio(config: PodcastConfig, final_script: list, output_file: str) -> str:
    """
    Generate audio from a podcast script using text-to-speech.

    Takes a final script consisting of speaker/text pairs and generates a single audio file
    using Google's Text-to-Speech service. The script is first cleaned and processed to be
    TTS-friendly, then converted to speech with different voices for different speakers.

    Args:
        final_script (list): List of dictionaries containing script lines with structure:
            {
                'speaker': str,  # Speaker identifier ('Interviewer' or 'Interviewee')
                'text': str      # Line content to convert to speech
            }
        output_file (str): Path where the final audio file should be saved

    Returns:
        str: Path to the generated audio file

    Raises:
        Exception: If any errors occur during TTS conversion or file operations
    """
    cleaned_script = clean_text_for_tts(final_script)

    temp_audio_dir = Path(config.temp_audio_dir)
    temp_audio_dir.mkdir(parents=True, exist_ok=True)
    convert_to_speech(config, cleaned_script, output_file, config.temp_audio_dir, config.output_format)

    return output_file