"""
Text-to-speech conversion module for podcast generation.
This module handles the conversion of text scripts into natural-sounding speech using
multiple TTS providers (Google Cloud TTS and ElevenLabs). It includes functionality for:
- Rate limiting API requests to stay within provider quotas
- Exponential backoff retry logic for API resilience
- Processing individual conversation lines with appropriate voices
- Merging multiple audio segments into a complete podcast
- Managing temporary audio file storage and cleanup
The module supports different voices for interviewer/interviewee to create natural
conversational flow and allows configuration of voice settings and audio effects
through the PodcastConfig system.
Typical usage:
config = PodcastConfig()
convert_to_speech(
config,
conversation_script,
'output.mp3',
'.temp_audio/',
'mp3'
)
"""
import logging
import os
from io import BytesIO
from pathlib import Path
from typing import List
from elevenlabs import client as elevenlabs_client
from google.cloud import texttospeech
from google.cloud import texttospeech_v1beta1
from pydub import AudioSegment
from podcast_llm.config import PodcastConfig
from podcast_llm.utils.rate_limits import (
rate_limit_per_minute,
retry_with_exponential_backoff
)
logger = logging.getLogger(__name__)
[docs]
def clean_text_for_tts(lines: List) -> List:
"""
Clean text lines for text-to-speech processing by removing special characters.
Takes a list of dictionaries containing speaker and text information and removes
characters that may interfere with text-to-speech synthesis, such as asterisks,
underscores, and em dashes.
Args:
lines (List[dict]): List of dictionaries with structure:
{
'speaker': str, # Speaker identifier
'text': str # Text to be cleaned
}
Returns:
List[dict]: List of dictionaries with cleaned text and same structure as input
"""
cleaned = []
for l in lines:
cleaned.append({'speaker': l['speaker'], 'text': l['text'].replace("*", "").replace("_", "").replace("—", "")})
return cleaned
[docs]
def merge_audio_files(audio_files: List, output_file: str, audio_format: str) -> None:
"""
Merge multiple audio files into a single output file.
Takes a list of audio files and combines them in the provided order into a single output
file. Handles any audio format supported by pydub.
Args:
audio_files (list): List of paths to audio files to merge
output_file (str): Path where merged audio file should be saved
audio_format (str): Format of input/output audio files (e.g. 'mp3', 'wav')
Returns:
None
Raises:
Exception: If there are any errors during the merging process
"""
logger.info("Merging audio files...")
try:
combined = AudioSegment.empty()
for filename in audio_files:
audio = AudioSegment.from_file(filename)
combined += audio
combined.export(output_file, format=audio_format)
except Exception as e:
raise
[docs]
@retry_with_exponential_backoff(max_retries=10, base_delay=2.0)
@rate_limit_per_minute(max_requests_per_minute=20)
def process_line_google(config: PodcastConfig, text: str, speaker: str):
"""
Process a single line of text using Google Text-to-Speech API.
Takes a line of text and speaker identifier and generates synthesized speech using
Google's TTS service. Uses different voices based on the speaker to create natural
conversation flow.
Args:
text (str): The text content to convert to speech
speaker (str): Speaker identifier to determine voice selection
Returns:
bytes: Raw audio data in bytes format containing the synthesized speech
"""
client = texttospeech.TextToSpeechClient(client_options={'api_key': config.google_api_key})
tts_settings = config.tts_settings['google']
interviewer_voice = texttospeech.VoiceSelectionParams(
language_code=tts_settings['language_code'],
name=tts_settings['voice_mapping']['Interviewer'],
ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
)
interviewee_voice = texttospeech.VoiceSelectionParams(
language_code=tts_settings['language_code'],
name=tts_settings['voice_mapping']['Interviewee'],
ssml_gender=texttospeech.SsmlVoiceGender.MALE
)
synthesis_input = texttospeech.SynthesisInput(text=text)
voice = interviewee_voice
if speaker == 'Interviewer':
voice = interviewer_voice
# Select the type of audio file you want returned
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
effects_profile_id=tts_settings['effects_profile_id']
)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
return response.audio_content
[docs]
@retry_with_exponential_backoff(max_retries=10, base_delay=2.0)
@rate_limit_per_minute(max_requests_per_minute=20)
def process_line_elevenlabs(config: PodcastConfig, text: str, speaker: str):
"""
Process a line of text into speech using ElevenLabs TTS service.
Takes a line of text and speaker identifier and generates synthesized speech using
ElevenLabs' TTS service. Uses different voices based on the speaker to create natural
conversation flow.
Args:
config (PodcastConfig): Configuration object containing API keys and settings
text (str): The text content to convert to speech
speaker (str): Speaker identifier to determine voice selection
Returns:
bytes: Raw audio data in bytes format containing the synthesized speech
"""
client = elevenlabs_client.ElevenLabs(api_key=config.elevenlabs_api_key)
tts_settings = config.tts_settings['elevenlabs']
audio = client.generate(
text=text,
voice=tts_settings['voice_mapping'][speaker],
model=tts_settings['model']
)
# Convert audio iterator to bytes that can be written to disk
audio_bytes = BytesIO()
for chunk in audio:
audio_bytes.write(chunk)
return audio_bytes.getvalue()
[docs]
def combine_consecutive_speaker_chunks(chunks: List[dict]) -> List[dict]:
"""
Combine consecutive chunks from the same speaker into single chunks.
Args:
chunks (List[dict]): List of dictionaries containing conversation chunks with structure:
{
'speaker': str, # Speaker identifier
'text': str # Text content
}
Returns:
List[dict]: List of combined chunks where consecutive entries from the same speaker
are merged into single chunks
"""
combined_chunks = []
current_chunk = None
for chunk in chunks:
if current_chunk is None:
current_chunk = chunk.copy()
elif current_chunk['speaker'] == chunk['speaker']:
current_chunk['text'] += ' ' + chunk['text']
else:
combined_chunks.append(current_chunk)
current_chunk = chunk.copy()
if current_chunk is not None:
combined_chunks.append(current_chunk)
return combined_chunks
[docs]
@retry_with_exponential_backoff(max_retries=10, base_delay=2.0)
@rate_limit_per_minute(max_requests_per_minute=20)
def process_lines_google_multispeaker(config: PodcastConfig, chunks: List):
"""
Process multiple lines of text into speech using Google's multi-speaker TTS service.
Takes a chunk of conversation lines and generates synthesized speech using Google's
multi-speaker TTS service. Handles up to 6 turns of conversation at once for more
natural conversational flow.
Args:
config (PodcastConfig): Configuration object containing API keys and settings
chunks (List): List of dictionaries containing conversation lines with structure:
{
'speaker': str, # Speaker identifier
'text': str # Line content to convert to speech
}
Returns:
bytes: Raw audio data in bytes format containing the synthesized speech
"""
client = texttospeech_v1beta1.TextToSpeechClient(client_options={'api_key': config.google_api_key})
tts_settings = config.tts_settings['google_multispeaker']
# Combine consecutive lines from same speaker
chunks = combine_consecutive_speaker_chunks(chunks)
# Create multi-speaker markup
multi_speaker_markup = texttospeech_v1beta1.MultiSpeakerMarkup()
# Add each line as a conversation turn
for line in chunks:
turn = texttospeech_v1beta1.MultiSpeakerMarkup.Turn()
turn.text = line['text']
turn.speaker = tts_settings['voice_mapping'][line['speaker']]
multi_speaker_markup.turns.append(turn)
# Configure synthesis input with multi-speaker markup
synthesis_input = texttospeech_v1beta1.SynthesisInput(
multi_speaker_markup=multi_speaker_markup
)
# Configure voice parameters
voice = texttospeech_v1beta1.VoiceSelectionParams(
language_code=tts_settings['language_code'],
name='en-US-Studio-MultiSpeaker'
)
# Configure audio output
audio_config = texttospeech_v1beta1.AudioConfig(
audio_encoding=texttospeech_v1beta1.AudioEncoding.MP3_64_KBPS,
effects_profile_id=tts_settings['effects_profile_id']
)
# Generate speech
response = client.synthesize_speech(
input=synthesis_input,
voice=voice,
audio_config=audio_config
)
return response.audio_content
[docs]
def convert_to_speech(
config: PodcastConfig,
conversation: str,
output_file: str,
temp_audio_dir: str,
audio_format: str) -> None:
"""
Convert a conversation script to speech audio using Google Text-to-Speech API.
Takes a conversation script consisting of speaker/text pairs and generates audio files
for each line using Google's TTS service. The individual audio files are then merged
into a single output file. Uses different voices for different speakers to create a
natural conversational feel.
Args:
conversation (str): List of dictionaries containing conversation lines with structure:
{
'speaker': str, # Speaker identifier ('Interviewer' or 'Interviewee')
'text': str # Line content to convert to speech
}
output_file (str): Path where the final merged audio file should be saved
temp_audio_dir (str): Directory path for temporary audio file storage
audio_format (str): Format of the audio files (e.g. 'mp3')
Raises:
Exception: If any errors occur during TTS conversion or file operations
"""
tts_audio_formats = {
'elevenlabs': 'mp3',
'google': 'mp3',
'google_multispeaker': 'mp3'
}
try:
logger.info(f"Generating audio files for {len(conversation)} lines...")
audio_files = []
counter = 0
if config.tts_provider == 'google_multispeaker':
# We will not use a line by line strategy.
# Instead we will process in chunks of 6.
# Process conversation in chunks of 6 lines
for chunk_start in range(0, len(conversation), 4):
chunk = conversation[chunk_start:chunk_start + 4]
logger.info(f"Processing chunk {counter} with {len(chunk)} lines...")
audio = process_lines_google_multispeaker(config, chunk)
file_name = os.path.join(temp_audio_dir, f"{counter:03d}.{tts_audio_formats[config.tts_provider]}")
with open(file_name, "wb") as out:
out.write(audio)
audio_files.append(file_name)
counter += 1
else:
for line in conversation:
logger.info(f"Generating audio for line {counter}...")
if config.tts_provider == 'google':
audio = process_line_google(config, line['text'], line['speaker'])
elif config.tts_provider == 'elevenlabs':
audio = process_line_elevenlabs(config, line['text'], line['speaker'])
logger.info(f"Saving audio chunk {counter}...")
file_name = os.path.join(temp_audio_dir, f"{counter:03d}.{tts_audio_formats[config.tts_provider]}")
with open(file_name, "wb") as out:
out.write(audio)
audio_files.append(file_name)
counter += 1
# Merge all audio files and save the result
merge_audio_files(audio_files, output_file, audio_format)
# Clean up individual audio files
for file in audio_files:
os.remove(file)
except Exception as e:
raise
[docs]
def generate_audio(config: PodcastConfig, final_script: list, output_file: str) -> str:
"""
Generate audio from a podcast script using text-to-speech.
Takes a final script consisting of speaker/text pairs and generates a single audio file
using Google's Text-to-Speech service. The script is first cleaned and processed to be
TTS-friendly, then converted to speech with different voices for different speakers.
Args:
final_script (list): List of dictionaries containing script lines with structure:
{
'speaker': str, # Speaker identifier ('Interviewer' or 'Interviewee')
'text': str # Line content to convert to speech
}
output_file (str): Path where the final audio file should be saved
Returns:
str: Path to the generated audio file
Raises:
Exception: If any errors occur during TTS conversion or file operations
"""
cleaned_script = clean_text_for_tts(final_script)
temp_audio_dir = Path(config.temp_audio_dir)
temp_audio_dir.mkdir(parents=True, exist_ok=True)
convert_to_speech(config, cleaned_script, output_file, config.temp_audio_dir, config.output_format)
return output_file