Source code for podcast_llm.extractors.audio

"""
Audio file extraction module.

This module provides functionality for extracting text content from audio files
by transcribing them using OpenAI's Whisper model. It handles various audio formats
(mp3, wav, m4a, ogg) and manages splitting long audio files into chunks that stay
within API limits.

The module includes:
- AudioSourceDocument class for handling audio file extraction
- Audio file splitting based on silence detection
- Transcription using OpenAI Whisper API
- Temporary file management for processing

Example:
    >>> from podcast_llm.extractors.audio import AudioSourceDocument
    >>> extractor = AudioSourceDocument('podcast.mp3')
    >>> extractor.extract()
    >>> print(extractor.content)
    'Transcribed text from audio file...'

The extraction process:
1. Loads the audio file using pydub
2. Splits into ~10 minute segments based on silence detection
3. Saves segments to temporary files
4. Transcribes each segment using Whisper
5. Combines transcriptions into final content

The module handles errors gracefully and cleans up temporary files after processing.
"""


import logging
from podcast_llm.extractors.base import BaseSourceDocument
from typing import Optional
import os
import math
import pydub
from pydub import AudioSegment
import openai
import tempfile


logger = logging.getLogger(__name__)



[docs]
class AudioSourceDocument(BaseSourceDocument):
    """
    A document extractor for audio files.

    This class handles extracting text content from audio files (mp3, wav, m4a, ogg) 
    by splitting them into manageable segments and transcribing them using OpenAI's
    Whisper model. The audio is first split into 10-minute chunks to stay within 
    API limits.

    Attributes:
        src (str): Path to the source audio file
        src_type (str): Type of source document ('Audio File')
        title (str): Title combining source type and filename
        content (Optional[str]): Extracted text content after transcription

    Example:
        >>> extractor = AudioSourceDocument('podcast.mp3')
        >>> extractor.extract()
        >>> print(extractor.content)
        'Transcribed text from audio file...'
    """
    def __init__(self, source: str) -> None:
        """
        Initialize the PDF extractor.

        Args:
            source: Path to the PDF file to extract text from
        """
        self.src = source
        self.src_type = 'PDF File'
        self.title = f"{self.src_type}: {source}"
        self.content: Optional[str] = None

    def _split_audio(self, filename: str, temp_dir: tempfile.TemporaryDirectory) -> list:
        """
        Split an audio file into 10-minute segments to stay within API limits.

        Takes an audio file and splits it into segments of 10 minutes or less to comply
        with OpenAI Whisper API limits. The segments are saved as separate MP3 files
        in a temporary directory.

        Args:
            filename (str): Path to the input audio file
            temp_dir (tempfile.TemporaryDirectory): Directory to store temporary segment files

        Returns:
            list: List of paths to the generated audio segment files

        Example:
            >>> with tempfile.TemporaryDirectory() as temp_dir:
            >>>     segments = self._split_audio('podcast.mp3', temp_dir)
            >>>     print(len(segments))
            3
        """
        logger.info(f"Splitting audio file {filename} into segments.")
        try:
            audio = AudioSegment.from_file(filename, format="mp3")
        except pydub.exceptions.CouldntDecodeError:
            audio = AudioSegment.from_file(filename, format="mp4")

        # Duration of each segment in milliseconds (10 minutes = 600,000 ms)
        segment_duration = 10 * 60 * 1000  # 10 minutes in milliseconds

        # Calculate the number of segments needed
        num_segments = math.ceil(len(audio) / segment_duration)

        segments = []
        for i in range(num_segments):
            # Calculate the start and end of each segment
            start_time = i * segment_duration
            end_time = min((i + 1) * segment_duration, len(audio))  # Make sure not to exceed audio length

            # Extract the segment
            segment = audio[start_time:end_time]

            # Export the segment to a new MP3 file
            segment_filename = os.path.join(temp_dir, f"segment_{i + 1:03d}.mp3")
            segment.export(segment_filename, format="mp3")
            segments.append(segment_filename)

            logger.info(f"Exported segment {i + 1} from {start_time / 1000} to {end_time / 1000} seconds.")

        return segments


[docs]
    def extract(self) -> str:
        """
        Extract text content from an audio file using OpenAI's Whisper API.

        This method takes an audio file, splits it into 10-minute segments to comply with 
        API limits, and transcribes each segment using OpenAI's Whisper speech-to-text model.
        The transcribed segments are then combined into a single text document.

        Returns:
            str: The complete transcribed text from the audio file

        Raises:
            openai.OpenAIError: If there is an error calling the Whisper API
            IOError: If there is an error reading the audio file

        Example:
            >>> extractor = AudioSourceDocument('podcast.mp3')
            >>> text = extractor.extract()
            >>> print(text[:100])
            'Welcome to today's episode where we'll be discussing...'
        """
        logger.info(f"Loading audio from file: {self.src}")

        client = openai.OpenAI()

        # Process each chunk through Whisper API
        transcribed_texts = []
        with tempfile.TemporaryDirectory() as temp_dir:
            chunks = self._split_audio(self.src, temp_dir)

            for i, chunk in enumerate(chunks):
                with open(chunk, 'rb') as audio_file:
                    logger.info(f"Transcribing chunk {i+1}...")
                    transcript = client.audio.transcriptions.create(
                        file=audio_file,
                        model="whisper-1",
                        response_format="text"
                    )
                    logger.info(f"Got transcript:\n{transcript[:200]}...")
                    transcribed_texts.append(transcript)

        logger.info(f"Transcribing complete. Combining transcripts...")
        self.content = ' '.join(transcribed_texts)
        return self.content