Source code for podcast_llm.extractors.plaintext

"""
Plaintext file extraction module.

This module provides functionality for extracting text content from plaintext files
like Markdown documents. It handles loading text files, preserving formatting, and
converting content into a standardized document format.

The module includes:
- MarkdownSourceDocument class for handling Markdown file extraction
- Raw text content preservation including formatting
- Conversion to standardized document format
- UTF-8 encoding support

Example:
    >>> from podcast_llm.extractors.plaintext import MarkdownSourceDocument
    >>> extractor = MarkdownSourceDocument('script.md')
    >>> extractor.extract()
    >>> print(extractor.content)
    '# Title\n\nMarkdown content...'

The extraction process:
1. Opens the text file with UTF-8 encoding
2. Reads the complete file content
3. Preserves original formatting and structure
4. Returns the raw text content

The module integrates with the BaseSourceDocument interface to provide consistent
handling of plaintext files alongside other source types like PDFs and web content.
"""



from podcast_llm.extractors.base import BaseSourceDocument
from typing import Optional



[docs]
class MarkdownSourceDocument(BaseSourceDocument):
    """
    A document extractor for Markdown files.

    This class handles extracting text content from Markdown files by reading
    the raw text content. It preserves the original Markdown formatting which
    can be used for conversation structure and formatting.

    Attributes:
        src (str): Path to the source Markdown file
        src_type (str): Type of source document ('Markdown File')
        title (str): Title combining source type and filename
        content (Optional[str]): Extracted text content after processing

    Example:
        >>> extractor = MarkdownSourceDocument('script.md')
        >>> extractor.extract()
        >>> print(extractor.content)
        '# Title\n\nMarkdown content...'
    """

    def __init__(self, source: str) -> None:
        """
        Initialize the text extractor.

        Args:
            source: Path to the Markdown file to extract text from
        """
        self.src = source
        self.src_type = 'Markdown File'
        self.title = f"{self.src_type}: {source}"
        self.content: Optional[str] = None


[docs]
    def extract(self) -> str:
        """
        Extract text content from the Markdown file.

        Returns:
            The extracted text content as a string
        """
        with open(self.src, 'r', encoding='utf-8') as file:
            self.content = file.read()
        return self.content





[docs]
class TextSourceDocument(BaseSourceDocument):
    """
    A document extractor for plain text files.

    This class handles extracting text content from plain text files by reading
    the raw text content. It provides simple text extraction without any special
    formatting or processing.

    Attributes:
        src (str): Path to the source text file
        src_type (str): Type of source document ('Text File')
        title (str): Title combining source type and filename
        content (Optional[str]): Extracted text content after processing

    Example:
        >>> extractor = TextSourceDocument('document.txt')
        >>> extractor.extract()
        >>> print(extractor.content)
        'Plain text content...'
    """

    def __init__(self, source: str) -> None:
        """
        Initialize the text extractor.

        Args:
            source: Path to the text file to extract text from
        """
        self.src = source
        self.src_type = 'Text File'
        self.title = f"{self.src_type}: {source}"
        self.content: Optional[str] = None


[docs]
    def extract(self) -> str:
        """
        Extract text content from the text file.

        Returns:
            The extracted text content as a string
        """
        with open(self.src, 'r', encoding='utf-8') as file:
            self.content = file.read()
        return self.content