"""
Research module for podcast generation.
This module provides functionality to gather background research and information
for podcast episode generation. It handles retrieving content from various sources
like Wikipedia and search engines.
Example:
>>> from podcast_llm.research import suggest_wikipedia_articles
>>> from podcast_llm.models import WikipediaPages
>>> config = PodcastConfig()
>>> articles: WikipediaPages = suggest_wikipedia_articles(config, "Artificial Intelligence")
>>> print(articles.pages[0].name)
'Artificial intelligence'
The research process includes:
- Suggesting relevant Wikipedia articles via LangChain and GPT-4
- Downloading Wikipedia article content
- Performing targeted web searches with Tavily
- Extracting key information from web articles
- Organizing research into structured formats using Pydantic models
The module uses various APIs and services to gather comprehensive background
information while maintaining rate limits and handling errors gracefully.
"""
import logging
from typing import List
from langchain import hub
from langchain_community.retrievers import WikipediaRetriever
from langchain_core.documents import Document
from podcast_llm.outline import PodcastOutline
from tavily import TavilyClient
from podcast_llm.config import PodcastConfig
from podcast_llm.utils.llm import get_fast_llm
from podcast_llm.models import (
SearchQueries,
WikipediaPages
)
from podcast_llm.extractors.web import WebSourceDocument
logger = logging.getLogger(__name__)
[docs]
def suggest_wikipedia_articles(config: PodcastConfig, topic: str) -> WikipediaPages:
"""
Suggest relevant Wikipedia articles for a given topic.
Uses LangChain and GPT-4 to intelligently suggest Wikipedia articles that would provide good
background research for a podcast episode on the given topic.
Args:
topic (str): The podcast topic to research
Returns:
WikipediaPages: A structured list of suggested Wikipedia article titles
"""
logger.info(f'Suggesting Wikipedia articles for topic: {topic}')
prompthub_path = "evandempsey/podcast_wikipedia_suggestions:58c92df4"
wikipedia_prompt = hub.pull(prompthub_path)
logger.info(f"Got prompt from hub: {prompthub_path}")
fast_llm = get_fast_llm(config)
wikipedia_chain = wikipedia_prompt | fast_llm.with_structured_output(
WikipediaPages
)
result = wikipedia_chain.invoke({"topic": topic})
logger.info(f'Found {len(result.pages)} suggested Wikipedia articles')
return result
[docs]
def download_wikipedia_articles(suggestions: WikipediaPages) -> list:
"""
Download Wikipedia articles based on suggested page titles.
Takes a structured list of Wikipedia page suggestions and downloads the full content
of each article using the WikipediaRetriever. Handles errors gracefully if any
articles fail to download.
Args:
suggestions (WikipediaPages): Structured list of suggested Wikipedia page titles
Returns:
list: List of retrieved Wikipedia document objects containing page content and metadata
"""
logger.info('Starting Wikipedia article download')
retriever = WikipediaRetriever()
wikipedia_documents = []
for page in suggestions.pages:
logger.info(f'Retrieving article: {page.name}')
try:
wikipedia_documents.append(retriever.invoke(page.name)[0])
logger.debug(f'Successfully retrieved article: {page.name}')
except Exception as e:
logger.error(f'Failed to retrieve article {page.name}: {str(e)}')
logger.info(f'Downloaded {len(wikipedia_documents)} Wikipedia articles')
return wikipedia_documents
[docs]
def research_background_info(config: PodcastConfig, topic: str) -> list:
"""
Research background information for a podcast topic.
Coordinates the research process by first suggesting relevant Wikipedia articles
based on the topic, then downloading the full content of those articles. Acts as
the main orchestration function for gathering background research material.
Args:
topic (str): The podcast topic to research
Returns:
dict: List of retrieved Wikipedia document objects containing article content and metadata
"""
logger.info(f'Starting research for topic: {topic}')
suggestions = suggest_wikipedia_articles(config, topic)
wikipedia_content = download_wikipedia_articles(suggestions)
logger.info('Research completed successfully')
return wikipedia_content
[docs]
def download_page_content(urls: List[str]) -> List[Document]:
"""
Download and parse content from a list of URLs.
Uses the newspaper3k library to download and extract clean text content from web pages.
Handles errors gracefully and logs success/failure for each URL. Filters out articles
with no text content.
Args:
urls (list): List of URLs to download and parse
Returns:
list: List of dictionaries containing the downloaded articles with structure:
{
'url': str, # Original URL
'title': str, # Article title
'text': str # Cleaned article text content
}
"""
logger.info('Downloading page content from URLs.')
downloaded_articles = []
for url in urls:
try:
web_source_doc = WebSourceDocument(url)
web_source_doc.extract()
downloaded_articles.append(web_source_doc.as_langchain_document())
except Exception as e:
logger.error(f'Unexpected error downloading {url}: {str(e)}')
logger.info(f'Successfully downloaded {len(downloaded_articles)} articles')
return downloaded_articles
[docs]
def research_discussion_topics(config: PodcastConfig, topic: str, outline: PodcastOutline) -> list:
"""
Research in-depth content for podcast discussion topics.
Takes a podcast topic and outline, then uses LangChain and GPT-4 to generate targeted
search queries. These queries are used to find relevant articles via Tavily search.
The articles are then downloaded and processed to provide detailed research material
for each section of the podcast.
Args:
topic (str): The main topic for the podcast episode
outline (PodcastOutline): Structured outline containing sections and subsections
Returns:
list: List of dictionaries containing downloaded article content with structure:
{
'url': str, # Source URL
'title': str, # Article title
'text': str # Article content
}
"""
logger.info(f'Suggesting search queries based on podcast outline')
prompthub_path = "evandempsey/podcast_research_queries:561acf5f"
search_queries_prompt = hub.pull(prompthub_path)
logger.info(f"Got prompt from hub: {prompthub_path}")
fast_llm = get_fast_llm(config)
search_queries_chain = search_queries_prompt | fast_llm.with_structured_output(
SearchQueries
)
queries = search_queries_chain.invoke({"topic": topic, "podcast_outline": outline.as_str})
logger.info(f'Got {len(queries.queries)} suggested search queries')
urls_to_scrape = perform_tavily_queries(config, queries)
page_content = download_page_content(urls_to_scrape)
return page_content