Skip to content

Command Line Interface

Once you have installed the blueprint, you can use it from the CLI.

You can either provide the path to a configuration file:

document-to-podcast --from_config "example_data/config.yaml"

Or provide values to the arguments directly:

document-to-podcast \
--input_file "example_data/Mozilla-Trustworthy_AI.pdf" \
--output_folder "example_data"
--text_to_text_model "Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf"

Note that you can also exit the podcast generation prematurely (before the whole podcast is created), by pressing Ctrl+C in the terminal. This will make the application stop the generation, but still save the result (script & audio) to disk up until that point.


document_to_podcast.cli.document_to_podcast(input_file=None, output_folder=None, text_to_text_model='bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf', text_to_text_prompt=DEFAULT_PROMPT, text_to_speech_model='OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf', speakers=None, outetts_language='en', from_config=None)

Generate a podcast from a document.

Parameters:

Name Type Description Default
input_file str

The path to the input file. Supported extensions:

- .pdf
- .html
- .txt
- .docx
- .md
None
output_folder str

The path to the output folder. Two files will be created:

- {output_folder}/podcast.txt
- {output_folder}/podcast.wav
None
text_to_text_model str

The text-to-text model_id.

Need to be formatted as owner/repo/file.

Need to be a gguf file.

Defaults to bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf.

'bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf'
text_to_text_prompt str

The prompt for the text-to-text model. Defaults to DEFAULT_PROMPT.

DEFAULT_PROMPT
text_to_speech_model str

The text-to-speech model_id. Defaults to OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf.

'OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf'
speakers list[Speaker] | None

The speakers for the podcast. Defaults to DEFAULT_SPEAKERS.

None
outetts_language str

For OuteTTS models we need to specify which language to use. Supported languages in 0.2-500M: en, zh, ja, ko. More info: https://github.com/edwko/OuteTTS

'en'
from_config str

The path to the config file. Defaults to None.

If provided, all other arguments will be ignored.

None
Source code in src/document_to_podcast/cli.py
@logger.catch(reraise=True)
def document_to_podcast(
    input_file: str | None = None,
    output_folder: str | None = None,
    text_to_text_model: str = "bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf",
    text_to_text_prompt: str = DEFAULT_PROMPT,
    text_to_speech_model: str = "OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf",
    speakers: list[Speaker] | None = None,
    outetts_language: str = "en",  # Only applicable to OuteTTS models
    from_config: str | None = None,
):
    """
    Generate a podcast from a document.

    Args:
        input_file (str): The path to the input file.
            Supported extensions:

                - .pdf
                - .html
                - .txt
                - .docx
                - .md

        output_folder (str): The path to the output folder.
            Two files will be created:

                - {output_folder}/podcast.txt
                - {output_folder}/podcast.wav

        text_to_text_model (str, optional): The text-to-text model_id.

            Need to be formatted as `owner/repo/file`.

            Need to be a gguf file.

            Defaults to `bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf`.

        text_to_text_prompt (str, optional): The prompt for the text-to-text model.
            Defaults to DEFAULT_PROMPT.

        text_to_speech_model (str, optional): The text-to-speech model_id.
            Defaults to `OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf`.

        speakers (list[Speaker] | None, optional): The speakers for the podcast.
            Defaults to DEFAULT_SPEAKERS.

        outetts_language (str): For OuteTTS models we need to specify which language to use.
            Supported languages in 0.2-500M: en, zh, ja, ko. More info: https://github.com/edwko/OuteTTS

        from_config (str, optional): The path to the config file. Defaults to None.

            If provided, all other arguments will be ignored.
    """
    if from_config:
        config = Config.model_validate(yaml.safe_load(Path(from_config).read_text()))
    else:
        speakers = speakers or DEFAULT_SPEAKERS
        config = Config(
            input_file=input_file,
            output_folder=output_folder,
            text_to_text_model=text_to_text_model,
            text_to_text_prompt=text_to_text_prompt,
            text_to_speech_model=text_to_speech_model,
            speakers=[Speaker.model_validate(speaker) for speaker in speakers],
            outetts_language=outetts_language,
        )

    output_folder = Path(config.output_folder)
    output_folder.mkdir(parents=True, exist_ok=True)

    data_loader = DATA_LOADERS[Path(config.input_file).suffix]
    logger.info(f"Loading {config.input_file}")
    raw_text = data_loader(config.input_file)
    logger.debug(f"Loaded {len(raw_text)} characters")

    data_cleaner = DATA_CLEANERS[Path(config.input_file).suffix]
    logger.info(f"Cleaning {config.input_file}")
    clean_text = data_cleaner(raw_text)
    logger.debug(f"Cleaned {len(raw_text) - len(clean_text)} characters")
    logger.debug(f"Length of cleaned text: {len(clean_text)}")

    logger.info(f"Loading {config.text_to_text_model}")
    text_model = load_llama_cpp_model(model_id=config.text_to_text_model)

    logger.info(f"Loading {config.text_to_speech_model}")
    speech_model = load_tts_model(
        model_id=config.text_to_speech_model, outetts_language=outetts_language
    )

    # ~4 characters per token is considered a reasonable default.
    max_characters = text_model.n_ctx() * 4
    if len(clean_text) > max_characters:
        logger.warning(
            f"Input text is too big ({len(clean_text)})."
            f" Using only a subset of it ({max_characters})."
        )
    clean_text = clean_text[:max_characters]

    logger.info("Generating Podcast...")
    podcast_script = ""
    text = ""
    podcast_audio = []
    system_prompt = config.text_to_text_prompt.strip()
    system_prompt = system_prompt.replace(
        "{SPEAKERS}", "\n".join(str(speaker) for speaker in config.speakers)
    )
    try:
        for chunk in text_to_text_stream(
            clean_text, text_model, system_prompt=system_prompt
        ):
            text += chunk
            podcast_script += chunk
            if text.endswith("\n") and "Speaker" in text:
                logger.debug(text)
                speaker_id = re.search(r"Speaker (\d+)", text).group(1)
                voice_profile = next(
                    speaker.voice_profile
                    for speaker in config.speakers
                    if speaker.id == int(speaker_id)
                )
                speech = text_to_speech(
                    text.split(f'"Speaker {speaker_id}":')[-1],
                    speech_model,
                    voice_profile,
                )
                podcast_audio.append(speech)
                text = ""

    except KeyboardInterrupt:
        logger.warning("Podcast generation stopped by user.")
    logger.info("Saving Podcast...")
    complete_audio = stack_audio_segments(
        podcast_audio, sample_rate=speech_model.sample_rate, silence_pad=1.0
    )

    sf.write(
        str(output_folder / "podcast.wav"),
        complete_audio,
        samplerate=speech_model.sample_rate,
    )
    (output_folder / "podcast.txt").write_text(podcast_script)
    logger.success("Done!")

document_to_podcast.config.Config

Bases: BaseModel

Source code in src/document_to_podcast/config.py
class Config(BaseModel):
    input_file: Annotated[FilePath, AfterValidator(validate_input_file)]
    output_folder: str
    text_to_text_model: Annotated[str, AfterValidator(validate_text_to_text_model)]
    text_to_text_prompt: Annotated[str, AfterValidator(validate_text_to_text_prompt)]
    text_to_speech_model: Annotated[str, AfterValidator(validate_text_to_speech_model)]
    speakers: list[Speaker]
    outetts_language: str = "en"

document_to_podcast.config.Speaker

Bases: BaseModel

Source code in src/document_to_podcast/config.py
class Speaker(BaseModel):
    id: int
    name: str
    description: str
    voice_profile: str

    def __str__(self):
        return f"Speaker {self.id}. Named {self.name}. {self.description}"

document_to_podcast.config.DEFAULT_PROMPT = '\nYou are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format.\nThe script features the following speakers:\n{SPEAKERS}\nInstructions:\n- Write dynamic, easy-to-follow dialogue.\n- Include natural interruptions and interjections.\n- Avoid repetitive phrasing between speakers.\n- Format output as a JSON conversation.\nExample:\n{\n "Speaker 1": "Welcome to our podcast! Today, we\'re exploring...",\n "Speaker 2": "Hi! I\'m excited to hear about this. Can you explain...",\n "Speaker 1": "Sure! Imagine it like this...",\n "Speaker 2": "Oh, that\'s cool! But how does..."\n}\n' module-attribute

document_to_podcast.config.DEFAULT_SPEAKERS = [{'id': 1, 'name': 'Laura', 'description': 'The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.', 'voice_profile': 'female_1'}, {'id': 2, 'name': 'Jon', 'description': 'The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.', 'voice_profile': 'male_1'}] module-attribute