Command Line Interface
Once you have installed the blueprint, you can use it from the CLI.
You can either provide the path to a configuration file:
document-to-podcast --from_config "example_data/config.yaml"
Or provide values to the arguments directly:
document-to-podcast \
--input_file "example_data/Mozilla-Trustworthy_AI.pdf" \
--output_folder "example_data"
--text_to_text_model "Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf"
Note that you can also exit the podcast generation prematurely (before the whole podcast is created), by pressing Ctrl+C in the terminal. This will make the application stop the generation, but still save the result (script & audio) to disk up until that point.
document_to_podcast.cli.document_to_podcast(input_file=None, output_folder=None, text_to_text_model='bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf', text_to_text_prompt=DEFAULT_PROMPT, text_to_speech_model='hexgrad/Kokoro-82M', speakers=None, from_config=None)
Generate a podcast from a document.
Parameters:
Name |
Type |
Description |
Default |
input_file
|
str
|
The path to the input file.
Supported extensions:
- .pdf
- .html
- .txt
- .docx
- .md
|
None
|
output_folder
|
str
|
The path to the output folder.
Two files will be created:
- {output_folder}/podcast.txt
- {output_folder}/podcast.wav
|
None
|
text_to_text_model
|
str
|
The text-to-text model_id.
Need to be formatted as owner/repo/file .
Need to be a gguf file.
Defaults to bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf .
|
'bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf'
|
text_to_text_prompt
|
str
|
The prompt for the text-to-text model.
Defaults to DEFAULT_PROMPT.
|
DEFAULT_PROMPT
|
text_to_speech_model
|
str
|
The text-to-speech model_id.
Defaults to hexgrad/Kokoro-82M .
|
'hexgrad/Kokoro-82M'
|
speakers
|
list[Speaker] | None
|
The speakers for the podcast.
Defaults to DEFAULT_SPEAKERS.
|
None
|
from_config
|
str
|
The path to the config file. Defaults to None.
If provided, all other arguments will be ignored.
|
None
|
Source code in src/document_to_podcast/cli.py
| @logger.catch(reraise=True)
def document_to_podcast(
input_file: str | None = None,
output_folder: str | None = None,
text_to_text_model: str = "bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf",
text_to_text_prompt: str = DEFAULT_PROMPT,
text_to_speech_model: str = "hexgrad/Kokoro-82M",
speakers: list[Speaker] | None = None,
from_config: str | None = None,
):
"""
Generate a podcast from a document.
Args:
input_file (str): The path to the input file.
Supported extensions:
- .pdf
- .html
- .txt
- .docx
- .md
output_folder (str): The path to the output folder.
Two files will be created:
- {output_folder}/podcast.txt
- {output_folder}/podcast.wav
text_to_text_model (str, optional): The text-to-text model_id.
Need to be formatted as `owner/repo/file`.
Need to be a gguf file.
Defaults to `bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf`.
text_to_text_prompt (str, optional): The prompt for the text-to-text model.
Defaults to DEFAULT_PROMPT.
text_to_speech_model (str, optional): The text-to-speech model_id.
Defaults to `hexgrad/Kokoro-82M`.
speakers (list[Speaker] | None, optional): The speakers for the podcast.
Defaults to DEFAULT_SPEAKERS.
from_config (str, optional): The path to the config file. Defaults to None.
If provided, all other arguments will be ignored.
"""
if from_config:
config = Config.model_validate(yaml.safe_load(Path(from_config).read_text()))
else:
speakers = speakers or DEFAULT_SPEAKERS
config = Config(
input_file=input_file,
output_folder=output_folder,
text_to_text_model=text_to_text_model,
text_to_text_prompt=text_to_text_prompt,
text_to_speech_model=text_to_speech_model,
speakers=[Speaker.model_validate(speaker) for speaker in speakers],
)
output_folder = Path(config.output_folder)
output_folder.mkdir(parents=True, exist_ok=True)
data_loader = DATA_LOADERS[Path(config.input_file).suffix]
logger.info(f"Loading {config.input_file}")
raw_text = data_loader(config.input_file)
logger.debug(f"Loaded {len(raw_text)} characters")
data_cleaner = DATA_CLEANERS[Path(config.input_file).suffix]
logger.info(f"Cleaning {config.input_file}")
clean_text = data_cleaner(raw_text)
logger.debug(f"Cleaned {len(raw_text) - len(clean_text)} characters")
logger.debug(f"Length of cleaned text: {len(clean_text)}")
logger.info(f"Loading {config.text_to_text_model}")
text_model = load_llama_cpp_model(model_id=config.text_to_text_model)
logger.info(f"Loading {config.text_to_speech_model}")
if config.speakers[0].voice_profile[0] != config.speakers[1].voice_profile[0]:
raise ValueError(
"Both Kokoro speakers need to have the same language code. "
"More info here https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md"
)
speech_model = load_tts_model(
model_id=config.text_to_speech_model,
**{"lang_code": config.speakers[0].voice_profile[0]},
)
# ~4 characters per token is considered a reasonable default.
max_characters = text_model.n_ctx() * 4
if len(clean_text) > max_characters:
logger.warning(
f"Input text is too big ({len(clean_text)})."
f" Using only a subset of it ({max_characters})."
)
clean_text = clean_text[:max_characters]
logger.info("Generating Podcast...")
podcast_script = ""
text = ""
podcast_audio = []
system_prompt = config.text_to_text_prompt.strip()
system_prompt = system_prompt.replace(
"{SPEAKERS}", "\n".join(str(speaker) for speaker in config.speakers)
)
try:
for chunk in text_to_text_stream(
clean_text, text_model, system_prompt=system_prompt
):
text += chunk
podcast_script += chunk
if text.endswith("\n") and "Speaker" in text:
logger.debug(text)
speaker_id = re.search(r"Speaker (\d+)", text).group(1)
voice_profile = next(
speaker.voice_profile
for speaker in config.speakers
if speaker.id == int(speaker_id)
)
speech = text_to_speech(
text.split(f'"Speaker {speaker_id}":')[-1],
speech_model,
voice_profile,
)
podcast_audio.append(speech)
text = ""
except KeyboardInterrupt:
logger.warning("Podcast generation stopped by user.")
logger.info("Saving Podcast...")
complete_audio = stack_audio_segments(
podcast_audio, sample_rate=speech_model.sample_rate, silence_pad=1.0
)
sf.write(
str(output_folder / "podcast.wav"),
complete_audio,
samplerate=speech_model.sample_rate,
)
(output_folder / "podcast.txt").write_text(podcast_script)
logger.success("Done!")
|
document_to_podcast.config.Config
Bases: BaseModel
Source code in src/document_to_podcast/config.py
| class Config(BaseModel):
input_file: Annotated[FilePath, AfterValidator(validate_input_file)]
output_folder: str
text_to_text_model: Annotated[str, AfterValidator(validate_text_to_text_model)]
text_to_text_prompt: Annotated[str, AfterValidator(validate_text_to_text_prompt)]
text_to_speech_model: Annotated[str, AfterValidator(validate_text_to_speech_model)]
speakers: list[Speaker]
|
document_to_podcast.config.Speaker
Bases: BaseModel
Source code in src/document_to_podcast/config.py
| class Speaker(BaseModel):
id: int
name: str
description: str
voice_profile: str
def __str__(self):
return f"Speaker {self.id}. Named {self.name}. {self.description}"
|
document_to_podcast.config.DEFAULT_PROMPT = '\nYou are a podcast scriptwriter generating engaging and natural-sounding conversations in JSON format.\nThe script features the following speakers:\n{SPEAKERS}\nInstructions:\n- Write dynamic, easy-to-follow dialogue.\n- Include natural interruptions and interjections.\n- Avoid repetitive phrasing between speakers.\n- Format output as a JSON conversation.\nExample:\n{\n "Speaker 1": "Welcome to our podcast! Today, we\'re exploring...",\n "Speaker 2": "Hi! I\'m excited to hear about this. Can you explain...",\n "Speaker 1": "Sure! Imagine it like this...",\n "Speaker 2": "Oh, that\'s cool! But how does..."\n}\n'
module-attribute
document_to_podcast.config.DEFAULT_SPEAKERS = [{'id': 1, 'name': 'Sarah', 'description': 'The main host. She explains topics clearly using anecdotes and analogies, teaching in an engaging and captivating way.', 'voice_profile': 'af_sarah'}, {'id': 2, 'name': 'Michael', 'description': 'The co-host. He keeps the conversation on track, asks curious follow-up questions, and reacts with excitement or confusion, often using interjections like hmm or umm.', 'voice_profile': 'am_michael'}]
module-attribute