Skip to content

API Reference

document_to_podcast.preprocessing.data_cleaners

clean_html(text)

Clean HTML text.

This function removes
  • scripts
  • styles
  • links
  • meta tags

In addition, it calls clean_with_regex.

Examples:

>>> clean_html("<html><body><p>Hello,  world!  </p></body></html>"")
"Hello, world!"

Parameters:

Name Type Description Default
text str

The HTML text to clean.

required

Returns:

Name Type Description
str str

The cleaned text.

Source code in src/document_to_podcast/preprocessing/data_cleaners.py
def clean_html(text: str) -> str:
    """Clean HTML text.

    This function removes:
        - scripts
        - styles
        - links
        - meta tags

    In addition, it calls [clean_with_regex][document_to_podcast.preprocessing.data_cleaners.clean_with_regex].

    Examples:
        >>> clean_html("<html><body><p>Hello,  world!  </p></body></html>"")
        "Hello, world!"

    Args:
        text (str): The HTML text to clean.

    Returns:
        str: The cleaned text.
    """
    soup = BeautifulSoup(text, "html.parser")
    for tag in soup(["script", "style", "link", "meta"]):
        tag.decompose()
    text = soup.get_text()
    return clean_with_regex(text)

clean_markdown(text)

Clean Markdown text.

This function removes
  • markdown images

In addition, it calls clean_with_regex.

Examples:

>>> clean_markdown('# Title   with image ![alt text](image.jpg "Image Title")')
"Title with image"

Parameters:

Name Type Description Default
text str

The Markdown text to clean.

required

Returns:

Name Type Description
str str

The cleaned text.

Source code in src/document_to_podcast/preprocessing/data_cleaners.py
def clean_markdown(text: str) -> str:
    """Clean Markdown text.

    This function removes:
        - markdown images

    In addition, it calls [clean_with_regex][document_to_podcast.preprocessing.data_cleaners.clean_with_regex].

    Examples:
        >>> clean_markdown('# Title   with image ![alt text](image.jpg "Image Title")')
        "Title with image"

    Args:
        text (str): The Markdown text to clean.

    Returns:
        str: The cleaned text.
    """
    text = re.sub(r'!\[.*?\]\(.*?(".*?")?\)', "", text)

    return clean_with_regex(text)

clean_with_regex(text)

Clean text using regular expressions.

This function removes
  • URLs
  • emails
  • special characters
  • extra spaces

Examples:

>>> clean_with_regex(" Hello,   world! http://example.com")
"Hello, world!"

Parameters:

Name Type Description Default
text str

The text to clean.

required

Returns:

Name Type Description
str str

The cleaned text.

Source code in src/document_to_podcast/preprocessing/data_cleaners.py
def clean_with_regex(text: str) -> str:
    """
    Clean text using regular expressions.

    This function removes:
        - URLs
        - emails
        - special characters
        - extra spaces

    Examples:
        >>> clean_with_regex("\xa0Hello,   world! http://example.com")
        "Hello, world!"

    Args:
        text (str): The text to clean.

    Returns:
        str: The cleaned text.
    """
    text = re.sub(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        "",
        text,
    )
    text = re.sub(r"[\w\.-]+@[\w\.-]+\.[\w]+", "", text)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?;:"\']', "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

document_to_podcast.inference.model_loaders

load_llama_cpp_model(model_id)

Loads the given model_id using Llama.from_pretrained.

Examples:

>>> model = load_llama_cpp_model(
    "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")

Parameters:

Name Type Description Default
model_id str

The model id to load. Format is expected to be {org}/{repo}/{filename}.

required

Returns:

Name Type Description
Llama Llama

The loaded model.

Source code in src/document_to_podcast/inference/model_loaders.py
def load_llama_cpp_model(
    model_id: str,
) -> Llama:
    """
    Loads the given model_id using Llama.from_pretrained.

    Examples:
        >>> model = load_llama_cpp_model(
            "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")

    Args:
        model_id (str): The model id to load.
            Format is expected to be `{org}/{repo}/{filename}`.

    Returns:
        Llama: The loaded model.
    """
    org, repo, filename = model_id.split("/")
    model = Llama.from_pretrained(
        repo_id=f"{org}/{repo}",
        filename=filename,
        # 0 means that the model limit will be used, instead of the default (512) or other hardcoded value
        n_ctx=0,
    )
    return model

load_parler_tts_model_and_tokenizer(model_id, device='cpu')

Loads the given model_id using parler_tts.from_pretrained.

Examples:

>>> model, tokenizer = load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")

Parameters:

Name Type Description Default
model_id str

The model id to load. Format is expected to be {repo}/{filename}.

required
device str

The device to load the model on, such as "cuda:0" or "cpu".

'cpu'

Returns:

Name Type Description
PreTrainedModel Tuple[PreTrainedModel, PreTrainedTokenizerBase]

The loaded model.

Source code in src/document_to_podcast/inference/model_loaders.py
def load_parler_tts_model_and_tokenizer(
    model_id: str, device: str = "cpu"
) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
    """
    Loads the given model_id using parler_tts.from_pretrained.

    Examples:
        >>> model, tokenizer = load_parler_tts_model_and_tokenizer("parler-tts/parler-tts-mini-v1", "cpu")

    Args:
        model_id (str): The model id to load.
            Format is expected to be `{repo}/{filename}`.
        device (str): The device to load the model on, such as "cuda:0" or "cpu".

    Returns:
        PreTrainedModel: The loaded model.
    """
    model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    return model, tokenizer

document_to_podcast.inference.text_to_text

text_to_text(input_text, model, system_prompt, return_json=True, stop=None)

Transforms input_text using the given model and system prompt.

Parameters:

Name Type Description Default
input_text str

The text to be transformed.

required
model Llama

The model to use for conversion.

required
system_prompt str

The system prompt to use for conversion.

required
return_json bool

Whether to return the response as JSON. Defaults to True.

True
stop str | list[str] | None

The stop token(s).

None

Returns:

Name Type Description
str str

The full transformed text.

Source code in src/document_to_podcast/inference/text_to_text.py
def text_to_text(
    input_text: str,
    model: Llama,
    system_prompt: str,
    return_json: bool = True,
    stop: str | list[str] | None = None,
) -> str:
    """
    Transforms input_text using the given model and system prompt.

    Args:
        input_text (str): The text to be transformed.
        model (Llama): The model to use for conversion.
        system_prompt (str): The system prompt to use for conversion.
        return_json (bool, optional): Whether to return the response as JSON.
            Defaults to True.
        stop (str | list[str] | None, optional): The stop token(s).

    Returns:
        str: The full transformed text.
    """
    response = chat_completion(
        input_text, model, system_prompt, return_json, stop=stop, stream=False
    )
    return response["choices"][0]["message"]["content"]

text_to_text_stream(input_text, model, system_prompt, return_json=True, stop=None)

Transforms input_text using the given model and system prompt.

Parameters:

Name Type Description Default
input_text str

The text to be transformed.

required
model Llama

The model to use for conversion.

required
system_prompt str

The system prompt to use for conversion.

required
return_json bool

Whether to return the response as JSON. Defaults to True.

True
stop str | list[str] | None

The stop token(s).

None

Yields:

Name Type Description
str str

Chunks of the transformed text as they are available.

Source code in src/document_to_podcast/inference/text_to_text.py
def text_to_text_stream(
    input_text: str,
    model: Llama,
    system_prompt: str,
    return_json: bool = True,
    stop: str | list[str] | None = None,
) -> Iterator[str]:
    """
    Transforms input_text using the given model and system prompt.

    Args:
        input_text (str): The text to be transformed.
        model (Llama): The model to use for conversion.
        system_prompt (str): The system prompt to use for conversion.
        return_json (bool, optional): Whether to return the response as JSON.
            Defaults to True.
        stop (str | list[str] | None, optional): The stop token(s).

    Yields:
        str: Chunks of the transformed text as they are available.
    """
    response = chat_completion(
        input_text, model, system_prompt, return_json, stop=stop, stream=True
    )
    for item in response:
        if item["choices"][0].get("delta", {}).get("content", None):
            yield item["choices"][0].get("delta", {}).get("content", None)

document_to_podcast.inference.text_to_speech

text_to_speech(input_text, model, tokenizer, speaker_profile)

Generates a speech waveform using the input_text, a model and a speaker profile to define a distinct voice pattern.

Examples:

>>> waveform = text_to_speech(input_text="Welcome to our amazing podcast", model=model, tokenizer=tokenizer, speaker_profile="Laura's voice is exciting and fast in delivery with very clear audio and no background noise.")

Parameters:

Name Type Description Default
input_text str

The text to convert to speech.

required
model PreTrainedModel

The model used for generating the waveform.

required
tokenizer PreTrainedTokenizerBase

The tokenizer used for tokenizing the text in order to send to the model.

required
speaker_profile str

A description used by the ParlerTTS model to configure the speaker profile.

required

Returns: numpy array: The waveform of the speech as a 2D numpy array

Source code in src/document_to_podcast/inference/text_to_speech.py
def text_to_speech(
    input_text: str,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizerBase,
    speaker_profile: str,
) -> np.ndarray:
    """
    Generates a speech waveform using the input_text, a model and a speaker profile to define a distinct voice pattern.

    Examples:
        >>> waveform = text_to_speech(input_text="Welcome to our amazing podcast", model=model, tokenizer=tokenizer, speaker_profile="Laura's voice is exciting and fast in delivery with very clear audio and no background noise.")

    Args:
        input_text (str): The text to convert to speech.
        model (PreTrainedModel): The model used for generating the waveform.
        tokenizer (PreTrainedTokenizerBase): The tokenizer used for tokenizing the text in order to send to the model.
        speaker_profile (str): A description used by the ParlerTTS model to configure the speaker profile.
    Returns:
        numpy array: The waveform of the speech as a 2D numpy array
    """
    model_id = model.config.name_or_path
    if "parler" in model_id:
        return _speech_generation_parler(input_text, model, tokenizer, speaker_profile)
    else:
        raise NotImplementedError(f"Model {model_id} not yet implemented for TTS")

document_to_podcast.podcast_maker.script_to_audio

parse_script_to_waveform(script, podcast_config)

Given a script with speaker identifiers (such as "Speaker 1") parse it so that each speaker has its own unique voice and concatenate all the voices in a sequence to form the complete podcast. Args: script: podcast_config:

Returns: A 2D numpy array containing the whole podcast in waveform format.

Source code in src/document_to_podcast/podcast_maker/script_to_audio.py
def parse_script_to_waveform(script: str, podcast_config: PodcastConfig):
    """
    Given a script with speaker identifiers (such as "Speaker 1") parse it so that each speaker has its own unique
    voice and concatenate all the voices in a sequence to form the complete podcast.
    Args:
        script:
        podcast_config:

    Returns: A 2D numpy array containing the whole podcast in waveform format.

    """
    parts = script.split("Speaker ")
    podcast_waveform = []
    for part in parts:
        if ":" in part:
            speaker_id, speaker_text = part.replace('"', "").split(":")
            speaker_model = podcast_config.speakers[speaker_id].model
            speaker_tokenizer = podcast_config.speakers[speaker_id].tokenizer
            speaker_description = podcast_config.speakers[
                speaker_id
            ].speaker_description
            speaker_waveform = text_to_speech(
                speaker_text, speaker_model, speaker_tokenizer, speaker_description
            )
            podcast_waveform.append(speaker_waveform)

    return np.concatenate(podcast_waveform)

save_waveform_as_file(waveform, sampling_rate, filename)

Save the output of the TTS (a numpy waveform) to a .wav file using the soundfile library.

Parameters:

Name Type Description Default
waveform ndarray

2D numpy array of a waveform

required
sampling_rate int

Usually 44.100, but check the specifications of the TTS model you are using.

required
filename str

the destination filename to save the audio

required
Source code in src/document_to_podcast/podcast_maker/script_to_audio.py
def save_waveform_as_file(
    waveform: np.ndarray, sampling_rate: int, filename: str
) -> None:
    """
    Save the output of the TTS (a numpy waveform) to a .wav file using the soundfile library.

    Args:
        waveform: 2D numpy array of a waveform
        sampling_rate: Usually 44.100, but check the specifications of the TTS model you are using.
        filename: the destination filename to save the audio

    """
    sf.write(filename, waveform, sampling_rate)

document_to_podcast.podcast_maker.config

PodcastConfig

Bases: BaseModel

Pydantic model that stores configuration of all the speakers for the TTS model. This allows different speakers to use different models and configurations.

Source code in src/document_to_podcast/podcast_maker/config.py
class PodcastConfig(BaseModel):
    """
    Pydantic model that stores configuration of all the speakers for the TTS model. This allows different speakers to
    use different models and configurations.
    """

    speakers: Dict[str, SpeakerConfig]
    sampling_rate: int = 44_100

SpeakerConfig

Bases: BaseModel

Pydantic model that stores configuration of an individual speaker for the TTS model.

Source code in src/document_to_podcast/podcast_maker/config.py
class SpeakerConfig(BaseModel):
    """
    Pydantic model that stores configuration of an individual speaker for the TTS model.
    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    model: PreTrainedModel
    speaker_id: str
    # ParlerTTS specific configuration
    tokenizer: Optional[PreTrainedTokenizerBase] = None
    speaker_description: Optional[str] = (
        None  # This description is used by the ParlerTTS model to configure the speaker profile
    )