Skip to content

API Reference

structured_qa.preprocessing

document_to_sections_dir(input_file, output_dir)

Convert a document to a directory of sections.

Uses pymupdf4llm to convert input_file to markdown. Then uses langchain_text_splitters to split the markdown into sections based on the headers.

Parameters:

Name Type Description Default
input_file str

Path to the input document.

required
output_dir str

Path to the output directory. Structure of the output directory:

output_dir/
    section_1.txt
    section_2.txt
    ...
required

Returns:

Type Description
list[str]

List of section names.

Source code in src/structured_qa/preprocessing.py
@logger.catch(reraise=True)
def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]:
    """
    Convert a document to a directory of sections.

    Uses [pymupdf4llm](https://pypi.org/project/pymupdf4llm/) to convert input_file to markdown.
    Then uses [langchain_text_splitters](https://pypi.org/project/langchain-text-splitters/) to split the markdown into sections based on the headers.

    Args:
        input_file: Path to the input document.
        output_dir: Path to the output directory.
            Structure of the output directory:

            ```
            output_dir/
                section_1.txt
                section_2.txt
                ...
            ```

    Returns:
        List of section names.
    """

    logger.info(f"Converting {input_file}")
    md_text = pymupdf4llm.to_markdown(input_file)
    logger.success("Converted")

    logger.info("Extracting sections")
    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
    )
    sections = splitter.split_text(md_text)
    logger.success(f"Found {len(sections)} sections")

    logger.info(f"Writing sections to {output_dir}")
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)
    section_names = []
    for section in sections:
        if not section.metadata:
            continue
        section_name = list(section.metadata.values())[-1].lower()
        section_names.append(section_name)
        (output_dir / f"{section_name.replace('/', '_')}.txt").write_text(
            section.page_content
        )
    logger.success("Done")

    return section_names

structured_qa.model_loaders

load_llama_cpp_model(model_id)

Loads the given model_id using Llama.from_pretrained.

Examples:

>>> model = load_llama_cpp_model("allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")

Parameters:

Name Type Description Default
model_id str

The model id to load. Format is expected to be {org}/{repo}/{filename}.

required

Returns:

Name Type Description
Llama Llama

The loaded model.

Source code in src/structured_qa/model_loaders.py
def load_llama_cpp_model(model_id: str) -> Llama:
    """
    Loads the given model_id using Llama.from_pretrained.

    Examples:
        >>> model = load_llama_cpp_model("allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")

    Args:
        model_id (str): The model id to load.
            Format is expected to be `{org}/{repo}/{filename}`.

    Returns:
        Llama: The loaded model.
    """
    org, repo, filename = model_id.split("/")
    model = Llama.from_pretrained(
        repo_id=f"{org}/{repo}",
        filename=filename,
        n_ctx=0,  # 0 means that the model limit will be used, instead of the default (512) or other hardcoded value
        verbose=False,
        n_gpu_layers=-1 if gpu_available() else 0,
    )
    return model

structured_qa.workflow

find_retrieve_answer(question, model, sections_dir, find_prompt, answer_prompt)

Workflow to find the relevant section, retrieve the information, and answer the question.

Parameters:

Name Type Description Default
question str

The question to answer.

required
model Llama

The Llama model to use for generating completions.

required
sections_dir str

The directory containing the sections. See document_to_sections_dir. Structure of the sections directory:

sections_dir/
    section_1.txt
    section_2.txt
    ...
required
find_prompt str

The prompt for finding the section.

See FIND_PROMPT.

required
answer_prompt str

The prompt for answering the question.

See ANSWER_PROMPT.

required

Returns:

Type Description
tuple[str, list[str]] | tuple[None, list[str]]

tuple[str, list[str]] | tuple[None, list[str]]:

If the answer is found, the tuple contains the answer and the sections checked. If the answer is not found, the tuple contains None and the sections checked

Source code in src/structured_qa/workflow.py
def find_retrieve_answer(
    question: str,
    model: Llama,
    sections_dir: str,
    find_prompt: str,
    answer_prompt: str,
) -> tuple[str, list[str]] | tuple[None, list[str]]:
    """
    Workflow to find the relevant section, retrieve the information, and answer the question.

    Args:
        question (str): The question to answer.
        model (Llama): The Llama model to use for generating completions.
        sections_dir (str): The directory containing the sections.
            See [`document_to_sections_dir`][structured_qa.preprocessing.document_to_sections_dir].
            Structure of the sections directory:

            ```
            sections_dir/
                section_1.txt
                section_2.txt
                ...
            ```
        find_prompt (str): The prompt for finding the section.

            See [`FIND_PROMPT`][structured_qa.config.FIND_PROMPT].
        answer_prompt (str): The prompt for answering the question.

            See [`ANSWER_PROMPT`][structured_qa.config.ANSWER_PROMPT].

    Returns:
        tuple[str, list[str]] | tuple[None, list[str]]:

            If the answer is found, the tuple contains the answer and the sections checked.
            If the answer is not found, the tuple contains None and the sections checked
    """
    sections_dir = Path(sections_dir)
    sections_names = [section.stem for section in sections_dir.glob("*.txt")]
    current_info = None
    current_section = None

    sections_checked = []
    while True:
        logger.debug(f"Current information available: {current_info}")
        if not current_info:
            logger.debug("Finding section")
            finding_section = True
            messages = [
                {
                    "role": "system",
                    "content": find_prompt.format(SECTIONS="\n".join(sections_names)),
                },
                {"role": "user", "content": question},
            ]
        else:
            logger.debug("Answering question")
            finding_section = False
            messages = [
                {
                    "role": "system",
                    "content": answer_prompt.format(CURRENT_INFO=current_info),
                },
                {"role": "user", "content": question},
            ]

        result = model.create_chat_completion(messages)
        result = result["choices"][0]["message"]["content"]

        logger.debug(f"Result: {result}")

        if finding_section:
            result = result.strip()
            logger.info(f"Retrieving section: {result}")
            if result in sections_names:
                section_content = (sections_dir / f"{result}.txt").read_text()
                current_section = result
                current_info = section_content
                sections_checked.append(result)
            else:
                logger.error(f"Unknown section: {result}")
                return None, sections_checked
        else:
            if result == "I need more info.":
                current_info = None
                sections_names.remove(current_section)
                continue
            else:
                return result, sections_checked

structured_qa.config.FIND_PROMPT = '\nYou are given two pieces of information:\n1. A user question.\n2. A list of valid section names.\n\nYour task is to:\n- Identify exactly one `section_name` from the provided list that seems related to the user question.\n- Return the `section_name` exactly as it appears in the list.\n- Do NOT return any additional text, explanation, or formatting.\n- Do NOT combine multiple section names into a single response.\n\nHere is the list of valid `section_names`:\n\n```\n{SECTIONS}\n```\n\nNow, based on the input question, return the single most relevant `section_name` from the list.\n' module-attribute

structured_qa.config.ANSWER_PROMPT = '\nYou are a rigorous assistant answering questions.\nYou only answer based on the current information available.\n\nThe current information available is:\n\n```\n{CURRENT_INFO}\n```\n\nIf the current information available not enough to answer the question,\nyou must return the following message and nothing else:\n\n```\nI need more info.\n```\n' module-attribute