Skip to content

API Reference

structured_qa.preprocessing

document_to_sections_dir(input_file, output_dir)

Convert a document to a directory of sections.

Uses pymupdf4llm to convert input_file to markdown. Then uses split_markdown_by_headings to split the markdown into sections based on the headers.

Parameters:

Name Type Description Default
input_file str

Path to the input document.

required
output_dir str

Path to the output directory. Structure of the output directory:

output_dir/
    section_1.txt
    section_2.txt
    ...
required

Returns:

Type Description
list[str]

List of section names.

Source code in src/structured_qa/preprocessing.py
@logger.catch(reraise=True)
def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]:
    """
    Convert a document to a directory of sections.

    Uses [pymupdf4llm](https://pypi.org/project/pymupdf4llm/) to convert input_file to markdown.
    Then uses [`split_markdown_by_headings`][structured_qa.preprocessing.split_markdown_by_headings] to split the markdown into sections based on the headers.

    Args:
        input_file: Path to the input document.
        output_dir: Path to the output directory.
            Structure of the output directory:

            ```
            output_dir/
                section_1.txt
                section_2.txt
                ...
            ```

    Returns:
        List of section names.
    """

    logger.info(f"Converting {input_file}")
    md_text = pymupdf4llm.to_markdown(input_file)
    Path("debug.md").write_text(md_text)
    logger.success("Converted")

    logger.info("Extracting sections")
    sections = split_markdown_by_headings(
        md_text,
    )
    logger.success(f"Found {len(sections)} sections")
    logger.info(f"Writing sections to {output_dir}")
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True, parents=True)

    for section_name, section_content in sections.items():
        (output_dir / f"{section_name.replace('/', '_')}.txt").write_text(
            section_content
        )
    logger.success("Done")

    return sections.keys()

split_markdown_by_headings(markdown_text, heading_patterns=None)

Splits a markdown document into sections based on specified heading patterns.

Parameters:

Name Type Description Default
markdown_text str

The markdown document as a single string.

required
heading_patterns str

A list of regex patterns representing heading markers in the markdown document. Defaults to None. If None, the default patterns are used.

None

Returns:

Type Description
dict[str, str]

dict[str, str]: A dictionary where the keys are the section names and the values are the section contents.

Source code in src/structured_qa/preprocessing.py
def split_markdown_by_headings(
    markdown_text, heading_patterns: list[str] | None = None
) -> dict[str, str]:
    """Splits a markdown document into sections based on specified heading patterns.

    Args:
        markdown_text (str): The markdown document as a single string.
        heading_patterns (str, optional): A list of regex patterns representing heading markers
            in the markdown document.
            Defaults to None.
            If None, the default patterns are used.

    Returns:
        dict[str, str]: A dictionary where the keys are the section names and the values are the section contents.
    """
    if heading_patterns is None:
        heading_patterns = [
            r"^#\s+(.+)$",
            r"^##\s+(.+)$",
            r"^###\s+(.+)$",
            r"^####\s+(.+)$",
            r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$",
        ]

    sections = defaultdict(str)

    heading_text = "INTRO"
    for line in markdown_text.splitlines():
        line = line.strip()
        if not line:
            continue
        for pattern in heading_patterns:
            match = re.match(pattern, line)
            if match:
                heading_text = match.group(1)[:100]
                break
        sections[heading_text] += f"{line}\n"

    return sections

structured_qa.model_loaders

load_llama_cpp_model(model_id)

Loads the given model_id using Llama.from_pretrained.

Examples:

>>> model = load_llama_cpp_model("allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")

Parameters:

Name Type Description Default
model_id str

The model id to load. Format is expected to be {org}/{repo}/{filename}.

required

Returns:

Name Type Description
Llama LlamaModel

The loaded model.

Source code in src/structured_qa/model_loaders.py
def load_llama_cpp_model(model_id: str) -> LlamaModel:
    """
    Loads the given model_id using Llama.from_pretrained.

    Examples:
        >>> model = load_llama_cpp_model("allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")

    Args:
        model_id (str): The model id to load.
            Format is expected to be `{org}/{repo}/{filename}`.

    Returns:
        Llama: The loaded model.
    """
    from llama_cpp import Llama

    org, repo, filename = model_id.split("/")
    model = Llama.from_pretrained(
        repo_id=f"{org}/{repo}",
        filename=filename,
        n_ctx=0,  # 0 means that the model limit will be used, instead of the default (512) or other hardcoded value
        verbose=False,
        n_gpu_layers=-1 if gpu_available() else 0,
    )
    return LlamaModel(model=model)

structured_qa.workflow

find_retrieve_answer(question, model, sections_dir, find_prompt, answer_prompt, max_sections_to_check=None)

Workflow to find the relevant section, retrieve the information, and answer the question.

Parameters:

Name Type Description Default
question str

The question to answer.

required
model LlamaModel

The model to use for generating completions.

required
sections_dir str

The directory containing the sections. See document_to_sections_dir. Structure of the sections directory:

sections_dir/
    section_1.txt
    section_2.txt
    ...
required
find_prompt str

The prompt for finding the section.

See FIND_PROMPT.

required
answer_prompt str

The prompt for answering the question.

See ANSWER_PROMPT.

required
max_sections_to_check int

The maximum number of sections to check before giving up. Defaults to None. If None, it will check up to a maximum of 20 sections until it finds the answer.

None

Returns:

Type Description
tuple[str, list[str]] | tuple[None, list[str]]

tuple[str, list[str]] | tuple[None, list[str]]:

If the answer is found, the tuple contains the answer and the sections checked. If the answer is not found, the tuple contains None and the sections checked

Source code in src/structured_qa/workflow.py
def find_retrieve_answer(
    question: str,
    model: LlamaModel,
    sections_dir: str,
    find_prompt: str,
    answer_prompt: str,
    max_sections_to_check: int | None = None,
) -> tuple[str, list[str]] | tuple[None, list[str]]:
    """
    Workflow to find the relevant section, retrieve the information, and answer the question.

    Args:
        question (str): The question to answer.
        model (LlamaModel): The model to use for generating completions.
        sections_dir (str): The directory containing the sections.
            See [`document_to_sections_dir`][structured_qa.preprocessing.document_to_sections_dir].
            Structure of the sections directory:

            ```
            sections_dir/
                section_1.txt
                section_2.txt
                ...
            ```
        find_prompt (str): The prompt for finding the section.

            See [`FIND_PROMPT`][structured_qa.config.FIND_PROMPT].
        answer_prompt (str): The prompt for answering the question.

            See [`ANSWER_PROMPT`][structured_qa.config.ANSWER_PROMPT].
        max_sections_to_check (int, optional): The maximum number of sections to check before giving up.
            Defaults to None.
            If None, it will check  up to a maximum of 20 sections until it finds the answer.

    Returns:
        tuple[str, list[str]] | tuple[None, list[str]]:

            If the answer is found, the tuple contains the answer and the sections checked.
            If the answer is not found, the tuple contains None and the sections checked
    """
    sections_dir = Path(sections_dir)
    sections_names = [section.stem for section in sections_dir.glob("*.txt")]
    current_info = None
    current_section = None

    if max_sections_to_check is None:
        max_sections_to_check = min(20, len(sections_names))

    sections_checked = []
    while len(sections_checked) <= max_sections_to_check:
        logger.debug(f"Current information available: {current_info}")
        if not current_info:
            logger.debug("Finding section")
            finding_section = True
            question_part, *options = question.split("?")
            messages = [
                {
                    "role": "system",
                    "content": find_prompt.format(SECTIONS="\n".join(sections_names)),
                },
                {"role": "user", "content": question_part},
            ]
        else:
            logger.debug("Answering question")
            finding_section = False
            messages = [
                {
                    "role": "system",
                    "content": answer_prompt.format(CURRENT_INFO=current_info),
                },
                {"role": "user", "content": question},
            ]

        try:
            response = model.get_response(messages)
        except Exception as e:
            logger.error(f"Failed to generate completion: {e}")
            return "Generation Error", sections_checked

        if finding_section:
            response = response.strip()
            if not sections_names:
                return "NOT FOUND", sections_checked
            section_name = get_matching_section(response, sections_names)
            logger.debug(f"Retrieving section: {section_name}")
            section_content = (sections_dir / f"{section_name}.txt").read_text()
            current_section = section_name
            current_info = section_content
            sections_checked.append(section_name)

        else:
            if "MORE INFO" in response.upper():
                current_info = None
                sections_names.remove(current_section)
                continue
            else:
                return response, sections_checked

    return "NOT FOUND", sections_checked

get_matching_section(response, section_names)

Use string similarity to find the most similar section_name.

Source code in src/structured_qa/workflow.py
def get_matching_section(response, section_names):
    """
    Use string similarity to find the most similar section_name.
    """
    return process.extractOne(response, section_names)[0]

structured_qa.config.FIND_PROMPT = '\nYou are given two pieces of information:\n1. A list of valid section names.\n2. A user question.\n\nYour task is to:\n- Identify exactly one `section_name` from the provided list that seems related to the user question.\n- Return the `section_name` exactly as it appears in the list.\n- Do NOT answer the question.\n- Do NOT return any additional text, explanation, or formatting.\n- Do NOT combine multiple section names into a single response.\n\nHere is the list of valid section names:\n\n```\n{SECTIONS}\n```\n\nNow, based on the following question, return the single most relevant `section_name` from the list.\n' module-attribute

structured_qa.config.ANSWER_PROMPT = '\nYou are a rigorous assistant answering questions.\nYou must only answer based on the current information available which is:\n\n```\n{CURRENT_INFO}\n```\n\nIf the current information available not enough to answer the question,\nyou must return "I need more info" and nothing else.\n' module-attribute