Skip to content

Harm Guard

any_guardrail.guardrails.harm_guard.harm_guard

HarmGuard

Bases: StandardGuardrail

Safety and jailbreak detection model based on DeBERTa-v3-large.

HarmAug-Guard classifies the safety of LLM conversations and detects jailbreak attempts. It can evaluate either a single prompt or a prompt + response pair.

For more information, please see the model card:

Source code in src/any_guardrail/guardrails/harm_guard/harm_guard.py
class HarmGuard(StandardGuardrail):
    """Safety and jailbreak detection model based on DeBERTa-v3-large.

    HarmAug-Guard classifies the safety of LLM conversations and detects jailbreak attempts.
    It can evaluate either a single prompt or a prompt + response pair.

    For more information, please see the model card:

    - [HarmAug-Guard](https://huggingface.co/hbseong/HarmAug-Guard).
    """

    SUPPORTED_MODELS: ClassVar = ["hbseong/HarmAug-Guard"]

    def __init__(
        self,
        model_id: str | None = None,
        threshold: float = HARMGUARD_DEFAULT_THRESHOLD,
        provider: StandardProvider | None = None,
    ) -> None:
        """Initialize the HarmGuard guardrail."""
        self.model_id = default(model_id, self.SUPPORTED_MODELS)
        self.threshold = threshold
        self.provider = provider or HuggingFaceProvider()
        self.provider.load_model(self.model_id)

    def validate(self, input_text: str, output_text: str | None = None) -> BinaryScoreOutput:  # type: ignore[override]
        """Validate whether the input (and optionally output) text is safe.

        Args:
            input_text: The prompt/input text to evaluate.
            output_text: Optional response/output text. When provided, evaluates the
                safety of the response in context of the input.

        Returns:
            GuardrailOutput with valid=True if safe, valid=False if harmful.
            The score represents the unsafe probability (0.0 = safe, 1.0 = unsafe).

        """
        model_inputs = self._pre_processing(input_text, output_text)
        model_outputs = self._inference(model_inputs)
        return self._post_processing(model_outputs)

    def _pre_processing(self, input_text: str, output_text: str | None = None) -> StandardPreprocessOutput:
        """Tokenize input text and optionally output text.

        When output_text is provided, the tokenizer creates a text pair input
        suitable for evaluating the safety of the response.
        """
        if output_text is None:
            tokenized = self.provider.tokenizer(input_text, return_tensors="pt")  # type: ignore[attr-defined]
        else:
            tokenized = self.provider.tokenizer(input_text, output_text, return_tensors="pt")  # type: ignore[attr-defined]
        return GuardrailPreprocessOutput(data=tokenized)

    def _inference(self, model_inputs: StandardPreprocessOutput) -> StandardInferenceOutput:
        return self.provider.infer(model_inputs)

    def _post_processing(self, model_outputs: StandardInferenceOutput) -> BinaryScoreOutput:
        logits = model_outputs.data["logits"][0].numpy()
        scores = _softmax(logits)  # type: ignore[no-untyped-call]
        final_score = float(scores[1])  # scores[1] is the unsafe probability
        return GuardrailOutput(valid=final_score < self.threshold, score=final_score)
__init__(model_id=None, threshold=HARMGUARD_DEFAULT_THRESHOLD, provider=None)

Initialize the HarmGuard guardrail.

Source code in src/any_guardrail/guardrails/harm_guard/harm_guard.py
def __init__(
    self,
    model_id: str | None = None,
    threshold: float = HARMGUARD_DEFAULT_THRESHOLD,
    provider: StandardProvider | None = None,
) -> None:
    """Initialize the HarmGuard guardrail."""
    self.model_id = default(model_id, self.SUPPORTED_MODELS)
    self.threshold = threshold
    self.provider = provider or HuggingFaceProvider()
    self.provider.load_model(self.model_id)
validate(input_text, output_text=None)

Validate whether the input (and optionally output) text is safe.

Parameters:

Name Type Description Default
input_text str

The prompt/input text to evaluate.

required
output_text str | None

Optional response/output text. When provided, evaluates the safety of the response in context of the input.

None

Returns:

Type Description
BinaryScoreOutput

GuardrailOutput with valid=True if safe, valid=False if harmful.

BinaryScoreOutput

The score represents the unsafe probability (0.0 = safe, 1.0 = unsafe).

Source code in src/any_guardrail/guardrails/harm_guard/harm_guard.py
def validate(self, input_text: str, output_text: str | None = None) -> BinaryScoreOutput:  # type: ignore[override]
    """Validate whether the input (and optionally output) text is safe.

    Args:
        input_text: The prompt/input text to evaluate.
        output_text: Optional response/output text. When provided, evaluates the
            safety of the response in context of the input.

    Returns:
        GuardrailOutput with valid=True if safe, valid=False if harmful.
        The score represents the unsafe probability (0.0 = safe, 1.0 = unsafe).

    """
    model_inputs = self._pre_processing(input_text, output_text)
    model_outputs = self._inference(model_inputs)
    return self._post_processing(model_outputs)