Bases: HuggingFace
Prompt injection detection encoder based model.
For more information, please see the model card:
Source code in src/any_guardrail/guardrails/harm_guard/harm_guard.py
| class HarmGuard(HuggingFace):
"""Prompt injection detection encoder based model.
For more information, please see the model card:
- [HarmGuard](https://huggingface.co/hbseong/HarmAug-Guard).
"""
SUPPORTED_MODELS: ClassVar = ["hbseong/HarmAug-Guard"]
def __init__(self, model_id: str | None = None, threshold: float = HARMGUARD_DEFAULT_THRESHOLD) -> None:
"""Initialize the HarmGuard guardrail."""
super().__init__(model_id)
self.threshold = threshold
def _post_processing(self, model_outputs: dict[str, Any]) -> GuardrailOutput:
logits = model_outputs["logits"][0].numpy()
scores = _softmax(logits) # type: ignore[no-untyped-call]
final_score = float(scores[1])
return GuardrailOutput(valid=final_score < self.threshold, score=final_score)
|
__init__(model_id=None, threshold=HARMGUARD_DEFAULT_THRESHOLD)
Initialize the HarmGuard guardrail.
Source code in src/any_guardrail/guardrails/harm_guard/harm_guard.py
| def __init__(self, model_id: str | None = None, threshold: float = HARMGUARD_DEFAULT_THRESHOLD) -> None:
"""Initialize the HarmGuard guardrail."""
super().__init__(model_id)
self.threshold = threshold
|