Source code for tropt.recipe_hub.UAT

from __future__ import annotations
"""Universal Adversarial Triggers (UAT) for classifier evasion.

Adapts the UAT framework (Wallace et al., 2019) using GCG-style optimization
instead of the original HotFlip variant. Optimizes a single universal trigger
that causes misclassification across many inputs via batch sampling.

Reference: https://arxiv.org/abs/1908.07125
"""

from typing import List, Optional

from tropt.common import OPTIMIZED_TRIGGER_PLACEHOLDER, Targets
from tropt.loss import MisclassCELoss
from tropt.model.huggingface.classifier import ClassifierHFModel
from tropt.optimizer import OptimizerResult
from tropt.optimizer.gcgplus_optimizer import GCGPlusOptimizer
from tropt.optimizer.utils.token_constraints import TokenConstraints
from tropt.optimizer.utils.token_initializers import get_printable_random_trigger
from tropt.tracker import BaseTracker

_UAT_TOKEN_CONSTRAINTS = TokenConstraints(
    disallow_non_ascii=True, disallow_special_tokens=True
)


[docs] def uat_classifier( templates: List[str], target_class_idx: int, model_name: Optional[str]=None, model_obj: Optional[ClassifierHFModel]=None, tracker: Optional[BaseTracker] = None, seed: int = 42, trigger_len: int = 5, num_steps: int = 500, template_batch_size: int = 5, n_candidates: int = 256, ) -> OptimizerResult: """UAT on a classifier model using GCG optimization. Args: templates: Input texts with ``{{OPTIMIZED_TRIGGER}}`` placeholder. target_class_idx: Class index to steer predictions toward (targeted attack). model_obj: Pre-loaded ClassifierHFModel. trigger_len: Number of trigger tokens to optimize. template_batch_size: Templates sampled per optimization step (UAT-style). """ if model_obj is None: model_obj = ClassifierHFModel(model_name=model_name) assert model_obj is not None loss = MisclassCELoss(targeted=True) initial_trigger = get_printable_random_trigger( trigger_len=trigger_len, tokenizer=model_obj.tokenizer, token_constraints=_UAT_TOKEN_CONSTRAINTS, ) optimizer = GCGPlusOptimizer( model=model_obj, loss=loss, tracker=tracker, seed=seed, num_steps=num_steps, n_candidates=n_candidates, sample_topk=256, sample_n_replace=1, token_constraints=_UAT_TOKEN_CONSTRAINTS, use_retokenize=False, template_batch_size=template_batch_size, ) return optimizer.optimize_trigger( templates=templates, targets=Targets(target_class_idx=[target_class_idx] * len(templates)), initial_trigger=initial_trigger, )
[docs] def uat_prompt_injection( model_name: str = "meta-llama/Llama-Prompt-Guard-2-86M", n_samples: int = 50, trigger_len: int = 5, num_steps: int = 500, template_batch_size: int = 5, tracker: Optional[BaseTracker] = None, seed: int = 42, ) -> dict: """UAT for prompt-injection evasion on Llama Prompt Guard 2. Loads the classifier and the rogue-security/prompt-injections-benchmark dataset, optimizes a universal trigger to make injection prompts be classified as BENIGN, then evaluates on held-in, held-out, and benign splits. Returns a dict with the optimization result and evaluation metrics. """ import random as stdlib_random import torch from datasets import load_dataset # --- Load model --- model = ClassifierHFModel(model_name=model_name) # Prompt Guard 2: 0=BENIGN, 1=MALICIOUS benign_class_idx = 0 # --- Load dataset --- ds = load_dataset("rogue-security/prompt-injections-benchmark", split="test") injections = [row["text"] for row in ds if row["label"] == "jailbreak"] benign_texts = [row["text"] for row in ds if row["label"] == "benign"] # Shuffle and sample injection prompts rng = stdlib_random.Random(seed) rng.shuffle(injections) held_in = injections[:n_samples] held_out = injections[n_samples:] print(f"Dataset: {len(injections)} injections, {len(benign_texts)} benign") print(f"Held-in: {len(held_in)}, Held-out: {len(held_out)}") # Build templates: append trigger placeholder to each injection prompt templates = [ text + " " + OPTIMIZED_TRIGGER_PLACEHOLDER for text in held_in ] # --- Optimize --- result = uat_classifier( templates=templates, target_class_idx=benign_class_idx, model_obj=model, tracker=tracker, seed=seed, trigger_len=trigger_len, num_steps=num_steps, template_batch_size=template_batch_size, ) assert result.best_trigger_str is not None trigger: str = result.best_trigger_str print(f"\nOptimized trigger: {trigger!r}") print(f"Best loss: {result.best_loss:.4f}") # --- Evaluate --- def evaluate_split(texts: list[str], append_trigger: bool) -> dict: if append_trigger: eval_texts = [t + " " + trigger for t in texts] else: eval_texts = texts with torch.no_grad(): out = model.invoke_from_texts(eval_texts) preds = out.output_class_logits.argmax(dim=-1).cpu().tolist() n_benign = sum(1 for p in preds if p == benign_class_idx) return { "n_total": len(texts), "n_predicted_benign": n_benign, "attack_success_rate": n_benign / len(texts), } eval_held_in = evaluate_split(held_in, append_trigger=True) eval_held_out = evaluate_split(held_out, append_trigger=True) eval_benign = evaluate_split(benign_texts, append_trigger=True) eval_benign_clean = evaluate_split(benign_texts, append_trigger=False) print("\n--- Evaluation ---") print(f"Held-in (injections + trigger): {eval_held_in['attack_success_rate']:.1%} classified as benign") print(f"Held-out (injections + trigger): {eval_held_out['attack_success_rate']:.1%} classified as benign") print(f"Benign (+ trigger): {eval_benign['attack_success_rate']:.1%} classified as benign") print(f"Benign (clean, no trigger): {eval_benign_clean['attack_success_rate']:.1%} classified as benign") return { "result": result, "trigger": trigger, "eval_held_in": eval_held_in, "eval_held_out": eval_held_out, "eval_benign_with_trigger": eval_benign, "eval_benign_clean": eval_benign_clean, }