"""
title: Minos Refusal Classifier Metric
author: your-name
funding_url: https://github.com/open-webui
version: 0.0.1
license: MIT
changelog:
- 0.0.1 - Initial version: emits refusal classification as a chat metric after each assistant response.
"""
from pydantic import BaseModel, Field
from typing import Optional, Callable, Any, Awaitable
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from open_webui.utils.misc import get_last_assistant_message, get_last_user_message
# Load Minos model and tokenizer once at startup
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Minos-v1")
model = AutoModelForSequenceClassification.from_pretrained(
"NousResearch/Minos-v1",
num_labels=2,
id2label={0: "Non-refusal", 1: "Refusal"},
label2id={"Non-refusal": 0, "Refusal": 1},
)
class Filter:
class Valves(BaseModel):
priority: int = Field(
default=5, description="Priority level for the filter operations."
)
show_refusal: bool = Field(
default=True,
description="Show Minos refusal classification metric",
)
pass
def __init__(self):
self.valves = self.Valves()
def classify_refusal(self, user_message: str, assistant_message: str):
text = f"<|user|>\n{user_message}\n<|assistant|>\n{assistant_message}"
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
prediction = torch.argmax(probabilities, dim=-1)
confidence = probabilities[0][prediction.item()].item()
label = model.config.id2label[prediction.item()]
return label, confidence
def inlet(self, body: dict):
# No changes to the input
return body
async def outlet(
self,
body: dict,
__event_emitter__: Callable[[Any], Awaitable[None]],
__model__: Optional[dict] = None,
) -> dict:
if not self.valves.show_refusal:
return body
# Get last user and assistant messages
assistant_message = get_last_assistant_message(body["messages"])
user_message = get_last_user_message(body["messages"])
if not assistant_message or not user_message:
return body
label, confidence = self.classify_refusal(user_message, assistant_message)
metric = f"Refusal: {label} ({confidence:.2%})"
await __event_emitter__(
{
"type": "status",
"data": {
"description": metric,
"done": True,
},
}
)
return body