welcomecenterbot/nlp/toxicity.py

import torch
from transformers import AutoTokenizer, \
    AutoModelForSequenceClassification

tiny_tox_model_path = 'cointegrated/rubert-tiny-toxicity'
tiny_tox_tokenizer = AutoTokenizer.from_pretrained(tiny_tox_model_path)
tiny_tox_model = AutoModelForSequenceClassification.from_pretrained(
    tiny_tox_model_path)


# if torch.cuda.is_available():
#    model.cuda()


def text2toxicity(text, aggregate=True) -> float:
    """ Calculate toxicity of a text (if aggregate=True)
    or a vector of toxicity aspects (if aggregate=False)"""
    proba = 0.0
    with torch.no_grad():
        inputs = tiny_tox_tokenizer(
            text.lower(),
            return_tensors='pt',
            truncation=True,
            padding=True
        ).to(tiny_tox_model.device)
        proba = torch.sigmoid(tiny_tox_model(**inputs).logits).cpu().numpy()
    if isinstance(text, str):
        proba = proba[0]
    if aggregate:
        return 1 - proba.T[0] * (1 - proba.T[-1])
    return float(proba)