welcomecenterbot/nlp/toxicity.py
2024-02-12 15:50:35 +03:00

32 lines
992 B
Python

import torch
from transformers import AutoTokenizer, \
AutoModelForSequenceClassification
tiny_tox_model_path = 'cointegrated/rubert-tiny-toxicity'
tiny_tox_tokenizer = AutoTokenizer.from_pretrained(tiny_tox_model_path)
tiny_tox_model = AutoModelForSequenceClassification.from_pretrained(
tiny_tox_model_path)
# if torch.cuda.is_available():
# model.cuda()
def text2toxicity(text, aggregate=True) -> float:
""" Calculate toxicity of a text (if aggregate=True)
or a vector of toxicity aspects (if aggregate=False)"""
proba = 0.0
with torch.no_grad():
inputs = tiny_tox_tokenizer(
text.lower(),
return_tensors='pt',
truncation=True,
padding=True
).to(tiny_tox_model.device)
proba = torch.sigmoid(tiny_tox_model(**inputs).logits).cpu().numpy()
if isinstance(text, str):
proba = proba[0]
if aggregate:
return 1 - proba.T[0] * (1 - proba.T[-1])
return float(proba)