32 lines
992 B
Python
32 lines
992 B
Python
import torch
|
|
from transformers import AutoTokenizer, \
|
|
AutoModelForSequenceClassification
|
|
|
|
tiny_tox_model_path = 'cointegrated/rubert-tiny-toxicity'
|
|
tiny_tox_tokenizer = AutoTokenizer.from_pretrained(tiny_tox_model_path)
|
|
tiny_tox_model = AutoModelForSequenceClassification.from_pretrained(
|
|
tiny_tox_model_path)
|
|
|
|
|
|
# if torch.cuda.is_available():
|
|
# model.cuda()
|
|
|
|
|
|
def text2toxicity(text, aggregate=True) -> float:
|
|
""" Calculate toxicity of a text (if aggregate=True)
|
|
or a vector of toxicity aspects (if aggregate=False)"""
|
|
proba = 0.0
|
|
with torch.no_grad():
|
|
inputs = tiny_tox_tokenizer(
|
|
text.lower(),
|
|
return_tensors='pt',
|
|
truncation=True,
|
|
padding=True
|
|
).to(tiny_tox_model.device)
|
|
proba = torch.sigmoid(tiny_tox_model(**inputs).logits).cpu().numpy()
|
|
if isinstance(text, str):
|
|
proba = proba[0]
|
|
if aggregate:
|
|
return 1 - proba.T[0] * (1 - proba.T[-1])
|
|
return float(proba)
|