toxicity-detector

2024-02-12 15:50:35 +03:00
parent ec82174dc9
commit 7fd358931a
10 changed files with 166 additions and 255 deletions
--- a/nlp/toxicity.py
+++ b/nlp/toxicity.py
@@ -0,0 +1,31 @@
+import torch
+from transformers import AutoTokenizer, \
+    AutoModelForSequenceClassification
+
+tiny_tox_model_path = 'cointegrated/rubert-tiny-toxicity'
+tiny_tox_tokenizer = AutoTokenizer.from_pretrained(tiny_tox_model_path)
+tiny_tox_model = AutoModelForSequenceClassification.from_pretrained(
+    tiny_tox_model_path)
+
+
+# if torch.cuda.is_available():
+#    model.cuda()
+
+
+def text2toxicity(text, aggregate=True) -> float:
+    """ Calculate toxicity of a text (if aggregate=True)
+    or a vector of toxicity aspects (if aggregate=False)"""
+    proba = 0.0
+    with torch.no_grad():
+        inputs = tiny_tox_tokenizer(
+            text.lower(),
+            return_tensors='pt',
+            truncation=True,
+            padding=True
+        ).to(tiny_tox_model.device)
+        proba = torch.sigmoid(tiny_tox_model(**inputs).logits).cpu().numpy()
+    if isinstance(text, str):
+        proba = proba[0]
+    if aggregate:
+        return 1 - proba.T[0] * (1 - proba.T[-1])
+    return float(proba)