welcomecenterbot/nlp/toxicity_detector.py

38 lines
963 B
Python
Raw Permalink Normal View History

2024-09-26 10:07:01 +00:00
from transformers import BertTokenizer, BertForSequenceClassification
2024-09-26 10:24:18 +00:00
import torch
import torch.nn.functional as F
2024-09-26 10:07:01 +00:00
2024-09-26 10:24:18 +00:00
# Load tokenizer and model weights
2024-09-27 06:23:55 +00:00
tokenizer = BertTokenizer.from_pretrained(
"SkolkovoInstitute/russian_toxicity_classifier"
)
model = BertForSequenceClassification.from_pretrained(
"SkolkovoInstitute/russian_toxicity_classifier"
)
2024-09-26 10:07:01 +00:00
def detector(text):
2024-09-26 10:24:18 +00:00
# Prepare the input
2024-09-27 06:23:55 +00:00
batch = tokenizer.encode(text, return_tensors="pt")
2024-09-26 10:07:01 +00:00
2024-09-26 10:24:18 +00:00
# Inference
with torch.no_grad():
result = model(batch)
2024-09-27 06:23:55 +00:00
2024-09-26 10:24:18 +00:00
# Get logits
logits = result.logits
# Convert logits to probabilities using softmax
probabilities = F.softmax(logits, dim=1)
return probabilities[0][1].item()
2024-09-27 06:23:55 +00:00
2024-09-26 10:24:18 +00:00
if __name__ == "__main__":
import sys
2024-09-27 06:23:55 +00:00
2024-09-26 10:24:18 +00:00
if len(sys.argv) > 1:
p = detector(sys.argv[1])
toxicity_percentage = p * 100 # Assuming index 1 is for toxic class
2024-09-27 06:23:55 +00:00
print(f"Toxicity Probability: {toxicity_percentage:.2f}%")