ruffed
This commit is contained in:
85
nlp/normalize.py
Normal file
85
nlp/normalize.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import torch
|
||||
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
|
||||
# Initialize the T5 model and tokenizer
|
||||
tokenizer = T5Tokenizer.from_pretrained("google/byt5-small")
|
||||
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
|
||||
|
||||
|
||||
def is_russian_wording(text):
|
||||
"""
|
||||
Check if the text contains any Russian characters by checking
|
||||
each character against the Unicode range for Cyrillic.
|
||||
"""
|
||||
for char in text:
|
||||
if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def segment_text(text):
|
||||
"""
|
||||
Use a neural network model to segment text into words.
|
||||
"""
|
||||
# Encode the input text for the model
|
||||
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
|
||||
|
||||
# Generate predictions
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(inputs)
|
||||
|
||||
# Decode the generated tokens back to text
|
||||
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
|
||||
return segmented_text
|
||||
|
||||
|
||||
def normalize(text):
|
||||
"""
|
||||
Normalize English text to resemble Russian characters.
|
||||
"""
|
||||
# Segment the text first
|
||||
segmented_text = segment_text(
|
||||
text.replace(" ", " ").replace(" ", " ").replace(" ", " ")
|
||||
)
|
||||
|
||||
# Normalize after segmentation
|
||||
segmented_text = segmented_text.lower()
|
||||
|
||||
if is_russian_wording(segmented_text):
|
||||
# Normalize the text by replacing characters
|
||||
normalized_text = (
|
||||
segmented_text.replace("e", "е")
|
||||
.replace("o", "о")
|
||||
.replace("x", "х")
|
||||
.replace("a", "а")
|
||||
.replace("r", "г")
|
||||
.replace("m", "м")
|
||||
.replace("u", "и")
|
||||
.replace("n", "п")
|
||||
.replace("p", "р")
|
||||
.replace("t", "т")
|
||||
.replace("y", "у")
|
||||
.replace("h", "н")
|
||||
.replace("i", "й")
|
||||
.replace("c", "с")
|
||||
.replace("k", "к")
|
||||
.replace("b", "в")
|
||||
.replace("3", "з")
|
||||
.replace("4", "ч")
|
||||
.replace("0", "о")
|
||||
.replace("d", "д")
|
||||
.replace("z", "з")
|
||||
)
|
||||
|
||||
return normalized_text
|
||||
|
||||
return segmented_text
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
input_text = "Hello, this is a test input."
|
||||
|
||||
normalized_output = normalize(input_text)
|
||||
print(normalized_output)
|
@@ -3,17 +3,22 @@ import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
# Load tokenizer and model weights
|
||||
tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
|
||||
model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
|
||||
tokenizer = BertTokenizer.from_pretrained(
|
||||
"SkolkovoInstitute/russian_toxicity_classifier"
|
||||
)
|
||||
model = BertForSequenceClassification.from_pretrained(
|
||||
"SkolkovoInstitute/russian_toxicity_classifier"
|
||||
)
|
||||
|
||||
|
||||
def detector(text):
|
||||
# Prepare the input
|
||||
batch = tokenizer.encode(text, return_tensors='pt')
|
||||
batch = tokenizer.encode(text, return_tensors="pt")
|
||||
|
||||
# Inference
|
||||
with torch.no_grad():
|
||||
result = model(batch)
|
||||
|
||||
|
||||
# Get logits
|
||||
logits = result.logits
|
||||
|
||||
@@ -22,9 +27,11 @@ def detector(text):
|
||||
|
||||
return probabilities[0][1].item()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
p = detector(sys.argv[1])
|
||||
toxicity_percentage = p * 100 # Assuming index 1 is for toxic class
|
||||
print(f"Toxicity Probability: {toxicity_percentage:.2f}%")
|
||||
print(f"Toxicity Probability: {toxicity_percentage:.2f}%")
|
||||
|
Reference in New Issue
Block a user