ruffed

2024-09-27 09:23:55 +03:00
parent c1e481ded8
commit a7b1925e8d
12 changed files with 218 additions and 139 deletions
--- a/nlp/normalize.py
+++ b/nlp/normalize.py
@@ -0,0 +1,85 @@
+import torch
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+# Initialize the T5 model and tokenizer
+tokenizer = T5Tokenizer.from_pretrained("google/byt5-small")
+model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+
+
+def is_russian_wording(text):
+    """
+    Check if the text contains any Russian characters by checking
+    each character against the Unicode range for Cyrillic.
+    """
+    for char in text:
+        if "\u0400" <= char <= "\u04ff":  # Unicode range for Cyrillic characters
+            return True
+    return False
+
+
+def segment_text(text):
+    """
+    Use a neural network model to segment text into words.
+    """
+    # Encode the input text for the model
+    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
+
+    # Generate predictions
+    with torch.no_grad():
+        outputs = model.generate(inputs)
+
+    # Decode the generated tokens back to text
+    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+    return segmented_text
+
+
+def normalize(text):
+    """
+    Normalize English text to resemble Russian characters.
+    """
+    # Segment the text first
+    segmented_text = segment_text(
+        text.replace("  ", " ").replace("  ", " ").replace("  ", " ")
+    )
+
+    # Normalize after segmentation
+    segmented_text = segmented_text.lower()
+
+    if is_russian_wording(segmented_text):
+        # Normalize the text by replacing characters
+        normalized_text = (
+            segmented_text.replace("e", "е")
+            .replace("o", "о")
+            .replace("x", "х")
+            .replace("a", "а")
+            .replace("r", "г")
+            .replace("m", "м")
+            .replace("u", "и")
+            .replace("n", "п")
+            .replace("p", "р")
+            .replace("t", "т")
+            .replace("y", "у")
+            .replace("h", "н")
+            .replace("i", "й")
+            .replace("c", "с")
+            .replace("k", "к")
+            .replace("b", "в")
+            .replace("3", "з")
+            .replace("4", "ч")
+            .replace("0", "о")
+            .replace("d", "д")
+            .replace("z", "з")
+        )
+
+        return normalized_text
+
+    return segmented_text
+
+
+# Example usage
+if __name__ == "__main__":
+    input_text = "Hello, this is a test input."
+
+    normalized_output = normalize(input_text)
+    print(normalized_output)
--- a/nlp/toxicity_detector.py
+++ b/nlp/toxicity_detector.py
@@ -3,17 +3,22 @@ import torch
 import torch.nn.functional as F

 # Load tokenizer and model weights
-tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
-model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
+tokenizer = BertTokenizer.from_pretrained(
+    "SkolkovoInstitute/russian_toxicity_classifier"
+)
+model = BertForSequenceClassification.from_pretrained(
+    "SkolkovoInstitute/russian_toxicity_classifier"
+)
+

 def detector(text):
    # Prepare the input
-    batch = tokenizer.encode(text, return_tensors='pt')
+    batch = tokenizer.encode(text, return_tensors="pt")

    # Inference
    with torch.no_grad():
        result = model(batch)
-    
+
    # Get logits
    logits = result.logits

@@ -22,9 +27,11 @@ def detector(text):

    return probabilities[0][1].item()

+
 if __name__ == "__main__":
    import sys
+
    if len(sys.argv) > 1:
        p = detector(sys.argv[1])
        toxicity_percentage = p * 100  # Assuming index 1 is for toxic class
-        print(f"Toxicity Probability: {toxicity_percentage:.2f}%")
+        print(f"Toxicity Probability: {toxicity_percentage:.2f}%")