welcomecenterbot/nlp/normalize.py

import torch
from transformers import ByT5Tokenizer, T5ForConditionalGeneration

# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")


def is_russian_wording(text):
    """
    Check if the text contains more than one Russian character by checking
    each character against the Unicode range for Cyrillic.
    """
    counter = 0
    for char in text:
        if "\u0400" <= char <= "\u04ff":  # Unicode range for Cyrillic characters
            counter += 1
            if counter > 1:
                return True
    return False


def segment_text(text):
    """
    Use a neural network model to segment text into words.
    """
    # Encode the input text for the model as UTF-8 bytes
    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")

    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(inputs)

    # Decode the generated tokens back to text
    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return segmented_text


def normalize(text):
    """
    Normalize English text to resemble Russian characters.
    """
    # Segment the text first
    t = segment_text(text.replace("  ", " ").replace("  ", " ").replace("  ", " "))

    t = t.lower()

    if is_russian_wording(t):
        # Normalize the text by replacing characters
        normalized_text = (t
            .replace("e", "е")
            .replace("o", "о")
            .replace("x", "х")
            .replace("a", "а")
            .replace("r", "р")
            .replace("m", "м")
            .replace("u", "и")
            .replace("n", "н")
            .replace("p", "п")
            .replace("t", "т")
            .replace("y", "у")
            .replace("h", "х")
            .replace("i", "и")
            .replace("c", "с")
            .replace("k", "к")
            .replace("b", "в")
            .replace("3", "з")
            .replace("4", "ч")
            .replace("0", "о")
            .replace("d", "д")
            .replace("z", "з")
        )

        return normalized_text

    return t


# Example usage
if __name__ == "__main__":
    input_text = "привет шп  ана т у п а я"

    normalized_output = normalize(input_text)
    print(normalized_output)
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
+								import torch
-												normlizer-fix2

											
										
										
											2024-09-27 07:15:18 +00:00
+								from transformers import ByT5Tokenizer, T5ForConditionalGeneration
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
-												normlizer-fix

											
										
										
											2024-09-27 07:10:40 +00:00
+								# Use ByT5 for the ByT5 model
 								tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
-												normlizer-fix2

											
										
										
											2024-09-27 07:15:18 +00:00
+								model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												toxic-debug15

											
										
										
											2024-09-26 18:36:14 +00:00
+								def is_russian_wording(text):
 								    """
-												normlizer-fix4

											
										
										
											2024-09-27 07:57:20 +00:00
+								    Check if the text contains more than one Russian character by checking
-												toxic-debug15

											
										
										
											2024-09-26 18:36:14 +00:00
+								    each character against the Unicode range for Cyrillic.
 								    """
-												dockerfile-fix

											
										
										
											2024-09-27 06:32:25 +00:00
+								    counter = 0
-												toxic-debug15

											
										
										
											2024-09-26 18:36:14 +00:00
+								    for char in text:
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
+								        if "\u0400" <= char <= "\u04ff":  # Unicode range for Cyrillic characters
-												dockerfile-fix

											
										
										
											2024-09-27 06:32:25 +00:00
+								            counter += 1
-												normlizer-fix4

											
										
										
											2024-09-27 07:57:20 +00:00
+								            if counter > 1:
 								                return True
-												toxic-debug15

											
										
										
											2024-09-26 18:36:14 +00:00
+								    return False
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
+								def segment_text(text):
 								    """
 								    Use a neural network model to segment text into words.
 								    """
-												normlizer-fix4

											
										
										
											2024-09-27 07:57:20 +00:00
+								    # Encode the input text for the model as UTF-8 bytes
 								    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
+								    # Generate predictions
 								    with torch.no_grad():
-												normlizer-fix4

											
										
										
											2024-09-27 07:57:20 +00:00
+								        outputs = model.generate(inputs)
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
+								    # Decode the generated tokens back to text
 								    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
+								    return segmented_text
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												toxic-debug15

											
										
										
											2024-09-26 18:36:14 +00:00
+								def normalize(text):
 								    """
 								    Normalize English text to resemble Russian characters.
 								    """
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
+								    # Segment the text first
-												normlizer-fix5

											
										
										
											2024-09-27 08:18:18 +00:00
+								    t = segment_text(text.replace("  ", " ").replace("  ", " ").replace("  ", " "))
 								    t = t.lower()
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												dockerfile-fix

											
										
										
											2024-09-27 06:32:25 +00:00
+								    if is_russian_wording(t):
-												toxic-debug15

											
										
										
											2024-09-26 18:36:14 +00:00
+								        # Normalize the text by replacing characters
-												normlizer-fix5

											
										
										
											2024-09-27 08:18:18 +00:00
+								        normalized_text = (t
-												dockerfile-fix

											
										
										
											2024-09-27 06:32:25 +00:00
+								            .replace("e", "е")
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
+								            .replace("o", "о")
 								            .replace("x", "х")
 								            .replace("a", "а")
-												normlizer-fix4

											
										
										
											2024-09-27 07:57:20 +00:00
+								            .replace("r", "р")
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
+								            .replace("m", "м")
 								            .replace("u", "и")
-												normlizer-fix4

											
										
										
											2024-09-27 07:57:20 +00:00
+								            .replace("n", "н")
 								            .replace("p", "п")
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
+								            .replace("t", "т")
 								            .replace("y", "у")
-												normlizer-fix4

											
										
										
											2024-09-27 07:57:20 +00:00
+								            .replace("h", "х")
 								            .replace("i", "и")
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
+								            .replace("c", "с")
 								            .replace("k", "к")
 								            .replace("b", "в")
 								            .replace("3", "з")
 								            .replace("4", "ч")
 								            .replace("0", "о")
 								            .replace("d", "д")
 								            .replace("z", "з")
 								        )
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
+								        return normalized_text
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												dockerfile-fix

											
										
										
											2024-09-27 06:32:25 +00:00
+								    return t
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
+								# Example usage
 								if __name__ == "__main__":
-												normlizer-fix2

											
										
										
											2024-09-27 07:15:18 +00:00
+								    input_text = "привет шп  ана т у п а я"
-												ruffed

											
										
										
											2024-09-27 06:23:55 +00:00
-												add-byt5

											
										
										
											2024-09-26 20:38:05 +00:00
+								    normalized_output = normalize(input_text)
 								    print(normalized_output)