2024-09-26 20:38:05 +00:00
|
|
|
|
import torch
|
2024-09-27 07:15:18 +00:00
|
|
|
|
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
|
2024-09-26 20:38:05 +00:00
|
|
|
|
|
2024-09-27 07:10:40 +00:00
|
|
|
|
# Use ByT5 for the ByT5 model
|
|
|
|
|
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
|
2024-09-27 07:15:18 +00:00
|
|
|
|
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
|
|
|
|
|
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-26 18:36:14 +00:00
|
|
|
|
def is_russian_wording(text):
|
|
|
|
|
"""
|
2024-09-27 07:57:20 +00:00
|
|
|
|
Check if the text contains more than one Russian character by checking
|
2024-09-26 18:36:14 +00:00
|
|
|
|
each character against the Unicode range for Cyrillic.
|
|
|
|
|
"""
|
2024-09-27 06:32:25 +00:00
|
|
|
|
counter = 0
|
2024-09-26 18:36:14 +00:00
|
|
|
|
for char in text:
|
2024-09-27 06:23:55 +00:00
|
|
|
|
if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters
|
2024-09-27 06:32:25 +00:00
|
|
|
|
counter += 1
|
2024-09-27 07:57:20 +00:00
|
|
|
|
if counter > 1:
|
|
|
|
|
return True
|
2024-09-26 18:36:14 +00:00
|
|
|
|
return False
|
|
|
|
|
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-26 20:38:05 +00:00
|
|
|
|
def segment_text(text):
|
|
|
|
|
"""
|
|
|
|
|
Use a neural network model to segment text into words.
|
|
|
|
|
"""
|
2024-09-27 07:57:20 +00:00
|
|
|
|
# Encode the input text for the model as UTF-8 bytes
|
|
|
|
|
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-26 20:38:05 +00:00
|
|
|
|
# Generate predictions
|
|
|
|
|
with torch.no_grad():
|
2024-09-27 07:57:20 +00:00
|
|
|
|
outputs = model.generate(inputs)
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-26 20:38:05 +00:00
|
|
|
|
# Decode the generated tokens back to text
|
|
|
|
|
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-26 20:38:05 +00:00
|
|
|
|
return segmented_text
|
|
|
|
|
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-26 18:36:14 +00:00
|
|
|
|
def normalize(text):
|
|
|
|
|
"""
|
|
|
|
|
Normalize English text to resemble Russian characters.
|
|
|
|
|
"""
|
2024-09-26 20:38:05 +00:00
|
|
|
|
# Segment the text first
|
2024-09-27 08:18:18 +00:00
|
|
|
|
t = segment_text(text.replace(" ", " ").replace(" ", " ").replace(" ", " "))
|
|
|
|
|
|
|
|
|
|
t = t.lower()
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-27 06:32:25 +00:00
|
|
|
|
if is_russian_wording(t):
|
2024-09-26 18:36:14 +00:00
|
|
|
|
# Normalize the text by replacing characters
|
2024-09-27 08:18:18 +00:00
|
|
|
|
normalized_text = (t
|
2024-09-27 06:32:25 +00:00
|
|
|
|
.replace("e", "е")
|
2024-09-27 06:23:55 +00:00
|
|
|
|
.replace("o", "о")
|
|
|
|
|
.replace("x", "х")
|
|
|
|
|
.replace("a", "а")
|
2024-09-27 07:57:20 +00:00
|
|
|
|
.replace("r", "р")
|
2024-09-27 06:23:55 +00:00
|
|
|
|
.replace("m", "м")
|
|
|
|
|
.replace("u", "и")
|
2024-09-27 07:57:20 +00:00
|
|
|
|
.replace("n", "н")
|
|
|
|
|
.replace("p", "п")
|
2024-09-27 06:23:55 +00:00
|
|
|
|
.replace("t", "т")
|
|
|
|
|
.replace("y", "у")
|
2024-09-27 07:57:20 +00:00
|
|
|
|
.replace("h", "х")
|
|
|
|
|
.replace("i", "и")
|
2024-09-27 06:23:55 +00:00
|
|
|
|
.replace("c", "с")
|
|
|
|
|
.replace("k", "к")
|
|
|
|
|
.replace("b", "в")
|
|
|
|
|
.replace("3", "з")
|
|
|
|
|
.replace("4", "ч")
|
|
|
|
|
.replace("0", "о")
|
|
|
|
|
.replace("d", "д")
|
|
|
|
|
.replace("z", "з")
|
|
|
|
|
)
|
|
|
|
|
|
2024-09-26 20:38:05 +00:00
|
|
|
|
return normalized_text
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-27 06:32:25 +00:00
|
|
|
|
return t
|
2024-09-26 20:38:05 +00:00
|
|
|
|
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-26 20:38:05 +00:00
|
|
|
|
# Example usage
|
|
|
|
|
if __name__ == "__main__":
|
2024-09-27 07:15:18 +00:00
|
|
|
|
input_text = "привет шп ана т у п а я"
|
2024-09-27 06:23:55 +00:00
|
|
|
|
|
2024-09-26 20:38:05 +00:00
|
|
|
|
normalized_output = normalize(input_text)
|
|
|
|
|
print(normalized_output)
|