welcomecenterbot/utils/normalize.py

81 lines
2.7 KiB
Python
Raw Normal View History

2024-09-26 20:38:05 +00:00
import logging
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
# Initialize the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
2024-09-26 18:36:14 +00:00
def is_russian_wording(text):
"""
Check if the text contains any Russian characters by checking
each character against the Unicode range for Cyrillic.
"""
for char in text:
if '\u0400' <= char <= '\u04FF': # Unicode range for Cyrillic characters
return True
return False
2024-09-26 20:38:05 +00:00
def segment_text(text):
"""
Use a neural network model to segment text into words.
"""
# Encode the input text for the model
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Generate predictions
with torch.no_grad():
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return segmented_text
2024-09-26 18:36:14 +00:00
def normalize(text):
"""
Normalize English text to resemble Russian characters.
"""
2024-09-26 20:38:05 +00:00
# Segment the text first
segmented_text = segment_text(text.replace(' ', ' ').replace(' ', ' ').replace(' ', ' '))
# Normalize after segmentation
segmented_text = segmented_text.lower()
if is_russian_wording(segmented_text):
2024-09-26 18:36:14 +00:00
# Normalize the text by replacing characters
2024-09-26 20:38:05 +00:00
normalized_text = (segmented_text
.replace('e', 'е')
.replace('o', 'о')
.replace('x', 'х')
.replace('a', 'а')
.replace('r', 'г')
.replace('m', 'м')
.replace('u', 'и')
.replace('n', 'п')
.replace('p', 'р')
.replace('t', 'т')
.replace('y', 'у')
.replace('h', 'н')
.replace('i', 'й')
.replace('c', 'с')
.replace('k', 'к')
.replace('b', 'в')
.replace('3', 'з')
.replace('4', 'ч')
.replace('0', 'о')
.replace('d', 'д')
.replace('z', 'з'))
return normalized_text
2024-09-26 18:36:14 +00:00
2024-09-26 20:38:05 +00:00
return segmented_text
# Example usage
if __name__ == "__main__":
input_text = "Hello, this is a test input."
normalized_output = normalize(input_text)
print(normalized_output)