normlizer-fix4

This commit is contained in:
2024-09-27 10:57:20 +03:00
parent e2761ee3d8
commit e241e14764

View File

@@ -8,7 +8,7 @@ model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def is_russian_wording(text): def is_russian_wording(text):
""" """
Check if the text contains more than one Russian characters by checking Check if the text contains more than one Russian character by checking
each character against the Unicode range for Cyrillic. each character against the Unicode range for Cyrillic.
""" """
counter = 0 counter = 0
@@ -24,13 +24,12 @@ def segment_text(text):
""" """
Use a neural network model to segment text into words. Use a neural network model to segment text into words.
""" """
# Encode the input text for the model # Encode the input text for the model as UTF-8 bytes
# inputs = tokenizer.encode("segment: " + text, return_tensors="pt") inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
inputs = tokenizer("segment: " + text, return_tensors="pt")
# Generate predictions # Generate predictions
with torch.no_grad(): with torch.no_grad():
outputs = model.generate(**inputs) outputs = model.generate(inputs)
# Decode the generated tokens back to text # Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True) segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -43,9 +42,7 @@ def normalize(text):
Normalize English text to resemble Russian characters. Normalize English text to resemble Russian characters.
""" """
# Segment the text first # Segment the text first
t = segment_text( t = segment_text(text.replace(" ", " "))
text.replace(" ", " ").replace(" ", " ").replace(" ", " ")
)
if is_russian_wording(t): if is_russian_wording(t):
# Normalize the text by replacing characters # Normalize the text by replacing characters
@@ -55,15 +52,15 @@ def normalize(text):
.replace("o", "о") .replace("o", "о")
.replace("x", "х") .replace("x", "х")
.replace("a", "а") .replace("a", "а")
.replace("r", "г") .replace("r", "р")
.replace("m", "м") .replace("m", "м")
.replace("u", "и") .replace("u", "и")
.replace("n", "п") .replace("n", "н")
.replace("p", "р") .replace("p", "п")
.replace("t", "т") .replace("t", "т")
.replace("y", "у") .replace("y", "у")
.replace("h", "н") .replace("h", "х")
.replace("i", "й") .replace("i", "и")
.replace("c", "с") .replace("c", "с")
.replace("k", "к") .replace("k", "к")
.replace("b", "в") .replace("b", "в")
@@ -85,3 +82,4 @@ if __name__ == "__main__":
normalized_output = normalize(input_text) normalized_output = normalize(input_text)
print(normalized_output) print(normalized_output)