dockerfile-fix

This commit is contained in:
2024-09-27 09:32:25 +03:00
parent a7b1925e8d
commit 14fc115e0f
2 changed files with 15 additions and 9 deletions

View File

@@ -8,11 +8,14 @@ model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def is_russian_wording(text):
"""
Check if the text contains any Russian characters by checking
Check if the text contains more than one Russian characters by checking
each character against the Unicode range for Cyrillic.
"""
counter = 0
for char in text:
if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters
counter += 1
if counter > 1:
return True
return False
@@ -39,17 +42,15 @@ def normalize(text):
Normalize English text to resemble Russian characters.
"""
# Segment the text first
segmented_text = segment_text(
t = segment_text(
text.replace(" ", " ").replace(" ", " ").replace(" ", " ")
)
# Normalize after segmentation
segmented_text = segmented_text.lower()
if is_russian_wording(segmented_text):
if is_russian_wording(t):
# Normalize the text by replacing characters
normalized_text = (
segmented_text.replace("e", "е")
t.lower()
.replace("e", "е")
.replace("o", "о")
.replace("x", "х")
.replace("a", "а")
@@ -74,7 +75,7 @@ def normalize(text):
return normalized_text
return segmented_text
return t
# Example usage