dockerfile-fix
This commit is contained in:
@@ -8,11 +8,14 @@ model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
|
||||
|
||||
def is_russian_wording(text):
|
||||
"""
|
||||
Check if the text contains any Russian characters by checking
|
||||
Check if the text contains more than one Russian characters by checking
|
||||
each character against the Unicode range for Cyrillic.
|
||||
"""
|
||||
counter = 0
|
||||
for char in text:
|
||||
if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters
|
||||
counter += 1
|
||||
if counter > 1:
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -39,17 +42,15 @@ def normalize(text):
|
||||
Normalize English text to resemble Russian characters.
|
||||
"""
|
||||
# Segment the text first
|
||||
segmented_text = segment_text(
|
||||
t = segment_text(
|
||||
text.replace(" ", " ").replace(" ", " ").replace(" ", " ")
|
||||
)
|
||||
|
||||
# Normalize after segmentation
|
||||
segmented_text = segmented_text.lower()
|
||||
|
||||
if is_russian_wording(segmented_text):
|
||||
if is_russian_wording(t):
|
||||
# Normalize the text by replacing characters
|
||||
normalized_text = (
|
||||
segmented_text.replace("e", "е")
|
||||
t.lower()
|
||||
.replace("e", "е")
|
||||
.replace("o", "о")
|
||||
.replace("x", "х")
|
||||
.replace("a", "а")
|
||||
@@ -74,7 +75,7 @@ def normalize(text):
|
||||
|
||||
return normalized_text
|
||||
|
||||
return segmented_text
|
||||
return t
|
||||
|
||||
|
||||
# Example usage
|
||||
|
Reference in New Issue
Block a user