normlizer-fix4

This commit is contained in:
Untone 2024-09-27 10:57:20 +03:00
parent e2761ee3d8
commit e241e14764

View File

@ -8,15 +8,15 @@ model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def is_russian_wording(text):
"""
Check if the text contains more than one Russian characters by checking
Check if the text contains more than one Russian character by checking
each character against the Unicode range for Cyrillic.
"""
counter = 0
for char in text:
if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters
counter += 1
if counter > 1:
return True
if counter > 1:
return True
return False
@ -24,13 +24,12 @@ def segment_text(text):
"""
Use a neural network model to segment text into words.
"""
# Encode the input text for the model
# inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
inputs = tokenizer("segment: " + text, return_tensors="pt")
# Encode the input text for the model as UTF-8 bytes
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Generate predictions
with torch.no_grad():
outputs = model.generate(**inputs)
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
@ -43,9 +42,7 @@ def normalize(text):
Normalize English text to resemble Russian characters.
"""
# Segment the text first
t = segment_text(
text.replace(" ", " ").replace(" ", " ").replace(" ", " ")
)
t = segment_text(text.replace(" ", " "))
if is_russian_wording(t):
# Normalize the text by replacing characters
@ -55,15 +52,15 @@ def normalize(text):
.replace("o", "о")
.replace("x", "х")
.replace("a", "а")
.replace("r", "г")
.replace("r", "р")
.replace("m", "м")
.replace("u", "и")
.replace("n", "п")
.replace("p", "р")
.replace("n", "н")
.replace("p", "п")
.replace("t", "т")
.replace("y", "у")
.replace("h", "н")
.replace("i", "й")
.replace("h", "х")
.replace("i", "и")
.replace("c", "с")
.replace("k", "к")
.replace("b", "в")
@ -85,3 +82,4 @@ if __name__ == "__main__":
normalized_output = normalize(input_text)
print(normalized_output)