normlizer-fix4
This commit is contained in:
@@ -8,7 +8,7 @@ model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
|
||||
|
||||
def is_russian_wording(text):
|
||||
"""
|
||||
Check if the text contains more than one Russian characters by checking
|
||||
Check if the text contains more than one Russian character by checking
|
||||
each character against the Unicode range for Cyrillic.
|
||||
"""
|
||||
counter = 0
|
||||
@@ -24,13 +24,12 @@ def segment_text(text):
|
||||
"""
|
||||
Use a neural network model to segment text into words.
|
||||
"""
|
||||
# Encode the input text for the model
|
||||
# inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
|
||||
inputs = tokenizer("segment: " + text, return_tensors="pt")
|
||||
# Encode the input text for the model as UTF-8 bytes
|
||||
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
|
||||
|
||||
# Generate predictions
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(**inputs)
|
||||
outputs = model.generate(inputs)
|
||||
|
||||
# Decode the generated tokens back to text
|
||||
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
@@ -43,9 +42,7 @@ def normalize(text):
|
||||
Normalize English text to resemble Russian characters.
|
||||
"""
|
||||
# Segment the text first
|
||||
t = segment_text(
|
||||
text.replace(" ", " ").replace(" ", " ").replace(" ", " ")
|
||||
)
|
||||
t = segment_text(text.replace(" ", " "))
|
||||
|
||||
if is_russian_wording(t):
|
||||
# Normalize the text by replacing characters
|
||||
@@ -55,15 +52,15 @@ def normalize(text):
|
||||
.replace("o", "о")
|
||||
.replace("x", "х")
|
||||
.replace("a", "а")
|
||||
.replace("r", "г")
|
||||
.replace("r", "р")
|
||||
.replace("m", "м")
|
||||
.replace("u", "и")
|
||||
.replace("n", "п")
|
||||
.replace("p", "р")
|
||||
.replace("n", "н")
|
||||
.replace("p", "п")
|
||||
.replace("t", "т")
|
||||
.replace("y", "у")
|
||||
.replace("h", "н")
|
||||
.replace("i", "й")
|
||||
.replace("h", "х")
|
||||
.replace("i", "и")
|
||||
.replace("c", "с")
|
||||
.replace("k", "к")
|
||||
.replace("b", "в")
|
||||
@@ -85,3 +82,4 @@ if __name__ == "__main__":
|
||||
|
||||
normalized_output = normalize(input_text)
|
||||
print(normalized_output)
|
||||
|
Reference in New Issue
Block a user