welcomecenterbot/nlp/normalize.py
2024-09-28 10:20:00 +03:00

67 lines
1.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
logger = logging.getLogger("nlp.normalize")
def is_russian_wording(text):
"""
Check if the text contains more than one Russian character by checking
each character against the Unicode range for Cyrillic.
"""
counter = 0
for char in text:
if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters
counter += 1
if counter > 1:
return True
return False
def normalize(text):
"""
Normalize English text to resemble Russian characters.
"""
t = text.replace(" ", " ").replace(" ", " ").replace(" ", " ")
# Segment the text first
# t = segment_text(t)
t = t.lower()
if is_russian_wording(t):
# Normalize the text by replacing characters
normalized_text = (
t.replace("e", "е")
.replace("o", "о")
.replace("x", "х")
.replace("a", "а")
.replace("r", "р")
.replace("m", "м")
.replace("u", "и")
.replace("n", "н")
.replace("p", "п")
.replace("t", "т")
.replace("y", "у")
.replace("h", "х")
.replace("i", "и")
.replace("c", "с")
.replace("k", "к")
.replace("b", "в")
.replace("3", "з")
.replace("4", "ч")
.replace("0", "о")
.replace("d", "д")
.replace("z", "з")
)
return normalized_text
logger.debug(f"normalized: {t}")
return t
# Example usage
if __name__ == "__main__":
input_text = "привет шп ана т у п а я"
normalized_output = normalize(input_text)
print(normalized_output)