welcomecenterbot/nlp/normalize.py

67 lines
1.7 KiB
Python
Raw Normal View History

2024-09-27 10:30:29 +00:00
import logging
logger = logging.getLogger("nlp.normalize")
2024-09-26 20:38:05 +00:00
2024-09-27 06:23:55 +00:00
2024-09-26 18:36:14 +00:00
def is_russian_wording(text):
"""
2024-09-27 07:57:20 +00:00
Check if the text contains more than one Russian character by checking
2024-09-26 18:36:14 +00:00
each character against the Unicode range for Cyrillic.
"""
2024-09-27 06:32:25 +00:00
counter = 0
2024-09-26 18:36:14 +00:00
for char in text:
2024-09-27 06:23:55 +00:00
if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters
2024-09-27 06:32:25 +00:00
counter += 1
2024-09-27 07:57:20 +00:00
if counter > 1:
return True
2024-09-26 18:36:14 +00:00
return False
def normalize(text):
"""
Normalize English text to resemble Russian characters.
"""
2024-09-27 10:39:40 +00:00
t = text.replace(" ", " ").replace(" ", " ").replace(" ", " ")
2024-09-26 20:38:05 +00:00
# Segment the text first
2024-09-27 10:39:40 +00:00
# t = segment_text(t)
2024-09-27 08:18:18 +00:00
t = t.lower()
2024-09-27 06:23:55 +00:00
2024-09-27 06:32:25 +00:00
if is_russian_wording(t):
2024-09-26 18:36:14 +00:00
# Normalize the text by replacing characters
2024-09-27 10:30:29 +00:00
normalized_text = (
t.replace("e", "е")
2024-09-27 06:23:55 +00:00
.replace("o", "о")
.replace("x", "х")
.replace("a", "а")
2024-09-27 07:57:20 +00:00
.replace("r", "р")
2024-09-27 06:23:55 +00:00
.replace("m", "м")
.replace("u", "и")
2024-09-27 07:57:20 +00:00
.replace("n", "н")
.replace("p", "п")
2024-09-27 06:23:55 +00:00
.replace("t", "т")
.replace("y", "у")
2024-09-27 07:57:20 +00:00
.replace("h", "х")
.replace("i", "и")
2024-09-27 06:23:55 +00:00
.replace("c", "с")
.replace("k", "к")
.replace("b", "в")
.replace("3", "з")
.replace("4", "ч")
.replace("0", "о")
.replace("d", "д")
.replace("z", "з")
)
2024-09-26 20:38:05 +00:00
return normalized_text
2024-09-28 07:20:00 +00:00
logger.debug(f"normalized: {t}")
2024-09-27 06:32:25 +00:00
return t
2024-09-26 20:38:05 +00:00
2024-09-27 06:23:55 +00:00
2024-09-26 20:38:05 +00:00
# Example usage
if __name__ == "__main__":
2024-09-27 07:15:18 +00:00
input_text = "привет шп ана т у п а я"
2024-09-27 06:23:55 +00:00
2024-09-26 20:38:05 +00:00
normalized_output = normalize(input_text)
print(normalized_output)