welcomecenterbot/nlp/normalize.py

import logging

logger = logging.getLogger("nlp.normalize")


def is_russian_wording(text):
    """
    Check if the text contains more than one Russian character by checking
    each character against the Unicode range for Cyrillic.
    """
    counter = 0
    for char in text:
        if "\u0400" <= char <= "\u04ff":  # Unicode range for Cyrillic characters
            counter += 1
            if counter > 1:
                return True
    return False

def normalize(text):
    """
    Normalize English text to resemble Russian characters.
    """
    t = text.replace("  ", " ").replace("  ", " ").replace("  ", " ")

    # Segment the text first
    # t = segment_text(t)

    t = t.lower()

    if is_russian_wording(t):
        # Normalize the text by replacing characters
        normalized_text = (
            t.replace("e", "е")
            .replace("o", "о")
            .replace("x", "х")
            .replace("a", "а")
            .replace("r", "р")
            .replace("m", "м")
            .replace("u", "и")
            .replace("n", "н")
            .replace("p", "п")
            .replace("t", "т")
            .replace("y", "у")
            .replace("h", "х")
            .replace("i", "и")
            .replace("c", "с")
            .replace("k", "к")
            .replace("b", "в")
            .replace("3", "з")
            .replace("4", "ч")
            .replace("0", "о")
            .replace("d", "д")
            .replace("z", "з")
        )

        return normalized_text
    logger.debug(f"normalized: {t}")
    return t


# Example usage
if __name__ == "__main__":
    input_text = "привет шп  ана т у п а я"

    normalized_output = normalize(input_text)
    print(normalized_output)