import logging logger = logging.getLogger("nlp.normalize") def is_russian_wording(text): """ Check if the text contains more than one Russian character by checking each character against the Unicode range for Cyrillic. """ counter = 0 for char in text: if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters counter += 1 if counter > 1: return True return False def normalize(text): """ Normalize English text to resemble Russian characters. """ t = text.replace(" ", " ").replace(" ", " ").replace(" ", " ") # Segment the text first # t = segment_text(t) t = t.lower() if is_russian_wording(t): # Normalize the text by replacing characters normalized_text = ( t.replace("e", "е") .replace("o", "о") .replace("x", "х") .replace("a", "а") .replace("r", "р") .replace("m", "м") .replace("u", "и") .replace("n", "н") .replace("p", "п") .replace("t", "т") .replace("y", "у") .replace("h", "х") .replace("i", "и") .replace("c", "с") .replace("k", "к") .replace("b", "в") .replace("3", "з") .replace("4", "ч") .replace("0", "о") .replace("d", "д") .replace("z", "з") ) return normalized_text logger.debug(f"normalized: {t}") return t # Example usage if __name__ == "__main__": input_text = "привет шп ана т у п а я" normalized_output = normalize(input_text) print(normalized_output)