From 8d6daeabe2216c0f45265b24ae8d1abf8ae2cee1 Mon Sep 17 00:00:00 2001 From: Untone Date: Thu, 26 Sep 2024 21:36:14 +0300 Subject: [PATCH] toxic-debug15 --- handlers/messages_routing.py | 2 +- utils/normalize.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 utils/normalize.py diff --git a/handlers/messages_routing.py b/handlers/messages_routing.py index 4992b11..e8ee022 100644 --- a/handlers/messages_routing.py +++ b/handlers/messages_routing.py @@ -56,7 +56,7 @@ async def messages_routing(msg, state): message_id=mid ) else: - toxic_score = detector(text.lower()) + toxic_score = detector(normalize(text)) toxic_perc = math.floor(toxic_score*100) await redis.set(f"toxic:{cid}", mid) await redis.set(f"toxic:{cid}:{uid}:{mid}", toxic_perc, ex=60*60*24*3) diff --git a/utils/normalize.py b/utils/normalize.py new file mode 100644 index 0000000..5d8e28d --- /dev/null +++ b/utils/normalize.py @@ -0,0 +1,29 @@ +def is_russian_wording(text): + """ + Check if the text contains any Russian characters by checking + each character against the Unicode range for Cyrillic. + """ + # Check if any character in the text is a Cyrillic character + for char in text: + if '\u0400' <= char <= '\u04FF': # Unicode range for Cyrillic characters + return True + return False + +def normalize(text): + """ + Normalize English text to resemble Russian characters. + """ + if is_russian_wording(text): + # Normalize the text by replacing characters + text = (text.lower() + .replace('e', 'е') + .replace('o', 'о') + .replace('x', 'х') + .replace('a', 'а') + .replace('r', 'г') + .replace('m', 'м') + .replace('u', 'и') + .replace('n', 'п') + .replace('p', 'р')) + + return text \ No newline at end of file