less-norm

2024-09-27 13:51:55 +03:00
parent 984630d4c1
commit a2545217e8
4 changed files with 61 additions and 58 deletions
--- a/handlers/messages_routing.py
+++ b/handlers/messages_routing.py
@@ -48,40 +48,42 @@ async def messages_routing(msg, state):
                reply_to_msg_id = reply_msg.get("message_id")
                if not reply_to_msg_id and latest_toxic_message_id:
                    reply_to_msg_id = int(latest_toxic_message_id)
+                
+                # count toxicity
+                if reply_to_msg_id:
+                    # count one message score
+                    one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
+                    reply_text = ""
+                    if one_score:
+                        logger.debug(one_score)
+                        reply_text += f"{int(one_score)}% токсичности\n"

-            # count average between all of messages
-            toxic_pattern = f"toxic:{cid}:{uid}:*"
-            toxic_score = await get_average_pattern(toxic_pattern)
+                    # count average between all of messages
+                    toxic_pattern = f"toxic:{cid}:{uid}:*"
+                    toxic_score = await get_average_pattern(toxic_pattern)

-            # current mesasage toxicity
-            if reply_to_msg_id:
-                one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
-                reply_text = ""
-                if one_score:
-                    logger.debug(one_score)
-                    reply_text += f"{int(one_score)}% токсичности\n"
-                if toxic_score:
-                    emoji = (
-                        "😳"
-                        if toxic_score > 90
-                        else "😟"
-                        if toxic_score > 80
-                        else "😏"
-                        if toxic_score > 60
-                        else "🙂"
-                        if toxic_score > 20
-                        else "😇"
-                    )
-                    reply_text += (
-                        f"Средняя токсичность сообщений: {toxic_score}% {emoji}"
-                    )
-                if reply_text:
-                    await telegram_api(
-                        "sendMessage",
-                        chat_id=cid,
-                        reply_to_message_id=reply_to_msg_id,
-                        text=reply_text,
-                    )
+                    if toxic_score:
+                        emoji = (
+                            "😳"
+                            if toxic_score > 90
+                            else "😟"
+                            if toxic_score > 80
+                            else "😏"
+                            if toxic_score > 60
+                            else "🙂"
+                            if toxic_score > 20
+                            else "😇"
+                        )
+                        reply_text += (
+                            f"Средняя токсичность сообщений: {toxic_score}% {emoji}"
+                        )
+                    if reply_text:
+                        await telegram_api(
+                            "sendMessage",
+                            chat_id=cid,
+                            reply_to_message_id=reply_to_msg_id,
+                            text=reply_text,
+                        )
            try:
                await telegram_api("deleteMessage", chat_id=cid, message_id=mid)
            except Exception:
--- a/nlp/normalize.py
+++ b/nlp/normalize.py
@@ -1,13 +1,7 @@
-import torch
-from transformers import ByT5Tokenizer, T5ForConditionalGeneration
 import logging

 logger = logging.getLogger("nlp.normalize")

-# Use ByT5 for the ByT5 model
-tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
-model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
-

 def is_russian_wording(text):
    """
@@ -22,24 +16,6 @@ def is_russian_wording(text):
                return True
    return False

-
-def segment_text(text):
-    """
-    Use a neural network model to segment text into words.
-    """
-    # Encode the input text for the model as UTF-8 bytes
-    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
-
-    # Generate predictions
-    with torch.no_grad():
-        outputs = model.generate(inputs)
-
-    # Decode the generated tokens back to text
-    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-    return segmented_text
-
-
 def normalize(text):
    """
    Normalize English text to resemble Russian characters.
--- a/nlp/segment_text.py
+++ b/nlp/segment_text.py
@@ -0,0 +1,25 @@
+import torch
+from transformers import ByT5Tokenizer, T5ForConditionalGeneration
+
+# Use ByT5 for the ByT5 model
+tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
+model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+
+
+
+
+def segment_text(text):
+    """
+    Use a neural network model to segment text into words.
+    """
+    # Encode the input text for the model as UTF-8 bytes
+    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
+
+    # Generate predictions
+    with torch.no_grad():
+        outputs = model.generate(inputs)
+
+    # Decode the generated tokens back to text
+    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+    return segmented_text
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,5 @@ redis[hiredis]
 aiohttp
 torch
 transformers
-protobuf
-sentencepiece
+# protobuf
+# sentencepiece