less-norm

2024-09-27 13:51:55 +03:00
parent 984630d4c1
commit a2545217e8
4 changed files with 61 additions and 58 deletions
--- a/handlers/messages_routing.py
+++ b/handlers/messages_routing.py
@@ -49,17 +49,19 @@ async def messages_routing(msg, state):
                if not reply_to_msg_id and latest_toxic_message_id:
                    reply_to_msg_id = int(latest_toxic_message_id)
-            # count average between all of messages
+                # count toxicity
            toxic_pattern = f"toxic:{cid}:{uid}:*"
            toxic_score = await get_average_pattern(toxic_pattern)
            # current mesasage toxicity
                if reply_to_msg_id:
                    # count one message score
                    one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
                    reply_text = ""
                    if one_score:
                        logger.debug(one_score)
                        reply_text += f"{int(one_score)}% токсичности\n"
                    # count average between all of messages
                    toxic_pattern = f"toxic:{cid}:{uid}:*"
                    toxic_score = await get_average_pattern(toxic_pattern)
                    if toxic_score:
                        emoji = (
                            "😳"
--- a/nlp/normalize.py
+++ b/nlp/normalize.py
@@ -1,13 +1,7 @@
 import torch
 from transformers import ByT5Tokenizer, T5ForConditionalGeneration
 import logging
 logger = logging.getLogger("nlp.normalize")
 # Use ByT5 for the ByT5 model
 tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
 model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
 def is_russian_wording(text):
    """
@@ -22,24 +16,6 @@ def is_russian_wording(text):
                return True
    return False
 def segment_text(text):
    """
    Use a neural network model to segment text into words.
    """
    # Encode the input text for the model as UTF-8 bytes
    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(inputs)
    # Decode the generated tokens back to text
    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return segmented_text
 def normalize(text):
    """
    Normalize English text to resemble Russian characters.
--- a/nlp/segment_text.py
+++ b/nlp/segment_text.py
@@ -0,0 +1,25 @@
 import torch
 from transformers import ByT5Tokenizer, T5ForConditionalGeneration
 # Use ByT5 for the ByT5 model
 tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
 model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
 def segment_text(text):
    """
    Use a neural network model to segment text into words.
    """
    # Encode the input text for the model as UTF-8 bytes
    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
    # Generate predictions
    with torch.no_grad():
        outputs = model.generate(inputs)
    # Decode the generated tokens back to text
    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return segmented_text
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,5 @@ redis[hiredis]
 aiohttp
 torch
 transformers
-protobuf
+# protobuf
-sentencepiece
+# sentencepiece