less-norm

This commit is contained in:
2024-09-27 13:51:55 +03:00
parent 984630d4c1
commit a2545217e8
4 changed files with 61 additions and 58 deletions

View File

@@ -49,39 +49,41 @@ async def messages_routing(msg, state):
if not reply_to_msg_id and latest_toxic_message_id: if not reply_to_msg_id and latest_toxic_message_id:
reply_to_msg_id = int(latest_toxic_message_id) reply_to_msg_id = int(latest_toxic_message_id)
# count average between all of messages # count toxicity
toxic_pattern = f"toxic:{cid}:{uid}:*" if reply_to_msg_id:
toxic_score = await get_average_pattern(toxic_pattern) # count one message score
one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
reply_text = ""
if one_score:
logger.debug(one_score)
reply_text += f"{int(one_score)}% токсичности\n"
# current mesasage toxicity # count average between all of messages
if reply_to_msg_id: toxic_pattern = f"toxic:{cid}:{uid}:*"
one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}") toxic_score = await get_average_pattern(toxic_pattern)
reply_text = ""
if one_score: if toxic_score:
logger.debug(one_score) emoji = (
reply_text += f"{int(one_score)}% токсичности\n" "😳"
if toxic_score: if toxic_score > 90
emoji = ( else "😟"
"😳" if toxic_score > 80
if toxic_score > 90 else "😏"
else "😟" if toxic_score > 60
if toxic_score > 80 else "🙂"
else "😏" if toxic_score > 20
if toxic_score > 60 else "😇"
else "🙂" )
if toxic_score > 20 reply_text += (
else "😇" f"Средняя токсичность сообщений: {toxic_score}% {emoji}"
) )
reply_text += ( if reply_text:
f"Средняя токсичность сообщений: {toxic_score}% {emoji}" await telegram_api(
) "sendMessage",
if reply_text: chat_id=cid,
await telegram_api( reply_to_message_id=reply_to_msg_id,
"sendMessage", text=reply_text,
chat_id=cid, )
reply_to_message_id=reply_to_msg_id,
text=reply_text,
)
try: try:
await telegram_api("deleteMessage", chat_id=cid, message_id=mid) await telegram_api("deleteMessage", chat_id=cid, message_id=mid)
except Exception: except Exception:

View File

@@ -1,13 +1,7 @@
import torch
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
import logging import logging
logger = logging.getLogger("nlp.normalize") logger = logging.getLogger("nlp.normalize")
# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def is_russian_wording(text): def is_russian_wording(text):
""" """
@@ -22,24 +16,6 @@ def is_russian_wording(text):
return True return True
return False return False
def segment_text(text):
"""
Use a neural network model to segment text into words.
"""
# Encode the input text for the model as UTF-8 bytes
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Generate predictions
with torch.no_grad():
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return segmented_text
def normalize(text): def normalize(text):
""" """
Normalize English text to resemble Russian characters. Normalize English text to resemble Russian characters.

25
nlp/segment_text.py Normal file
View File

@@ -0,0 +1,25 @@
import torch
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def segment_text(text):
"""
Use a neural network model to segment text into words.
"""
# Encode the input text for the model as UTF-8 bytes
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Generate predictions
with torch.no_grad():
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return segmented_text

View File

@@ -2,5 +2,5 @@ redis[hiredis]
aiohttp aiohttp
torch torch
transformers transformers
protobuf # protobuf
sentencepiece # sentencepiece