less-norm

This commit is contained in:
2024-09-27 13:51:55 +03:00
parent 984630d4c1
commit a2545217e8
4 changed files with 61 additions and 58 deletions

View File

@@ -48,40 +48,42 @@ async def messages_routing(msg, state):
reply_to_msg_id = reply_msg.get("message_id")
if not reply_to_msg_id and latest_toxic_message_id:
reply_to_msg_id = int(latest_toxic_message_id)
# count toxicity
if reply_to_msg_id:
# count one message score
one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
reply_text = ""
if one_score:
logger.debug(one_score)
reply_text += f"{int(one_score)}% токсичности\n"
# count average between all of messages
toxic_pattern = f"toxic:{cid}:{uid}:*"
toxic_score = await get_average_pattern(toxic_pattern)
# count average between all of messages
toxic_pattern = f"toxic:{cid}:{uid}:*"
toxic_score = await get_average_pattern(toxic_pattern)
# current mesasage toxicity
if reply_to_msg_id:
one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
reply_text = ""
if one_score:
logger.debug(one_score)
reply_text += f"{int(one_score)}% токсичности\n"
if toxic_score:
emoji = (
"😳"
if toxic_score > 90
else "😟"
if toxic_score > 80
else "😏"
if toxic_score > 60
else "🙂"
if toxic_score > 20
else "😇"
)
reply_text += (
f"Средняя токсичность сообщений: {toxic_score}% {emoji}"
)
if reply_text:
await telegram_api(
"sendMessage",
chat_id=cid,
reply_to_message_id=reply_to_msg_id,
text=reply_text,
)
if toxic_score:
emoji = (
"😳"
if toxic_score > 90
else "😟"
if toxic_score > 80
else "😏"
if toxic_score > 60
else "🙂"
if toxic_score > 20
else "😇"
)
reply_text += (
f"Средняя токсичность сообщений: {toxic_score}% {emoji}"
)
if reply_text:
await telegram_api(
"sendMessage",
chat_id=cid,
reply_to_message_id=reply_to_msg_id,
text=reply_text,
)
try:
await telegram_api("deleteMessage", chat_id=cid, message_id=mid)
except Exception:

View File

@@ -1,13 +1,7 @@
import torch
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
import logging
logger = logging.getLogger("nlp.normalize")
# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def is_russian_wording(text):
"""
@@ -22,24 +16,6 @@ def is_russian_wording(text):
return True
return False
def segment_text(text):
"""
Use a neural network model to segment text into words.
"""
# Encode the input text for the model as UTF-8 bytes
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Generate predictions
with torch.no_grad():
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return segmented_text
def normalize(text):
"""
Normalize English text to resemble Russian characters.

25
nlp/segment_text.py Normal file
View File

@@ -0,0 +1,25 @@
import torch
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def segment_text(text):
"""
Use a neural network model to segment text into words.
"""
# Encode the input text for the model as UTF-8 bytes
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Generate predictions
with torch.no_grad():
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return segmented_text

View File

@@ -2,5 +2,5 @@ redis[hiredis]
aiohttp
torch
transformers
protobuf
sentencepiece
# protobuf
# sentencepiece