less-norm
This commit is contained in:
@@ -49,17 +49,19 @@ async def messages_routing(msg, state):
|
|||||||
if not reply_to_msg_id and latest_toxic_message_id:
|
if not reply_to_msg_id and latest_toxic_message_id:
|
||||||
reply_to_msg_id = int(latest_toxic_message_id)
|
reply_to_msg_id = int(latest_toxic_message_id)
|
||||||
|
|
||||||
# count average between all of messages
|
# count toxicity
|
||||||
toxic_pattern = f"toxic:{cid}:{uid}:*"
|
|
||||||
toxic_score = await get_average_pattern(toxic_pattern)
|
|
||||||
|
|
||||||
# current mesasage toxicity
|
|
||||||
if reply_to_msg_id:
|
if reply_to_msg_id:
|
||||||
|
# count one message score
|
||||||
one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
|
one_score = await redis.get(f"toxic:{cid}:{uid}:{reply_to_msg_id}")
|
||||||
reply_text = ""
|
reply_text = ""
|
||||||
if one_score:
|
if one_score:
|
||||||
logger.debug(one_score)
|
logger.debug(one_score)
|
||||||
reply_text += f"{int(one_score)}% токсичности\n"
|
reply_text += f"{int(one_score)}% токсичности\n"
|
||||||
|
|
||||||
|
# count average between all of messages
|
||||||
|
toxic_pattern = f"toxic:{cid}:{uid}:*"
|
||||||
|
toxic_score = await get_average_pattern(toxic_pattern)
|
||||||
|
|
||||||
if toxic_score:
|
if toxic_score:
|
||||||
emoji = (
|
emoji = (
|
||||||
"😳"
|
"😳"
|
||||||
|
@@ -1,13 +1,7 @@
|
|||||||
import torch
|
|
||||||
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
logger = logging.getLogger("nlp.normalize")
|
logger = logging.getLogger("nlp.normalize")
|
||||||
|
|
||||||
# Use ByT5 for the ByT5 model
|
|
||||||
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
|
|
||||||
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
|
|
||||||
|
|
||||||
|
|
||||||
def is_russian_wording(text):
|
def is_russian_wording(text):
|
||||||
"""
|
"""
|
||||||
@@ -22,24 +16,6 @@ def is_russian_wording(text):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def segment_text(text):
|
|
||||||
"""
|
|
||||||
Use a neural network model to segment text into words.
|
|
||||||
"""
|
|
||||||
# Encode the input text for the model as UTF-8 bytes
|
|
||||||
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
|
|
||||||
|
|
||||||
# Generate predictions
|
|
||||||
with torch.no_grad():
|
|
||||||
outputs = model.generate(inputs)
|
|
||||||
|
|
||||||
# Decode the generated tokens back to text
|
|
||||||
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
||||||
|
|
||||||
return segmented_text
|
|
||||||
|
|
||||||
|
|
||||||
def normalize(text):
|
def normalize(text):
|
||||||
"""
|
"""
|
||||||
Normalize English text to resemble Russian characters.
|
Normalize English text to resemble Russian characters.
|
||||||
|
25
nlp/segment_text.py
Normal file
25
nlp/segment_text.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
import torch
|
||||||
|
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
|
||||||
|
|
||||||
|
# Use ByT5 for the ByT5 model
|
||||||
|
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
|
||||||
|
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def segment_text(text):
|
||||||
|
"""
|
||||||
|
Use a neural network model to segment text into words.
|
||||||
|
"""
|
||||||
|
# Encode the input text for the model as UTF-8 bytes
|
||||||
|
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
|
||||||
|
|
||||||
|
# Generate predictions
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model.generate(inputs)
|
||||||
|
|
||||||
|
# Decode the generated tokens back to text
|
||||||
|
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||||
|
|
||||||
|
return segmented_text
|
@@ -2,5 +2,5 @@ redis[hiredis]
|
|||||||
aiohttp
|
aiohttp
|
||||||
torch
|
torch
|
||||||
transformers
|
transformers
|
||||||
protobuf
|
# protobuf
|
||||||
sentencepiece
|
# sentencepiece
|
Reference in New Issue
Block a user