nosegm
This commit is contained in:
parent
20cbc6dab6
commit
22ed5f6335
|
@ -5,10 +5,11 @@ from state.scan import get_average_pattern
|
||||||
from bot.api import telegram_api, download_file
|
from bot.api import telegram_api, download_file
|
||||||
from bot.config import FEEDBACK_CHAT_ID
|
from bot.config import FEEDBACK_CHAT_ID
|
||||||
from handlers.handle_private import handle_private
|
from handlers.handle_private import handle_private
|
||||||
from nlp.segment_text import segment_text
|
|
||||||
from nlp.toxicity_detector import detector
|
from nlp.toxicity_detector import detector
|
||||||
from nlp.normalize import normalize
|
from nlp.normalize import normalize
|
||||||
from nlp.ocr import ocr_recognize
|
from nlp.ocr import ocr_recognize
|
||||||
|
from nlp.stopwords_detector import check_stopwords
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("handlers.messages_routing")
|
logger = logging.getLogger("handlers.messages_routing")
|
||||||
|
|
||||||
|
@ -110,19 +111,21 @@ async def messages_routing(msg, state):
|
||||||
text += '\n'
|
text += '\n'
|
||||||
|
|
||||||
normalized_text = normalize(text)
|
normalized_text = normalize(text)
|
||||||
segmented_text = segment_text(normalized_text)
|
toxic_score = detector(normalized_text)
|
||||||
toxic_score = detector(segmented_text)
|
|
||||||
toxic_perc = math.floor(toxic_score * 100)
|
toxic_perc = math.floor(toxic_score * 100)
|
||||||
logger.info(f"\segmented_text: {segmented_text}\ntoxic: {toxic_perc}%")
|
logger.info(f"\text: {normalized_text}\ntoxic: {toxic_perc}%")
|
||||||
|
|
||||||
nospaces_text = text.replace(' ', '')
|
nospaces_text = text.replace(" ", "")
|
||||||
|
if nospaces_text != text:
|
||||||
nospaces_normalized_text = normalize(nospaces_text)
|
nospaces_normalized_text = normalize(nospaces_text)
|
||||||
nospaces_segmented_text = segment_text(nospaces_normalized_text)
|
nospaces_text_score = detector(nospaces_normalized_text)
|
||||||
nospaces_text_score = detector(nospaces_segmented_text)
|
|
||||||
nospaces_text_perc = math.floor(nospaces_text_score * 100)
|
nospaces_text_perc = math.floor(nospaces_text_score * 100)
|
||||||
logger.info(f"\nospaces_segmented_text: {nospaces_segmented_text}\nnospaces_toxic: {toxic_perc}%")
|
if check_stopwords(nospaces_normalized_text):
|
||||||
|
logger.info('stopword detected with no spaces, toxicity +40%')
|
||||||
|
nospaces_text_perc += 40
|
||||||
|
logger.info(f"\nospaces_text: {nospaces_normalized_text}\nnospaces_toxic: {nospaces_text_perc}%")
|
||||||
|
|
||||||
if (nospaces_text != text and nospaces_text_score > toxic_score) or nospaces_text_perc > 95:
|
if nospaces_text_score > toxic_score or nospaces_text_perc > 95:
|
||||||
text_perc = nospaces_text_perc
|
text_perc = nospaces_text_perc
|
||||||
|
|
||||||
await redis.set(f"toxic:{cid}", mid)
|
await redis.set(f"toxic:{cid}", mid)
|
||||||
|
|
|
@ -1,17 +0,0 @@
|
||||||
import spacy
|
|
||||||
|
|
||||||
# Load the Russian language model
|
|
||||||
nlp = spacy.load("ru_core_news_md")
|
|
||||||
|
|
||||||
def segment_text(text):
|
|
||||||
"""
|
|
||||||
Use SpaCy to segment text into words.
|
|
||||||
"""
|
|
||||||
# Process the text with SpaCy
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
# Extract words from the processed document
|
|
||||||
segmented_text = ' '.join([token.text for token in doc if not token.is_space])
|
|
||||||
|
|
||||||
return segmented_text
|
|
||||||
|
|
1238
nlp/stop_words.txt
Normal file
1238
nlp/stop_words.txt
Normal file
File diff suppressed because it is too large
Load Diff
27
nlp/stopwords_detector.py
Normal file
27
nlp/stopwords_detector.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
stopwords = []
|
||||||
|
with open('stop_words.txt', 'r', encoding='utf-8') as file:
|
||||||
|
text = file.readlines()
|
||||||
|
|
||||||
|
# Convert stopwords to a set for faster lookup
|
||||||
|
stopword_set = set(stopwords)
|
||||||
|
|
||||||
|
def check_stopwords(text, stopwords):
|
||||||
|
"""
|
||||||
|
Check if any words from the stopwords list are present in the given text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to check.
|
||||||
|
stopwords (list): A list of stopwords.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if any stopword is found in the text, False otherwise.
|
||||||
|
"""
|
||||||
|
# Normalize the text by converting it to lower case and splitting into words
|
||||||
|
words = text.lower().split()
|
||||||
|
|
||||||
|
# Iterate through each word and check for stopwords
|
||||||
|
for word in words:
|
||||||
|
if word in stopword_set:
|
||||||
|
return True # Stop iteration and return True if a stopword is found
|
||||||
|
|
||||||
|
return False # Return False if no stopwords are found
|
|
@ -1,6 +1,5 @@
|
||||||
redis[hiredis]
|
redis[hiredis]
|
||||||
aiohttp
|
aiohttp
|
||||||
aiofiles
|
aiofiles
|
||||||
spacy
|
|
||||||
transformers
|
transformers
|
||||||
easyocr
|
easyocr
|
Loading…
Reference in New Issue
Block a user