nosegm

2024-09-29 09:47:49 +03:00 · 2024-09-29 09:47:49 +03:00 · 22ed5f6335
commit 22ed5f6335
parent 20cbc6dab6
5 changed files with 1280 additions and 30 deletions
--- a/handlers/messages_routing.py
+++ b/handlers/messages_routing.py
@ -5,10 +5,11 @@ from state.scan import get_average_pattern
 from bot.api import telegram_api, download_file
 from bot.config import FEEDBACK_CHAT_ID
 from handlers.handle_private import handle_private
 from nlp.segment_text import segment_text
 from nlp.toxicity_detector import detector
 from nlp.normalize import normalize
 from nlp.ocr import ocr_recognize
 from nlp.stopwords_detector import check_stopwords
 logger = logging.getLogger("handlers.messages_routing")
@ -110,19 +111,21 @@ async def messages_routing(msg, state):
                        text += '\n'
            normalized_text = normalize(text)
-            segmented_text = segment_text(normalized_text)
+            toxic_score = detector(normalized_text)
            toxic_score = detector(segmented_text)
            toxic_perc = math.floor(toxic_score * 100)
-            logger.info(f"\segmented_text: {segmented_text}\ntoxic: {toxic_perc}%")
+            logger.info(f"\text: {normalized_text}\ntoxic: {toxic_perc}%")
-            nospaces_text = text.replace(' ', '')
+            nospaces_text = text.replace(" ", "")
            if nospaces_text != text:
                nospaces_normalized_text = normalize(nospaces_text)
-            nospaces_segmented_text = segment_text(nospaces_normalized_text)
+                nospaces_text_score = detector(nospaces_normalized_text)
            nospaces_text_score = detector(nospaces_segmented_text)
                nospaces_text_perc = math.floor(nospaces_text_score * 100)
-            logger.info(f"\nospaces_segmented_text: {nospaces_segmented_text}\nnospaces_toxic: {toxic_perc}%")
+                if check_stopwords(nospaces_normalized_text):
                    logger.info('stopword detected with no spaces, toxicity +40%')
                    nospaces_text_perc += 40
                logger.info(f"\nospaces_text: {nospaces_normalized_text}\nnospaces_toxic: {nospaces_text_perc}%")
-            if (nospaces_text != text and nospaces_text_score > toxic_score) or nospaces_text_perc > 95:
+                if nospaces_text_score > toxic_score or nospaces_text_perc > 95:
                    text_perc = nospaces_text_perc
            await redis.set(f"toxic:{cid}", mid)
--- a/nlp/segment_text.py
+++ b/nlp/segment_text.py
@ -1,17 +0,0 @@
 import spacy
 # Load the Russian language model
 nlp = spacy.load("ru_core_news_md")
 def segment_text(text):
    """
    Use SpaCy to segment text into words.
    """
    # Process the text with SpaCy
    doc = nlp(text)
    # Extract words from the processed document
    segmented_text = ' '.join([token.text for token in doc if not token.is_space])
    return segmented_text
--- a/nlp/stop_words.txt
+++ b/nlp/stop_words.txt
--- a/nlp/stopwords_detector.py
+++ b/nlp/stopwords_detector.py
@ -0,0 +1,27 @@
 stopwords = []
 with open('stop_words.txt', 'r', encoding='utf-8') as file:
    text = file.readlines()
 # Convert stopwords to a set for faster lookup
 stopword_set = set(stopwords)
 def check_stopwords(text, stopwords):
    """
    Check if any words from the stopwords list are present in the given text.
    Args:
        text (str): The input text to check.
        stopwords (list): A list of stopwords.
    Returns:
        bool: True if any stopword is found in the text, False otherwise.
    """
    # Normalize the text by converting it to lower case and splitting into words
    words = text.lower().split()
    # Iterate through each word and check for stopwords
    for word in words:
        if word in stopword_set:
            return True  # Stop iteration and return True if a stopword is found
    return False  # Return False if no stopwords are found
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,5 @@
 redis[hiredis]
 aiohttp
 aiofiles
 spacy
 transformers
 easyocr