diff --git a/handlers/messages_routing.py b/handlers/messages_routing.py index 8f69766..f588efa 100644 --- a/handlers/messages_routing.py +++ b/handlers/messages_routing.py @@ -115,12 +115,8 @@ async def messages_routing(msg, state): toxic_score = detector(normalized_text) toxic_perc = math.floor(toxic_score * 100) if toxic_score < 0.9 and text != text.replace(' ', ''): - logger.info('check without spaces') - if check_stopwords(normalized_text): - logger.info('stopword detected without spaces, toxicity +40%') - toxic_perc += 40 - else: - logger.info('ok') + logger.info('re-check without spaces') + toxic_perc += check_stopwords(normalized_text): logger.info(f"text: {normalized_text}\ntoxic: {toxic_perc}%") await redis.set(f"toxic:{cid}", mid) diff --git a/nlp/stopwords_detector.py b/nlp/stopwords_detector.py index 50965a7..161ec89 100644 --- a/nlp/stopwords_detector.py +++ b/nlp/stopwords_detector.py @@ -68,21 +68,32 @@ def check_stopwords(text): text (str): The input normalized text to check. Returns: - bool: True if any stopword is found in the text, False otherwise. + int: The score based on the number of stopwords found in the text. Examples: >>> check_stopwords("this is a хуй") - True - + 40 + >>> check_stopwords("this is clean") - False + 0 + + >>> check_stopwords("хуй is a хуй") + 80 + + >>> check_stopwords("clean is clean") + 0 """ - + # Normalize the text by splitting into words words = set(text.split()) - + # Check for any intersection with stopword_set - return not stopword_set.isdisjoint(words) + stopwords_found = stopword_set.intersection(words) + + # Calculate the score based on the number of stopwords found + score = len(stopwords_found) * 46 + + return score # Example usage if __name__ == "__main__":