stopwords-fix

This commit is contained in:
Untone 2024-09-29 13:08:39 +03:00
parent 3e4aefc429
commit 5f3bbe6229
2 changed files with 20 additions and 13 deletions

View File

@ -115,12 +115,8 @@ async def messages_routing(msg, state):
toxic_score = detector(normalized_text) toxic_score = detector(normalized_text)
toxic_perc = math.floor(toxic_score * 100) toxic_perc = math.floor(toxic_score * 100)
if toxic_score < 0.9 and text != text.replace(' ', ''): if toxic_score < 0.9 and text != text.replace(' ', ''):
logger.info('check without spaces') logger.info('re-check without spaces')
if check_stopwords(normalized_text): toxic_perc += check_stopwords(normalized_text):
logger.info('stopword detected without spaces, toxicity +40%')
toxic_perc += 40
else:
logger.info('ok')
logger.info(f"text: {normalized_text}\ntoxic: {toxic_perc}%") logger.info(f"text: {normalized_text}\ntoxic: {toxic_perc}%")
await redis.set(f"toxic:{cid}", mid) await redis.set(f"toxic:{cid}", mid)

View File

@ -68,21 +68,32 @@ def check_stopwords(text):
text (str): The input normalized text to check. text (str): The input normalized text to check.
Returns: Returns:
bool: True if any stopword is found in the text, False otherwise. int: The score based on the number of stopwords found in the text.
Examples: Examples:
>>> check_stopwords("this is a хуй") >>> check_stopwords("this is a хуй")
True 40
>>> check_stopwords("this is clean") >>> check_stopwords("this is clean")
False 0
>>> check_stopwords("хуй is a хуй")
80
>>> check_stopwords("clean is clean")
0
""" """
# Normalize the text by splitting into words # Normalize the text by splitting into words
words = set(text.split()) words = set(text.split())
# Check for any intersection with stopword_set # Check for any intersection with stopword_set
return not stopword_set.isdisjoint(words) stopwords_found = stopword_set.intersection(words)
# Calculate the score based on the number of stopwords found
score = len(stopwords_found) * 46
return score
# Example usage # Example usage
if __name__ == "__main__": if __name__ == "__main__":