diff --git a/handlers/messages_routing.py b/handlers/messages_routing.py index 0df2af5..8f69766 100644 --- a/handlers/messages_routing.py +++ b/handlers/messages_routing.py @@ -116,7 +116,7 @@ async def messages_routing(msg, state): toxic_perc = math.floor(toxic_score * 100) if toxic_score < 0.9 and text != text.replace(' ', ''): logger.info('check without spaces') - if check_stopwords(normalized_text.replace(' ', '')): + if check_stopwords(normalized_text): logger.info('stopword detected without spaces, toxicity +40%') toxic_perc += 40 else: diff --git a/nlp/ocr.py b/nlp/ocr.py index 8372be7..bd287ad 100644 --- a/nlp/ocr.py +++ b/nlp/ocr.py @@ -11,7 +11,7 @@ def ocr_recognize(file_path): # Use EasyOCR to detect text in the photo results = reader.readtext(file_path) - result = result[-1] + result = results[-1] [_coords, ocr_text, ocr_accuracy] = result logger.debug("OCR Result: %s", ocr_text) if ocr_accuracy.item() > 0.5: diff --git a/nlp/stopwords_detector.py b/nlp/stopwords_detector.py index 968bf58..50965a7 100644 --- a/nlp/stopwords_detector.py +++ b/nlp/stopwords_detector.py @@ -1,25 +1,90 @@ stopwords = [] with open('nlp/stop_words.txt', 'r', encoding='utf-8') as file: - stopwords = file.readlines() + stopwords = [word.strip() for word in file.readlines()] # Convert stopwords to a set for faster lookup stopword_set = set(stopwords) +def combine_words(words, index, current, combinations): + """ + Helper function to recursively generate combinations of words. + + Args: + words (list): List of words to combine. + index (int): Current index in the list of words. + current (str): Current combination being built. + combinations (set): Set to store unique combinations. + + Examples: + >>> combinations = set() + >>> combine_words(['a', 'b', 'c'], 0, '', combinations) + >>> sorted(combinations) + ['a', 'ab', 'abc', 'b', 'bc', 'c'] + """ + # Add the current combination to the set of combinations + combinations.update(current.split()) + + # If we have reached the end of the list of words, return + if index == len(words): + return + + # Include the current word + combine_words(words, index + 1, current + words[index], combinations) + + # Include the current word with a space if not at the last word + if index < len(words) - 1: + combine_words(words, index + 1, current + words[index] + ' ', combinations) + + +def generate_combinations(text): + """ + Generate all possible combinations of words from the given text, + treating spaces as potential concatenation points for words less than 5 characters. + + Args: + text (str): The input text to process. + + Returns: + list: A list of unique word combinations. + + Examples: + >>> generate_combinations("stu pid as sho le") + ['as', 'assho', 'asshole', 'le', 'pid', 'pidas', 'pidassho', 'pidasshole', 'sho', 'shole', 'stu', 'stupid', 'stupidas', 'stupidassho', 'stupidasshole'] + + >>> generate_combinations("singleword") + ['singleword'] + """ + combinations = set() + combine_words(text.split(), 0, "", combinations) + + # Filter out any empty strings and sort the results + return sorted(filter(lambda x: x != "", combinations)) + def check_stopwords(text): """ Check if any words from the stopwords list are present in the given text. Args: - text (str): The input text to check. - stopwords (list): A list of stopwords. + text (str): The input normalized text to check. Returns: bool: True if any stopword is found in the text, False otherwise. + + Examples: + >>> check_stopwords("this is a хуй") + True + + >>> check_stopwords("this is clean") + False """ + + # Normalize the text by splitting into words + words = set(text.split()) + + # Check for any intersection with stopword_set + return not stopword_set.isdisjoint(words) - # Iterate through each word and check for stopwords - for word in stopword_set: - if word in text: - return True # Stop iteration and return True if a stopword is found - - return False # Return False if no stopwords are found +# Example usage +if __name__ == "__main__": + import doctest + doctest.testmod() \ No newline at end of file