stopwords = [] with open('nlp/stop_words.txt', 'r', encoding='utf-8') as file: stopwords = [word.strip() for word in file.readlines()] # Convert stopwords to a set for faster lookup stopword_set = set(stopwords) def combine_words(words, index, current, combinations): """ Helper function to recursively generate combinations of words. Args: words (list): List of words to combine. index (int): Current index in the list of words. current (str): Current combination being built. combinations (set): Set to store unique combinations. Examples: >>> combinations = set() >>> combine_words(['a', 'b', 'c'], 0, '', combinations) >>> sorted(combinations) ['a', 'ab', 'abc', 'b', 'bc', 'c'] """ # Add the current combination to the set of combinations combinations.update(current.split()) # If we have reached the end of the list of words, return if index == len(words): return # Include the current word combine_words(words, index + 1, current + words[index], combinations) # Include the current word with a space if not at the last word if index < len(words) - 1: combine_words(words, index + 1, current + words[index] + ' ', combinations) def generate_combinations(text): """ Generate all possible combinations of words from the given text, treating spaces as potential concatenation points for words less than 5 characters. Args: text (str): The input text to process. Returns: list: A list of unique word combinations. Examples: >>> generate_combinations("stu pid as sho le") ['as', 'assho', 'asshole', 'le', 'pid', 'pidas', 'pidassho', 'pidasshole', 'sho', 'shole', 'stu', 'stupid', 'stupidas', 'stupidassho', 'stupidasshole'] >>> generate_combinations("singleword") ['singleword'] """ combinations = set() combine_words(text.split(), 0, "", combinations) # Filter out any empty strings and sort the results return sorted(filter(lambda x: x != "", combinations)) def check_stopwords(text): """ Check if any words from the stopwords list are present in the given text. Args: text (str): The input normalized text to check. Returns: int: The score based on the number of stopwords found in the text. Examples: >>> check_stopwords("this is a хуй") {'хуй'} """ # Normalize the text by splitting into words words = set(text.split()) # Check for any intersection with stopword_set stopwords_found = stopword_set.intersection(words) return stopwords_found # Example usage if __name__ == "__main__": import doctest doctest.testmod()