welcomecenterbot/nlp/stopwords_detector.py

stopwords = []
with open('nlp/stop_words.txt', 'r', encoding='utf-8') as file:
    stopwords = [word.strip() for word in file.readlines()]

# Convert stopwords to a set for faster lookup
stopword_set = set(stopwords)

def combine_words(words, index, current, combinations):
    """
    Helper function to recursively generate combinations of words.

    Args:
        words (list): List of words to combine.
        index (int): Current index in the list of words.
        current (str): Current combination being built.
        combinations (set): Set to store unique combinations.

    Examples:
        >>> combinations = set()
        >>> combine_words(['a', 'b', 'c'], 0, '', combinations)
        >>> sorted(combinations)
        ['a', 'ab', 'abc', 'b', 'bc', 'c']
    """
    # Add the current combination to the set of combinations
    combinations.update(current.split())

    # If we have reached the end of the list of words, return
    if index == len(words):
        return

    # Include the current word
    combine_words(words, index + 1, current + words[index], combinations)

    # Include the current word with a space if not at the last word
    if index < len(words) - 1:
        combine_words(words, index + 1, current + words[index] + ' ', combinations)


def generate_combinations(text):
    """
    Generate all possible combinations of words from the given text,
    treating spaces as potential concatenation points for words less than 5 characters.

    Args:
        text (str): The input text to process.

    Returns:
        list: A list of unique word combinations.

    Examples:
        >>> generate_combinations("stu pid as sho le")
        ['as', 'assho', 'asshole', 'le', 'pid', 'pidas', 'pidassho', 'pidasshole', 'sho', 'shole', 'stu', 'stupid', 'stupidas', 'stupidassho', 'stupidasshole']
        
        >>> generate_combinations("singleword")
        ['singleword']
    """
    combinations = set()
    combine_words(text.split(), 0, "", combinations)
    
    # Filter out any empty strings and sort the results
    return sorted(filter(lambda x: x != "", combinations))

def check_stopwords(text):
    """
    Check if any words from the stopwords list are present in the given text.

    Args:
        text (str): The input normalized text to check.

    Returns:
        int: The score based on the number of stopwords found in the text.

    Examples:
        >>> check_stopwords("this is a хуй")
        40

        >>> check_stopwords("this is clean")
        0

        >>> check_stopwords("хуй is a хуй")
        80

        >>> check_stopwords("clean is clean")
        0
    """

    # Normalize the text by splitting into words
    words = set(text.split())

    # Check for any intersection with stopword_set
    stopwords_found = stopword_set.intersection(words)

    # Calculate the score based on the number of stopwords found
    score = len(stopwords_found) * 46

    return score

# Example usage
if __name__ == "__main__":
    import doctest
    doctest.testmod()
nosegm 2024-09-29 06:47:49 +00:00			`stopwords = []`
nosegm3 2024-09-29 07:05:14 +00:00			`with open('nlp/stop_words.txt', 'r', encoding='utf-8') as file:`
stopwords-fix 2024-09-29 10:02:30 +00:00			`stopwords = [word.strip() for word in file.readlines()]`
nosegm 2024-09-29 06:47:49 +00:00
			`# Convert stopwords to a set for faster lookup`
			`stopword_set = set(stopwords)`

stopwords-fix 2024-09-29 10:02:30 +00:00			`def combine_words(words, index, current, combinations):`
			`"""`
			`Helper function to recursively generate combinations of words.`

			`Args:`
			`words (list): List of words to combine.`
			`index (int): Current index in the list of words.`
			`current (str): Current combination being built.`
			`combinations (set): Set to store unique combinations.`

			`Examples:`
			`>>> combinations = set()`
			`>>> combine_words(['a', 'b', 'c'], 0, '', combinations)`
			`>>> sorted(combinations)`
			`['a', 'ab', 'abc', 'b', 'bc', 'c']`
			`"""`
			`# Add the current combination to the set of combinations`
			`combinations.update(current.split())`

			`# If we have reached the end of the list of words, return`
			`if index == len(words):`
			`return`

			`# Include the current word`
			`combine_words(words, index + 1, current + words[index], combinations)`

			`# Include the current word with a space if not at the last word`
			`if index < len(words) - 1:`
			`combine_words(words, index + 1, current + words[index] + ' ', combinations)`


			`def generate_combinations(text):`
			`"""`
			`Generate all possible combinations of words from the given text,`
			`treating spaces as potential concatenation points for words less than 5 characters.`

			`Args:`
			`text (str): The input text to process.`

			`Returns:`
			`list: A list of unique word combinations.`

			`Examples:`
			`>>> generate_combinations("stu pid as sho le")`
			`['as', 'assho', 'asshole', 'le', 'pid', 'pidas', 'pidassho', 'pidasshole', 'sho', 'shole', 'stu', 'stupid', 'stupidas', 'stupidassho', 'stupidasshole']`

			`>>> generate_combinations("singleword")`
			`['singleword']`
			`"""`
			`combinations = set()`
			`combine_words(text.split(), 0, "", combinations)`

			`# Filter out any empty strings and sort the results`
			`return sorted(filter(lambda x: x != "", combinations))`

nosegm4 2024-09-29 07:08:25 +00:00			`def check_stopwords(text):`
nosegm 2024-09-29 06:47:49 +00:00			`"""`
			`Check if any words from the stopwords list are present in the given text.`

			`Args:`
stopwords-fix 2024-09-29 10:02:30 +00:00			`text (str): The input normalized text to check.`
nosegm 2024-09-29 06:47:49 +00:00
			`Returns:`
stopwords-fix 2024-09-29 10:08:39 +00:00			`int: The score based on the number of stopwords found in the text.`
nosegm 2024-09-29 06:47:49 +00:00
stopwords-fix 2024-09-29 10:02:30 +00:00			`Examples:`
			`>>> check_stopwords("this is a хуй")`
stopwords-fix 2024-09-29 10:08:39 +00:00			`40`

stopwords-fix 2024-09-29 10:02:30 +00:00			`>>> check_stopwords("this is clean")`
stopwords-fix 2024-09-29 10:08:39 +00:00			`0`

			`>>> check_stopwords("хуй is a хуй")`
			`80`

			`>>> check_stopwords("clean is clean")`
			`0`
stopwords-fix 2024-09-29 10:02:30 +00:00			`"""`
stopwords-fix 2024-09-29 10:08:39 +00:00
stopwords-fix 2024-09-29 10:02:30 +00:00			`# Normalize the text by splitting into words`
			`words = set(text.split())`
stopwords-fix 2024-09-29 10:08:39 +00:00
stopwords-fix 2024-09-29 10:02:30 +00:00			`# Check for any intersection with stopword_set`
stopwords-fix 2024-09-29 10:08:39 +00:00			`stopwords_found = stopword_set.intersection(words)`

			`# Calculate the score based on the number of stopwords found`
			`score = len(stopwords_found) * 46`

			`return score`
nosegm 2024-09-29 06:47:49 +00:00
stopwords-fix 2024-09-29 10:02:30 +00:00			`# Example usage`
			`if __name__ == "__main__":`
			`import doctest`
			`doctest.testmod()`