2024-09-29 06:47:49 +00:00
|
|
|
stopwords = []
|
2024-09-29 07:05:14 +00:00
|
|
|
with open('nlp/stop_words.txt', 'r', encoding='utf-8') as file:
|
2024-09-29 10:02:30 +00:00
|
|
|
stopwords = [word.strip() for word in file.readlines()]
|
2024-09-29 06:47:49 +00:00
|
|
|
|
|
|
|
# Convert stopwords to a set for faster lookup
|
|
|
|
stopword_set = set(stopwords)
|
|
|
|
|
2024-09-29 10:02:30 +00:00
|
|
|
def combine_words(words, index, current, combinations):
|
|
|
|
"""
|
|
|
|
Helper function to recursively generate combinations of words.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
words (list): List of words to combine.
|
|
|
|
index (int): Current index in the list of words.
|
|
|
|
current (str): Current combination being built.
|
|
|
|
combinations (set): Set to store unique combinations.
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
>>> combinations = set()
|
|
|
|
>>> combine_words(['a', 'b', 'c'], 0, '', combinations)
|
|
|
|
>>> sorted(combinations)
|
|
|
|
['a', 'ab', 'abc', 'b', 'bc', 'c']
|
|
|
|
"""
|
|
|
|
# Add the current combination to the set of combinations
|
|
|
|
combinations.update(current.split())
|
|
|
|
|
|
|
|
# If we have reached the end of the list of words, return
|
|
|
|
if index == len(words):
|
|
|
|
return
|
|
|
|
|
|
|
|
# Include the current word
|
|
|
|
combine_words(words, index + 1, current + words[index], combinations)
|
|
|
|
|
|
|
|
# Include the current word with a space if not at the last word
|
|
|
|
if index < len(words) - 1:
|
|
|
|
combine_words(words, index + 1, current + words[index] + ' ', combinations)
|
|
|
|
|
|
|
|
|
|
|
|
def generate_combinations(text):
|
|
|
|
"""
|
|
|
|
Generate all possible combinations of words from the given text,
|
|
|
|
treating spaces as potential concatenation points for words less than 5 characters.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
text (str): The input text to process.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
list: A list of unique word combinations.
|
|
|
|
|
|
|
|
Examples:
|
|
|
|
>>> generate_combinations("stu pid as sho le")
|
|
|
|
['as', 'assho', 'asshole', 'le', 'pid', 'pidas', 'pidassho', 'pidasshole', 'sho', 'shole', 'stu', 'stupid', 'stupidas', 'stupidassho', 'stupidasshole']
|
|
|
|
|
|
|
|
>>> generate_combinations("singleword")
|
|
|
|
['singleword']
|
|
|
|
"""
|
|
|
|
combinations = set()
|
|
|
|
combine_words(text.split(), 0, "", combinations)
|
|
|
|
|
|
|
|
# Filter out any empty strings and sort the results
|
|
|
|
return sorted(filter(lambda x: x != "", combinations))
|
|
|
|
|
2024-09-29 07:08:25 +00:00
|
|
|
def check_stopwords(text):
|
2024-09-29 06:47:49 +00:00
|
|
|
"""
|
|
|
|
Check if any words from the stopwords list are present in the given text.
|
|
|
|
|
|
|
|
Args:
|
2024-09-29 10:02:30 +00:00
|
|
|
text (str): The input normalized text to check.
|
2024-09-29 06:47:49 +00:00
|
|
|
|
|
|
|
Returns:
|
2024-09-29 10:08:39 +00:00
|
|
|
int: The score based on the number of stopwords found in the text.
|
2024-09-29 06:47:49 +00:00
|
|
|
|
2024-09-29 10:02:30 +00:00
|
|
|
Examples:
|
|
|
|
>>> check_stopwords("this is a хуй")
|
2024-09-29 10:08:39 +00:00
|
|
|
40
|
|
|
|
|
2024-09-29 10:02:30 +00:00
|
|
|
>>> check_stopwords("this is clean")
|
2024-09-29 10:08:39 +00:00
|
|
|
0
|
|
|
|
|
|
|
|
>>> check_stopwords("хуй is a хуй")
|
|
|
|
80
|
|
|
|
|
|
|
|
>>> check_stopwords("clean is clean")
|
|
|
|
0
|
2024-09-29 10:02:30 +00:00
|
|
|
"""
|
2024-09-29 10:08:39 +00:00
|
|
|
|
2024-09-29 10:02:30 +00:00
|
|
|
# Normalize the text by splitting into words
|
|
|
|
words = set(text.split())
|
2024-09-29 10:08:39 +00:00
|
|
|
|
2024-09-29 10:02:30 +00:00
|
|
|
# Check for any intersection with stopword_set
|
2024-09-29 10:08:39 +00:00
|
|
|
stopwords_found = stopword_set.intersection(words)
|
|
|
|
|
|
|
|
# Calculate the score based on the number of stopwords found
|
|
|
|
score = len(stopwords_found) * 46
|
|
|
|
|
|
|
|
return score
|
2024-09-29 06:47:49 +00:00
|
|
|
|
2024-09-29 10:02:30 +00:00
|
|
|
# Example usage
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import doctest
|
|
|
|
doctest.testmod()
|