stopwords-fix

2024-09-29 13:02:30 +03:00
parent 4b517441ee
commit 3e4aefc429
3 changed files with 76 additions and 11 deletions
--- a/handlers/messages_routing.py
+++ b/handlers/messages_routing.py
@@ -116,7 +116,7 @@ async def messages_routing(msg, state):
            toxic_perc = math.floor(toxic_score * 100)
            if toxic_score < 0.9 and text != text.replace(' ', ''):
                logger.info('check without spaces')
-                if check_stopwords(normalized_text.replace(' ', '')):
+                if check_stopwords(normalized_text):
                    logger.info('stopword detected without spaces, toxicity +40%')
                    toxic_perc += 40
                else:
--- a/nlp/ocr.py
+++ b/nlp/ocr.py
@@ -11,7 +11,7 @@ def ocr_recognize(file_path):

    # Use EasyOCR to detect text in the photo
    results = reader.readtext(file_path)
-    result = result[-1]
+    result = results[-1]
    [_coords, ocr_text, ocr_accuracy] = result
    logger.debug("OCR Result: %s", ocr_text)
    if ocr_accuracy.item() > 0.5:
--- a/nlp/stopwords_detector.py
+++ b/nlp/stopwords_detector.py
@@ -1,25 +1,90 @@
 stopwords = []
 with open('nlp/stop_words.txt', 'r', encoding='utf-8') as file:
-    stopwords = file.readlines()
+    stopwords = [word.strip() for word in file.readlines()]

 # Convert stopwords to a set for faster lookup
 stopword_set = set(stopwords)

+def combine_words(words, index, current, combinations):
+    """
+    Helper function to recursively generate combinations of words.
+
+    Args:
+        words (list): List of words to combine.
+        index (int): Current index in the list of words.
+        current (str): Current combination being built.
+        combinations (set): Set to store unique combinations.
+
+    Examples:
+        >>> combinations = set()
+        >>> combine_words(['a', 'b', 'c'], 0, '', combinations)
+        >>> sorted(combinations)
+        ['a', 'ab', 'abc', 'b', 'bc', 'c']
+    """
+    # Add the current combination to the set of combinations
+    combinations.update(current.split())
+
+    # If we have reached the end of the list of words, return
+    if index == len(words):
+        return
+
+    # Include the current word
+    combine_words(words, index + 1, current + words[index], combinations)
+
+    # Include the current word with a space if not at the last word
+    if index < len(words) - 1:
+        combine_words(words, index + 1, current + words[index] + ' ', combinations)
+
+
+def generate_combinations(text):
+    """
+    Generate all possible combinations of words from the given text,
+    treating spaces as potential concatenation points for words less than 5 characters.
+
+    Args:
+        text (str): The input text to process.
+
+    Returns:
+        list: A list of unique word combinations.
+
+    Examples:
+        >>> generate_combinations("stu pid as sho le")
+        ['as', 'assho', 'asshole', 'le', 'pid', 'pidas', 'pidassho', 'pidasshole', 'sho', 'shole', 'stu', 'stupid', 'stupidas', 'stupidassho', 'stupidasshole']
+        
+        >>> generate_combinations("singleword")
+        ['singleword']
+    """
+    combinations = set()
+    combine_words(text.split(), 0, "", combinations)
+    
+    # Filter out any empty strings and sort the results
+    return sorted(filter(lambda x: x != "", combinations))
+
 def check_stopwords(text):
    """
    Check if any words from the stopwords list are present in the given text.

    Args:
-        text (str): The input text to check.
-        stopwords (list): A list of stopwords.
+        text (str): The input normalized text to check.

    Returns:
        bool: True if any stopword is found in the text, False otherwise.
+
+    Examples:
+        >>> check_stopwords("this is a хуй")
+        True
+        
+        >>> check_stopwords("this is clean")
+        False
    """
    
-    # Iterate through each word and check for stopwords
-    for word in stopword_set:
-        if word in text:
-            return True  # Stop iteration and return True if a stopword is found
+    # Normalize the text by splitting into words
+    words = set(text.split())
    
-    return False  # Return False if no stopwords are found
+    # Check for any intersection with stopword_set
+    return not stopword_set.isdisjoint(words)
+
+# Example usage
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()