stopwords-fix
This commit is contained in:
parent
4b517441ee
commit
3e4aefc429
|
@ -116,7 +116,7 @@ async def messages_routing(msg, state):
|
|||
toxic_perc = math.floor(toxic_score * 100)
|
||||
if toxic_score < 0.9 and text != text.replace(' ', ''):
|
||||
logger.info('check without spaces')
|
||||
if check_stopwords(normalized_text.replace(' ', '')):
|
||||
if check_stopwords(normalized_text):
|
||||
logger.info('stopword detected without spaces, toxicity +40%')
|
||||
toxic_perc += 40
|
||||
else:
|
||||
|
|
|
@ -11,7 +11,7 @@ def ocr_recognize(file_path):
|
|||
|
||||
# Use EasyOCR to detect text in the photo
|
||||
results = reader.readtext(file_path)
|
||||
result = result[-1]
|
||||
result = results[-1]
|
||||
[_coords, ocr_text, ocr_accuracy] = result
|
||||
logger.debug("OCR Result: %s", ocr_text)
|
||||
if ocr_accuracy.item() > 0.5:
|
||||
|
|
|
@ -1,25 +1,90 @@
|
|||
stopwords = []
|
||||
with open('nlp/stop_words.txt', 'r', encoding='utf-8') as file:
|
||||
stopwords = file.readlines()
|
||||
stopwords = [word.strip() for word in file.readlines()]
|
||||
|
||||
# Convert stopwords to a set for faster lookup
|
||||
stopword_set = set(stopwords)
|
||||
|
||||
def combine_words(words, index, current, combinations):
|
||||
"""
|
||||
Helper function to recursively generate combinations of words.
|
||||
|
||||
Args:
|
||||
words (list): List of words to combine.
|
||||
index (int): Current index in the list of words.
|
||||
current (str): Current combination being built.
|
||||
combinations (set): Set to store unique combinations.
|
||||
|
||||
Examples:
|
||||
>>> combinations = set()
|
||||
>>> combine_words(['a', 'b', 'c'], 0, '', combinations)
|
||||
>>> sorted(combinations)
|
||||
['a', 'ab', 'abc', 'b', 'bc', 'c']
|
||||
"""
|
||||
# Add the current combination to the set of combinations
|
||||
combinations.update(current.split())
|
||||
|
||||
# If we have reached the end of the list of words, return
|
||||
if index == len(words):
|
||||
return
|
||||
|
||||
# Include the current word
|
||||
combine_words(words, index + 1, current + words[index], combinations)
|
||||
|
||||
# Include the current word with a space if not at the last word
|
||||
if index < len(words) - 1:
|
||||
combine_words(words, index + 1, current + words[index] + ' ', combinations)
|
||||
|
||||
|
||||
def generate_combinations(text):
|
||||
"""
|
||||
Generate all possible combinations of words from the given text,
|
||||
treating spaces as potential concatenation points for words less than 5 characters.
|
||||
|
||||
Args:
|
||||
text (str): The input text to process.
|
||||
|
||||
Returns:
|
||||
list: A list of unique word combinations.
|
||||
|
||||
Examples:
|
||||
>>> generate_combinations("stu pid as sho le")
|
||||
['as', 'assho', 'asshole', 'le', 'pid', 'pidas', 'pidassho', 'pidasshole', 'sho', 'shole', 'stu', 'stupid', 'stupidas', 'stupidassho', 'stupidasshole']
|
||||
|
||||
>>> generate_combinations("singleword")
|
||||
['singleword']
|
||||
"""
|
||||
combinations = set()
|
||||
combine_words(text.split(), 0, "", combinations)
|
||||
|
||||
# Filter out any empty strings and sort the results
|
||||
return sorted(filter(lambda x: x != "", combinations))
|
||||
|
||||
def check_stopwords(text):
|
||||
"""
|
||||
Check if any words from the stopwords list are present in the given text.
|
||||
|
||||
Args:
|
||||
text (str): The input text to check.
|
||||
stopwords (list): A list of stopwords.
|
||||
text (str): The input normalized text to check.
|
||||
|
||||
Returns:
|
||||
bool: True if any stopword is found in the text, False otherwise.
|
||||
|
||||
Examples:
|
||||
>>> check_stopwords("this is a хуй")
|
||||
True
|
||||
|
||||
>>> check_stopwords("this is clean")
|
||||
False
|
||||
"""
|
||||
|
||||
# Iterate through each word and check for stopwords
|
||||
for word in stopword_set:
|
||||
if word in text:
|
||||
return True # Stop iteration and return True if a stopword is found
|
||||
# Normalize the text by splitting into words
|
||||
words = set(text.split())
|
||||
|
||||
return False # Return False if no stopwords are found
|
||||
# Check for any intersection with stopword_set
|
||||
return not stopword_set.isdisjoint(words)
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
Loading…
Reference in New Issue
Block a user