diff --git a/Dockerfile b/Dockerfile index 3afd607..9d26b02 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,9 +11,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends wget gcc libffi # Install Python dependencies including redis with hiredis support RUN pip install --no-cache-dir -r requirements.txt -# Download and install the Russian language model -RUN python -m spacy download ru_core_news_md - COPY . . EXPOSE 8080 diff --git a/handlers/messages_routing.py b/handlers/messages_routing.py index e60a4a2..df1cc07 100644 --- a/handlers/messages_routing.py +++ b/handlers/messages_routing.py @@ -114,19 +114,9 @@ async def messages_routing(msg, state): toxic_score = detector(normalized_text) toxic_perc = math.floor(toxic_score * 100) logger.info(f"\text: {normalized_text}\ntoxic: {toxic_perc}%") - - nospaces_text = text.replace(" ", "") - if nospaces_text != text: - nospaces_normalized_text = normalize(nospaces_text) - nospaces_text_score = detector(nospaces_normalized_text) - nospaces_text_perc = math.floor(nospaces_text_score * 100) - if check_stopwords(nospaces_normalized_text): - logger.info('stopword detected with no spaces, toxicity +40%') - nospaces_text_perc += 40 - logger.info(f"\nospaces_text: {nospaces_normalized_text}\nnospaces_toxic: {nospaces_text_perc}%") - - if nospaces_text_score > toxic_score or nospaces_text_perc > 95: - text_perc = nospaces_text_perc + if check_stopwords(normalized_text): + logger.info('stopword detected with no spaces, toxicity +40%') + toxic_perc += 40 await redis.set(f"toxic:{cid}", mid) await redis.set(f"toxic:{cid}:{uid}:{mid}", toxic_perc, ex=60 * 60 * 24 * 3) diff --git a/nlp/stopwords_detector.py b/nlp/stopwords_detector.py index f5d1573..e6c2600 100644 --- a/nlp/stopwords_detector.py +++ b/nlp/stopwords_detector.py @@ -17,7 +17,7 @@ def check_stopwords(text, stopwords): bool: True if any stopword is found in the text, False otherwise. """ # Normalize the text by converting it to lower case and splitting into words - words = text.lower().split() + words = text.replace(' ', '').lower().split() # Iterate through each word and check for stopwords for word in words: