toxicity-detector

2024-02-12 15:50:35 +03:00
parent ec82174dc9
commit 7fd358931a
10 changed files with 166 additions and 255 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,119 +0,0 @@
-## [0.2.2]
-
- сообщает о новых заявках в FEEDBACK_CHAT
-
-
-## [0.2.1]
-
- реорганизация кодовой базы
- сокращение логики обработчиков
- отказ от хранения пользовательских данных
- использование реакций
-
-## [0.2.0]
-
- реорганизация кодовой базы
- удаление зависимости от aiogram
- удаление остатков зависимостей от vercel
- переход на poetry
-
-
-## [0.0.12]
-
- множество исправлений в роутинге сообщений
- исправления в коммандах /ask и /my
- исправление обработки случая без фото
- добавлен автомат состояний
- добавлена возможность высказаться "на кругу" без одобрения заявки
- асинхронное api
-
-
-## [0.0.11]
-
- отображение одобренности заявки на кнопке
- разрешения по умолчанию для unmute_member
- исправлены ошибки перепроверки заявок на старте
- фильтрация вступления в чат обратной связи
- исправлен kick/approve на старте
- сообщение во все чаты, если отменили все одобрения
- ответы учитываются только от админов FEEDBACK-чата
- логика целостности данных для связей
- команда /unlink по внутреннему айди
-
-
-## [0.0.10]
-
- добавлено фото к заявке пользователя, если есть
- изменена надпись на русском
- исправлены ошибки
- добавлена сервисная команда для показа потерянных заявок
-
-
-## [0.0.9]
-
- исправление логики show_request_msg
- логика перепроверки на старте
- двуязычный интерфейс без переменных среды
- kick для тех, от кого отказались поручители
- bugfix: нестандартные символы в имени
-
-
-## [0.0.8]
-
- генерация древовидного графа, с опорой на одного участника
- /my для просмотра и изменения связей
- возможность отмены поручительства
- рефакторинг
-
-
-## [0.0.7]
-
- мьют на входе, там где заявки не включены
- одобрение заявки: любой участник может поручиться
- за всех кто уже в чате и пишет сообщения
-
-
-## [0.0.6]
-
- совместимость с механизмом заявок для публичных групп
- бот работает во всех чатах, где он админ
- убраны кнопки ответов
-
-
-## [0.0.5]
-
- добавлена возможность поручиться
- обычные сообщения в общем чате больше никак не обрабатываются
- унифицированный механизм хранения профилей пользователей
- рефакторинг
-
-
-## [0.0.4]
-
- управление правами на отправку сообщений
- сохранение айди автора сообщения обратной связи
-
-
-## [0.0.3]
-
- подключение независимого от перезапусков хранилища redis
- многозадачные хранимые сессии пользователей
- доработки логов отладки
- bugfix: пропуск приглашённых участников
- bugfix: учитывание редактируемого сообщения обратной связи
- удаление приветствия для покинувших канал без ответа
- обработка ответов на сообщения в чате отзывов
- рефакторинг
-
-## [0.0.2]
-
- добавлена функция для обратной связи
- исправлена ошибка повторной регистрации хука
-
-## [0.0.1]
-
- фундаментальная кодовая база на основе Sanic и requests
- настройки на основе переменных среды
- функция приветственного сообщения с кнопками
- ограничения для новоприбывших и неверно отвечающих
- распознание приветствия в тексте
--- a/21
+++ b/21
@@ -1,14 +1,17 @@
 FROM python:slim

 WORKDIR /app
-COPY . /app

-RUN apt-get update && \
-    apt-get install -y git gcc curl && \
-    curl -sSL https://install.python-poetry.org | python - && \
-    echo "export PATH=$PATH:/root/.local/bin" >> ~/.bashrc && \
-    . ~/.bashrc && \
-    poetry config virtualenvs.create false && \
-    poetry install --no-dev
+# Copy just the requirements file first
+COPY requirements.txt .

-CMD python main.py
+# Install requirements
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application code
+COPY . .
+
+EXPOSE 8080
+
+# Set the entry point
+CMD ["python", "main.py"]
--- a/bot/announce.py
+++ b/bot/announce.py
@@ -27,10 +27,9 @@ async def show_announce(msg):
    newcomer_message = get_newcomer_message(msg)

    userphotos_response = await telegram_api("getUserphotos", user_id=from_id)
-    logger.debug(userphotos_response)

    file_id = ""
-    if userphotos_response["ok"] and userphotos_response["result"]["total_count"] > 0:
+    if isinstance(userphotos_response, dict) and userphotos_response["ok"] and userphotos_response["result"]["total_count"] > 0:
        logger.info("showing button with photo")
        file_id = userphotos_response["result"]["photos"][0][0]["file_id"]

--- a/bot/api.py
+++ b/bot/api.py
@@ -5,15 +5,26 @@ from bot.config import BOT_TOKEN
 import logging

 # Create a logger instance
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger('[tgbot.api] ')
+logging.basicConfig(level=logging.DEBUG)

-apiBase = f"https://api.telegram.org/bot{BOT_TOKEN}/"
+api_base = f"https://api.telegram.org/bot{BOT_TOKEN}/"


-async def telegram_api(endpoint: str, **kwargs):
-    async with aiohttp.ClientSession() as session:
-        async with session.get(apiBase + f"{endpoint}?{urlencode(kwargs)}") as response:
-            data = await response.json()
-            logger.info("Telegram API response: %s", data)
-            return data
+async def telegram_api(endpoint: str, json_data=None, **kwargs):
+    try:
+        url = api_base + f"{endpoint}?{urlencode(kwargs)}"
+        is_polling = endpoint == 'getUpdates'
+        headers = {'Content-Type': 'application/json'}
+        async with aiohttp.ClientSession() as session:
+            url = api_base + f"{endpoint}?{urlencode(kwargs)}"
+            if not is_polling:
+                logger.info(f' >>> {url} {json_data if json_data else ""}')
+            async with session.get(url, data=json.dumps(json_data), headers=headers) as response:
+                data = await response.json()
+                if not is_polling:
+                    logger.info(f' <<< {data}')
+                return data
+    except Exception as ex:
+        import traceback
+        traceback.print_exc()
--- a/handlers/messages_routing.py
+++ b/handlers/messages_routing.py
@@ -1,6 +1,7 @@
 from bot.api import telegram_api
 from bot.config import FEEDBACK_CHAT_ID
-
+from nlp.toxicity import text2toxicity
+from nlp.replying import get_toxic_reply
 import logging

 from handlers.handle_private import handle_private
@@ -28,5 +29,17 @@ async def messages_routing(msg, state):
            if reply_chat_id != FEEDBACK_CHAT_ID:
                await telegram_api("sendMessage", chat_id=reply_chat_id, text=text, reply_to=reply_msg.get("message_id"))

+    elif bool(text):
+        toxic_score = text2toxicity(text)
+        if toxic_score > 0.71:
+            toxic_reply = get_toxic_reply(toxic_score)
+            await telegram_api(
+                "setMessageReaction",
+                chat_id=cid,
+                is_big=True,
+                message_id=msg.get("message_id"),
+                reaction=f'[{{"type":"emoji", "emoji":"{toxic_reply}"}}]'
+            )
+
    else:
        pass
--- a/main.py
+++ b/main.py
@@ -12,44 +12,39 @@ logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 state = dict()

-api_url = f'https://api.telegram.org/bot{BOT_TOKEN}/'
-
-
-async def fetch(session, url):
-    logger.debug(f"fetching: {url}")
-    async with session.get(url) as response:
-        logger.debug(response)
-        return await response.json()
-

 async def start():
    logger.info("\tstarted")
    async with ClientSession() as session:
        offset = 0  # начальное значение offset
        while True:
-            updates = await fetch(session, f"{api_url}getUpdates?offset={offset}&allowed_updates=['message', 'join_chat_request', 'message_reaction']")
-            # logger.info('.' if updates['result'] == [] else updates)
-            for update in updates.get("result", []):
-                try:
-                    message = update.get("message")
-                    join_chat_request = update.get("join_chat_request")
-                    message_reaction = update.get("message_reaction")
-                    if message:
-                        await messages_routing(message, state)
-                    elif join_chat_request:
-                        await handle_join_request(join_chat_request)
-                    elif message_reaction:
-                        await handle_reaction_on_request(message_reaction)
+            reponse = await telegram_api("getUpdates", offset=offset, allowed_updates=['message', 'message_reaction'])
+            if isinstance(reponse, dict):
+                result = reponse.get("result", [])
+                for update in result:
+                    try:
+                        message = update.get("message", update.get("edited_message"))
+                        join_chat_request = update.get("join_chat_request")
+                        message_reaction = update.get("join_chat_request")
+                        if message:
+                            await messages_routing(message, state)
+                        elif join_chat_request:
+                            await handle_join_request(join_chat_request)
+                        elif message_reaction:
+                            await handle_reaction_on_request(message_reaction)

-                except Exception as e:
-                    logger.error(e)
-                    import traceback
-                    text = traceback.format_exc()
-                    await telegram_api("sendMessage", chat_id=FEEDBACK_CHAT_ID, text=text)
+                    except Exception as e:
+                        logger.error(e)
+                        import traceback
+                        text = traceback.format_exc()
+                        await telegram_api("sendMessage", chat_id=FEEDBACK_CHAT_ID, text=text)

-                offset = update["update_id"] + 1
+                    offset = update["update_id"] + 1

-            await asyncio.sleep(1.0)
+                await asyncio.sleep(1.0)
+            else:
+                logger.error(' \n\n\n!!! getUpdates polling error\n\n\n')
+                await asyncio.sleep(30.0)


 if __name__ == "__main__":
--- a/nlp/replying.py
+++ b/nlp/replying.py
@@ -0,0 +1,59 @@
+#  "👍", "👎", "❤", "🔥", "🥰", "👏", "😁",
+# "🤔", "🤯", "😱", "🤬", "😢", "🎉", "🤩",
+# "🤮", "💩", "🙏", "👌", "🕊", "🤡", "🥱",
+# "🥴", "😍", "🐳", "❤‍🔥", "🌚", "🌭", "💯",
+# "🤣", "⚡", "🍌", "🏆", "💔", "🤨", "😐",
+# "🍓", "🍾", "💋", "🖕", "😈", "😴", "😭",
+# "🤓", "👻", "👨‍💻", "👀", "🎃", "🙈", "😇",
+# "😨", "🤝", "✍", "🤗", "🫡", "🎅", "🎄",
+# "☃", "💅", "🤪", "🗿", "🆒", "💘", "🙉",
+# "🦄", "😘", "💊", "🙊", "😎", "👾", "🤷‍♂",
+# "🤷", "🤷‍♀", "😡"
+
+toxic_reactions = {
+    "071": "🕊",
+    "073": "👀",
+    "075": "🙈",
+    "077": "🙊",
+    "079": "💩",
+    "081": "🤔",
+    "083": "😐",
+    "085": "🤨",
+    "087": "🥴",
+    "089": "🤯",
+    "091": "😢",
+    "093": "😭",
+    "095": "😨",
+    "097": "😱",
+    "099": "🤬"
+}
+
+grads = list(toxic_reactions.keys())
+grads.sort()
+grads.reverse()
+
+abusive_reactions = {
+    "085": "🫡",
+    "088": "💅",
+    "091": "🤷‍♀",
+    "094": "👾",
+    "097": "👻",
+    "099": "😈"
+}
+
+abusive_grads = list(abusive_reactions.keys())
+abusive_grads.sort()
+abusive_grads.reverse()
+
+def get_toxic_reply(tx):
+    percentage = tx * 100
+    for key in grads:
+        if percentage > int(key):
+            return toxic_reactions[key]
+
+
+def get_abusive_reply(tx):
+    percentage = tx * 100
+    for key in abusive_grads:
+        if percentage > int(key):
+            return abusive_reactions[key]
--- a/nlp/toxicity.py
+++ b/nlp/toxicity.py
@@ -0,0 +1,31 @@
+import torch
+from transformers import AutoTokenizer, \
+    AutoModelForSequenceClassification
+
+tiny_tox_model_path = 'cointegrated/rubert-tiny-toxicity'
+tiny_tox_tokenizer = AutoTokenizer.from_pretrained(tiny_tox_model_path)
+tiny_tox_model = AutoModelForSequenceClassification.from_pretrained(
+    tiny_tox_model_path)
+
+
+# if torch.cuda.is_available():
+#    model.cuda()
+
+
+def text2toxicity(text, aggregate=True) -> float:
+    """ Calculate toxicity of a text (if aggregate=True)
+    or a vector of toxicity aspects (if aggregate=False)"""
+    proba = 0.0
+    with torch.no_grad():
+        inputs = tiny_tox_tokenizer(
+            text.lower(),
+            return_tensors='pt',
+            truncation=True,
+            padding=True
+        ).to(tiny_tox_model.device)
+        proba = torch.sigmoid(tiny_tox_model(**inputs).logits).cpu().numpy()
+    if isinstance(text, str):
+        proba = proba[0]
+    if aggregate:
+        return 1 - proba.T[0] * (1 - proba.T[-1])
+    return float(proba)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,86 +0,0 @@
-[tool.poetry]
-name = "welcomecenterbot"
-version = "0.2.1"
-description = "telegram group helper"
-authors = ["rainbowdev circle"]
-license = "Open Source"
-readme = "README.md"
-
-[tool.poetry.dependencies]
-python = "^3.12"
-aiohttp = "^3.9.1"
-redis = "^5.0.1"
-
-[tool.poetry.group.dev.dependencies]
-setuptools = "^69.0.2"
-mypy = "^1.6.1"
-black = "^23.10.1"
-ruff = "^0.1.2"
-isort = "^5.12.0"
-
-[tool.black]
-line-length = 120
-target-version = ['py312']
-include = '\.pyi?$'
-exclude = '''
-(
-  /(
-      \.eggs
-    | \.git
-    | \.hg
-    | \.mypy_cache
-    | \.tox
-    | \.venv
-    | _build
-    | buck-out
-    | build
-    | dist
-  )/
-  | foo.py
-)
-'''
-
-[tool.isort]
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-use_parentheses = true
-ensure_newline_before_comments = true
-line_length = 120
-
-[tool.ruff]
-select = ["E4", "E7", "E9", "F"]
-ignore = []
-line-length = 120
-target-version = "py312"
-
-[tool.pyright]
-venvPath = "."
-venv = ".venv"
-include = ["bot/."]
-useLibraryCodeForTypes = false
-disableLanguageServices = false
-disableOrganizeImports = false
-reportMissingImports = true
-reportMissingModuleSource = "warning"
-reportImportCycles = "warning"
-maxMemoryForLargeFile = 4096
-pythonVersion = "3.12"
-autoImportCompletions = true
-useVirtualEnv = true
-typeCheckingMode = "basic"
-disableJediCompletion = true
-disableCompletion = false
-disableSnippetCompletion = false
-disableGoToDefinition = false
-disableRenaming = false
-disableSignatureHelp = false
-diagnostics = true
-logLevel = "debug"
-pluginSearchPaths = []
-typings = {}
-mergeTypeStubPackages = false
-
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+torch
+transformers
+transliterate
+aiohttp
+redis[hiredis]