From 7fd358931acc93abad71c64aa1cb7148bd611e8f Mon Sep 17 00:00:00 2001
From: Untone <anton.rewin@gmail.com>
Date: Mon, 12 Feb 2024 15:50:35 +0300
Subject: [PATCH] toxicity-detector

---
 CHANGELOG.md                 | 119 -----------------------------------
 Dockerfile                   |  21 ++++---
 bot/announce.py              |   3 +-
 bot/api.py                   |  29 ++++++---
 handlers/messages_routing.py |  15 ++++-
 main.py                      |  53 +++++++---------
 nlp/replying.py              |  59 +++++++++++++++++
 nlp/toxicity.py              |  31 +++++++++
 pyproject.toml               |  86 -------------------------
 requirements.txt             |   5 ++
 10 files changed, 166 insertions(+), 255 deletions(-)
 delete mode 100644 CHANGELOG.md
 create mode 100644 nlp/replying.py
 create mode 100644 nlp/toxicity.py
 delete mode 100644 pyproject.toml
 create mode 100644 requirements.txt

diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index 14c7ecf..0000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,119 +0,0 @@
-## [0.2.2]
-
-- сообщает о новых заявках в FEEDBACK_CHAT
-
-
-## [0.2.1]
-
-- реорганизация кодовой базы
-- сокращение логики обработчиков
-- отказ от хранения пользовательских данных
-- использование реакций
-
-## [0.2.0]
-
-- реорганизация кодовой базы
-- удаление зависимости от aiogram
-- удаление остатков зависимостей от vercel
-- переход на poetry
-
-
-## [0.0.12]
-
-- множество исправлений в роутинге сообщений
-- исправления в коммандах /ask и /my
-- исправление обработки случая без фото
-- добавлен автомат состояний
-- добавлена возможность высказаться "на кругу" без одобрения заявки
-- асинхронное api
-
-
-## [0.0.11]
-
-- отображение одобренности заявки на кнопке
-- разрешения по умолчанию для unmute_member
-- исправлены ошибки перепроверки заявок на старте
-- фильтрация вступления в чат обратной связи
-- исправлен kick/approve на старте
-- сообщение во все чаты, если отменили все одобрения
-- ответы учитываются только от админов FEEDBACK-чата
-- логика целостности данных для связей
-- команда /unlink по внутреннему айди
-
-
-## [0.0.10]
-
-- добавлено фото к заявке пользователя, если есть
-- изменена надпись на русском
-- исправлены ошибки
-- добавлена сервисная команда для показа потерянных заявок
-
-
-## [0.0.9]
-
-- исправление логики show_request_msg
-- логика перепроверки на старте
-- двуязычный интерфейс без переменных среды
-- kick для тех, от кого отказались поручители
-- bugfix: нестандартные символы в имени
-
-
-## [0.0.8]
-
-- генерация древовидного графа, с опорой на одного участника
-- /my для просмотра и изменения связей
-- возможность отмены поручительства
-- рефакторинг
-
-
-## [0.0.7]
-
-- мьют на входе, там где заявки не включены
-- одобрение заявки: любой участник может поручиться
-- за всех кто уже в чате и пишет сообщения
-
-
-## [0.0.6]
-
-- совместимость с механизмом заявок для публичных групп
-- бот работает во всех чатах, где он админ
-- убраны кнопки ответов
-
-
-## [0.0.5]
-
-- добавлена возможность поручиться
-- обычные сообщения в общем чате больше никак не обрабатываются
-- унифицированный механизм хранения профилей пользователей
-- рефакторинг
-
-
-## [0.0.4]
-
-- управление правами на отправку сообщений
-- сохранение айди автора сообщения обратной связи
-
-
-## [0.0.3]
-
-- подключение независимого от перезапусков хранилища redis
-- многозадачные хранимые сессии пользователей
-- доработки логов отладки
-- bugfix: пропуск приглашённых участников
-- bugfix: учитывание редактируемого сообщения обратной связи
-- удаление приветствия для покинувших канал без ответа
-- обработка ответов на сообщения в чате отзывов
-- рефакторинг
-
-## [0.0.2]
-
-- добавлена функция для обратной связи
-- исправлена ошибка повторной регистрации хука
-
-## [0.0.1]
-
-- фундаментальная кодовая база на основе Sanic и requests
-- настройки на основе переменных среды
-- функция приветственного сообщения с кнопками
-- ограничения для новоприбывших и неверно отвечающих
-- распознание приветствия в тексте
diff --git a/Dockerfile b/Dockerfile
index 86d035a..e5f4561 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,14 +1,17 @@
 FROM python:slim
 
 WORKDIR /app
-COPY . /app
 
-RUN apt-get update && \
-    apt-get install -y git gcc curl && \
-    curl -sSL https://install.python-poetry.org | python - && \
-    echo "export PATH=$PATH:/root/.local/bin" >> ~/.bashrc && \
-    . ~/.bashrc && \
-    poetry config virtualenvs.create false && \
-    poetry install --no-dev
+# Copy just the requirements file first
+COPY requirements.txt .
 
-CMD python main.py
+# Install requirements
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application code
+COPY . .
+
+EXPOSE 8080
+
+# Set the entry point
+CMD ["python", "main.py"]
diff --git a/bot/announce.py b/bot/announce.py
index 18ae18d..e362db2 100644
--- a/bot/announce.py
+++ b/bot/announce.py
@@ -27,10 +27,9 @@ async def show_announce(msg):
     newcomer_message = get_newcomer_message(msg)
 
     userphotos_response = await telegram_api("getUserphotos", user_id=from_id)
-    logger.debug(userphotos_response)
 
     file_id = ""
-    if userphotos_response["ok"] and userphotos_response["result"]["total_count"] > 0:
+    if isinstance(userphotos_response, dict) and userphotos_response["ok"] and userphotos_response["result"]["total_count"] > 0:
         logger.info("showing button with photo")
         file_id = userphotos_response["result"]["photos"][0][0]["file_id"]
 
diff --git a/bot/api.py b/bot/api.py
index 401afba..13e11f3 100644
--- a/bot/api.py
+++ b/bot/api.py
@@ -5,15 +5,26 @@ from bot.config import BOT_TOKEN
 import logging
 
 # Create a logger instance
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger('[tgbot.api] ')
+logging.basicConfig(level=logging.DEBUG)
 
-apiBase = f"https://api.telegram.org/bot{BOT_TOKEN}/"
+api_base = f"https://api.telegram.org/bot{BOT_TOKEN}/"
 
 
-async def telegram_api(endpoint: str, **kwargs):
-    async with aiohttp.ClientSession() as session:
-        async with session.get(apiBase + f"{endpoint}?{urlencode(kwargs)}") as response:
-            data = await response.json()
-            logger.info("Telegram API response: %s", data)
-            return data
+async def telegram_api(endpoint: str, json_data=None, **kwargs):
+    try:
+        url = api_base + f"{endpoint}?{urlencode(kwargs)}"
+        is_polling = endpoint == 'getUpdates'
+        headers = {'Content-Type': 'application/json'}
+        async with aiohttp.ClientSession() as session:
+            url = api_base + f"{endpoint}?{urlencode(kwargs)}"
+            if not is_polling:
+                logger.info(f' >>> {url} {json_data if json_data else ""}')
+            async with session.get(url, data=json.dumps(json_data), headers=headers) as response:
+                data = await response.json()
+                if not is_polling:
+                    logger.info(f' <<< {data}')
+                return data
+    except Exception as ex:
+        import traceback
+        traceback.print_exc()
diff --git a/handlers/messages_routing.py b/handlers/messages_routing.py
index 582ccd1..f265522 100644
--- a/handlers/messages_routing.py
+++ b/handlers/messages_routing.py
@@ -1,6 +1,7 @@
 from bot.api import telegram_api
 from bot.config import FEEDBACK_CHAT_ID
-
+from nlp.toxicity import text2toxicity
+from nlp.replying import get_toxic_reply
 import logging
 
 from handlers.handle_private import handle_private
@@ -28,5 +29,17 @@ async def messages_routing(msg, state):
             if reply_chat_id != FEEDBACK_CHAT_ID:
                 await telegram_api("sendMessage", chat_id=reply_chat_id, text=text, reply_to=reply_msg.get("message_id"))
 
+    elif bool(text):
+        toxic_score = text2toxicity(text)
+        if toxic_score > 0.71:
+            toxic_reply = get_toxic_reply(toxic_score)
+            await telegram_api(
+                "setMessageReaction",
+                chat_id=cid,
+                is_big=True,
+                message_id=msg.get("message_id"),
+                reaction=f'[{{"type":"emoji", "emoji":"{toxic_reply}"}}]'
+            )
+
     else:
         pass
diff --git a/main.py b/main.py
index 571e0c6..e901cca 100644
--- a/main.py
+++ b/main.py
@@ -12,44 +12,39 @@ logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 state = dict()
 
-api_url = f'https://api.telegram.org/bot{BOT_TOKEN}/'
-
-
-async def fetch(session, url):
-    logger.debug(f"fetching: {url}")
-    async with session.get(url) as response:
-        logger.debug(response)
-        return await response.json()
-
 
 async def start():
     logger.info("\tstarted")
     async with ClientSession() as session:
         offset = 0  # начальное значение offset
         while True:
-            updates = await fetch(session, f"{api_url}getUpdates?offset={offset}&allowed_updates=['message', 'join_chat_request', 'message_reaction']")
-            # logger.info('.' if updates['result'] == [] else updates)
-            for update in updates.get("result", []):
-                try:
-                    message = update.get("message")
-                    join_chat_request = update.get("join_chat_request")
-                    message_reaction = update.get("message_reaction")
-                    if message:
-                        await messages_routing(message, state)
-                    elif join_chat_request:
-                        await handle_join_request(join_chat_request)
-                    elif message_reaction:
-                        await handle_reaction_on_request(message_reaction)
+            reponse = await telegram_api("getUpdates", offset=offset, allowed_updates=['message', 'message_reaction'])
+            if isinstance(reponse, dict):
+                result = reponse.get("result", [])
+                for update in result:
+                    try:
+                        message = update.get("message", update.get("edited_message"))
+                        join_chat_request = update.get("join_chat_request")
+                        message_reaction = update.get("join_chat_request")
+                        if message:
+                            await messages_routing(message, state)
+                        elif join_chat_request:
+                            await handle_join_request(join_chat_request)
+                        elif message_reaction:
+                            await handle_reaction_on_request(message_reaction)
 
-                except Exception as e:
-                    logger.error(e)
-                    import traceback
-                    text = traceback.format_exc()
-                    await telegram_api("sendMessage", chat_id=FEEDBACK_CHAT_ID, text=text)
+                    except Exception as e:
+                        logger.error(e)
+                        import traceback
+                        text = traceback.format_exc()
+                        await telegram_api("sendMessage", chat_id=FEEDBACK_CHAT_ID, text=text)
 
-                offset = update["update_id"] + 1
+                    offset = update["update_id"] + 1
 
-            await asyncio.sleep(1.0)
+                await asyncio.sleep(1.0)
+            else:
+                logger.error(' \n\n\n!!! getUpdates polling error\n\n\n')
+                await asyncio.sleep(30.0)
 
 
 if __name__ == "__main__":
diff --git a/nlp/replying.py b/nlp/replying.py
new file mode 100644
index 0000000..d84821e
--- /dev/null
+++ b/nlp/replying.py
@@ -0,0 +1,59 @@
+#  "👍", "👎", "❤", "🔥", "🥰", "👏", "😁",
+# "🤔", "🤯", "😱", "🤬", "😢", "🎉", "🤩",
+# "🤮", "💩", "🙏", "👌", "🕊", "🤡", "🥱",
+# "🥴", "😍", "🐳", "❤‍🔥", "🌚", "🌭", "💯",
+# "🤣", "⚡", "🍌", "🏆", "💔", "🤨", "😐",
+# "🍓", "🍾", "💋", "🖕", "😈", "😴", "😭",
+# "🤓", "👻", "👨‍💻", "👀", "🎃", "🙈", "😇",
+# "😨", "🤝", "✍", "🤗", "🫡", "🎅", "🎄",
+# "☃", "💅", "🤪", "🗿", "🆒", "💘", "🙉",
+# "🦄", "😘", "💊", "🙊", "😎", "👾", "🤷‍♂",
+# "🤷", "🤷‍♀", "😡"
+
+toxic_reactions = {
+    "071": "🕊",
+    "073": "👀",
+    "075": "🙈",
+    "077": "🙊",
+    "079": "💩",
+    "081": "🤔",
+    "083": "😐",
+    "085": "🤨",
+    "087": "🥴",
+    "089": "🤯",
+    "091": "😢",
+    "093": "😭",
+    "095": "😨",
+    "097": "😱",
+    "099": "🤬"
+}
+
+grads = list(toxic_reactions.keys())
+grads.sort()
+grads.reverse()
+
+abusive_reactions = {
+    "085": "🫡",
+    "088": "💅",
+    "091": "🤷‍♀",
+    "094": "👾",
+    "097": "👻",
+    "099": "😈"
+}
+
+abusive_grads = list(abusive_reactions.keys())
+abusive_grads.sort()
+abusive_grads.reverse()
+
+def get_toxic_reply(tx):
+    percentage = tx * 100
+    for key in grads:
+        if percentage > int(key):
+            return toxic_reactions[key]
+
+
+def get_abusive_reply(tx):
+    percentage = tx * 100
+    for key in abusive_grads:
+        if percentage > int(key):
+            return abusive_reactions[key]
diff --git a/nlp/toxicity.py b/nlp/toxicity.py
new file mode 100644
index 0000000..14f787e
--- /dev/null
+++ b/nlp/toxicity.py
@@ -0,0 +1,31 @@
+import torch
+from transformers import AutoTokenizer, \
+    AutoModelForSequenceClassification
+
+tiny_tox_model_path = 'cointegrated/rubert-tiny-toxicity'
+tiny_tox_tokenizer = AutoTokenizer.from_pretrained(tiny_tox_model_path)
+tiny_tox_model = AutoModelForSequenceClassification.from_pretrained(
+    tiny_tox_model_path)
+
+
+# if torch.cuda.is_available():
+#    model.cuda()
+
+
+def text2toxicity(text, aggregate=True) -> float:
+    """ Calculate toxicity of a text (if aggregate=True)
+    or a vector of toxicity aspects (if aggregate=False)"""
+    proba = 0.0
+    with torch.no_grad():
+        inputs = tiny_tox_tokenizer(
+            text.lower(),
+            return_tensors='pt',
+            truncation=True,
+            padding=True
+        ).to(tiny_tox_model.device)
+        proba = torch.sigmoid(tiny_tox_model(**inputs).logits).cpu().numpy()
+    if isinstance(text, str):
+        proba = proba[0]
+    if aggregate:
+        return 1 - proba.T[0] * (1 - proba.T[-1])
+    return float(proba)
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index 3a4afe4..0000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,86 +0,0 @@
-[tool.poetry]
-name = "welcomecenterbot"
-version = "0.2.1"
-description = "telegram group helper"
-authors = ["rainbowdev circle"]
-license = "Open Source"
-readme = "README.md"
-
-[tool.poetry.dependencies]
-python = "^3.12"
-aiohttp = "^3.9.1"
-redis = "^5.0.1"
-
-[tool.poetry.group.dev.dependencies]
-setuptools = "^69.0.2"
-mypy = "^1.6.1"
-black = "^23.10.1"
-ruff = "^0.1.2"
-isort = "^5.12.0"
-
-[tool.black]
-line-length = 120
-target-version = ['py312']
-include = '\.pyi?$'
-exclude = '''
-(
-  /(
-      \.eggs
-    | \.git
-    | \.hg
-    | \.mypy_cache
-    | \.tox
-    | \.venv
-    | _build
-    | buck-out
-    | build
-    | dist
-  )/
-  | foo.py
-)
-'''
-
-[tool.isort]
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-use_parentheses = true
-ensure_newline_before_comments = true
-line_length = 120
-
-[tool.ruff]
-select = ["E4", "E7", "E9", "F"]
-ignore = []
-line-length = 120
-target-version = "py312"
-
-[tool.pyright]
-venvPath = "."
-venv = ".venv"
-include = ["bot/."]
-useLibraryCodeForTypes = false
-disableLanguageServices = false
-disableOrganizeImports = false
-reportMissingImports = true
-reportMissingModuleSource = "warning"
-reportImportCycles = "warning"
-maxMemoryForLargeFile = 4096
-pythonVersion = "3.12"
-autoImportCompletions = true
-useVirtualEnv = true
-typeCheckingMode = "basic"
-disableJediCompletion = true
-disableCompletion = false
-disableSnippetCompletion = false
-disableGoToDefinition = false
-disableRenaming = false
-disableSignatureHelp = false
-diagnostics = true
-logLevel = "debug"
-pluginSearchPaths = []
-typings = {}
-mergeTypeStubPackages = false
-
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b902ae1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+torch
+transformers
+transliterate
+aiohttp
+redis[hiredis]