deps-fixes

2023-10-05 22:38:35 +03:00
parent 458823b894
commit 9537814718
3 changed files with 3 additions and 87 deletions
--- a/ai/preprocess.py
+++ b/ai/preprocess.py
@@ -1,75 +0,0 @@
-import re
-import nltk
-from bs4 import BeautifulSoup
-from nltk.corpus import stopwords
-from pymystem3 import Mystem
-from string import punctuation
-from transformers import BertTokenizer
-
-nltk.download("stopwords")
-
-
-def get_clear_text(text):
-    soup = BeautifulSoup(text, 'html.parser')
-
-    # extract the plain text from the HTML document without tags
-    clear_text = ''
-    for tag in soup.find_all():
-        clear_text += tag.string or ''
-
-    clear_text = re.sub(pattern='[\u202F\u00A0\n]+', repl=' ', string=clear_text)
-
-    # only words
-    clear_text = re.sub(pattern='[^A-ZА-ЯЁ -]', repl='', string=clear_text, flags=re.IGNORECASE)
-
-    clear_text = re.sub(pattern='\s+', repl=' ', string=clear_text)
-
-    clear_text = clear_text.lower()
-
-    mystem = Mystem()
-    russian_stopwords = stopwords.words("russian")
-
-    tokens = mystem.lemmatize(clear_text)
-    tokens = [token for token in tokens if token not in russian_stopwords \
-              and token != " " \
-              and token.strip() not in punctuation]
-
-    clear_text = " ".join(tokens)
-
-    return clear_text
-
-
-# if __name__ == '__main__':
-#
-#     # initialize the tokenizer with the pre-trained BERT model and vocabulary
-#     tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
-#
-#     # split each text into smaller segments of maximum length 512
-#     max_length = 512
-#     segmented_texts = []
-#     for text in [clear_text1, clear_text2]:
-#         segmented_text = []
-#         for i in range(0, len(text), max_length):
-#             segment = text[i:i+max_length]
-#             segmented_text.append(segment)
-#         segmented_texts.append(segmented_text)
-#
-#     # tokenize each segment using the BERT tokenizer
-#     tokenized_texts = []
-#     for segmented_text in segmented_texts:
-#         tokenized_text = []
-#         for segment in segmented_text:
-#             segment_tokens = tokenizer.tokenize(segment)
-#             segment_tokens = ['[CLS]'] + segment_tokens + ['[SEP]']
-#             tokenized_text.append(segment_tokens)
-#         tokenized_texts.append(tokenized_text)
-#
-#     input_ids = []
-#     for tokenized_text in tokenized_texts:
-#         input_id = []
-#         for segment_tokens in tokenized_text:
-#             segment_id = tokenizer.convert_tokens_to_ids(segment_tokens)
-#             input_id.append(segment_id)
-#         input_ids.append(input_id)
-#
-#     print(input_ids)
--- a/auth/email.py
+++ b/auth/email.py
@@ -1,4 +1,4 @@
-from httpx import AsyncClient
+import requests

 from settings import MAILGUN_API_KEY, MAILGUN_DOMAIN

@@ -24,13 +24,7 @@ async def send_auth_email(user, token, lang="ru", template="email_confirmation")
        print("[auth.email] payload: %r" % payload)
        # debug
        # print('http://localhost:3000/?modal=auth&mode=confirm-email&token=%s' % token)
-        async with AsyncClient() as client:
-            response = await client.post(api_url, headers=headers, data=gql)
-            if response.status_code != 200:
-                return False, None
-            r = response.json()
-                api_url, auth=("api", MAILGUN_API_KEY), data=payload
-            )
+        response = requests.post(api_url, auth=("api", MAILGUN_API_KEY), data=payload)
        response.raise_for_status()
    except Exception as e:
        print(e)
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,9 +24,6 @@ python-dateutil~=2.8.2
 beautifulsoup4~=4.11.1
 lxml
 sentry-sdk>=1.14.0
-nltk~=3.8.1
-pymystem3~=0.2.0
-transformers
 boto3~=1.28.2
 botocore~=1.31.2
 python-multipart~=0.0.6