deps-fixes
This commit is contained in:
parent
458823b894
commit
9537814718
|
@ -1,75 +0,0 @@
|
||||||
import re
|
|
||||||
import nltk
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
from pymystem3 import Mystem
|
|
||||||
from string import punctuation
|
|
||||||
from transformers import BertTokenizer
|
|
||||||
|
|
||||||
nltk.download("stopwords")
|
|
||||||
|
|
||||||
|
|
||||||
def get_clear_text(text):
|
|
||||||
soup = BeautifulSoup(text, 'html.parser')
|
|
||||||
|
|
||||||
# extract the plain text from the HTML document without tags
|
|
||||||
clear_text = ''
|
|
||||||
for tag in soup.find_all():
|
|
||||||
clear_text += tag.string or ''
|
|
||||||
|
|
||||||
clear_text = re.sub(pattern='[\u202F\u00A0\n]+', repl=' ', string=clear_text)
|
|
||||||
|
|
||||||
# only words
|
|
||||||
clear_text = re.sub(pattern='[^A-ZА-ЯЁ -]', repl='', string=clear_text, flags=re.IGNORECASE)
|
|
||||||
|
|
||||||
clear_text = re.sub(pattern='\s+', repl=' ', string=clear_text)
|
|
||||||
|
|
||||||
clear_text = clear_text.lower()
|
|
||||||
|
|
||||||
mystem = Mystem()
|
|
||||||
russian_stopwords = stopwords.words("russian")
|
|
||||||
|
|
||||||
tokens = mystem.lemmatize(clear_text)
|
|
||||||
tokens = [token for token in tokens if token not in russian_stopwords \
|
|
||||||
and token != " " \
|
|
||||||
and token.strip() not in punctuation]
|
|
||||||
|
|
||||||
clear_text = " ".join(tokens)
|
|
||||||
|
|
||||||
return clear_text
|
|
||||||
|
|
||||||
|
|
||||||
# if __name__ == '__main__':
|
|
||||||
#
|
|
||||||
# # initialize the tokenizer with the pre-trained BERT model and vocabulary
|
|
||||||
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
|
||||||
#
|
|
||||||
# # split each text into smaller segments of maximum length 512
|
|
||||||
# max_length = 512
|
|
||||||
# segmented_texts = []
|
|
||||||
# for text in [clear_text1, clear_text2]:
|
|
||||||
# segmented_text = []
|
|
||||||
# for i in range(0, len(text), max_length):
|
|
||||||
# segment = text[i:i+max_length]
|
|
||||||
# segmented_text.append(segment)
|
|
||||||
# segmented_texts.append(segmented_text)
|
|
||||||
#
|
|
||||||
# # tokenize each segment using the BERT tokenizer
|
|
||||||
# tokenized_texts = []
|
|
||||||
# for segmented_text in segmented_texts:
|
|
||||||
# tokenized_text = []
|
|
||||||
# for segment in segmented_text:
|
|
||||||
# segment_tokens = tokenizer.tokenize(segment)
|
|
||||||
# segment_tokens = ['[CLS]'] + segment_tokens + ['[SEP]']
|
|
||||||
# tokenized_text.append(segment_tokens)
|
|
||||||
# tokenized_texts.append(tokenized_text)
|
|
||||||
#
|
|
||||||
# input_ids = []
|
|
||||||
# for tokenized_text in tokenized_texts:
|
|
||||||
# input_id = []
|
|
||||||
# for segment_tokens in tokenized_text:
|
|
||||||
# segment_id = tokenizer.convert_tokens_to_ids(segment_tokens)
|
|
||||||
# input_id.append(segment_id)
|
|
||||||
# input_ids.append(input_id)
|
|
||||||
#
|
|
||||||
# print(input_ids)
|
|
|
@ -1,4 +1,4 @@
|
||||||
from httpx import AsyncClient
|
import requests
|
||||||
|
|
||||||
from settings import MAILGUN_API_KEY, MAILGUN_DOMAIN
|
from settings import MAILGUN_API_KEY, MAILGUN_DOMAIN
|
||||||
|
|
||||||
|
@ -24,13 +24,7 @@ async def send_auth_email(user, token, lang="ru", template="email_confirmation")
|
||||||
print("[auth.email] payload: %r" % payload)
|
print("[auth.email] payload: %r" % payload)
|
||||||
# debug
|
# debug
|
||||||
# print('http://localhost:3000/?modal=auth&mode=confirm-email&token=%s' % token)
|
# print('http://localhost:3000/?modal=auth&mode=confirm-email&token=%s' % token)
|
||||||
async with AsyncClient() as client:
|
response = requests.post(api_url, auth=("api", MAILGUN_API_KEY), data=payload)
|
||||||
response = await client.post(api_url, headers=headers, data=gql)
|
response.raise_for_status()
|
||||||
if response.status_code != 200:
|
|
||||||
return False, None
|
|
||||||
r = response.json()
|
|
||||||
api_url, auth=("api", MAILGUN_API_KEY), data=payload
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
|
|
@ -24,9 +24,6 @@ python-dateutil~=2.8.2
|
||||||
beautifulsoup4~=4.11.1
|
beautifulsoup4~=4.11.1
|
||||||
lxml
|
lxml
|
||||||
sentry-sdk>=1.14.0
|
sentry-sdk>=1.14.0
|
||||||
nltk~=3.8.1
|
|
||||||
pymystem3~=0.2.0
|
|
||||||
transformers
|
|
||||||
boto3~=1.28.2
|
boto3~=1.28.2
|
||||||
botocore~=1.31.2
|
botocore~=1.31.2
|
||||||
python-multipart~=0.0.6
|
python-multipart~=0.0.6
|
||||||
|
|
Loading…
Reference in New Issue
Block a user