core/ai/preprocess.py

78 lines
2.3 KiB
Python
Raw Normal View History

2023-05-09 21:41:13 +00:00
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from pymystem3 import Mystem
2023-10-26 20:38:31 +00:00
from string import punctuation
import nltk
import re
2023-05-09 21:41:13 +00:00
nltk.download("stopwords")
def get_clear_text(text):
2023-10-26 20:38:31 +00:00
soup = BeautifulSoup(text, "html.parser")
2023-05-09 21:41:13 +00:00
# extract the plain text from the HTML document without tags
2023-10-26 20:38:31 +00:00
clear_text = ""
2023-05-09 21:41:13 +00:00
for tag in soup.find_all():
2023-10-26 20:38:31 +00:00
clear_text += tag.string or ""
2023-05-09 21:41:13 +00:00
2023-10-26 20:38:31 +00:00
clear_text = re.sub(pattern="[\u202F\u00A0\n]+", repl=" ", string=clear_text)
2023-05-09 21:41:13 +00:00
# only words
2023-10-26 20:38:31 +00:00
clear_text = re.sub(pattern="[^A-ZА-ЯЁ -]", repl="", string=clear_text, flags=re.IGNORECASE)
2023-05-09 21:41:13 +00:00
2023-10-26 20:38:31 +00:00
clear_text = re.sub(pattern=r"\s+", repl=" ", string=clear_text)
2023-05-09 21:41:13 +00:00
clear_text = clear_text.lower()
mystem = Mystem()
russian_stopwords = stopwords.words("russian")
tokens = mystem.lemmatize(clear_text)
2023-10-26 17:56:42 +00:00
tokens = [
token
for token in tokens
if token not in russian_stopwords and token != " " and token.strip() not in punctuation
]
2023-05-09 21:41:13 +00:00
clear_text = " ".join(tokens)
return clear_text
# if __name__ == '__main__':
#
# # initialize the tokenizer with the pre-trained BERT model and vocabulary
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
#
# # split each text into smaller segments of maximum length 512
# max_length = 512
# segmented_texts = []
# for text in [clear_text1, clear_text2]:
# segmented_text = []
# for i in range(0, len(text), max_length):
# segment = text[i:i+max_length]
# segmented_text.append(segment)
# segmented_texts.append(segmented_text)
#
# # tokenize each segment using the BERT tokenizer
# tokenized_texts = []
# for segmented_text in segmented_texts:
# tokenized_text = []
# for segment in segmented_text:
# segment_tokens = tokenizer.tokenize(segment)
# segment_tokens = ['[CLS]'] + segment_tokens + ['[SEP]']
# tokenized_text.append(segment_tokens)
# tokenized_texts.append(tokenized_text)
#
# input_ids = []
# for tokenized_text in tokenized_texts:
# input_id = []
# for segment_tokens in tokenized_text:
# segment_id = tokenizer.convert_tokens_to_ids(segment_tokens)
# input_id.append(segment_id)
# input_ids.append(input_id)
#
# print(input_ids)