This commit is contained in:
2024-09-29 09:47:49 +03:00
parent 20cbc6dab6
commit 22ed5f6335
5 changed files with 1280 additions and 30 deletions

View File

@@ -1,17 +0,0 @@
import spacy
# Load the Russian language model
nlp = spacy.load("ru_core_news_md")
def segment_text(text):
"""
Use SpaCy to segment text into words.
"""
# Process the text with SpaCy
doc = nlp(text)
# Extract words from the processed document
segmented_text = ' '.join([token.text for token in doc if not token.is_space])
return segmented_text

1238
nlp/stop_words.txt Normal file

File diff suppressed because it is too large Load Diff

27
nlp/stopwords_detector.py Normal file
View File

@@ -0,0 +1,27 @@
stopwords = []
with open('stop_words.txt', 'r', encoding='utf-8') as file:
text = file.readlines()
# Convert stopwords to a set for faster lookup
stopword_set = set(stopwords)
def check_stopwords(text, stopwords):
"""
Check if any words from the stopwords list are present in the given text.
Args:
text (str): The input text to check.
stopwords (list): A list of stopwords.
Returns:
bool: True if any stopword is found in the text, False otherwise.
"""
# Normalize the text by converting it to lower case and splitting into words
words = text.lower().split()
# Iterate through each word and check for stopwords
for word in words:
if word in stopword_set:
return True # Stop iteration and return True if a stopword is found
return False # Return False if no stopwords are found