nosegm
This commit is contained in:
@@ -1,17 +0,0 @@
|
||||
import spacy
|
||||
|
||||
# Load the Russian language model
|
||||
nlp = spacy.load("ru_core_news_md")
|
||||
|
||||
def segment_text(text):
|
||||
"""
|
||||
Use SpaCy to segment text into words.
|
||||
"""
|
||||
# Process the text with SpaCy
|
||||
doc = nlp(text)
|
||||
|
||||
# Extract words from the processed document
|
||||
segmented_text = ' '.join([token.text for token in doc if not token.is_space])
|
||||
|
||||
return segmented_text
|
||||
|
1238
nlp/stop_words.txt
Normal file
1238
nlp/stop_words.txt
Normal file
File diff suppressed because it is too large
Load Diff
27
nlp/stopwords_detector.py
Normal file
27
nlp/stopwords_detector.py
Normal file
@@ -0,0 +1,27 @@
|
||||
stopwords = []
|
||||
with open('stop_words.txt', 'r', encoding='utf-8') as file:
|
||||
text = file.readlines()
|
||||
|
||||
# Convert stopwords to a set for faster lookup
|
||||
stopword_set = set(stopwords)
|
||||
|
||||
def check_stopwords(text, stopwords):
|
||||
"""
|
||||
Check if any words from the stopwords list are present in the given text.
|
||||
|
||||
Args:
|
||||
text (str): The input text to check.
|
||||
stopwords (list): A list of stopwords.
|
||||
|
||||
Returns:
|
||||
bool: True if any stopword is found in the text, False otherwise.
|
||||
"""
|
||||
# Normalize the text by converting it to lower case and splitting into words
|
||||
words = text.lower().split()
|
||||
|
||||
# Iterate through each word and check for stopwords
|
||||
for word in words:
|
||||
if word in stopword_set:
|
||||
return True # Stop iteration and return True if a stopword is found
|
||||
|
||||
return False # Return False if no stopwords are found
|
Reference in New Issue
Block a user