nosegm

2024-09-29 09:47:49 +03:00
parent 20cbc6dab6
commit 22ed5f6335
5 changed files with 1280 additions and 30 deletions
--- a/nlp/segment_text.py
+++ b/nlp/segment_text.py
@@ -1,17 +0,0 @@
-import spacy
-
-# Load the Russian language model
-nlp = spacy.load("ru_core_news_md")
-
-def segment_text(text):
-    """
-    Use SpaCy to segment text into words.
-    """
-    # Process the text with SpaCy
-    doc = nlp(text)
-
-    # Extract words from the processed document
-    segmented_text = ' '.join([token.text for token in doc if not token.is_space])
-
-    return segmented_text
-
--- a/nlp/stop_words.txt
+++ b/nlp/stop_words.txt
--- a/nlp/stopwords_detector.py
+++ b/nlp/stopwords_detector.py
@@ -0,0 +1,27 @@
+stopwords = []
+with open('stop_words.txt', 'r', encoding='utf-8') as file:
+    text = file.readlines()
+
+# Convert stopwords to a set for faster lookup
+stopword_set = set(stopwords)
+
+def check_stopwords(text, stopwords):
+    """
+    Check if any words from the stopwords list are present in the given text.
+
+    Args:
+        text (str): The input text to check.
+        stopwords (list): A list of stopwords.
+
+    Returns:
+        bool: True if any stopword is found in the text, False otherwise.
+    """
+    # Normalize the text by converting it to lower case and splitting into words
+    words = text.lower().split()
+
+    # Iterate through each word and check for stopwords
+    for word in words:
+        if word in stopword_set:
+            return True  # Stop iteration and return True if a stopword is found
+
+    return False  # Return False if no stopwords are found