2024-09-28 08:51:24 +00:00
|
|
|
import spacy
|
2024-09-27 10:51:55 +00:00
|
|
|
|
2024-09-28 08:51:24 +00:00
|
|
|
# Load the Russian language model
|
2024-09-28 09:06:24 +00:00
|
|
|
nlp = spacy.load("ru_core_news_md")
|
2024-09-27 10:51:55 +00:00
|
|
|
|
|
|
|
def segment_text(text):
|
|
|
|
"""
|
2024-09-28 08:51:24 +00:00
|
|
|
Use SpaCy to segment text into words.
|
2024-09-27 10:51:55 +00:00
|
|
|
"""
|
2024-09-28 08:51:24 +00:00
|
|
|
# Process the text with SpaCy
|
|
|
|
doc = nlp(text)
|
2024-09-27 10:51:55 +00:00
|
|
|
|
2024-09-28 08:51:24 +00:00
|
|
|
# Extract words from the processed document
|
2024-09-28 09:06:24 +00:00
|
|
|
segmented_text = ' '.join([token.text for token in doc if not token.is_space])
|
2024-09-27 10:51:55 +00:00
|
|
|
|
|
|
|
return segmented_text
|
2024-09-28 08:51:24 +00:00
|
|
|
|