welcomecenterbot/nlp/segment_text.py

18 lines
381 B
Python
Raw Normal View History

2024-09-28 08:51:24 +00:00
import spacy
2024-09-27 10:51:55 +00:00
2024-09-28 08:51:24 +00:00
# Load the Russian language model
2024-09-28 09:06:24 +00:00
nlp = spacy.load("ru_core_news_md")
2024-09-27 10:51:55 +00:00
def segment_text(text):
"""
2024-09-28 08:51:24 +00:00
Use SpaCy to segment text into words.
2024-09-27 10:51:55 +00:00
"""
2024-09-28 08:51:24 +00:00
# Process the text with SpaCy
doc = nlp(text)
2024-09-27 10:51:55 +00:00
2024-09-28 08:51:24 +00:00
# Extract words from the processed document
2024-09-28 09:06:24 +00:00
segmented_text = ' '.join([token.text for token in doc if not token.is_space])
2024-09-27 10:51:55 +00:00
return segmented_text
2024-09-28 08:51:24 +00:00