spacy-words-separation

2024-09-28 11:51:24 +03:00
parent d9e9c547ef
commit 56a2632980
3 changed files with 30 additions and 34 deletions
--- a/nlp/segment_text.py
+++ b/nlp/segment_text.py
@@ -1,25 +1,17 @@
-import torch
-from transformers import ByT5Tokenizer, T5ForConditionalGeneration
-
-# Use ByT5 for the ByT5 model
-tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
-model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
-
-
+import spacy

+# Load the Russian language model
+nlp = spacy.load("ru_core_news_sm")

 def segment_text(text):
    """
-    Use a neural network model to segment text into words.
+    Use SpaCy to segment text into words.
    """
-    # Encode the input text for the model as UTF-8 bytes
-    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
+    # Process the text with SpaCy
+    doc = nlp(text)

-    # Generate predictions
-    with torch.no_grad():
-        outputs = model.generate(inputs)
-
-    # Decode the generated tokens back to text
-    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract words from the processed document
+    segmented_text = ' '.join([token.text for token in doc])

    return segmented_text
+