spacy-words-separation

This commit is contained in:
2024-09-28 11:51:24 +03:00
parent d9e9c547ef
commit 56a2632980
3 changed files with 30 additions and 34 deletions

View File

@@ -1,25 +1,17 @@
import torch
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
import spacy
# Load the Russian language model
nlp = spacy.load("ru_core_news_sm")
def segment_text(text):
"""
Use a neural network model to segment text into words.
Use SpaCy to segment text into words.
"""
# Encode the input text for the model as UTF-8 bytes
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Process the text with SpaCy
doc = nlp(text)
# Generate predictions
with torch.no_grad():
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract words from the processed document
segmented_text = ' '.join([token.text for token in doc])
return segmented_text