spacy-words-separation
This commit is contained in:
@@ -1,25 +1,17 @@
|
||||
import torch
|
||||
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
|
||||
|
||||
# Use ByT5 for the ByT5 model
|
||||
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
|
||||
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
|
||||
|
||||
|
||||
import spacy
|
||||
|
||||
# Load the Russian language model
|
||||
nlp = spacy.load("ru_core_news_sm")
|
||||
|
||||
def segment_text(text):
|
||||
"""
|
||||
Use a neural network model to segment text into words.
|
||||
Use SpaCy to segment text into words.
|
||||
"""
|
||||
# Encode the input text for the model as UTF-8 bytes
|
||||
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
|
||||
# Process the text with SpaCy
|
||||
doc = nlp(text)
|
||||
|
||||
# Generate predictions
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(inputs)
|
||||
|
||||
# Decode the generated tokens back to text
|
||||
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
# Extract words from the processed document
|
||||
segmented_text = ' '.join([token.text for token in doc])
|
||||
|
||||
return segmented_text
|
||||
|
||||
|
Reference in New Issue
Block a user