diff --git a/nlp/normalize.py b/nlp/normalize.py index e89ac0f..b7e7d2b 100644 --- a/nlp/normalize.py +++ b/nlp/normalize.py @@ -1,10 +1,9 @@ import torch -from transformers import T5Tokenizer, T5ForConditionalGeneration - -# Initialize the T5 model and tokenizer -tokenizer = T5Tokenizer.from_pretrained("google/byt5-small") -model = T5ForConditionalGeneration.from_pretrained("google/byt5-small") +from transformers import ByT5Tokenizer, ByT5ForConditionalGeneration +# Use ByT5 for the ByT5 model +tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small") +model = ByT5ForConditionalGeneration.from_pretrained("google/byt5-small") def is_russian_wording(text): """ @@ -25,11 +24,12 @@ def segment_text(text): Use a neural network model to segment text into words. """ # Encode the input text for the model - inputs = tokenizer.encode("segment: " + text, return_tensors="pt") + # inputs = tokenizer.encode("segment: " + text, return_tensors="pt") + inputs = tokenizer("segment: " + input_text, return_tensors="pt") # Generate predictions with torch.no_grad(): - outputs = model.generate(inputs) + outputs = model.generate(**inputs) # Decode the generated tokens back to text segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)