normlizer-fix

This commit is contained in:
Untone 2024-09-27 10:10:40 +03:00
parent 224cf08603
commit bebf0caf65

View File

@ -1,10 +1,9 @@
import torch import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration from transformers import ByT5Tokenizer, ByT5ForConditionalGeneration
# Initialize the T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = ByT5ForConditionalGeneration.from_pretrained("google/byt5-small")
def is_russian_wording(text): def is_russian_wording(text):
""" """
@ -25,11 +24,12 @@ def segment_text(text):
Use a neural network model to segment text into words. Use a neural network model to segment text into words.
""" """
# Encode the input text for the model # Encode the input text for the model
inputs = tokenizer.encode("segment: " + text, return_tensors="pt") # inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
inputs = tokenizer("segment: " + input_text, return_tensors="pt")
# Generate predictions # Generate predictions
with torch.no_grad(): with torch.no_grad():
outputs = model.generate(inputs) outputs = model.generate(**inputs)
# Decode the generated tokens back to text # Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True) segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)