normlizer-fix
This commit is contained in:
parent
224cf08603
commit
bebf0caf65
|
@ -1,10 +1,9 @@
|
||||||
import torch
|
import torch
|
||||||
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
from transformers import ByT5Tokenizer, ByT5ForConditionalGeneration
|
||||||
|
|
||||||
# Initialize the T5 model and tokenizer
|
|
||||||
tokenizer = T5Tokenizer.from_pretrained("google/byt5-small")
|
|
||||||
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
|
|
||||||
|
|
||||||
|
# Use ByT5 for the ByT5 model
|
||||||
|
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
|
||||||
|
model = ByT5ForConditionalGeneration.from_pretrained("google/byt5-small")
|
||||||
|
|
||||||
def is_russian_wording(text):
|
def is_russian_wording(text):
|
||||||
"""
|
"""
|
||||||
|
@ -25,11 +24,12 @@ def segment_text(text):
|
||||||
Use a neural network model to segment text into words.
|
Use a neural network model to segment text into words.
|
||||||
"""
|
"""
|
||||||
# Encode the input text for the model
|
# Encode the input text for the model
|
||||||
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
|
# inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
|
||||||
|
inputs = tokenizer("segment: " + input_text, return_tensors="pt")
|
||||||
|
|
||||||
# Generate predictions
|
# Generate predictions
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
outputs = model.generate(inputs)
|
outputs = model.generate(**inputs)
|
||||||
|
|
||||||
# Decode the generated tokens back to text
|
# Decode the generated tokens back to text
|
||||||
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user