normlizer-fix

2024-09-27 10:10:40 +03:00 · 2024-09-27 10:10:40 +03:00 · bebf0caf65
commit bebf0caf65
parent 224cf08603
1 changed files with 7 additions and 7 deletions
--- a/nlp/normalize.py
+++ b/nlp/normalize.py
@ -1,10 +1,9 @@
 import torch
-from transformers import T5Tokenizer, T5ForConditionalGeneration
+from transformers import ByT5Tokenizer, ByT5ForConditionalGeneration
 # Initialize the T5 model and tokenizer
 tokenizer = T5Tokenizer.from_pretrained("google/byt5-small")
 model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
 # Use ByT5 for the ByT5 model
 tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
 model = ByT5ForConditionalGeneration.from_pretrained("google/byt5-small")
 def is_russian_wording(text):
    """
@ -25,11 +24,12 @@ def segment_text(text):
    Use a neural network model to segment text into words.
    """
    # Encode the input text for the model
-    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
+    # inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
    inputs = tokenizer("segment: " + input_text, return_tensors="pt")
    # Generate predictions
    with torch.no_grad():
-        outputs = model.generate(inputs)
+        outputs = model.generate(**inputs)
    # Decode the generated tokens back to text
    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)