less-norm

2024-09-27 13:51:55 +03:00
parent 984630d4c1
commit a2545217e8
4 changed files with 61 additions and 58 deletions
--- a/nlp/normalize.py
+++ b/nlp/normalize.py
@@ -1,13 +1,7 @@
-import torch
-from transformers import ByT5Tokenizer, T5ForConditionalGeneration
 import logging

 logger = logging.getLogger("nlp.normalize")

-# Use ByT5 for the ByT5 model
-tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
-model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
-

 def is_russian_wording(text):
    """
@@ -22,24 +16,6 @@ def is_russian_wording(text):
                return True
    return False

-
-def segment_text(text):
-    """
-    Use a neural network model to segment text into words.
-    """
-    # Encode the input text for the model as UTF-8 bytes
-    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
-
-    # Generate predictions
-    with torch.no_grad():
-        outputs = model.generate(inputs)
-
-    # Decode the generated tokens back to text
-    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-    return segmented_text
-
-
 def normalize(text):
    """
    Normalize English text to resemble Russian characters.