less-norm

This commit is contained in:
2024-09-27 13:51:55 +03:00
parent 984630d4c1
commit a2545217e8
4 changed files with 61 additions and 58 deletions

View File

@@ -1,13 +1,7 @@
import torch
from transformers import ByT5Tokenizer, T5ForConditionalGeneration
import logging
logger = logging.getLogger("nlp.normalize")
# Use ByT5 for the ByT5 model
tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
def is_russian_wording(text):
"""
@@ -22,24 +16,6 @@ def is_russian_wording(text):
return True
return False
def segment_text(text):
"""
Use a neural network model to segment text into words.
"""
# Encode the input text for the model as UTF-8 bytes
inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
# Generate predictions
with torch.no_grad():
outputs = model.generate(inputs)
# Decode the generated tokens back to text
segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return segmented_text
def normalize(text):
"""
Normalize English text to resemble Russian characters.