add-byt5

2024-09-26 23:38:05 +03:00
parent b9ac3ee3c6
commit 905b9b177c
1 changed files with 65 additions and 33 deletions
--- a/utils/normalize.py
+++ b/utils/normalize.py
@@ -1,25 +1,50 @@
+import logging
+import torch
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+# Initialize the T5 model and tokenizer
+tokenizer = T5Tokenizer.from_pretrained("google/byt5-small")
+model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+
 def is_russian_wording(text):
    """
    Check if the text contains any Russian characters by checking 
    each character against the Unicode range for Cyrillic.
    """
-    # Check if any character in the text is a Cyrillic character
    for char in text:
        if '\u0400' <= char <= '\u04FF':  # Unicode range for Cyrillic characters
            return True
    return False

+def segment_text(text):
+    """
+    Use a neural network model to segment text into words.
+    """
+    # Encode the input text for the model
+    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
+    
+    # Generate predictions
+    with torch.no_grad():
+        outputs = model.generate(inputs)
+    
+    # Decode the generated tokens back to text
+    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    
+    return segmented_text
+
 def normalize(text):
    """
    Normalize English text to resemble Russian characters.
    """
-    text = text.lower()
-    if is_russian_wording(text):
+    # Segment the text first
+    segmented_text = segment_text(text.replace('  ', ' ').replace('  ', ' ').replace('  ', ' '))
+    
+    # Normalize after segmentation
+    segmented_text = segmented_text.lower()
+    
+    if is_russian_wording(segmented_text):
        # Normalize the text by replacing characters
-        text = (text
-                    .replace('  ', ' ')
-                    .replace('  ', ' ')
-                    .replace('  ', ' ')
+        normalized_text = (segmented_text
                           .replace('e', 'е')
                           .replace('o', 'о')
                           .replace('x', 'х')
@@ -32,7 +57,6 @@ def normalize(text):
                           .replace('t', 'т')
                           .replace('y', 'у')
                           .replace('h', 'н')
-                    .replace('p', 'р')
                           .replace('i', 'й')
                           .replace('c', 'с')
                           .replace('k', 'к')
@@ -40,9 +64,17 @@ def normalize(text):
                           .replace('3', 'з')
                           .replace('4', 'ч')
                           .replace('0', 'о')
-                    .replace('e', 'е')
                           .replace('d', 'д')
-                    .replace('z', 'з')
-                    )
+                           .replace('z', 'з'))
+        
+        return normalized_text
+    
+    return segmented_text
+
+# Example usage
+if __name__ == "__main__":
+    input_text = "Hello, this is a test input."
+    
+    normalized_output = normalize(input_text)
+    print(normalized_output)

-    return text