normlizer-fix4

2024-09-27 10:57:20 +03:00
parent e2761ee3d8
commit e241e14764
1 changed files with 13 additions and 15 deletions
--- a/nlp/normalize.py
+++ b/nlp/normalize.py
@@ -8,15 +8,15 @@ model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")

 def is_russian_wording(text):
    """
-    Check if the text contains more than one Russian characters by checking
+    Check if the text contains more than one Russian character by checking
    each character against the Unicode range for Cyrillic.
    """
    counter = 0
    for char in text:
        if "\u0400" <= char <= "\u04ff":  # Unicode range for Cyrillic characters
            counter += 1
-        if counter > 1:
-            return True
+            if counter > 1:
+                return True
    return False


@@ -24,13 +24,12 @@ def segment_text(text):
    """
    Use a neural network model to segment text into words.
    """
-    # Encode the input text for the model
-    # inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
-    inputs = tokenizer("segment: " + text, return_tensors="pt")
+    # Encode the input text for the model as UTF-8 bytes
+    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")

    # Generate predictions
    with torch.no_grad():
-        outputs = model.generate(**inputs)
+        outputs = model.generate(inputs)

    # Decode the generated tokens back to text
    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -43,9 +42,7 @@ def normalize(text):
    Normalize English text to resemble Russian characters.
    """
    # Segment the text first
-    t = segment_text(
-        text.replace("  ", " ").replace("  ", " ").replace("  ", " ")
-    )
+    t = segment_text(text.replace("  ", " "))

    if is_russian_wording(t):
        # Normalize the text by replacing characters
@@ -55,15 +52,15 @@ def normalize(text):
            .replace("o", "о")
            .replace("x", "х")
            .replace("a", "а")
-            .replace("r", "г")
+            .replace("r", "р")
            .replace("m", "м")
            .replace("u", "и")
-            .replace("n", "п")
-            .replace("p", "р")
+            .replace("n", "н")
+            .replace("p", "п")
            .replace("t", "т")
            .replace("y", "у")
-            .replace("h", "н")
-            .replace("i", "й")
+            .replace("h", "х")
+            .replace("i", "и")
            .replace("c", "с")
            .replace("k", "к")
            .replace("b", "в")
@@ -85,3 +82,4 @@ if __name__ == "__main__":

    normalized_output = normalize(input_text)
    print(normalized_output)
+