diff --git a/nlp/normalize.py b/nlp/normalize.py index 4495e90..4db8b60 100644 --- a/nlp/normalize.py +++ b/nlp/normalize.py @@ -8,15 +8,15 @@ model = T5ForConditionalGeneration.from_pretrained("google/byt5-small") def is_russian_wording(text): """ - Check if the text contains more than one Russian characters by checking + Check if the text contains more than one Russian character by checking each character against the Unicode range for Cyrillic. """ counter = 0 for char in text: if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters counter += 1 - if counter > 1: - return True + if counter > 1: + return True return False @@ -24,13 +24,12 @@ def segment_text(text): """ Use a neural network model to segment text into words. """ - # Encode the input text for the model - # inputs = tokenizer.encode("segment: " + text, return_tensors="pt") - inputs = tokenizer("segment: " + text, return_tensors="pt") + # Encode the input text for the model as UTF-8 bytes + inputs = tokenizer.encode("segment: " + text, return_tensors="pt") # Generate predictions with torch.no_grad(): - outputs = model.generate(**inputs) + outputs = model.generate(inputs) # Decode the generated tokens back to text segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True) @@ -43,9 +42,7 @@ def normalize(text): Normalize English text to resemble Russian characters. """ # Segment the text first - t = segment_text( - text.replace(" ", " ").replace(" ", " ").replace(" ", " ") - ) + t = segment_text(text.replace(" ", " ")) if is_russian_wording(t): # Normalize the text by replacing characters @@ -55,15 +52,15 @@ def normalize(text): .replace("o", "о") .replace("x", "х") .replace("a", "а") - .replace("r", "г") + .replace("r", "р") .replace("m", "м") .replace("u", "и") - .replace("n", "п") - .replace("p", "р") + .replace("n", "н") + .replace("p", "п") .replace("t", "т") .replace("y", "у") - .replace("h", "н") - .replace("i", "й") + .replace("h", "х") + .replace("i", "и") .replace("c", "с") .replace("k", "к") .replace("b", "в") @@ -85,3 +82,4 @@ if __name__ == "__main__": normalized_output = normalize(input_text) print(normalized_output) + \ No newline at end of file