less-norm

2024-09-27 13:51:55 +03:00
parent 984630d4c1
commit a2545217e8
4 changed files with 61 additions and 58 deletions
--- a/nlp/segment_text.py
+++ b/nlp/segment_text.py
@@ -0,0 +1,25 @@
+import torch
+from transformers import ByT5Tokenizer, T5ForConditionalGeneration
+
+# Use ByT5 for the ByT5 model
+tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small")
+model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+
+
+
+
+def segment_text(text):
+    """
+    Use a neural network model to segment text into words.
+    """
+    # Encode the input text for the model as UTF-8 bytes
+    inputs = tokenizer.encode("segment: " + text, return_tensors="pt")
+
+    # Generate predictions
+    with torch.no_grad():
+        outputs = model.generate(inputs)
+
+    # Decode the generated tokens back to text
+    segmented_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+    return segmented_text