spacy-words-separation2

2024-09-28 12:06:24 +03:00 · 2024-09-28 12:06:24 +03:00 · 7030e58f4b
commit 7030e58f4b
parent 56a2632980
3 changed files with 8 additions and 6 deletions
--- a/5
+++ b/5
@ -6,11 +6,14 @@ WORKDIR /app
 COPY requirements.txt .

 # Install system dependencies required for building Python packages
-RUN apt-get update && apt-get install -y --no-install-recommends gcc libffi-dev libssl-dev
+RUN apt-get update && apt-get install -y --no-install-recommends wget gcc libffi-dev libssl-dev

 # Install Python dependencies including redis with hiredis support
 RUN pip install --no-cache-dir -r requirements.txt

+# Download and install the Russian language model
+RUN python -m spacy download ru_core_news_md
+
 COPY . .

 EXPOSE 8080
--- a/nlp/segment_text.py
+++ b/nlp/segment_text.py
@ -1,7 +1,8 @@
 import spacy
+from spacy.lang.ru.examples import 

 # Load the Russian language model
-nlp = spacy.load("ru_core_news_sm")
+nlp = spacy.load("ru_core_news_md")

 def segment_text(text):
    """
@ -11,7 +12,7 @@ def segment_text(text):
    doc = nlp(text)

    # Extract words from the processed document
-    segmented_text = ' '.join([token.text for token in doc])
+    segmented_text = ' '.join([token.text for token in doc if not token.is_space])

    return segmented_text

--- a/requirements.txt
+++ b/requirements.txt
@ -4,5 +4,3 @@ aiofiles
 spacy
 transformers
 easyocr
-# protobuf
-# sentencepiece