diff --git a/Dockerfile b/Dockerfile index a3f735f..3afd607 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,11 +6,14 @@ WORKDIR /app COPY requirements.txt . # Install system dependencies required for building Python packages -RUN apt-get update && apt-get install -y --no-install-recommends gcc libffi-dev libssl-dev +RUN apt-get update && apt-get install -y --no-install-recommends wget gcc libffi-dev libssl-dev # Install Python dependencies including redis with hiredis support RUN pip install --no-cache-dir -r requirements.txt +# Download and install the Russian language model +RUN python -m spacy download ru_core_news_md + COPY . . EXPOSE 8080 diff --git a/nlp/segment_text.py b/nlp/segment_text.py index 56af820..6e3d02f 100644 --- a/nlp/segment_text.py +++ b/nlp/segment_text.py @@ -1,7 +1,8 @@ import spacy +from spacy.lang.ru.examples import # Load the Russian language model -nlp = spacy.load("ru_core_news_sm") +nlp = spacy.load("ru_core_news_md") def segment_text(text): """ @@ -11,7 +12,7 @@ def segment_text(text): doc = nlp(text) # Extract words from the processed document - segmented_text = ' '.join([token.text for token in doc]) + segmented_text = ' '.join([token.text for token in doc if not token.is_space]) return segmented_text diff --git a/requirements.txt b/requirements.txt index c4d9f2b..0c7e787 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,4 @@ aiohttp aiofiles spacy transformers -easyocr -# protobuf -# sentencepiece \ No newline at end of file +easyocr \ No newline at end of file