diff --git a/Dockerfile b/Dockerfile index 1f753b9..081ccf4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,8 +5,10 @@ WORKDIR /app COPY requirements.txt . +# Install system dependencies required for building Python packages RUN apt-get update && apt-get install -y --no-install-recommends gcc libffi-dev libssl-dev -RUN pip install asyncio aiohttp redis[hiredis] + +# Install Python dependencies including redis with hiredis support RUN pip install --no-cache-dir -r requirements.txt # Stage 2: Final stage @@ -15,7 +17,10 @@ FROM python:slim WORKDIR /app # Copy only necessary files from the builder stage +COPY --from=builder /usr/local/lib/python/dist-packages /usr/local/lib/python/dist-packages +COPY --from=builder /usr/local/lib/python3/dist-packages /usr/local/lib/python3/dist-packages COPY --from=builder /usr/local/lib/python3.*/dist-packages /usr/local/lib/python3.*/dist-packages + COPY . . EXPOSE 8080 diff --git a/nlp/normalize.py b/nlp/normalize.py index 319cc58..e89ac0f 100644 --- a/nlp/normalize.py +++ b/nlp/normalize.py @@ -8,11 +8,14 @@ model = T5ForConditionalGeneration.from_pretrained("google/byt5-small") def is_russian_wording(text): """ - Check if the text contains any Russian characters by checking + Check if the text contains more than one Russian characters by checking each character against the Unicode range for Cyrillic. """ + counter = 0 for char in text: if "\u0400" <= char <= "\u04ff": # Unicode range for Cyrillic characters + counter += 1 + if counter > 1: return True return False @@ -39,17 +42,15 @@ def normalize(text): Normalize English text to resemble Russian characters. """ # Segment the text first - segmented_text = segment_text( + t = segment_text( text.replace(" ", " ").replace(" ", " ").replace(" ", " ") ) - # Normalize after segmentation - segmented_text = segmented_text.lower() - - if is_russian_wording(segmented_text): + if is_russian_wording(t): # Normalize the text by replacing characters normalized_text = ( - segmented_text.replace("e", "е") + t.lower() + .replace("e", "е") .replace("o", "о") .replace("x", "х") .replace("a", "а") @@ -74,7 +75,7 @@ def normalize(text): return normalized_text - return segmented_text + return t # Example usage