From d55448398dc91d8770181b154bf34b5bdab7684a Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 5 Mar 2025 20:08:21 +0000 Subject: [PATCH 01/40] feat(search.py): change to txtai server, with ai model. And fix granian workers --- .gitea/workflows/main.yml | 12 +- .gitignore | 3 +- main.py | 9 +- requirements.txt | 4 + server.py | 12 +- services/search.py | 353 +++++++++++++++++++------------------- 6 files changed, 211 insertions(+), 182 deletions(-) diff --git a/.gitea/workflows/main.yml b/.gitea/workflows/main.yml index 18730b95..86ae2988 100644 --- a/.gitea/workflows/main.yml +++ b/.gitea/workflows/main.yml @@ -29,7 +29,17 @@ jobs: if: github.ref == 'refs/heads/dev' uses: dokku/github-action@master with: - branch: 'dev' + branch: 'main' force: true git_remote_url: 'ssh://dokku@v2.discours.io:22/core' ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} + + - name: Push to dokku for staging branch + if: github.ref == 'refs/heads/staging' + uses: dokku/github-action@master + with: + branch: 'main' + force: true + git_remote_url: 'ssh://dokku@staging.discours.io:22/core + ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} + git_push_flags: '--force' \ No newline at end of file diff --git a/.gitignore b/.gitignore index 4db9e7e4..502d180d 100644 --- a/.gitignore +++ b/.gitignore @@ -161,4 +161,5 @@ views.json *.key *.crt *cache.json -.cursor \ No newline at end of file +.cursor +.devcontainer/ diff --git a/main.py b/main.py index ff64c974..7c4a722f 100644 --- a/main.py +++ b/main.py @@ -17,7 +17,8 @@ from cache.revalidator import revalidation_manager from services.exception import ExceptionHandlerMiddleware from services.redis import redis from services.schema import create_all_tables, resolvers -from services.search import search_service +#from services.search import search_service +from services.search import search_service, initialize_search_index from services.viewed import ViewedStorage from services.webhook import WebhookEndpoint, create_webhook_endpoint from settings import DEV_SERVER_PID_FILE_NAME, MODE @@ -47,6 +48,12 @@ async def lifespan(_app): start(), revalidation_manager.start(), ) + + # After basic initialization is complete, fetch shouts and initialize search + from services.db import fetch_all_shouts # Import your database access function + all_shouts = await fetch_all_shouts() # Replace with your actual function + await initialize_search_index(all_shouts) + yield finally: tasks = [redis.disconnect(), ViewedStorage.stop(), revalidation_manager.stop()] diff --git a/requirements.txt b/requirements.txt index 56b09175..ccab19f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,6 +17,10 @@ gql ariadne granian +# NLP and search +txtai[embeddings] +sentence-transformers + pydantic fakeredis pytest diff --git a/server.py b/server.py index 30009c89..e34609b1 100644 --- a/server.py +++ b/server.py @@ -3,7 +3,8 @@ from pathlib import Path from granian.constants import Interfaces from granian.log import LogLevels -from granian.server import Granian +from granian.server import Server +from sentence_transformers import SentenceTransformer from settings import PORT from utils.logger import root_logger as logger @@ -11,12 +12,17 @@ from utils.logger import root_logger as logger if __name__ == "__main__": logger.info("started") try: - granian_instance = Granian( + # Preload the model before starting the server + logger.info("Loading sentence transformer model...") + model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2') + logger.info("Model loaded successfully!") + + granian_instance = Server( "main:app", address="0.0.0.0", port=PORT, interface=Interfaces.ASGI, - threads=4, + workers=4, websockets=False, log_level=LogLevels.debug, backlog=2048, diff --git a/services/search.py b/services/search.py index 9c9b13e9..b8b97b60 100644 --- a/services/search.py +++ b/services/search.py @@ -2,8 +2,9 @@ import asyncio import json import logging import os +import concurrent.futures -from opensearchpy import OpenSearch +from txtai.embeddings import Embeddings from services.redis import redis from utils.encoders import CustomJSONEncoder @@ -12,220 +13,220 @@ from utils.encoders import CustomJSONEncoder logger = logging.getLogger("search") logger.setLevel(logging.WARNING) -ELASTIC_HOST = os.environ.get("ELASTIC_HOST", "").replace("https://", "") -ELASTIC_USER = os.environ.get("ELASTIC_USER", "") -ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", "") -ELASTIC_PORT = os.environ.get("ELASTIC_PORT", 9200) -ELASTIC_URL = os.environ.get( - "ELASTIC_URL", - f"https://{ELASTIC_USER}:{ELASTIC_PASSWORD}@{ELASTIC_HOST}:{ELASTIC_PORT}", -) REDIS_TTL = 86400 # 1 день в секундах -index_settings = { - "settings": { - "index": {"number_of_shards": 1, "auto_expand_replicas": "0-all"}, - "analysis": { - "analyzer": { - "ru": { - "tokenizer": "standard", - "filter": ["lowercase", "ru_stop", "ru_stemmer"], - } - }, - "filter": { - "ru_stemmer": {"type": "stemmer", "language": "russian"}, - "ru_stop": {"type": "stop", "stopwords": "_russian_"}, - }, - }, - }, - "mappings": { - "properties": { - "body": {"type": "text", "analyzer": "ru"}, - "title": {"type": "text", "analyzer": "ru"}, - "subtitle": {"type": "text", "analyzer": "ru"}, - "lead": {"type": "text", "analyzer": "ru"}, - "media": {"type": "text", "analyzer": "ru"}, - } - }, -} - -expected_mapping = index_settings["mappings"] - -# Создание цикла событий -search_loop = asyncio.get_event_loop() - -# В начале файла добавим флаг -SEARCH_ENABLED = bool(os.environ.get("ELASTIC_HOST", "")) - - -def get_indices_stats(): - indices_stats = search_service.client.cat.indices(format="json") - for index_info in indices_stats: - index_name = index_info["index"] - if not index_name.startswith("."): - index_health = index_info["health"] - index_status = index_info["status"] - pri_shards = index_info["pri"] - rep_shards = index_info["rep"] - docs_count = index_info["docs.count"] - docs_deleted = index_info["docs.deleted"] - store_size = index_info["store.size"] - pri_store_size = index_info["pri.store.size"] - - logger.info(f"Index: {index_name}") - logger.info(f"Health: {index_health}") - logger.info(f"Status: {index_status}") - logger.info(f"Primary Shards: {pri_shards}") - logger.info(f"Replica Shards: {rep_shards}") - logger.info(f"Documents Count: {docs_count}") - logger.info(f"Deleted Documents: {docs_deleted}") - logger.info(f"Store Size: {store_size}") - logger.info(f"Primary Store Size: {pri_store_size}") +# Configuration for txtai search +SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]) +# Thread executor for non-blocking initialization +thread_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) class SearchService: def __init__(self, index_name="search_index"): logger.info("Инициализируем поиск...") self.index_name = index_name - self.client = None - self.lock = asyncio.Lock() - - # Инициализация клиента OpenSearch только если поиск включен - if SEARCH_ENABLED: - try: - self.client = OpenSearch( - hosts=[{"host": ELASTIC_HOST, "port": ELASTIC_PORT}], - http_compress=True, - http_auth=(ELASTIC_USER, ELASTIC_PASSWORD), - use_ssl=True, - verify_certs=False, - ssl_assert_hostname=False, - ssl_show_warn=False, - ) - logger.info("Клиент OpenSearch.org подключен") - search_loop.create_task(self.check_index()) - except Exception as exc: - logger.warning(f"Поиск отключен из-за ошибки подключения: {exc}") - self.client = None - else: - logger.info("Поиск отключен (ELASTIC_HOST не установлен)") + self.embeddings = None + self._initialization_future = None + self.available = SEARCH_ENABLED + + if not self.available: + logger.info("Поиск отключен (SEARCH_ENABLED = False)") + return + + # Initialize embeddings in background thread + self._initialization_future = thread_executor.submit(self._init_embeddings) + + def _init_embeddings(self): + """Initialize txtai embeddings in a background thread""" + try: + # Use the same model as in TopicClassifier + model_path = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" + + # Configure embeddings with content storage and quantization for lower memory usage + self.embeddings = Embeddings({ + "path": model_path, + "content": True, + "quantize": True + }) + logger.info("txtai embeddings initialized successfully") + return True + except Exception as e: + logger.error(f"Failed to initialize txtai embeddings: {e}") + self.available = False + return False async def info(self): - if not SEARCH_ENABLED: + """Return information about search service""" + if not self.available: return {"status": "disabled"} try: - return get_indices_stats() + if not self.is_ready(): + return {"status": "initializing", "model": "paraphrase-multilingual-mpnet-base-v2"} + + return { + "status": "active", + "count": len(self.embeddings) if self.embeddings else 0, + "model": "paraphrase-multilingual-mpnet-base-v2" + } except Exception as e: logger.error(f"Failed to get search info: {e}") return {"status": "error", "message": str(e)} - def delete_index(self): - if self.client: - logger.warning(f"[!!!] Удаляем индекс {self.index_name}") - self.client.indices.delete(index=self.index_name, ignore_unavailable=True) - - def create_index(self): - if self.client: - logger.info(f"Создается индекс: {self.index_name}") - self.client.indices.create(index=self.index_name, body=index_settings) - logger.info(f"Индекс {self.index_name} создан") - - async def check_index(self): - if self.client: - logger.info(f"Проверяем индекс {self.index_name}...") - if not self.client.indices.exists(index=self.index_name): - self.create_index() - self.client.indices.put_mapping(index=self.index_name, body=expected_mapping) - else: - logger.info(f"Найден существующий индекс {self.index_name}") - # Проверка и обновление структуры индекса, если необходимо - result = self.client.indices.get_mapping(index=self.index_name) - if isinstance(result, str): - result = json.loads(result) - if isinstance(result, dict): - mapping = result.get(self.index_name, {}).get("mappings") - logger.info(f"Найдена структура индексации: {mapping['properties'].keys()}") - expected_keys = expected_mapping["properties"].keys() - if mapping and mapping["properties"].keys() != expected_keys: - logger.info(f"Ожидаемая структура индексации: {expected_mapping}") - logger.warning("[!!!] Требуется переиндексация всех данных") - self.delete_index() - self.client = None - else: - logger.error("клиент не инициализован, невозможно проверить индекс") + def is_ready(self): + """Check if embeddings are fully initialized and ready""" + return self.embeddings is not None and self.available def index(self, shout): - if not SEARCH_ENABLED: + """Index a single document""" + if not self.available: return - if self.client: - logger.info(f"Индексируем пост {shout.id}") - index_body = { - "body": shout.body, - "title": shout.title, - "subtitle": shout.subtitle, - "lead": shout.lead, - "media": shout.media, - } - asyncio.create_task(self.perform_index(shout, index_body)) + logger.info(f"Индексируем пост {shout.id}") + + # Start in background to not block + asyncio.create_task(self.perform_index(shout)) - async def perform_index(self, shout, index_body): - if self.client: - try: - await asyncio.wait_for( - self.client.index(index=self.index_name, id=str(shout.id), body=index_body), timeout=40.0 - ) - except asyncio.TimeoutError: - logger.error(f"Indexing timeout for shout {shout.id}") - except Exception as e: - logger.error(f"Indexing error for shout {shout.id}: {e}") + async def perform_index(self, shout): + """Actually perform the indexing operation""" + if not self.is_ready(): + # If embeddings not ready, wait for initialization + if self._initialization_future and not self._initialization_future.done(): + try: + # Wait for initialization to complete with timeout + await asyncio.get_event_loop().run_in_executor( + None, lambda: self._initialization_future.result(timeout=30)) + except Exception as e: + logger.error(f"Embeddings initialization failed: {e}") + return + + if not self.is_ready(): + logger.error(f"Cannot index shout {shout.id}: embeddings not ready") + return + + try: + # Combine all text fields + text = " ".join(filter(None, [ + shout.title or "", + shout.subtitle or "", + shout.lead or "", + shout.body or "", + shout.media or "" + ])) + + # Use upsert for individual documents + await asyncio.get_event_loop().run_in_executor( + None, + lambda: self.embeddings.upsert([(str(shout.id), text, None)]) + ) + logger.info(f"Пост {shout.id} успешно индексирован") + except Exception as e: + logger.error(f"Indexing error for shout {shout.id}: {e}") + + async def bulk_index(self, shouts): + """Index multiple documents at once""" + if not self.available or not shouts: + return + + if not self.is_ready(): + # Wait for initialization if needed + if self._initialization_future and not self._initialization_future.done(): + try: + await asyncio.get_event_loop().run_in_executor( + None, lambda: self._initialization_future.result(timeout=30)) + except Exception as e: + logger.error(f"Embeddings initialization failed: {e}") + return + + if not self.is_ready(): + logger.error("Cannot perform bulk indexing: embeddings not ready") + return + + documents = [] + for shout in shouts: + text = " ".join(filter(None, [ + shout.title or "", + shout.subtitle or "", + shout.lead or "", + shout.body or "", + shout.media or "" + ])) + documents.append((str(shout.id), text, None)) + + try: + await asyncio.get_event_loop().run_in_executor( + None, lambda: self.embeddings.upsert(documents)) + logger.info(f"Bulk indexed {len(documents)} documents") + except Exception as e: + logger.error(f"Bulk indexing error: {e}") async def search(self, text, limit, offset): - if not SEARCH_ENABLED: + """Search documents""" + if not self.available: return [] - + + # Check Redis cache first + redis_key = f"search:{text}:{offset}+{limit}" + cached = await redis.get(redis_key) + if cached: + return json.loads(cached) + logger.info(f"Ищем: {text} {offset}+{limit}") - search_body = { - "query": {"multi_match": {"query": text, "fields": ["title", "lead", "subtitle", "body", "media"]}} - } - - if self.client: - search_response = self.client.search( - index=self.index_name, - body=search_body, - size=limit, - from_=offset, - _source=False, - _source_excludes=["title", "body", "subtitle", "media", "lead", "_index"], - ) - hits = search_response["hits"]["hits"] - results = [{"id": hit["_id"], "score": hit["_score"]} for hit in hits] - - # если результаты не пустые - if results: - # Кэширование в Redis с TTL - redis_key = f"search:{text}:{offset}+{limit}" + + if not self.is_ready(): + # Wait for initialization if needed + if self._initialization_future and not self._initialization_future.done(): + try: + await asyncio.get_event_loop().run_in_executor( + None, lambda: self._initialization_future.result(timeout=30)) + except Exception as e: + logger.error(f"Embeddings initialization failed: {e}") + return [] + + if not self.is_ready(): + logger.error("Cannot search: embeddings not ready") + return [] + + try: + # Search with txtai (need to request more to handle offset) + total = offset + limit + results = await asyncio.get_event_loop().run_in_executor( + None, lambda: self.embeddings.search(text, total)) + + # Apply offset and convert to the expected format + results = results[offset:offset+limit] + formatted_results = [{"id": doc_id, "score": float(score)} for score, doc_id in results] + + # Cache results + if formatted_results: await redis.execute( "SETEX", redis_key, REDIS_TTL, - json.dumps(results, cls=CustomJSONEncoder), + json.dumps(formatted_results, cls=CustomJSONEncoder), ) - return results - return [] + return formatted_results + except Exception as e: + logger.error(f"Search error: {e}") + return [] +# Create the search service singleton search_service = SearchService() +# Keep the API exactly the same to maintain compatibility async def search_text(text: str, limit: int = 50, offset: int = 0): payload = [] - if search_service.client: - # Использование метода search_post из OpenSearchService + if search_service.available: payload = await search_service.search(text, limit, offset) return payload -# Проверить что URL корректный -OPENSEARCH_URL = os.getenv("OPENSEARCH_URL", "rc1a-3n5pi3bhuj9gieel.mdb.yandexcloud.net") +# Function to initialize search with existing data +async def initialize_search_index(shouts_data): + """Initialize search index with existing data during application startup""" + if SEARCH_ENABLED: + logger.info("Initializing search index with existing data...") + await search_service.bulk_index(shouts_data) + logger.info(f"Search index initialized with {len(shouts_data)} documents") -- 2.43.0 From e6a416383d985739b2863f1216bd961821e793ac Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 5 Mar 2025 20:16:32 +0000 Subject: [PATCH 02/40] debug: fixed workflows gitea --- .gitea/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitea/workflows/main.yml b/.gitea/workflows/main.yml index 86ae2988..d7c55ea0 100644 --- a/.gitea/workflows/main.yml +++ b/.gitea/workflows/main.yml @@ -39,7 +39,6 @@ jobs: uses: dokku/github-action@master with: branch: 'main' - force: true - git_remote_url: 'ssh://dokku@staging.discours.io:22/core + git_remote_url: 'ssh://dokku@staging.discours.io:22/core' ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} git_push_flags: '--force' \ No newline at end of file -- 2.43.0 From 59e71c81448998d79eef064f240914fc54eafcc2 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 5 Mar 2025 20:17:34 +0000 Subject: [PATCH 03/40] debug: fixed workflows gitea --- .gitea/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitea/workflows/main.yml b/.gitea/workflows/main.yml index d7c55ea0..f65ae48a 100644 --- a/.gitea/workflows/main.yml +++ b/.gitea/workflows/main.yml @@ -38,7 +38,7 @@ jobs: if: github.ref == 'refs/heads/staging' uses: dokku/github-action@master with: - branch: 'main' + branch: 'dev' git_remote_url: 'ssh://dokku@staging.discours.io:22/core' ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} git_push_flags: '--force' \ No newline at end of file -- 2.43.0 From c0b2116da2c0696b6908da752750fc4e1c966311 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 5 Mar 2025 20:32:34 +0000 Subject: [PATCH 04/40] feat(db.py): added fetch_all_shouts, to populate the search index --- services/db.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/services/db.py b/services/db.py index bd3072e4..01e3961c 100644 --- a/services/db.py +++ b/services/db.py @@ -181,3 +181,27 @@ def get_json_builder(): # Используем их в коде json_builder, json_array_builder, json_cast = get_json_builder() + +async def fetch_all_shouts(session=None): + """Fetch all published shouts for search indexing""" + from orm.shout import Shout + + close_session = False + if session is None: + session = local_session() + close_session = True + + try: + # Fetch only published and non-deleted shouts + query = session.query(Shout).filter( + Shout.published_at.is_not(None), + Shout.deleted_at.is_(None) + ) + shouts = query.all() + return shouts + except Exception as e: + logger.error(f"Error fetching shouts for search indexing: {e}") + return [] + finally: + if close_session: + session.close() \ No newline at end of file -- 2.43.0 From f249752db5b53c5ee359760cde7263cdf0ba7203 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 12 Mar 2025 12:06:09 -0300 Subject: [PATCH 05/40] feat: moved txtai and search procedure in different instance --- main.py | 10 +++- requirements.txt | 3 +- server.py | 7 +-- services/search.py | 144 ++++++++++++--------------------------------- 4 files changed, 48 insertions(+), 116 deletions(-) diff --git a/main.py b/main.py index 7c4a722f..6f88b382 100644 --- a/main.py +++ b/main.py @@ -35,6 +35,14 @@ async def start(): f.write(str(os.getpid())) print(f"[main] process started in {MODE} mode") +async def check_search_service(): + """Check if search service is available and log result""" + info = await search_service.info() + if info.get("status") in ["error", "unavailable"]: + print(f"[WARNING] Search service unavailable: {info.get('message', 'unknown reason')}") + else: + print(f"[INFO] Search service is available: {info}") + async def lifespan(_app): try: @@ -44,7 +52,7 @@ async def lifespan(_app): precache_data(), ViewedStorage.init(), create_webhook_endpoint(), - search_service.info(), + check_search_service(), start(), revalidation_manager.start(), ) diff --git a/requirements.txt b/requirements.txt index ccab19f3..40212b99 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,8 +18,7 @@ ariadne granian # NLP and search -txtai[embeddings] -sentence-transformers +httpx pydantic fakeredis diff --git a/server.py b/server.py index e34609b1..0ba5b97c 100644 --- a/server.py +++ b/server.py @@ -4,7 +4,6 @@ from pathlib import Path from granian.constants import Interfaces from granian.log import LogLevels from granian.server import Server -from sentence_transformers import SentenceTransformer from settings import PORT from utils.logger import root_logger as logger @@ -12,11 +11,7 @@ from utils.logger import root_logger as logger if __name__ == "__main__": logger.info("started") try: - # Preload the model before starting the server - logger.info("Loading sentence transformer model...") - model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2') - logger.info("Model loaded successfully!") - + granian_instance = Server( "main:app", address="0.0.0.0", diff --git a/services/search.py b/services/search.py index b8b97b60..4328ce4c 100644 --- a/services/search.py +++ b/services/search.py @@ -2,9 +2,7 @@ import asyncio import json import logging import os -import concurrent.futures - -from txtai.embeddings import Embeddings +import httpx from services.redis import redis from utils.encoders import CustomJSONEncoder @@ -13,96 +11,53 @@ from utils.encoders import CustomJSONEncoder logger = logging.getLogger("search") logger.setLevel(logging.WARNING) -REDIS_TTL = 86400 # 1 день в секундах +REDIS_TTL = 86400 # 1 day in seconds -# Configuration for txtai search +# Configuration for search service SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]) -# Thread executor for non-blocking initialization -thread_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) +TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "http://txtai-service:8000") class SearchService: - def __init__(self, index_name="search_index"): - logger.info("Инициализируем поиск...") - self.index_name = index_name - self.embeddings = None - self._initialization_future = None + def __init__(self): + logger.info("Initializing search service...") self.available = SEARCH_ENABLED + self.client = httpx.AsyncClient(timeout=30.0, base_url=TXTAI_SERVICE_URL) if not self.available: - logger.info("Поиск отключен (SEARCH_ENABLED = False)") - return + logger.info("Search disabled (SEARCH_ENABLED = False)") - # Initialize embeddings in background thread - self._initialization_future = thread_executor.submit(self._init_embeddings) - - def _init_embeddings(self): - """Initialize txtai embeddings in a background thread""" - try: - # Use the same model as in TopicClassifier - model_path = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" - - # Configure embeddings with content storage and quantization for lower memory usage - self.embeddings = Embeddings({ - "path": model_path, - "content": True, - "quantize": True - }) - logger.info("txtai embeddings initialized successfully") - return True - except Exception as e: - logger.error(f"Failed to initialize txtai embeddings: {e}") - self.available = False - return False - async def info(self): """Return information about search service""" if not self.available: return {"status": "disabled"} try: - if not self.is_ready(): - return {"status": "initializing", "model": "paraphrase-multilingual-mpnet-base-v2"} - - return { - "status": "active", - "count": len(self.embeddings) if self.embeddings else 0, - "model": "paraphrase-multilingual-mpnet-base-v2" - } + response = await self.client.get("/info") + response.raise_for_status() + return response.json() except Exception as e: logger.error(f"Failed to get search info: {e}") return {"status": "error", "message": str(e)} def is_ready(self): - """Check if embeddings are fully initialized and ready""" - return self.embeddings is not None and self.available + """Check if service is available""" + return self.available def index(self, shout): """Index a single document""" if not self.available: return - logger.info(f"Индексируем пост {shout.id}") + logger.info(f"Indexing post {shout.id}") # Start in background to not block asyncio.create_task(self.perform_index(shout)) async def perform_index(self, shout): """Actually perform the indexing operation""" - if not self.is_ready(): - # If embeddings not ready, wait for initialization - if self._initialization_future and not self._initialization_future.done(): - try: - # Wait for initialization to complete with timeout - await asyncio.get_event_loop().run_in_executor( - None, lambda: self._initialization_future.result(timeout=30)) - except Exception as e: - logger.error(f"Embeddings initialization failed: {e}") - return - - if not self.is_ready(): - logger.error(f"Cannot index shout {shout.id}: embeddings not ready") - return + if not self.available: + return try: # Combine all text fields @@ -114,12 +69,13 @@ class SearchService: shout.media or "" ])) - # Use upsert for individual documents - await asyncio.get_event_loop().run_in_executor( - None, - lambda: self.embeddings.upsert([(str(shout.id), text, None)]) + # Send to txtai service + response = await self.client.post( + "/index", + json={"id": str(shout.id), "text": text} ) - logger.info(f"Пост {shout.id} успешно индексирован") + response.raise_for_status() + logger.info(f"Post {shout.id} successfully indexed") except Exception as e: logger.error(f"Indexing error for shout {shout.id}: {e}") @@ -127,20 +83,6 @@ class SearchService: """Index multiple documents at once""" if not self.available or not shouts: return - - if not self.is_ready(): - # Wait for initialization if needed - if self._initialization_future and not self._initialization_future.done(): - try: - await asyncio.get_event_loop().run_in_executor( - None, lambda: self._initialization_future.result(timeout=30)) - except Exception as e: - logger.error(f"Embeddings initialization failed: {e}") - return - - if not self.is_ready(): - logger.error("Cannot perform bulk indexing: embeddings not ready") - return documents = [] for shout in shouts: @@ -151,11 +93,14 @@ class SearchService: shout.body or "", shout.media or "" ])) - documents.append((str(shout.id), text, None)) + documents.append({"id": str(shout.id), "text": text}) try: - await asyncio.get_event_loop().run_in_executor( - None, lambda: self.embeddings.upsert(documents)) + response = await self.client.post( + "/bulk-index", + json={"documents": documents} + ) + response.raise_for_status() logger.info(f"Bulk indexed {len(documents)} documents") except Exception as e: logger.error(f"Bulk indexing error: {e}") @@ -171,31 +116,16 @@ class SearchService: if cached: return json.loads(cached) - logger.info(f"Ищем: {text} {offset}+{limit}") - - if not self.is_ready(): - # Wait for initialization if needed - if self._initialization_future and not self._initialization_future.done(): - try: - await asyncio.get_event_loop().run_in_executor( - None, lambda: self._initialization_future.result(timeout=30)) - except Exception as e: - logger.error(f"Embeddings initialization failed: {e}") - return [] - - if not self.is_ready(): - logger.error("Cannot search: embeddings not ready") - return [] + logger.info(f"Searching: {text} {offset}+{limit}") try: - # Search with txtai (need to request more to handle offset) - total = offset + limit - results = await asyncio.get_event_loop().run_in_executor( - None, lambda: self.embeddings.search(text, total)) - - # Apply offset and convert to the expected format - results = results[offset:offset+limit] - formatted_results = [{"id": doc_id, "score": float(score)} for score, doc_id in results] + response = await self.client.post( + "/search", + json={"text": text, "limit": limit, "offset": offset} + ) + response.raise_for_status() + result = response.json() + formatted_results = result.get("results", []) # Cache results if formatted_results: @@ -229,4 +159,4 @@ async def initialize_search_index(shouts_data): if SEARCH_ENABLED: logger.info("Initializing search index with existing data...") await search_service.bulk_index(shouts_data) - logger.info(f"Search index initialized with {len(shouts_data)} documents") + logger.info(f"Search index initialized with {len(shouts_data)} documents") \ No newline at end of file -- 2.43.0 From a9c7ac49d6324569aa6fa11f7fb204421e26fb06 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 12 Mar 2025 13:07:27 -0300 Subject: [PATCH 06/40] feat: with logs >>> --- services/search.py | 146 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 125 insertions(+), 21 deletions(-) diff --git a/services/search.py b/services/search.py index 4328ce4c..ca5f8fff 100644 --- a/services/search.py +++ b/services/search.py @@ -3,6 +3,7 @@ import json import logging import os import httpx +from typing import List, Dict, Any, Optional from services.redis import redis from utils.encoders import CustomJSONEncoder @@ -15,18 +16,84 @@ REDIS_TTL = 86400 # 1 day in seconds # Configuration for search service SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]) -TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "http://txtai-service:8000") +TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL") +# Add retry configuration +MAX_RETRIES = int(os.environ.get("TXTAI_MAX_RETRIES", "3")) +RETRY_DELAY = float(os.environ.get("TXTAI_RETRY_DELAY", "1.0")) +# Add request timeout configuration +REQUEST_TIMEOUT = float(os.environ.get("TXTAI_REQUEST_TIMEOUT", "30.0")) +# Add health check configuration +HEALTH_CHECK_INTERVAL = int(os.environ.get("SEARCH_HEALTH_CHECK_INTERVAL", "300")) # 5 minutes class SearchService: def __init__(self): - logger.info("Initializing search service...") + logger.info(f"Initializing search service with URL: {TXTAI_SERVICE_URL}") self.available = SEARCH_ENABLED - self.client = httpx.AsyncClient(timeout=30.0, base_url=TXTAI_SERVICE_URL) + self.client = httpx.AsyncClient(timeout=REQUEST_TIMEOUT, base_url=TXTAI_SERVICE_URL) + self.last_health_check = 0 + self.health_status = False if not self.available: logger.info("Search disabled (SEARCH_ENABLED = False)") + + # Schedule health check if enabled + if self.available: + asyncio.create_task(self._schedule_health_checks()) + async def _schedule_health_checks(self): + """Schedule periodic health checks""" + while True: + try: + await self._check_health() + await asyncio.sleep(HEALTH_CHECK_INTERVAL) + except Exception as e: + logger.error(f"Error in health check scheduler: {e}") + await asyncio.sleep(HEALTH_CHECK_INTERVAL) + + async def _check_health(self): + """Check if search service is healthy""" + try: + info = await self.info() + self.health_status = info.get("status") != "error" + if self.health_status: + logger.info("Search service is healthy") + else: + logger.warning("Search service is unhealthy") + return self.health_status + except Exception as e: + self.health_status = False + logger.warning(f"Search health check failed: {e}") + return False + + async def _retry_operation(self, operation_func, *args, **kwargs): + """Execute an operation with retries""" + if not self.available: + return None + + for attempt in range(MAX_RETRIES): + try: + return await operation_func(*args, **kwargs) + except httpx.ReadTimeout: + if attempt == MAX_RETRIES - 1: + raise + logger.warning(f"Request timed out, retrying ({attempt+1}/{MAX_RETRIES})") + await asyncio.sleep(RETRY_DELAY * (attempt + 1)) + except httpx.ConnectError as e: + if attempt == MAX_RETRIES - 1: + self.available = False + logger.error(f"Connection error after {MAX_RETRIES} attempts: {e}") + raise + logger.warning(f"Connection error, retrying ({attempt+1}/{MAX_RETRIES}): {e}") + await asyncio.sleep(RETRY_DELAY * (attempt + 1)) + except Exception as e: + if attempt == MAX_RETRIES - 1: + raise + logger.warning(f"Error, retrying ({attempt+1}/{MAX_RETRIES}): {e}") + await asyncio.sleep(RETRY_DELAY * (attempt + 1)) + + return None # Should not reach here + async def info(self): """Return information about search service""" if not self.available: @@ -42,7 +109,7 @@ class SearchService: def is_ready(self): """Check if service is available""" - return self.available + return self.available and self.health_status def index(self, shout): """Index a single document""" @@ -69,15 +136,24 @@ class SearchService: shout.media or "" ])) - # Send to txtai service - response = await self.client.post( - "/index", - json={"id": str(shout.id), "text": text} + # Send to txtai service with retry + await self._retry_operation( + self._perform_index_request, + str(shout.id), + text ) - response.raise_for_status() logger.info(f"Post {shout.id} successfully indexed") except Exception as e: logger.error(f"Indexing error for shout {shout.id}: {e}") + + async def _perform_index_request(self, id, text): + """Execute the actual index request""" + response = await self.client.post( + "/index", + json={"id": id, "text": text} + ) + response.raise_for_status() + return response.json() async def bulk_index(self, shouts): """Index multiple documents at once""" @@ -96,14 +172,29 @@ class SearchService: documents.append({"id": str(shout.id), "text": text}) try: - response = await self.client.post( - "/bulk-index", - json={"documents": documents} - ) - response.raise_for_status() - logger.info(f"Bulk indexed {len(documents)} documents") + # Using chunking to avoid large requests + chunk_size = 100 # Adjust based on your needs + for i in range(0, len(documents), chunk_size): + chunk = documents[i:i+chunk_size] + logger.info(f"Bulk indexing chunk {i//chunk_size + 1}/{(len(documents)-1)//chunk_size + 1} ({len(chunk)} documents)") + + await self._retry_operation( + self._perform_bulk_index_request, + chunk + ) + + logger.info(f"Bulk indexed {len(documents)} documents in total") except Exception as e: logger.error(f"Bulk indexing error: {e}") + + async def _perform_bulk_index_request(self, documents): + """Execute the actual bulk index request""" + response = await self.client.post( + "/bulk-index", + json={"documents": documents} + ) + response.raise_for_status() + return response.json() async def search(self, text, limit, offset): """Search documents""" @@ -119,12 +210,16 @@ class SearchService: logger.info(f"Searching: {text} {offset}+{limit}") try: - response = await self.client.post( - "/search", - json={"text": text, "limit": limit, "offset": offset} + result = await self._retry_operation( + self._perform_search_request, + text, + limit, + offset ) - response.raise_for_status() - result = response.json() + + if not result: + return [] + formatted_results = result.get("results", []) # Cache results @@ -139,6 +234,15 @@ class SearchService: except Exception as e: logger.error(f"Search error: {e}") return [] + + async def _perform_search_request(self, text, limit, offset): + """Execute the actual search request""" + response = await self.client.post( + "/search", + json={"text": text, "limit": limit, "offset": offset} + ) + response.raise_for_status() + return response.json() # Create the search service singleton @@ -153,7 +257,7 @@ async def search_text(text: str, limit: int = 50, offset: int = 0): return payload -# Function to initialize search with existing data +# Function to initialize search index with existing data async def initialize_search_index(shouts_data): """Initialize search index with existing data during application startup""" if SEARCH_ENABLED: -- 2.43.0 From 24cca7f2cb7f030cac72355a052d1902474bb01b Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 12 Mar 2025 13:11:19 -0300 Subject: [PATCH 07/40] debug: something wrong one stap back with logs --- services/search.py | 144 +++++++-------------------------------------- 1 file changed, 20 insertions(+), 124 deletions(-) diff --git a/services/search.py b/services/search.py index ca5f8fff..c8186802 100644 --- a/services/search.py +++ b/services/search.py @@ -3,7 +3,6 @@ import json import logging import os import httpx -from typing import List, Dict, Any, Optional from services.redis import redis from utils.encoders import CustomJSONEncoder @@ -17,83 +16,17 @@ REDIS_TTL = 86400 # 1 day in seconds # Configuration for search service SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]) TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL") -# Add retry configuration -MAX_RETRIES = int(os.environ.get("TXTAI_MAX_RETRIES", "3")) -RETRY_DELAY = float(os.environ.get("TXTAI_RETRY_DELAY", "1.0")) -# Add request timeout configuration -REQUEST_TIMEOUT = float(os.environ.get("TXTAI_REQUEST_TIMEOUT", "30.0")) -# Add health check configuration -HEALTH_CHECK_INTERVAL = int(os.environ.get("SEARCH_HEALTH_CHECK_INTERVAL", "300")) # 5 minutes class SearchService: def __init__(self): - logger.info(f"Initializing search service with URL: {TXTAI_SERVICE_URL}") + logger.info("Initializing search service...") self.available = SEARCH_ENABLED - self.client = httpx.AsyncClient(timeout=REQUEST_TIMEOUT, base_url=TXTAI_SERVICE_URL) - self.last_health_check = 0 - self.health_status = False + self.client = httpx.AsyncClient(timeout=30.0, base_url=TXTAI_SERVICE_URL) if not self.available: logger.info("Search disabled (SEARCH_ENABLED = False)") - - # Schedule health check if enabled - if self.available: - asyncio.create_task(self._schedule_health_checks()) - async def _schedule_health_checks(self): - """Schedule periodic health checks""" - while True: - try: - await self._check_health() - await asyncio.sleep(HEALTH_CHECK_INTERVAL) - except Exception as e: - logger.error(f"Error in health check scheduler: {e}") - await asyncio.sleep(HEALTH_CHECK_INTERVAL) - - async def _check_health(self): - """Check if search service is healthy""" - try: - info = await self.info() - self.health_status = info.get("status") != "error" - if self.health_status: - logger.info("Search service is healthy") - else: - logger.warning("Search service is unhealthy") - return self.health_status - except Exception as e: - self.health_status = False - logger.warning(f"Search health check failed: {e}") - return False - - async def _retry_operation(self, operation_func, *args, **kwargs): - """Execute an operation with retries""" - if not self.available: - return None - - for attempt in range(MAX_RETRIES): - try: - return await operation_func(*args, **kwargs) - except httpx.ReadTimeout: - if attempt == MAX_RETRIES - 1: - raise - logger.warning(f"Request timed out, retrying ({attempt+1}/{MAX_RETRIES})") - await asyncio.sleep(RETRY_DELAY * (attempt + 1)) - except httpx.ConnectError as e: - if attempt == MAX_RETRIES - 1: - self.available = False - logger.error(f"Connection error after {MAX_RETRIES} attempts: {e}") - raise - logger.warning(f"Connection error, retrying ({attempt+1}/{MAX_RETRIES}): {e}") - await asyncio.sleep(RETRY_DELAY * (attempt + 1)) - except Exception as e: - if attempt == MAX_RETRIES - 1: - raise - logger.warning(f"Error, retrying ({attempt+1}/{MAX_RETRIES}): {e}") - await asyncio.sleep(RETRY_DELAY * (attempt + 1)) - - return None # Should not reach here - async def info(self): """Return information about search service""" if not self.available: @@ -109,7 +42,7 @@ class SearchService: def is_ready(self): """Check if service is available""" - return self.available and self.health_status + return self.available def index(self, shout): """Index a single document""" @@ -136,24 +69,15 @@ class SearchService: shout.media or "" ])) - # Send to txtai service with retry - await self._retry_operation( - self._perform_index_request, - str(shout.id), - text + # Send to txtai service + response = await self.client.post( + "/index", + json={"id": str(shout.id), "text": text} ) + response.raise_for_status() logger.info(f"Post {shout.id} successfully indexed") except Exception as e: logger.error(f"Indexing error for shout {shout.id}: {e}") - - async def _perform_index_request(self, id, text): - """Execute the actual index request""" - response = await self.client.post( - "/index", - json={"id": id, "text": text} - ) - response.raise_for_status() - return response.json() async def bulk_index(self, shouts): """Index multiple documents at once""" @@ -172,29 +96,14 @@ class SearchService: documents.append({"id": str(shout.id), "text": text}) try: - # Using chunking to avoid large requests - chunk_size = 100 # Adjust based on your needs - for i in range(0, len(documents), chunk_size): - chunk = documents[i:i+chunk_size] - logger.info(f"Bulk indexing chunk {i//chunk_size + 1}/{(len(documents)-1)//chunk_size + 1} ({len(chunk)} documents)") - - await self._retry_operation( - self._perform_bulk_index_request, - chunk - ) - - logger.info(f"Bulk indexed {len(documents)} documents in total") + response = await self.client.post( + "/bulk-index", + json={"documents": documents} + ) + response.raise_for_status() + logger.info(f"Bulk indexed {len(documents)} documents") except Exception as e: logger.error(f"Bulk indexing error: {e}") - - async def _perform_bulk_index_request(self, documents): - """Execute the actual bulk index request""" - response = await self.client.post( - "/bulk-index", - json={"documents": documents} - ) - response.raise_for_status() - return response.json() async def search(self, text, limit, offset): """Search documents""" @@ -210,16 +119,12 @@ class SearchService: logger.info(f"Searching: {text} {offset}+{limit}") try: - result = await self._retry_operation( - self._perform_search_request, - text, - limit, - offset + response = await self.client.post( + "/search", + json={"text": text, "limit": limit, "offset": offset} ) - - if not result: - return [] - + response.raise_for_status() + result = response.json() formatted_results = result.get("results", []) # Cache results @@ -234,15 +139,6 @@ class SearchService: except Exception as e: logger.error(f"Search error: {e}") return [] - - async def _perform_search_request(self, text, limit, offset): - """Execute the actual search request""" - response = await self.client.post( - "/search", - json={"text": text, "limit": limit, "offset": offset} - ) - response.raise_for_status() - return response.json() # Create the search service singleton @@ -257,7 +153,7 @@ async def search_text(text: str, limit: int = 50, offset: int = 0): return payload -# Function to initialize search index with existing data +# Function to initialize search with existing data async def initialize_search_index(shouts_data): """Initialize search index with existing data during application startup""" if SEARCH_ENABLED: -- 2.43.0 From 39242d5e6c49499dc091987a4544219f34f0422a Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 12 Mar 2025 14:13:55 -0300 Subject: [PATCH 08/40] debug: add logs in search.py and change and input validation ... index ver too --- services/search.py | 192 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 161 insertions(+), 31 deletions(-) diff --git a/services/search.py b/services/search.py index c8186802..7b5bfb11 100644 --- a/services/search.py +++ b/services/search.py @@ -3,24 +3,26 @@ import json import logging import os import httpx +import time from services.redis import redis from utils.encoders import CustomJSONEncoder -# Set redis logging level to suppress DEBUG messages +# Set up proper logging logger = logging.getLogger("search") -logger.setLevel(logging.WARNING) +logger.setLevel(logging.INFO) # Change to INFO to see more details REDIS_TTL = 86400 # 1 day in seconds # Configuration for search service SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]) -TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL") +TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "http://search-txtai.web.1:8000") +MAX_BATCH_SIZE = int(os.environ.get("SEARCH_MAX_BATCH_SIZE", "100")) class SearchService: def __init__(self): - logger.info("Initializing search service...") + logger.info(f"Initializing search service with URL: {TXTAI_SERVICE_URL}") self.available = SEARCH_ENABLED self.client = httpx.AsyncClient(timeout=30.0, base_url=TXTAI_SERVICE_URL) @@ -35,7 +37,9 @@ class SearchService: try: response = await self.client.get("/info") response.raise_for_status() - return response.json() + result = response.json() + logger.info(f"Search service info: {result}") + return result except Exception as e: logger.error(f"Failed to get search info: {e}") return {"status": "error", "message": str(e)} @@ -69,63 +73,163 @@ class SearchService: shout.media or "" ])) + if not text.strip(): + logger.warning(f"No text content to index for shout {shout.id}") + return + + # Log the document being indexed + logger.info(f"Indexing document: ID={shout.id}, Text length={len(text)}") + # Send to txtai service response = await self.client.post( "/index", json={"id": str(shout.id), "text": text} ) response.raise_for_status() - logger.info(f"Post {shout.id} successfully indexed") + result = response.json() + logger.info(f"Post {shout.id} successfully indexed: {result}") except Exception as e: logger.error(f"Indexing error for shout {shout.id}: {e}") async def bulk_index(self, shouts): """Index multiple documents at once""" if not self.available or not shouts: + logger.warning(f"Bulk indexing skipped: available={self.available}, shouts_count={len(shouts) if shouts else 0}") return + + start_time = time.time() + logger.info(f"Starting bulk indexing of {len(shouts)} documents") - documents = [] - for shout in shouts: - text = " ".join(filter(None, [ - shout.title or "", - shout.subtitle or "", - shout.lead or "", - shout.body or "", - shout.media or "" - ])) - documents.append({"id": str(shout.id), "text": text}) + # Process documents in batches + batch_size = MAX_BATCH_SIZE + total_indexed = 0 + total_skipped = 0 + + for i in range(0, len(shouts), batch_size): + batch = shouts[i:i+batch_size] + logger.info(f"Processing batch {i//batch_size + 1} of {(len(shouts)-1)//batch_size + 1}, size {len(batch)}") - try: - response = await self.client.post( - "/bulk-index", - json={"documents": documents} - ) - response.raise_for_status() - logger.info(f"Bulk indexed {len(documents)} documents") - except Exception as e: - logger.error(f"Bulk indexing error: {e}") + documents = [] + for shout in batch: + try: + # Clean and combine all text fields + text_fields = [] + for field_name in ['title', 'subtitle', 'lead', 'body']: + field_value = getattr(shout, field_name, None) + if field_value and isinstance(field_value, str) and field_value.strip(): + text_fields.append(field_value.strip()) + + # Process media field if it exists + media = getattr(shout, 'media', None) + if media: + if isinstance(media, str): + # Try to parse if it's JSON + try: + media_json = json.loads(media) + if isinstance(media_json, dict) and 'title' in media_json: + text_fields.append(media_json['title']) + if isinstance(media_json, dict) and 'body' in media_json: + text_fields.append(media_json['body']) + except json.JSONDecodeError: + text_fields.append(media) + elif isinstance(media, dict): + if 'title' in media: + text_fields.append(media['title']) + if 'body' in media: + text_fields.append(media['body']) + + # Combine fields into one text + text = " ".join(text_fields) + + if not text.strip(): + logger.debug(f"Skipping shout {shout.id}: no text content") + total_skipped += 1 + continue + + # Add to batch + documents.append({ + "id": str(shout.id), + "text": text + }) + total_indexed += 1 + + except Exception as e: + logger.error(f"Error processing shout {getattr(shout, 'id', 'unknown')} for indexing: {e}") + total_skipped += 1 + + if not documents: + logger.warning(f"No valid documents in batch {i//batch_size + 1}") + continue + + try: + # Log a sample of the batch for debugging + if documents: + sample = documents[0] + logger.info(f"Sample document: id={sample['id']}, text_length={len(sample['text'])}") + + # Send batch to txtai service + logger.info(f"Sending batch of {len(documents)} documents to search service") + response = await self.client.post( + "/bulk-index", + json={"documents": documents} + ) + response.raise_for_status() + result = response.json() + logger.info(f"Batch {i//batch_size + 1} indexed successfully: {result}") + except Exception as e: + logger.error(f"Bulk indexing error for batch {i//batch_size + 1}: {e}") + + elapsed = time.time() - start_time + logger.info(f"Bulk indexing completed in {elapsed:.2f}s: {total_indexed} indexed, {total_skipped} skipped") async def search(self, text, limit, offset): """Search documents""" if not self.available: + logger.warning("Search not available") return [] + + # Validate input + if not isinstance(text, str) or not text.strip(): + logger.warning(f"Invalid search text: {text}") + return [] + + logger.info(f"Searching for: '{text}' (limit={limit}, offset={offset})") # Check Redis cache first redis_key = f"search:{text}:{offset}+{limit}" cached = await redis.get(redis_key) if cached: - return json.loads(cached) - - logger.info(f"Searching: {text} {offset}+{limit}") + cached_results = json.loads(cached) + logger.info(f"Retrieved {len(cached_results)} results from cache for '{text}'") + return cached_results try: + # Log request + logger.info(f"Sending search request: text='{text}', limit={limit}, offset={offset}") + + # Send search request to txtai service response = await self.client.post( "/search", json={"text": text, "limit": limit, "offset": offset} ) response.raise_for_status() + + # Log raw response for debugging + logger.info(f"Raw search response: {response.text}") + + # Parse response result = response.json() + logger.info(f"Parsed search response: {result}") + + # Extract results formatted_results = result.get("results", []) + logger.info(f"Search for '{text}' returned {len(formatted_results)} results") + + # Log sample results for debugging + if formatted_results: + logger.info(f"Sample result: {formatted_results[0]}") + else: + logger.warning(f"No results found for '{text}'") # Cache results if formatted_results: @@ -137,7 +241,7 @@ class SearchService: ) return formatted_results except Exception as e: - logger.error(f"Search error: {e}") + logger.error(f"Search error for '{text}': {e}", exc_info=True) return [] @@ -157,6 +261,32 @@ async def search_text(text: str, limit: int = 50, offset: int = 0): async def initialize_search_index(shouts_data): """Initialize search index with existing data during application startup""" if SEARCH_ENABLED: - logger.info("Initializing search index with existing data...") + if not shouts_data: + logger.warning("No shouts data provided for search indexing") + return + + logger.info(f"Initializing search index with {len(shouts_data)} documents") + + # Check if search service is available first + info = await search_service.info() + if info.get("status") in ["error", "unavailable", "disabled"]: + logger.error(f"Cannot initialize search index: {info}") + return + + # Start the bulk indexing process await search_service.bulk_index(shouts_data) - logger.info(f"Search index initialized with {len(shouts_data)} documents") \ No newline at end of file + + # Verify indexing worked by testing with a search + try: + test_query = "test" + logger.info(f"Verifying search index with query: '{test_query}'") + test_results = await search_text(test_query, 5) + + if test_results: + logger.info(f"Search verification successful: found {len(test_results)} results") + else: + logger.warning("Search verification returned no results. Index may be empty or not working.") + except Exception as e: + logger.error(f"Error verifying search index: {e}") + else: + logger.info("Search indexing skipped (SEARCH_ENABLED=False)") \ No newline at end of file -- 2.43.0 From ad0ca75aa98504849eed75e236e017da6344d42e Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 19 Mar 2025 14:47:31 -0300 Subject: [PATCH 09/40] debug: no redis for indexing in nackend side --- services/search.py | 67 +++++++++------------------------------------- 1 file changed, 13 insertions(+), 54 deletions(-) diff --git a/services/search.py b/services/search.py index 7b5bfb11..4f5f7105 100644 --- a/services/search.py +++ b/services/search.py @@ -5,27 +5,24 @@ import os import httpx import time -from services.redis import redis -from utils.encoders import CustomJSONEncoder - # Set up proper logging logger = logging.getLogger("search") logger.setLevel(logging.INFO) # Change to INFO to see more details -REDIS_TTL = 86400 # 1 day in seconds - # Configuration for search service SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]) TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "http://search-txtai.web.1:8000") -MAX_BATCH_SIZE = int(os.environ.get("SEARCH_MAX_BATCH_SIZE", "100")) +MAX_BATCH_SIZE = int(os.environ.get("SEARCH_MAX_BATCH_SIZE", "25")) class SearchService: def __init__(self): logger.info(f"Initializing search service with URL: {TXTAI_SERVICE_URL}") self.available = SEARCH_ENABLED + # Use different timeout settings for indexing and search requests self.client = httpx.AsyncClient(timeout=30.0, base_url=TXTAI_SERVICE_URL) - + self.index_client = httpx.AsyncClient(timeout=120.0, base_url=TXTAI_SERVICE_URL) + if not self.available: logger.info("Search disabled (SEARCH_ENABLED = False)") @@ -54,7 +51,6 @@ class SearchService: return logger.info(f"Indexing post {shout.id}") - # Start in background to not block asyncio.create_task(self.perform_index(shout)) @@ -77,7 +73,6 @@ class SearchService: logger.warning(f"No text content to index for shout {shout.id}") return - # Log the document being indexed logger.info(f"Indexing document: ID={shout.id}, Text length={len(text)}") # Send to txtai service @@ -100,11 +95,10 @@ class SearchService: start_time = time.time() logger.info(f"Starting bulk indexing of {len(shouts)} documents") - # Process documents in batches batch_size = MAX_BATCH_SIZE total_indexed = 0 total_skipped = 0 - + i = 0 for i in range(0, len(shouts), batch_size): batch = shouts[i:i+batch_size] logger.info(f"Processing batch {i//batch_size + 1} of {(len(shouts)-1)//batch_size + 1}, size {len(batch)}") @@ -112,24 +106,22 @@ class SearchService: documents = [] for shout in batch: try: - # Clean and combine all text fields text_fields = [] for field_name in ['title', 'subtitle', 'lead', 'body']: field_value = getattr(shout, field_name, None) if field_value and isinstance(field_value, str) and field_value.strip(): text_fields.append(field_value.strip()) - # Process media field if it exists media = getattr(shout, 'media', None) if media: if isinstance(media, str): - # Try to parse if it's JSON try: media_json = json.loads(media) - if isinstance(media_json, dict) and 'title' in media_json: - text_fields.append(media_json['title']) - if isinstance(media_json, dict) and 'body' in media_json: - text_fields.append(media_json['body']) + if isinstance(media_json, dict): + if 'title' in media_json: + text_fields.append(media_json['title']) + if 'body' in media_json: + text_fields.append(media_json['body']) except json.JSONDecodeError: text_fields.append(media) elif isinstance(media, dict): @@ -138,7 +130,6 @@ class SearchService: if 'body' in media: text_fields.append(media['body']) - # Combine fields into one text text = " ".join(text_fields) if not text.strip(): @@ -146,7 +137,6 @@ class SearchService: total_skipped += 1 continue - # Add to batch documents.append({ "id": str(shout.id), "text": text @@ -162,14 +152,12 @@ class SearchService: continue try: - # Log a sample of the batch for debugging if documents: sample = documents[0] logger.info(f"Sample document: id={sample['id']}, text_length={len(sample['text'])}") - # Send batch to txtai service logger.info(f"Sending batch of {len(documents)} documents to search service") - response = await self.client.post( + response = await self.index_client.post( "/bulk-index", json={"documents": documents} ) @@ -188,57 +176,32 @@ class SearchService: logger.warning("Search not available") return [] - # Validate input if not isinstance(text, str) or not text.strip(): logger.warning(f"Invalid search text: {text}") return [] logger.info(f"Searching for: '{text}' (limit={limit}, offset={offset})") - # Check Redis cache first - redis_key = f"search:{text}:{offset}+{limit}" - cached = await redis.get(redis_key) - if cached: - cached_results = json.loads(cached) - logger.info(f"Retrieved {len(cached_results)} results from cache for '{text}'") - return cached_results - try: - # Log request logger.info(f"Sending search request: text='{text}', limit={limit}, offset={offset}") - - # Send search request to txtai service response = await self.client.post( "/search", json={"text": text, "limit": limit, "offset": offset} ) response.raise_for_status() - # Log raw response for debugging logger.info(f"Raw search response: {response.text}") - - # Parse response result = response.json() logger.info(f"Parsed search response: {result}") - # Extract results formatted_results = result.get("results", []) logger.info(f"Search for '{text}' returned {len(formatted_results)} results") - # Log sample results for debugging if formatted_results: logger.info(f"Sample result: {formatted_results[0]}") else: logger.warning(f"No results found for '{text}'") - # Cache results - if formatted_results: - await redis.execute( - "SETEX", - redis_key, - REDIS_TTL, - json.dumps(formatted_results, cls=CustomJSONEncoder), - ) return formatted_results except Exception as e: logger.error(f"Search error for '{text}': {e}", exc_info=True) @@ -249,7 +212,7 @@ class SearchService: search_service = SearchService() -# Keep the API exactly the same to maintain compatibility +# API-compatible function to perform a search async def search_text(text: str, limit: int = 50, offset: int = 0): payload = [] if search_service.available: @@ -257,7 +220,6 @@ async def search_text(text: str, limit: int = 50, offset: int = 0): return payload -# Function to initialize search with existing data async def initialize_search_index(shouts_data): """Initialize search index with existing data during application startup""" if SEARCH_ENABLED: @@ -267,16 +229,13 @@ async def initialize_search_index(shouts_data): logger.info(f"Initializing search index with {len(shouts_data)} documents") - # Check if search service is available first info = await search_service.info() if info.get("status") in ["error", "unavailable", "disabled"]: logger.error(f"Cannot initialize search index: {info}") return - # Start the bulk indexing process await search_service.bulk_index(shouts_data) - # Verify indexing worked by testing with a search try: test_query = "test" logger.info(f"Verifying search index with query: '{test_query}'") @@ -289,4 +248,4 @@ async def initialize_search_index(shouts_data): except Exception as e: logger.error(f"Error verifying search index: {e}") else: - logger.info("Search indexing skipped (SEARCH_ENABLED=False)") \ No newline at end of file + logger.info("Search indexing skipped (SEARCH_ENABLED=False)") -- 2.43.0 From 90699768ff000aa932f5b79bc0afc854562f81d8 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 13:30:23 -0300 Subject: [PATCH 10/40] debug: start index --- main.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 6f88b382..44c727af 100644 --- a/main.py +++ b/main.py @@ -44,6 +44,11 @@ async def check_search_service(): print(f"[INFO] Search service is available: {info}") +# indexing DB data +# async def indexing(): +# from services.db import fetch_all_shouts +# all_shouts = await fetch_all_shouts() +# await initialize_search_index(all_shouts) async def lifespan(_app): try: create_all_tables() @@ -59,7 +64,7 @@ async def lifespan(_app): # After basic initialization is complete, fetch shouts and initialize search from services.db import fetch_all_shouts # Import your database access function - all_shouts = await fetch_all_shouts() # Replace with your actual function + all_shouts = await fetch_all_shouts() await initialize_search_index(all_shouts) yield -- 2.43.0 From 385057ffcd7edfb2abc7056a31378f07a14da612 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 13:45:50 -0300 Subject: [PATCH 11/40] debug: with logs in indexing procedure --- main.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 44c727af..9d57ac20 100644 --- a/main.py +++ b/main.py @@ -51,6 +51,7 @@ async def check_search_service(): # await initialize_search_index(all_shouts) async def lifespan(_app): try: + print("[lifespan] Starting application initialization") create_all_tables() await asyncio.gather( redis.connect(), @@ -61,18 +62,24 @@ async def lifespan(_app): start(), revalidation_manager.start(), ) + print("[lifespan] Basic initialization complete") # After basic initialization is complete, fetch shouts and initialize search + print("[lifespan] Starting search indexing process") from services.db import fetch_all_shouts # Import your database access function all_shouts = await fetch_all_shouts() + print(f"[lifespan] Fetched {len(all_shouts) if all_shouts else 0} shouts for indexing") + + print("[lifespan] Initializing search index...") await initialize_search_index(all_shouts) + print("[lifespan] Search index initialization complete") yield finally: + print("[lifespan] Shutting down application services") tasks = [redis.disconnect(), ViewedStorage.stop(), revalidation_manager.stop()] await asyncio.gather(*tasks, return_exceptions=True) - - + print("[lifespan] Shutdown complete") # Создаем экземпляр GraphQL graphql_app = GraphQL(schema, debug=True) -- 2.43.0 From 57e1e8e6bdbb2b8982bfec69113dfdef728b2b17 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 14:10:09 -0300 Subject: [PATCH 12/40] debug: more logs in indexing --- services/search.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/services/search.py b/services/search.py index 4f5f7105..78fb0daa 100644 --- a/services/search.py +++ b/services/search.py @@ -161,6 +161,19 @@ class SearchService: "/bulk-index", json={"documents": documents} ) + # Error Handling + if response.status_code == 422: + error_detail = response.json() + logger.error(f"Validation error from search service: {error_detail}") + + # Try to identify problematic documents + for doc in documents: + if len(doc['text']) > 10000: # Adjust threshold as needed + logger.warning(f"Document {doc['id']} has very long text: {len(doc['text'])} chars") + + # Continue with next batch instead of failing completely + continue + response.raise_for_status() result = response.json() logger.info(f"Batch {i//batch_size + 1} indexed successfully: {result}") -- 2.43.0 From 19c5028a0caca8cc6b109146c025c37dca6b2d32 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 14:18:32 -0300 Subject: [PATCH 13/40] debug: Limit max chars for bulk indexing --- services/search.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/services/search.py b/services/search.py index 78fb0daa..7e007d6f 100644 --- a/services/search.py +++ b/services/search.py @@ -94,11 +94,14 @@ class SearchService: start_time = time.time() logger.info(f"Starting bulk indexing of {len(shouts)} documents") - + + MAX_TEXT_LENGTH = 8000 # Maximum text length to send in a single request batch_size = MAX_BATCH_SIZE total_indexed = 0 total_skipped = 0 + total_truncated = 0 i = 0 + for i in range(0, len(shouts), batch_size): batch = shouts[i:i+batch_size] logger.info(f"Processing batch {i//batch_size + 1} of {(len(shouts)-1)//batch_size + 1}, size {len(batch)}") @@ -136,6 +139,13 @@ class SearchService: logger.debug(f"Skipping shout {shout.id}: no text content") total_skipped += 1 continue + + # Truncate text if it exceeds the maximum length + original_length = len(text) + if original_length > MAX_TEXT_LENGTH: + text = text[:MAX_TEXT_LENGTH] + logger.info(f"Truncated document {shout.id} from {original_length} to {MAX_TEXT_LENGTH} chars") + total_truncated += 1 documents.append({ "id": str(shout.id), -- 2.43.0 From eb4b9363abc18752ee2df19925809d00ff375fba Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 14:32:45 -0300 Subject: [PATCH 14/40] debug: change logs entris and indexing not wraps all in documents --- services/search.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/services/search.py b/services/search.py index 7e007d6f..aca6ba1e 100644 --- a/services/search.py +++ b/services/search.py @@ -169,12 +169,32 @@ class SearchService: logger.info(f"Sending batch of {len(documents)} documents to search service") response = await self.index_client.post( "/bulk-index", - json={"documents": documents} + json=documents ) # Error Handling if response.status_code == 422: error_detail = response.json() - logger.error(f"Validation error from search service: {error_detail}") + + # Create a truncated version of the error detail for logging + truncated_detail = error_detail.copy() if isinstance(error_detail, dict) else error_detail + + # If it's a validation error with details list + if isinstance(truncated_detail, dict) and 'detail' in truncated_detail and isinstance(truncated_detail['detail'], list): + for i, item in enumerate(truncated_detail['detail']): + # Handle case where input contains document text + if isinstance(item, dict) and 'input' in item: + if isinstance(item['input'], dict) and any(k in item['input'] for k in ['documents', 'text']): + # Check for documents list + if 'documents' in item['input'] and isinstance(item['input']['documents'], list): + for j, doc in enumerate(item['input']['documents']): + if 'text' in doc and isinstance(doc['text'], str) and len(doc['text']) > 100: + item['input']['documents'][j]['text'] = f"{doc['text'][:100]}... [truncated, total {len(doc['text'])} chars]" + + # Check for direct text field + if 'text' in item['input'] and isinstance(item['input']['text'], str) and len(item['input']['text']) > 100: + item['input']['text'] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]" + + logger.error(f"Validation error from search service: {truncated_detail}") # Try to identify problematic documents for doc in documents: -- 2.43.0 From 50a8c24ead9d87cf8337af309b6185859fde99b4 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 15:40:29 -0300 Subject: [PATCH 15/40] feat(search.py): documnet for bulk indexing are categorized --- services/search.py | 324 ++++++++++++++++++++++++++++++--------------- 1 file changed, 215 insertions(+), 109 deletions(-) diff --git a/services/search.py b/services/search.py index aca6ba1e..a29ead9b 100644 --- a/services/search.py +++ b/services/search.py @@ -4,6 +4,7 @@ import logging import os import httpx import time +import random # Set up proper logging logger = logging.getLogger("search") @@ -11,7 +12,7 @@ logger.setLevel(logging.INFO) # Change to INFO to see more details # Configuration for search service SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]) -TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "http://search-txtai.web.1:8000") +TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "none") MAX_BATCH_SIZE = int(os.environ.get("SEARCH_MAX_BATCH_SIZE", "25")) @@ -87,7 +88,7 @@ class SearchService: logger.error(f"Indexing error for shout {shout.id}: {e}") async def bulk_index(self, shouts): - """Index multiple documents at once""" + """Index multiple documents at once with adaptive batch sizing""" if not self.available or not shouts: logger.warning(f"Bulk indexing skipped: available={self.available}, shouts_count={len(shouts) if shouts else 0}") return @@ -96,122 +97,227 @@ class SearchService: logger.info(f"Starting bulk indexing of {len(shouts)} documents") MAX_TEXT_LENGTH = 8000 # Maximum text length to send in a single request - batch_size = MAX_BATCH_SIZE + max_batch_size = MAX_BATCH_SIZE total_indexed = 0 total_skipped = 0 total_truncated = 0 - i = 0 - - for i in range(0, len(shouts), batch_size): - batch = shouts[i:i+batch_size] - logger.info(f"Processing batch {i//batch_size + 1} of {(len(shouts)-1)//batch_size + 1}, size {len(batch)}") - - documents = [] - for shout in batch: - try: - text_fields = [] - for field_name in ['title', 'subtitle', 'lead', 'body']: - field_value = getattr(shout, field_name, None) - if field_value and isinstance(field_value, str) and field_value.strip(): - text_fields.append(field_value.strip()) - - media = getattr(shout, 'media', None) - if media: - if isinstance(media, str): - try: - media_json = json.loads(media) - if isinstance(media_json, dict): - if 'title' in media_json: - text_fields.append(media_json['title']) - if 'body' in media_json: - text_fields.append(media_json['body']) - except json.JSONDecodeError: - text_fields.append(media) - elif isinstance(media, dict): - if 'title' in media: - text_fields.append(media['title']) - if 'body' in media: - text_fields.append(media['body']) - - text = " ".join(text_fields) - - if not text.strip(): - logger.debug(f"Skipping shout {shout.id}: no text content") - total_skipped += 1 - continue - - # Truncate text if it exceeds the maximum length - original_length = len(text) - if original_length > MAX_TEXT_LENGTH: - text = text[:MAX_TEXT_LENGTH] - logger.info(f"Truncated document {shout.id} from {original_length} to {MAX_TEXT_LENGTH} chars") - total_truncated += 1 - - documents.append({ - "id": str(shout.id), - "text": text - }) - total_indexed += 1 - - except Exception as e: - logger.error(f"Error processing shout {getattr(shout, 'id', 'unknown')} for indexing: {e}") - total_skipped += 1 - - if not documents: - logger.warning(f"No valid documents in batch {i//batch_size + 1}") - continue - + total_retries = 0 + + # Group documents by size to process smaller documents in larger batches + small_docs = [] + medium_docs = [] + large_docs = [] + + # First pass: prepare all documents and categorize by size + for shout in shouts: try: - if documents: - sample = documents[0] - logger.info(f"Sample document: id={sample['id']}, text_length={len(sample['text'])}") + text_fields = [] + for field_name in ['title', 'subtitle', 'lead', 'body']: + field_value = getattr(shout, field_name, None) + if field_value and isinstance(field_value, str) and field_value.strip(): + text_fields.append(field_value.strip()) - logger.info(f"Sending batch of {len(documents)} documents to search service") - response = await self.index_client.post( - "/bulk-index", - json=documents - ) - # Error Handling - if response.status_code == 422: - error_detail = response.json() - - # Create a truncated version of the error detail for logging - truncated_detail = error_detail.copy() if isinstance(error_detail, dict) else error_detail - - # If it's a validation error with details list - if isinstance(truncated_detail, dict) and 'detail' in truncated_detail and isinstance(truncated_detail['detail'], list): - for i, item in enumerate(truncated_detail['detail']): - # Handle case where input contains document text - if isinstance(item, dict) and 'input' in item: - if isinstance(item['input'], dict) and any(k in item['input'] for k in ['documents', 'text']): - # Check for documents list - if 'documents' in item['input'] and isinstance(item['input']['documents'], list): - for j, doc in enumerate(item['input']['documents']): - if 'text' in doc and isinstance(doc['text'], str) and len(doc['text']) > 100: - item['input']['documents'][j]['text'] = f"{doc['text'][:100]}... [truncated, total {len(doc['text'])} chars]" - - # Check for direct text field - if 'text' in item['input'] and isinstance(item['input']['text'], str) and len(item['input']['text']) > 100: - item['input']['text'] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]" - - logger.error(f"Validation error from search service: {truncated_detail}") - - # Try to identify problematic documents - for doc in documents: - if len(doc['text']) > 10000: # Adjust threshold as needed - logger.warning(f"Document {doc['id']} has very long text: {len(doc['text'])} chars") - - # Continue with next batch instead of failing completely + # Media field processing remains the same + media = getattr(shout, 'media', None) + if media: + # Your existing media processing logic + if isinstance(media, str): + try: + media_json = json.loads(media) + if isinstance(media_json, dict): + if 'title' in media_json: + text_fields.append(media_json['title']) + if 'body' in media_json: + text_fields.append(media_json['body']) + except json.JSONDecodeError: + text_fields.append(media) + elif isinstance(media, dict): + if 'title' in media: + text_fields.append(media['title']) + if 'body' in media: + text_fields.append(media['body']) + + text = " ".join(text_fields) + + if not text.strip(): + logger.debug(f"Skipping shout {shout.id}: no text content") + total_skipped += 1 continue - - response.raise_for_status() - result = response.json() - logger.info(f"Batch {i//batch_size + 1} indexed successfully: {result}") + + # Truncate text if it exceeds the maximum length + original_length = len(text) + if original_length > MAX_TEXT_LENGTH: + text = text[:MAX_TEXT_LENGTH] + logger.info(f"Truncated document {shout.id} from {original_length} to {MAX_TEXT_LENGTH} chars") + total_truncated += 1 + + document = { + "id": str(shout.id), + "text": text + } + + # Categorize by size + text_len = len(text) + if text_len > 5000: + large_docs.append(document) + elif text_len > 2000: + medium_docs.append(document) + else: + small_docs.append(document) + + total_indexed += 1 + except Exception as e: - logger.error(f"Bulk indexing error for batch {i//batch_size + 1}: {e}") + logger.error(f"Error processing shout {getattr(shout, 'id', 'unknown')} for indexing: {e}") + total_skipped += 1 + + # Process each category with appropriate batch sizes + logger.info(f"Documents categorized: {len(small_docs)} small, {len(medium_docs)} medium, {len(large_docs)} large") + + # Process small documents (larger batches) + if small_docs: + batch_size = min(max_batch_size, 25) + await self._process_document_batches(small_docs, batch_size, "small") + + # Process medium documents (medium batches) + if medium_docs: + batch_size = min(max_batch_size, 15) + await self._process_document_batches(medium_docs, batch_size, "medium") + + # Process large documents (small batches) + if large_docs: + batch_size = min(max_batch_size, 5) + await self._process_document_batches(large_docs, batch_size, "large") elapsed = time.time() - start_time - logger.info(f"Bulk indexing completed in {elapsed:.2f}s: {total_indexed} indexed, {total_skipped} skipped") + logger.info(f"Bulk indexing completed in {elapsed:.2f}s: {total_indexed} indexed, {total_skipped} skipped, {total_truncated} truncated, {total_retries} retries") + + async def _process_document_batches(self, documents, batch_size, size_category): + """Process document batches with retry logic""" + for i in range(0, len(documents), batch_size): + batch = documents[i:i+batch_size] + batch_id = f"{size_category}-{i//batch_size + 1}" + logger.info(f"Processing {size_category} batch {batch_id} of {len(batch)} documents") + + retry_count = 0 + max_retries = 3 + success = False + + # Process with retries + while not success and retry_count < max_retries: + try: + if batch: + sample = batch[0] + logger.info(f"Sample document in batch {batch_id}: id={sample['id']}, text_length={len(sample['text'])}") + + logger.info(f"Sending batch {batch_id} of {len(batch)} documents to search service (attempt {retry_count+1})") + response = await self.index_client.post( + "/bulk-index", + json=batch, + timeout=120.0 # Explicit longer timeout for large batches + ) + + # Handle 422 validation errors - these won't be fixed by retrying + if response.status_code == 422: + error_detail = response.json() + truncated_error = self._truncate_error_detail(error_detail) + logger.error(f"Validation error from search service for batch {batch_id}: {truncated_error}") + + # Individual document validation often won't benefit from splitting + break + + # Handle 500 server errors - these might be fixed by retrying with smaller batches + elif response.status_code == 500: + if retry_count < max_retries - 1: + retry_count += 1 + wait_time = (2 ** retry_count) + (random.random() * 0.5) # Exponential backoff with jitter + logger.warning(f"Server error for batch {batch_id}, retrying in {wait_time:.1f}s (attempt {retry_count+1}/{max_retries})") + await asyncio.sleep(wait_time) + continue + + # Final retry, split the batch + elif len(batch) > 1: + logger.warning(f"Splitting batch {batch_id} after repeated failures") + mid = len(batch) // 2 + await self._process_single_batch(batch[:mid], f"{batch_id}-A") + await self._process_single_batch(batch[mid:], f"{batch_id}-B") + break + else: + # Can't split a single document + logger.error(f"Failed to index document {batch[0]['id']} after {max_retries} attempts") + break + + # Normal success case + response.raise_for_status() + result = response.json() + logger.info(f"Batch {batch_id} indexed successfully: {result}") + success = True + + except Exception as e: + if retry_count < max_retries - 1: + retry_count += 1 + wait_time = (2 ** retry_count) + (random.random() * 0.5) + logger.warning(f"Error for batch {batch_id}, retrying in {wait_time:.1f}s: {str(e)[:200]}") + await asyncio.sleep(wait_time) + else: + # Last resort - try to split the batch + if len(batch) > 1: + logger.warning(f"Splitting batch {batch_id} after exception: {str(e)[:200]}") + mid = len(batch) // 2 + await self._process_single_batch(batch[:mid], f"{batch_id}-A") + await self._process_single_batch(batch[mid:], f"{batch_id}-B") + else: + logger.error(f"Failed to index document {batch[0]['id']} after {max_retries} attempts: {e}") + break + + async def _process_single_batch(self, documents, batch_id): + """Process a single batch with maximum reliability""" + try: + if not documents: + return + + logger.info(f"Processing sub-batch {batch_id} with {len(documents)} documents") + response = await self.index_client.post( + "/bulk-index", + json=documents, + timeout=90.0 + ) + response.raise_for_status() + result = response.json() + logger.info(f"Sub-batch {batch_id} indexed successfully: {result}") + except Exception as e: + logger.error(f"Error indexing sub-batch {batch_id}: {str(e)[:200]}") + + # For tiny batches, try one-by-one as last resort + if len(documents) > 1: + logger.info(f"Processing documents in sub-batch {batch_id} individually") + for i, doc in enumerate(documents): + try: + resp = await self.index_client.post("/index", json=doc, timeout=30.0) + resp.raise_for_status() + logger.info(f"Indexed document {doc['id']} individually") + except Exception as e2: + logger.error(f"Failed to index document {doc['id']} individually: {str(e2)[:100]}") + + def _truncate_error_detail(self, error_detail): + """Truncate error details for logging""" + truncated_detail = error_detail.copy() if isinstance(error_detail, dict) else error_detail + + if isinstance(truncated_detail, dict) and 'detail' in truncated_detail and isinstance(truncated_detail['detail'], list): + for i, item in enumerate(truncated_detail['detail']): + if isinstance(item, dict) and 'input' in item: + if isinstance(item['input'], dict) and any(k in item['input'] for k in ['documents', 'text']): + # Check for documents list + if 'documents' in item['input'] and isinstance(item['input']['documents'], list): + for j, doc in enumerate(item['input']['documents']): + if 'text' in doc and isinstance(doc['text'], str) and len(doc['text']) > 100: + item['input']['documents'][j]['text'] = f"{doc['text'][:100]}... [truncated, total {len(doc['text'])} chars]" + + # Check for direct text field + if 'text' in item['input'] and isinstance(item['input']['text'], str) and len(item['input']['text']) > 100: + item['input']['text'] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]" + + return truncated_detail async def search(self, text, limit, offset): """Search documents""" -- 2.43.0 From ebb67eb3111ece9c4bef75121b63712c78689ed8 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 16:53:00 -0300 Subject: [PATCH 16/40] debug: decrease chars in search.py for bulk indexing --- services/search.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/search.py b/services/search.py index a29ead9b..3121a405 100644 --- a/services/search.py +++ b/services/search.py @@ -96,7 +96,7 @@ class SearchService: start_time = time.time() logger.info(f"Starting bulk indexing of {len(shouts)} documents") - MAX_TEXT_LENGTH = 8000 # Maximum text length to send in a single request + MAX_TEXT_LENGTH = 4000 # Maximum text length to send in a single request max_batch_size = MAX_BATCH_SIZE total_indexed = 0 total_skipped = 0 @@ -176,17 +176,17 @@ class SearchService: # Process small documents (larger batches) if small_docs: - batch_size = min(max_batch_size, 25) + batch_size = min(max_batch_size, 15) await self._process_document_batches(small_docs, batch_size, "small") # Process medium documents (medium batches) if medium_docs: - batch_size = min(max_batch_size, 15) + batch_size = min(max_batch_size, 10) await self._process_document_batches(medium_docs, batch_size, "medium") # Process large documents (small batches) if large_docs: - batch_size = min(max_batch_size, 5) + batch_size = min(max_batch_size, 3) await self._process_document_batches(large_docs, batch_size, "large") elapsed = time.time() - start_time -- 2.43.0 From f1d9f4e0361653fbce779ba4f2ce3ec9037ee7fa Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 17:28:54 -0300 Subject: [PATCH 17/40] feat(search.py): with db reset endpoint --- services/search.py | 51 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/services/search.py b/services/search.py index 3121a405..c7e9c80c 100644 --- a/services/search.py +++ b/services/search.py @@ -194,6 +194,9 @@ class SearchService: async def _process_document_batches(self, documents, batch_size, size_category): """Process document batches with retry logic""" + # Check for possible database corruption before starting + db_error_count = 0 + for i in range(0, len(documents), batch_size): batch = documents[i:i+batch_size] batch_id = f"{size_category}-{i//batch_size + 1}" @@ -222,12 +225,25 @@ class SearchService: error_detail = response.json() truncated_error = self._truncate_error_detail(error_detail) logger.error(f"Validation error from search service for batch {batch_id}: {truncated_error}") - - # Individual document validation often won't benefit from splitting break # Handle 500 server errors - these might be fixed by retrying with smaller batches elif response.status_code == 500: + db_error_count += 1 + + # If we've seen multiple 500s, check for DB corruption + if db_error_count >= 3: + logger.warning("Multiple server errors detected, attempting to reset search service") + reset_result = await self.reset_search_service() + if reset_result["status"] == "reset": + logger.info("Search service has been reset, restarting batch processing") + # Wait a moment for the service to stabilize + await asyncio.sleep(2) + # Only retry current batch + retry_count = 0 + continue + + # Try again with exponential backoff if retry_count < max_retries - 1: retry_count += 1 wait_time = (2 ** retry_count) + (random.random() * 0.5) # Exponential backoff with jitter @@ -252,8 +268,22 @@ class SearchService: result = response.json() logger.info(f"Batch {batch_id} indexed successfully: {result}") success = True + db_error_count = 0 # Reset error counter on success except Exception as e: + # Check if it looks like a database corruption error + error_str = str(e).lower() + if "duplicate key" in error_str or "unique constraint" in error_str or "nonetype" in error_str: + db_error_count += 1 + if db_error_count >= 2: + logger.warning(f"Database corruption detected: {error_str}") + reset_result = await self.reset_search_service() + if reset_result["status"] == "reset": + logger.info("Search service has been reset, restarting batch processing") + await asyncio.sleep(2) + retry_count = 0 + continue + if retry_count < max_retries - 1: retry_count += 1 wait_time = (2 ** retry_count) + (random.random() * 0.5) @@ -318,6 +348,23 @@ class SearchService: item['input']['text'] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]" return truncated_detail + + async def reset_search_service(self): + """Reset the search service to recover from database corruption""" + if not self.available: + logger.warning("Search not available, cannot reset") + return {"status": "disabled"} + + try: + logger.warning("Resetting search service due to database corruption") + response = await self.client.post("/initialize") + response.raise_for_status() + result = response.json() + logger.info(f"Search service reset: {result}") + return {"status": "reset", "message": "Search index has been reset"} + except Exception as e: + logger.error(f"Failed to reset search service: {e}") + return {"status": "error", "message": str(e)} async def search(self, text, limit, offset): """Search documents""" -- 2.43.0 From fb820f67fd73e56d9ec0f405ba9968abf9e6a6ce Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 17:48:26 -0300 Subject: [PATCH 18/40] debug(search.py): encrease batch size for bulk indexing --- services/search.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/search.py b/services/search.py index c7e9c80c..9a0dafbf 100644 --- a/services/search.py +++ b/services/search.py @@ -96,7 +96,7 @@ class SearchService: start_time = time.time() logger.info(f"Starting bulk indexing of {len(shouts)} documents") - MAX_TEXT_LENGTH = 4000 # Maximum text length to send in a single request + MAX_TEXT_LENGTH = 8000 # Maximum text length to send in a single request max_batch_size = MAX_BATCH_SIZE total_indexed = 0 total_skipped = 0 @@ -176,17 +176,17 @@ class SearchService: # Process small documents (larger batches) if small_docs: - batch_size = min(max_batch_size, 15) + batch_size = min(max_batch_size, 25) await self._process_document_batches(small_docs, batch_size, "small") # Process medium documents (medium batches) if medium_docs: - batch_size = min(max_batch_size, 10) + batch_size = min(max_batch_size, 15) await self._process_document_batches(medium_docs, batch_size, "medium") # Process large documents (small batches) if large_docs: - batch_size = min(max_batch_size, 3) + batch_size = min(max_batch_size, 5) await self._process_document_batches(large_docs, batch_size, "large") elapsed = time.time() - start_time -- 2.43.0 From 316375bf186b48838921535d7a035ba95a5936dc Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Fri, 21 Mar 2025 17:56:54 -0300 Subject: [PATCH 19/40] debug(search.py): encrease batch size for bulk indexing --- services/search.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/services/search.py b/services/search.py index 9a0dafbf..c7e9c80c 100644 --- a/services/search.py +++ b/services/search.py @@ -96,7 +96,7 @@ class SearchService: start_time = time.time() logger.info(f"Starting bulk indexing of {len(shouts)} documents") - MAX_TEXT_LENGTH = 8000 # Maximum text length to send in a single request + MAX_TEXT_LENGTH = 4000 # Maximum text length to send in a single request max_batch_size = MAX_BATCH_SIZE total_indexed = 0 total_skipped = 0 @@ -176,17 +176,17 @@ class SearchService: # Process small documents (larger batches) if small_docs: - batch_size = min(max_batch_size, 25) + batch_size = min(max_batch_size, 15) await self._process_document_batches(small_docs, batch_size, "small") # Process medium documents (medium batches) if medium_docs: - batch_size = min(max_batch_size, 15) + batch_size = min(max_batch_size, 10) await self._process_document_batches(medium_docs, batch_size, "medium") # Process large documents (small batches) if large_docs: - batch_size = min(max_batch_size, 5) + batch_size = min(max_batch_size, 3) await self._process_document_batches(large_docs, batch_size, "large") elapsed = time.time() - start_time -- 2.43.0 From 60a13a90972088e9f36707870ec44fd3b76fb246 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Mon, 24 Mar 2025 19:47:02 -0300 Subject: [PATCH 20/40] refactor(search.py): moved initialization logic in search-txtai instance --- services/search.py | 199 +++++++++++++++++++-------------------------- 1 file changed, 85 insertions(+), 114 deletions(-) diff --git a/services/search.py b/services/search.py index c7e9c80c..d434b195 100644 --- a/services/search.py +++ b/services/search.py @@ -193,112 +193,100 @@ class SearchService: logger.info(f"Bulk indexing completed in {elapsed:.2f}s: {total_indexed} indexed, {total_skipped} skipped, {total_truncated} truncated, {total_retries} retries") async def _process_document_batches(self, documents, batch_size, size_category): - """Process document batches with retry logic""" - # Check for possible database corruption before starting - db_error_count = 0 - - for i in range(0, len(documents), batch_size): - batch = documents[i:i+batch_size] - batch_id = f"{size_category}-{i//batch_size + 1}" - logger.info(f"Processing {size_category} batch {batch_id} of {len(batch)} documents") + """Process document batches with retry logic""" + # Check for possible database corruption before starting + db_error_count = 0 - retry_count = 0 - max_retries = 3 - success = False - - # Process with retries - while not success and retry_count < max_retries: - try: - if batch: - sample = batch[0] - logger.info(f"Sample document in batch {batch_id}: id={sample['id']}, text_length={len(sample['text'])}") - - logger.info(f"Sending batch {batch_id} of {len(batch)} documents to search service (attempt {retry_count+1})") - response = await self.index_client.post( - "/bulk-index", - json=batch, - timeout=120.0 # Explicit longer timeout for large batches - ) - - # Handle 422 validation errors - these won't be fixed by retrying - if response.status_code == 422: - error_detail = response.json() - truncated_error = self._truncate_error_detail(error_detail) - logger.error(f"Validation error from search service for batch {batch_id}: {truncated_error}") - break - - # Handle 500 server errors - these might be fixed by retrying with smaller batches - elif response.status_code == 500: - db_error_count += 1 + for i in range(0, len(documents), batch_size): + batch = documents[i:i+batch_size] + batch_id = f"{size_category}-{i//batch_size + 1}" + logger.info(f"Processing {size_category} batch {batch_id} of {len(batch)} documents") + + retry_count = 0 + max_retries = 3 + success = False + + # Process with retries + while not success and retry_count < max_retries: + try: + if batch: + sample = batch[0] + logger.info(f"Sample document in batch {batch_id}: id={sample['id']}, text_length={len(sample['text'])}") - # If we've seen multiple 500s, check for DB corruption - if db_error_count >= 3: - logger.warning("Multiple server errors detected, attempting to reset search service") - reset_result = await self.reset_search_service() - if reset_result["status"] == "reset": - logger.info("Search service has been reset, restarting batch processing") - # Wait a moment for the service to stabilize - await asyncio.sleep(2) - # Only retry current batch - retry_count = 0 + logger.info(f"Sending batch {batch_id} of {len(batch)} documents to search service (attempt {retry_count+1})") + response = await self.index_client.post( + "/bulk-index", + json=batch, + timeout=120.0 # Explicit longer timeout for large batches + ) + + # Handle 422 validation errors - these won't be fixed by retrying + if response.status_code == 422: + error_detail = response.json() + truncated_error = self._truncate_error_detail(error_detail) + logger.error(f"Validation error from search service for batch {batch_id}: {truncated_error}") + break + + # Handle 500 server errors - these might be fixed by retrying with smaller batches + elif response.status_code == 500: + db_error_count += 1 + + # If we've seen multiple 500s, log a critical error + if db_error_count >= 3: + logger.critical(f"Multiple server errors detected (500). The search service may need manual intervention. Stopping batch {batch_id} processing.") + break + + # Try again with exponential backoff + if retry_count < max_retries - 1: + retry_count += 1 + wait_time = (2 ** retry_count) + (random.random() * 0.5) # Exponential backoff with jitter + logger.warning(f"Server error for batch {batch_id}, retrying in {wait_time:.1f}s (attempt {retry_count+1}/{max_retries})") + await asyncio.sleep(wait_time) continue - # Try again with exponential backoff + # Final retry, split the batch + elif len(batch) > 1: + logger.warning(f"Splitting batch {batch_id} after repeated failures") + mid = len(batch) // 2 + await self._process_single_batch(batch[:mid], f"{batch_id}-A") + await self._process_single_batch(batch[mid:], f"{batch_id}-B") + break + else: + # Can't split a single document + logger.error(f"Failed to index document {batch[0]['id']} after {max_retries} attempts") + break + + # Normal success case + response.raise_for_status() + result = response.json() + logger.info(f"Batch {batch_id} indexed successfully: {result}") + success = True + db_error_count = 0 # Reset error counter on success + + except Exception as e: + # Check if it looks like a database corruption error + error_str = str(e).lower() + if "duplicate key" in error_str or "unique constraint" in error_str or "nonetype" in error_str: + db_error_count += 1 + if db_error_count >= 2: + logger.critical(f"Potential database corruption detected: {error_str}. The search service may need manual intervention. Stopping batch {batch_id} processing.") + break + if retry_count < max_retries - 1: retry_count += 1 - wait_time = (2 ** retry_count) + (random.random() * 0.5) # Exponential backoff with jitter - logger.warning(f"Server error for batch {batch_id}, retrying in {wait_time:.1f}s (attempt {retry_count+1}/{max_retries})") + wait_time = (2 ** retry_count) + (random.random() * 0.5) + logger.warning(f"Error for batch {batch_id}, retrying in {wait_time:.1f}s: {str(e)[:200]}") await asyncio.sleep(wait_time) - continue - - # Final retry, split the batch - elif len(batch) > 1: - logger.warning(f"Splitting batch {batch_id} after repeated failures") - mid = len(batch) // 2 - await self._process_single_batch(batch[:mid], f"{batch_id}-A") - await self._process_single_batch(batch[mid:], f"{batch_id}-B") - break else: - # Can't split a single document - logger.error(f"Failed to index document {batch[0]['id']} after {max_retries} attempts") + # Last resort - try to split the batch + if len(batch) > 1: + logger.warning(f"Splitting batch {batch_id} after exception: {str(e)[:200]}") + mid = len(batch) // 2 + await self._process_single_batch(batch[:mid], f"{batch_id}-A") + await self._process_single_batch(batch[mid:], f"{batch_id}-B") + else: + logger.error(f"Failed to index document {batch[0]['id']} after {max_retries} attempts: {e}") break - - # Normal success case - response.raise_for_status() - result = response.json() - logger.info(f"Batch {batch_id} indexed successfully: {result}") - success = True - db_error_count = 0 # Reset error counter on success - - except Exception as e: - # Check if it looks like a database corruption error - error_str = str(e).lower() - if "duplicate key" in error_str or "unique constraint" in error_str or "nonetype" in error_str: - db_error_count += 1 - if db_error_count >= 2: - logger.warning(f"Database corruption detected: {error_str}") - reset_result = await self.reset_search_service() - if reset_result["status"] == "reset": - logger.info("Search service has been reset, restarting batch processing") - await asyncio.sleep(2) - retry_count = 0 - continue - - if retry_count < max_retries - 1: - retry_count += 1 - wait_time = (2 ** retry_count) + (random.random() * 0.5) - logger.warning(f"Error for batch {batch_id}, retrying in {wait_time:.1f}s: {str(e)[:200]}") - await asyncio.sleep(wait_time) - else: - # Last resort - try to split the batch - if len(batch) > 1: - logger.warning(f"Splitting batch {batch_id} after exception: {str(e)[:200]}") - mid = len(batch) // 2 - await self._process_single_batch(batch[:mid], f"{batch_id}-A") - await self._process_single_batch(batch[mid:], f"{batch_id}-B") - else: - logger.error(f"Failed to index document {batch[0]['id']} after {max_retries} attempts: {e}") - break async def _process_single_batch(self, documents, batch_id): """Process a single batch with maximum reliability""" @@ -348,23 +336,6 @@ class SearchService: item['input']['text'] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]" return truncated_detail - - async def reset_search_service(self): - """Reset the search service to recover from database corruption""" - if not self.available: - logger.warning("Search not available, cannot reset") - return {"status": "disabled"} - - try: - logger.warning("Resetting search service due to database corruption") - response = await self.client.post("/initialize") - response.raise_for_status() - result = response.json() - logger.info(f"Search service reset: {result}") - return {"status": "reset", "message": "Search index has been reset"} - except Exception as e: - logger.error(f"Failed to reset search service: {e}") - return {"status": "error", "message": str(e)} async def search(self, text, limit, offset): """Search documents""" -- 2.43.0 From 077cb464821470a373666d7fbe86dcd0c914b7cc Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Mon, 24 Mar 2025 20:16:07 -0300 Subject: [PATCH 21/40] debug: server.py -> threds 1 , search.py -> add 3 times reconect --- server.py | 2 +- services/search.py | 68 +++++++++++++++++++++++++++++----------------- 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/server.py b/server.py index 0ba5b97c..281f50ff 100644 --- a/server.py +++ b/server.py @@ -17,7 +17,7 @@ if __name__ == "__main__": address="0.0.0.0", port=PORT, interface=Interfaces.ASGI, - workers=4, + workers=1, websockets=False, log_level=LogLevels.debug, backlog=2048, diff --git a/services/search.py b/services/search.py index d434b195..5401d0f6 100644 --- a/services/search.py +++ b/services/search.py @@ -290,32 +290,50 @@ class SearchService: async def _process_single_batch(self, documents, batch_id): """Process a single batch with maximum reliability""" - try: - if not documents: - return + max_retries = 3 + retry_count = 0 + + while retry_count < max_retries: + try: + if not documents: + return + + logger.info(f"Processing sub-batch {batch_id} with {len(documents)} documents") + response = await self.index_client.post( + "/bulk-index", + json=documents, + timeout=90.0 + ) + response.raise_for_status() + result = response.json() + logger.info(f"Sub-batch {batch_id} indexed successfully: {result}") + return # Success, exit the retry loop - logger.info(f"Processing sub-batch {batch_id} with {len(documents)} documents") - response = await self.index_client.post( - "/bulk-index", - json=documents, - timeout=90.0 - ) - response.raise_for_status() - result = response.json() - logger.info(f"Sub-batch {batch_id} indexed successfully: {result}") - except Exception as e: - logger.error(f"Error indexing sub-batch {batch_id}: {str(e)[:200]}") - - # For tiny batches, try one-by-one as last resort - if len(documents) > 1: - logger.info(f"Processing documents in sub-batch {batch_id} individually") - for i, doc in enumerate(documents): - try: - resp = await self.index_client.post("/index", json=doc, timeout=30.0) - resp.raise_for_status() - logger.info(f"Indexed document {doc['id']} individually") - except Exception as e2: - logger.error(f"Failed to index document {doc['id']} individually: {str(e2)[:100]}") + except Exception as e: + error_str = str(e).lower() + retry_count += 1 + + # Check if it's a transient error that txtai might recover from internally + if "dictionary changed size" in error_str or "transaction error" in error_str: + wait_time = (2 ** retry_count) + (random.random() * 0.5) + logger.warning(f"Transient txtai error in sub-batch {batch_id}, waiting {wait_time:.1f}s for recovery: {str(e)[:200]}") + await asyncio.sleep(wait_time) # Wait for txtai to recover + continue # Try again + + # For other errors or final retry failure + logger.error(f"Error indexing sub-batch {batch_id} (attempt {retry_count}/{max_retries}): {str(e)[:200]}") + + # Only try one-by-one on the final retry + if retry_count >= max_retries and len(documents) > 1: + logger.info(f"Processing documents in sub-batch {batch_id} individually") + for i, doc in enumerate(documents): + try: + resp = await self.index_client.post("/index", json=doc, timeout=30.0) + resp.raise_for_status() + logger.info(f"Indexed document {doc['id']} individually") + except Exception as e2: + logger.error(f"Failed to index document {doc['id']} individually: {str(e2)[:100]}") + return # Exit after individual processing attempt def _truncate_error_detail(self, error_detail): """Truncate error details for logging""" -- 2.43.0 From 6e284640c07e096da9059fe029894db5eec8887c Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Mon, 24 Mar 2025 21:42:51 -0300 Subject: [PATCH 22/40] feat: give little timeout for resource stab --- main.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 9d57ac20..536dfdc2 100644 --- a/main.py +++ b/main.py @@ -64,15 +64,12 @@ async def lifespan(_app): ) print("[lifespan] Basic initialization complete") - # After basic initialization is complete, fetch shouts and initialize search - print("[lifespan] Starting search indexing process") - from services.db import fetch_all_shouts # Import your database access function - all_shouts = await fetch_all_shouts() - print(f"[lifespan] Fetched {len(all_shouts) if all_shouts else 0} shouts for indexing") - - print("[lifespan] Initializing search index...") - await initialize_search_index(all_shouts) - print("[lifespan] Search index initialization complete") + # Add a delay before starting the intensive search indexing + print("[lifespan] Waiting for system stabilization before search indexing...") + await asyncio.sleep(10) # 10-second delay to let the system stabilize + + # Start search indexing as a background task with lower priority + asyncio.create_task(initialize_search_index_background()) yield finally: @@ -80,6 +77,26 @@ async def lifespan(_app): tasks = [redis.disconnect(), ViewedStorage.stop(), revalidation_manager.stop()] await asyncio.gather(*tasks, return_exceptions=True) print("[lifespan] Shutdown complete") + +# Initialize search index in the background +async def initialize_search_index_background(): + """Run search indexing as a background task with low priority""" + try: + print("[search] Starting background search indexing process") + from services.db import fetch_all_shouts + + # Get total count first (optional) + all_shouts = await fetch_all_shouts() + total_count = len(all_shouts) if all_shouts else 0 + print(f"[search] Fetched {total_count} shouts for background indexing") + + # Start the indexing process with the fetched shouts + print("[search] Beginning background search index initialization...") + await initialize_search_index(all_shouts) + print("[search] Background search index initialization complete") + except Exception as e: + print(f"[search] Error in background search indexing: {str(e)}") + # Создаем экземпляр GraphQL graphql_app = GraphQL(schema, debug=True) -- 2.43.0 From 88012f1b8c2958154c798ba6a3639786c3099420 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Tue, 25 Mar 2025 12:21:59 -0300 Subject: [PATCH 23/40] debug(server.py): with 4 workers (threds). cheking reindexing --- server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.py b/server.py index 281f50ff..0ba5b97c 100644 --- a/server.py +++ b/server.py @@ -17,7 +17,7 @@ if __name__ == "__main__": address="0.0.0.0", port=PORT, interface=Interfaces.ASGI, - workers=1, + workers=4, websockets=False, log_level=LogLevels.debug, backlog=2048, -- 2.43.0 From 1fd623a660c87561f03545bcff7e7f02696788d1 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Tue, 25 Mar 2025 13:31:45 -0300 Subject: [PATCH 24/40] feat: with index sync endpoints configs --- services/search.py | 106 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 26 deletions(-) diff --git a/services/search.py b/services/search.py index 5401d0f6..a463d578 100644 --- a/services/search.py +++ b/services/search.py @@ -45,6 +45,30 @@ class SearchService: def is_ready(self): """Check if service is available""" return self.available + + async def verify_docs(self, doc_ids): + """Verify which documents exist in the search index""" + if not self.available: + return {"status": "disabled"} + + try: + logger.info(f"Verifying {len(doc_ids)} documents in search index") + response = await self.client.post( + "/verify-docs", + json={"doc_ids": doc_ids}, + timeout=60.0 # Longer timeout for potentially large ID lists + ) + response.raise_for_status() + result = response.json() + + # Log summary of verification results + missing_count = len(result.get("missing", [])) + logger.info(f"Document verification complete: {missing_count} missing out of {len(doc_ids)} total") + + return result + except Exception as e: + logger.error(f"Document verification error: {e}") + return {"status": "error", "message": str(e)} def index(self, shout): """Index a single document""" @@ -407,30 +431,60 @@ async def search_text(text: str, limit: int = 50, offset: int = 0): async def initialize_search_index(shouts_data): """Initialize search index with existing data during application startup""" - if SEARCH_ENABLED: - if not shouts_data: - logger.warning("No shouts data provided for search indexing") - return - - logger.info(f"Initializing search index with {len(shouts_data)} documents") - - info = await search_service.info() - if info.get("status") in ["error", "unavailable", "disabled"]: - logger.error(f"Cannot initialize search index: {info}") - return - - await search_service.bulk_index(shouts_data) - - try: - test_query = "test" - logger.info(f"Verifying search index with query: '{test_query}'") - test_results = await search_text(test_query, 5) - - if test_results: - logger.info(f"Search verification successful: found {len(test_results)} results") - else: - logger.warning("Search verification returned no results. Index may be empty or not working.") - except Exception as e: - logger.error(f"Error verifying search index: {e}") - else: + if not SEARCH_ENABLED: logger.info("Search indexing skipped (SEARCH_ENABLED=False)") + return + + if not shouts_data: + logger.warning("No shouts data provided for search indexing") + return + + logger.info(f"Checking search index status for {len(shouts_data)} documents") + + # Get the current index info + info = await search_service.info() + if info.get("status") in ["error", "unavailable", "disabled"]: + logger.error(f"Cannot initialize search index: {info}") + return + + # Check if index has approximately right number of documents + index_stats = info.get("index_stats", {}) + indexed_doc_count = index_stats.get("document_count", 0) + + # If counts are significantly different, do verification + if abs(indexed_doc_count - len(shouts_data)) > 10: + logger.info(f"Document count mismatch: {indexed_doc_count} in index vs {len(shouts_data)} in database. Verifying...") + + # Get all document IDs from your database + doc_ids = [str(shout.id) for shout in shouts_data] + + # Verify which ones are missing from the index + verification = await search_service.verify_docs(doc_ids) + + if verification.get("status") == "error": + logger.error(f"Document verification failed: {verification.get('message')}") + return + + # Index only missing documents + missing_ids = verification.get("missing", []) + if missing_ids: + logger.info(f"Found {len(missing_ids)} documents missing from index. Indexing them...") + missing_docs = [shout for shout in shouts_data if str(shout.id) in missing_ids] + await search_service.bulk_index(missing_docs) + else: + logger.info("All documents are already indexed.") + else: + logger.info(f"Search index appears to be in sync ({indexed_doc_count} documents indexed).") + + # Verify with test query + try: + test_query = "test" + logger.info(f"Verifying search index with query: '{test_query}'") + test_results = await search_text(test_query, 5) + + if test_results: + logger.info(f"Search verification successful: found {len(test_results)} results") + else: + logger.warning("Search verification returned no results. Index may be empty or not working.") + except Exception as e: + logger.error(f"Error verifying search index: {e}") -- 2.43.0 From f089a32394dc87a6786ca19057e09fe90575b493 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Tue, 25 Mar 2025 14:44:05 -0300 Subject: [PATCH 25/40] debug(search.py): with more logs when check sync of indexing --- server.py | 2 +- services/search.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/server.py b/server.py index 0ba5b97c..281f50ff 100644 --- a/server.py +++ b/server.py @@ -17,7 +17,7 @@ if __name__ == "__main__": address="0.0.0.0", port=PORT, interface=Interfaces.ASGI, - workers=4, + workers=1, websockets=False, log_level=LogLevels.debug, backlog=2048, diff --git a/services/search.py b/services/search.py index a463d578..1f40d69e 100644 --- a/services/search.py +++ b/services/search.py @@ -451,6 +451,24 @@ async def initialize_search_index(shouts_data): index_stats = info.get("index_stats", {}) indexed_doc_count = index_stats.get("document_count", 0) + # Log database document summary + db_ids = [str(shout.id) for shout in shouts_data] + logger.info(f"Database contains {len(shouts_data)} documents. Sample IDs: {', '.join(db_ids[:5])}...") + + # Calculate summary by ID range to understand the coverage + try: + # Parse numeric IDs where possible to analyze coverage + numeric_ids = [int(sid) for sid in db_ids if sid.isdigit()] + if numeric_ids: + min_id = min(numeric_ids) + max_id = max(numeric_ids) + id_range = max_id - min_id + 1 + coverage_pct = (len(numeric_ids) / id_range) * 100 if id_range > 0 else 0 + logger.info(f"ID range analysis: min_id={min_id}, max_id={max_id}, range={id_range}, " + f"coverage={coverage_pct:.1f}% ({len(numeric_ids)}/{id_range})") + except Exception as e: + logger.warning(f"Could not analyze ID ranges: {e}") + # If counts are significantly different, do verification if abs(indexed_doc_count - len(shouts_data)) > 10: logger.info(f"Document count mismatch: {indexed_doc_count} in index vs {len(shouts_data)} in database. Verifying...") @@ -469,12 +487,29 @@ async def initialize_search_index(shouts_data): missing_ids = verification.get("missing", []) if missing_ids: logger.info(f"Found {len(missing_ids)} documents missing from index. Indexing them...") + logger.info(f"Sample missing IDs: {', '.join(missing_ids[:10])}...") missing_docs = [shout for shout in shouts_data if str(shout.id) in missing_ids] await search_service.bulk_index(missing_docs) else: logger.info("All documents are already indexed.") else: logger.info(f"Search index appears to be in sync ({indexed_doc_count} documents indexed).") + + # Optional sample verification (can be slow with large document sets) + # Uncomment if you want to periodically check a random sample even when counts match + """ + sample_size = 10 + if len(db_ids) > sample_size: + sample_ids = random.sample(db_ids, sample_size) + logger.info(f"Performing random sample verification on {sample_size} documents...") + verification = await search_service.verify_docs(sample_ids) + if verification.get("missing"): + missing_count = len(verification.get("missing", [])) + logger.warning(f"Random verification found {missing_count}/{sample_size} missing docs " + f"despite count match. Consider full verification.") + else: + logger.info("Random document sample verification passed.") + """ # Verify with test query try: @@ -484,6 +519,15 @@ async def initialize_search_index(shouts_data): if test_results: logger.info(f"Search verification successful: found {len(test_results)} results") + # Log categories covered by search results + categories = set() + for result in test_results: + result_id = result.get("id") + matching_shouts = [s for s in shouts_data if str(s.id) == result_id] + if matching_shouts and hasattr(matching_shouts[0], 'category'): + categories.add(getattr(matching_shouts[0], 'category', 'unknown')) + if categories: + logger.info(f"Search results cover categories: {', '.join(categories)}") else: logger.warning("Search verification returned no results. Index may be empty or not working.") except Exception as e: -- 2.43.0 From 7f36f93d92e7fed08b48a505ade0befccb4a8220 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Tue, 25 Mar 2025 15:18:29 -0300 Subject: [PATCH 26/40] feat(search.py): detects both missing documents and null embeddings --- services/search.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/services/search.py b/services/search.py index 1f40d69e..3ed09c9f 100644 --- a/services/search.py +++ b/services/search.py @@ -415,6 +415,30 @@ class SearchService: except Exception as e: logger.error(f"Search error for '{text}': {e}", exc_info=True) return [] + + async def check_index_status(self): + """Get detailed statistics about the search index health""" + if not self.available: + return {"status": "disabled"} + + try: + response = await self.client.get("/index-status") + response.raise_for_status() + result = response.json() + logger.info(f"Index status check: {result['status']}, {result['documents_count']} documents, {result['embeddings_count']} embeddings") + + # Log warnings for any inconsistencies + if result["consistency"]["status"] != "ok": + if result["consistency"]["missing_embeddings_count"] > 0: + logger.warning(f"Found {result['consistency']['missing_embeddings_count']} documents without embeddings. Sample IDs: {result['consistency']['missing_embeddings_sample']}") + + if result["consistency"]["null_embeddings_count"] > 0: + logger.warning(f"Found {result['consistency']['null_embeddings_count']} documents with NULL embeddings. Sample IDs: {result['consistency']['null_embeddings_sample']}") + + return result + except Exception as e: + logger.error(f"Failed to check index status: {e}") + return {"status": "error", "message": str(e)} # Create the search service singleton @@ -450,6 +474,24 @@ async def initialize_search_index(shouts_data): # Check if index has approximately right number of documents index_stats = info.get("index_stats", {}) indexed_doc_count = index_stats.get("document_count", 0) + + # Add a more detailed status check + index_status = await search_service.check_index_status() + if index_status.get("status") == "healthy": + logger.info("Index status check passed") + elif index_status.get("status") == "inconsistent": + logger.warning("Index status check found inconsistencies") + + # Get both missing documents and documents with null embeddings + problem_ids = [] + problem_ids.extend(index_status.get("consistency", {}).get("missing_embeddings_sample", [])) + problem_ids.extend(index_status.get("consistency", {}).get("null_embeddings_sample", [])) + + if problem_ids: + logger.info(f"Repairing {len(problem_ids)} problem documents") + problem_docs = [shout for shout in shouts_data if str(shout.id) in problem_ids] + if problem_docs: + await search_service.bulk_index(problem_docs) # Log database document summary db_ids = [str(shout.id) for shout in shouts_data] -- 2.43.0 From e405fb527bffba501cd2c3782f705ff2dddd171e Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Tue, 25 Mar 2025 16:42:44 -0300 Subject: [PATCH 27/40] refactor(search.py): moved to use one table docs for embdings and docs store --- services/search.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/services/search.py b/services/search.py index 3ed09c9f..40211b8b 100644 --- a/services/search.py +++ b/services/search.py @@ -425,15 +425,13 @@ class SearchService: response = await self.client.get("/index-status") response.raise_for_status() result = response.json() - logger.info(f"Index status check: {result['status']}, {result['documents_count']} documents, {result['embeddings_count']} embeddings") + logger.info(f"Index status check: {result['status']}, {result['documents_count']} documents") # Log warnings for any inconsistencies - if result["consistency"]["status"] != "ok": - if result["consistency"]["missing_embeddings_count"] > 0: - logger.warning(f"Found {result['consistency']['missing_embeddings_count']} documents without embeddings. Sample IDs: {result['consistency']['missing_embeddings_sample']}") - - if result["consistency"]["null_embeddings_count"] > 0: - logger.warning(f"Found {result['consistency']['null_embeddings_count']} documents with NULL embeddings. Sample IDs: {result['consistency']['null_embeddings_sample']}") + if result.get("consistency", {}).get("status") != "ok": + null_count = result.get("consistency", {}).get("null_embeddings_count", 0) + if null_count > 0: + logger.warning(f"Found {null_count} documents with NULL embeddings") return result except Exception as e: @@ -482,13 +480,11 @@ async def initialize_search_index(shouts_data): elif index_status.get("status") == "inconsistent": logger.warning("Index status check found inconsistencies") - # Get both missing documents and documents with null embeddings - problem_ids = [] - problem_ids.extend(index_status.get("consistency", {}).get("missing_embeddings_sample", [])) - problem_ids.extend(index_status.get("consistency", {}).get("null_embeddings_sample", [])) - + # Get documents with null embeddings + problem_ids = index_status.get("consistency", {}).get("null_embeddings_sample", []) + if problem_ids: - logger.info(f"Repairing {len(problem_ids)} problem documents") + logger.info(f"Repairing {len(problem_ids)} documents with NULL embeddings") problem_docs = [shout for shout in shouts_data if str(shout.id) in problem_ids] if problem_docs: await search_service.bulk_index(problem_docs) -- 2.43.0 From 34a354e9e386232c5dea4b1a0b3a389a4b9155bd Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Thu, 27 Mar 2025 11:54:56 -0300 Subject: [PATCH 28/40] debug(reader.py: trying back shout id in query call --- resolvers/reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resolvers/reader.py b/resolvers/reader.py index 0be80af4..79d604ca 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -423,8 +423,8 @@ async def load_shouts_search(_, info, text, options): q = apply_sorting(q, options) shouts = get_shouts_with_links(info, q, limit, offset) for shout in shouts: - shout.score = scores[f"{shout.id}"] - shouts.sort(key=lambda x: x.score, reverse=True) + shout["score"] = scores[f"{shout['id']}"] + shouts.sort(key=lambda x: x["score"], reverse=True) return shouts return [] -- 2.43.0 From ae85b32f697ef053a960d736968ee02f79a81b9a Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Thu, 27 Mar 2025 14:06:52 -0300 Subject: [PATCH 29/40] feat(type.qraphql): SearchResult with shout id --- schema/type.graphql | 1 + 1 file changed, 1 insertion(+) diff --git a/schema/type.graphql b/schema/type.graphql index 7a8344da..b4d4ba50 100644 --- a/schema/type.graphql +++ b/schema/type.graphql @@ -207,6 +207,7 @@ type CommonResult { } type SearchResult { + id: Int! slug: String! title: String! cover: String -- 2.43.0 From ff3a4debce85fe906bfee73dfcb3564220304fcf Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Thu, 27 Mar 2025 14:43:17 -0300 Subject: [PATCH 30/40] debug(reader.py): trying to handle main topic ids founded --- resolvers/reader.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/resolvers/reader.py b/resolvers/reader.py index 79d604ca..95bc6dc8 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -253,10 +253,10 @@ def get_shouts_with_links(info, q, limit=20, offset=0): "is_main": True, } elif not main_topic: - logger.warning(f"No main_topic and no topics found for shout#{shout_id}") + logger.debug(f"No main_topic and no topics found for shout#{shout_id}") main_topic = {"id": 0, "title": "no topic", "slug": "notopic", "is_main": True} shout_dict["main_topic"] = main_topic - # logger.debug(f"Final main_topic for shout#{shout_id}: {main_topic}") + logger.debug(f"Final main_topic for shout#{shout_id}: {main_topic}") if has_field(info, "authors") and hasattr(row, "authors"): shout_dict["authors"] = ( @@ -420,6 +420,13 @@ async def load_shouts_search(_, info, text, options): ) q = q.filter(Shout.id.in_(hits_ids)) q = apply_filters(q, options) + + # added this to join topics + topic_join = aliased(ShoutTopic) + topic = aliased(Topic) + q = q.outerjoin(topic_join, topic_join.shout == Shout.id) + q = q.outerjoin(topic, topic.id == topic_join.topic) + q = apply_sorting(q, options) shouts = get_shouts_with_links(info, q, limit, offset) for shout in shouts: -- 2.43.0 From 0bc55977ac59a57ce97e471c0d62e1b9f1e0b9a3 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Thu, 27 Mar 2025 15:18:08 -0300 Subject: [PATCH 31/40] debug(reader.py): query_with_stat(info) always --- resolvers/reader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/resolvers/reader.py b/resolvers/reader.py index 95bc6dc8..666761c4 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -413,11 +413,13 @@ async def load_shouts_search(_, info, text, options): scores[shout_id] = sr.get("score") hits_ids.append(shout_id) - q = ( + """ q = ( query_with_stat(info) if has_field(info, "stat") else select(Shout).filter(and_(Shout.published_at.is_not(None), Shout.deleted_at.is_(None))) - ) + ) """ + q = query_with_stat(info) + q = q.filter(Shout.id.in_(hits_ids)) q = apply_filters(q, options) @@ -427,7 +429,6 @@ async def load_shouts_search(_, info, text, options): q = q.outerjoin(topic_join, topic_join.shout == Shout.id) q = q.outerjoin(topic, topic.id == topic_join.topic) - q = apply_sorting(q, options) shouts = get_shouts_with_links(info, q, limit, offset) for shout in shouts: shout["score"] = scores[f"{shout['id']}"] -- 2.43.0 From 9ebb81cbd3c8b8d3b21271ef10745af3506ad731 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Mon, 31 Mar 2025 13:32:51 -0300 Subject: [PATCH 32/40] refactor(reader.py): rm debug line --- resolvers/reader.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/resolvers/reader.py b/resolvers/reader.py index 666761c4..91eddad9 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -413,11 +413,6 @@ async def load_shouts_search(_, info, text, options): scores[shout_id] = sr.get("score") hits_ids.append(shout_id) - """ q = ( - query_with_stat(info) - if has_field(info, "stat") - else select(Shout).filter(and_(Shout.published_at.is_not(None), Shout.deleted_at.is_(None))) - ) """ q = query_with_stat(info) q = q.filter(Shout.id.in_(hits_ids)) -- 2.43.0 From ecc443c3adae54aa87243fd8de260b55e3103b98 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Tue, 1 Apr 2025 12:57:46 -0300 Subject: [PATCH 33/40] refactor(reader.py): Remove the unnecessary topic joins that cause duplicate results --- resolvers/reader.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/resolvers/reader.py b/resolvers/reader.py index f9cea228..e9e2d207 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -417,12 +417,6 @@ async def load_shouts_search(_, info, text, options): q = q.filter(Shout.id.in_(hits_ids)) q = apply_filters(q, options) - # added this to join topics - topic_join = aliased(ShoutTopic) - topic = aliased(Topic) - q = q.outerjoin(topic_join, topic_join.shout == Shout.id) - q = q.outerjoin(topic, topic.id == topic_join.topic) - shouts = get_shouts_with_links(info, q, limit, offset) for shout in shouts: shout["score"] = scores[f"{shout['id']}"] -- 2.43.0 From a0db5707c4cf996f66a9eb199dfc388e88b3e149 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Tue, 1 Apr 2025 16:01:09 -0300 Subject: [PATCH 34/40] feat: add cash for storing searchresalts and hold them for working pagination. Now we are have offset for use on frontend --- .gitignore | 3 + resolvers/reader.py | 43 +++++++- schema/query.graphql | 1 + schema/type.graphql | 4 + services/search.py | 249 ++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 284 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 502d180d..4ba0aec2 100644 --- a/.gitignore +++ b/.gitignore @@ -128,6 +128,9 @@ dmypy.json .idea temp.* +# Debug +DEBUG.log + discours.key discours.crt discours.pem diff --git a/resolvers/reader.py b/resolvers/reader.py index e9e2d207..48b3f695 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -10,7 +10,7 @@ from orm.shout import Shout, ShoutAuthor, ShoutTopic from orm.topic import Topic from services.db import json_array_builder, json_builder, local_session from services.schema import query -from services.search import search_text +from services.search import search_text, get_search_count from services.viewed import ViewedStorage from utils.logger import root_logger as logger @@ -401,8 +401,17 @@ async def load_shouts_search(_, info, text, options): """ limit = options.get("limit", 10) offset = options.get("offset", 0) + if isinstance(text, str) and len(text) > 2: + # Get search results with pagination results = await search_text(text, limit, offset) + + # If no results, return empty list + if not results: + logger.info(f"No search results found for '{text}'") + return [] + + # Build a map of document IDs to their search scores scores = {} hits_ids = [] for sr in results: @@ -412,19 +421,45 @@ async def load_shouts_search(_, info, text, options): scores[shout_id] = sr.get("score") hits_ids.append(shout_id) + # Build query to fetch shout details q = query_with_stat(info) - q = q.filter(Shout.id.in_(hits_ids)) - q = apply_filters(q, options) + q = apply_filters(q, options.get("filters", {})) + # Fetch shout details shouts = get_shouts_with_links(info, q, limit, offset) + + # Populate search scores in results and sort by score for shout in shouts: - shout["score"] = scores[f"{shout['id']}"] + shout_id = str(shout['id']) + if shout_id in scores: + shout["score"] = scores[shout_id] + else: + shout["score"] = 0 # Default score if not found in search results + + # Sort by score (highest first) shouts.sort(key=lambda x: x["score"], reverse=True) + return shouts return [] +@query.field("get_search_results_count") +async def get_search_results_count(_, info, text): + """ + Returns the total count of search results for a search query. + + :param _: Root query object (unused) + :param info: GraphQL context information + :param text: Search query text + :return: Total count of results + """ + if isinstance(text, str) and len(text) > 2: + count = await get_search_count(text) + return {"count": count} + return {"count": 0} + + @query.field("load_shouts_unrated") async def load_shouts_unrated(_, info, options): """ diff --git a/schema/query.graphql b/schema/query.graphql index e07954ae..96055bc6 100644 --- a/schema/query.graphql +++ b/schema/query.graphql @@ -33,6 +33,7 @@ type Query { get_shout(slug: String, shout_id: Int): Shout load_shouts_by(options: LoadShoutsOptions): [Shout] load_shouts_search(text: String!, options: LoadShoutsOptions): [SearchResult] + get_search_results_count(text: String!): CountResult! load_shouts_bookmarked(options: LoadShoutsOptions): [Shout] # rating diff --git a/schema/type.graphql b/schema/type.graphql index 17d0c84b..ff82b4bc 100644 --- a/schema/type.graphql +++ b/schema/type.graphql @@ -276,3 +276,7 @@ type MyRateComment { my_rate: ReactionKind } +type CountResult { + count: Int! +} + diff --git a/services/search.py b/services/search.py index 40211b8b..bc77dfb6 100644 --- a/services/search.py +++ b/services/search.py @@ -4,7 +4,8 @@ import logging import os import httpx import time -import random +from collections import defaultdict +from datetime import datetime, timedelta # Set up proper logging logger = logging.getLogger("search") @@ -15,6 +16,168 @@ SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "none") MAX_BATCH_SIZE = int(os.environ.get("SEARCH_MAX_BATCH_SIZE", "25")) +# Search cache configuration +SEARCH_CACHE_ENABLED = bool(os.environ.get("SEARCH_CACHE_ENABLED", "true").lower() in ["true", "1", "yes"]) +SEARCH_CACHE_TTL_SECONDS = int(os.environ.get("SEARCH_CACHE_TTL_SECONDS", "900")) # Default: 15 minutes +SEARCH_MIN_SCORE = float(os.environ.get("SEARCH_MIN_SCORE", "0.1")) +SEARCH_PREFETCH_SIZE = int(os.environ.get("SEARCH_PREFETCH_SIZE", "200")) +SEARCH_USE_REDIS = bool(os.environ.get("SEARCH_USE_REDIS", "true").lower() in ["true", "1", "yes"]) + +# Import Redis client if Redis caching is enabled +if SEARCH_USE_REDIS: + try: + from services.redis import redis + logger.info("Redis client imported for search caching") + except ImportError: + logger.warning("Redis client import failed, falling back to memory cache") + SEARCH_USE_REDIS = False + +class SearchCache: + """Cache for search results to enable efficient pagination""" + + def __init__(self, ttl_seconds=SEARCH_CACHE_TTL_SECONDS, max_items=100): + self.cache = {} # Maps search query to list of results + self.last_accessed = {} # Maps search query to last access timestamp + self.ttl = ttl_seconds + self.max_items = max_items + self._redis_prefix = "search_cache:" + + async def store(self, query, results): + """Store search results for a query""" + normalized_query = self._normalize_query(query) + + if SEARCH_USE_REDIS: + try: + serialized_results = json.dumps(results) + await redis.set( + f"{self._redis_prefix}{normalized_query}", + serialized_results, + ex=self.ttl + ) + logger.info(f"Stored {len(results)} search results for query '{query}' in Redis") + return True + except Exception as e: + logger.error(f"Error storing search results in Redis: {e}") + # Fall back to memory cache if Redis fails + + # First cleanup if needed for memory cache + if len(self.cache) >= self.max_items: + self._cleanup() + + # Store results and update timestamp + self.cache[normalized_query] = results + self.last_accessed[normalized_query] = time.time() + logger.info(f"Cached {len(results)} search results for query '{query}' in memory") + return True + + async def get(self, query, limit=10, offset=0): + """Get paginated results for a query""" + normalized_query = self._normalize_query(query) + all_results = None + + # Try to get from Redis first + if SEARCH_USE_REDIS: + try: + cached_data = await redis.get(f"{self._redis_prefix}{normalized_query}") + if cached_data: + all_results = json.loads(cached_data) + logger.info(f"Retrieved search results for '{query}' from Redis") + # Redis TTL is auto-extended when setting the key with expiry + except Exception as e: + logger.error(f"Error retrieving search results from Redis: {e}") + + # Fall back to memory cache if not in Redis + if all_results is None and normalized_query in self.cache: + all_results = self.cache[normalized_query] + self.last_accessed[normalized_query] = time.time() + logger.info(f"Retrieved search results for '{query}' from memory cache") + + # If not found in any cache + if all_results is None: + logger.debug(f"Cache miss for query '{query}'") + return None + + # Return paginated subset + end_idx = min(offset + limit, len(all_results)) + if offset >= len(all_results): + logger.warning(f"Requested offset {offset} exceeds result count {len(all_results)}") + return [] + + logger.info(f"Cache hit for '{query}': serving {offset}:{end_idx} of {len(all_results)} results") + return all_results[offset:end_idx] + + async def has_query(self, query): + """Check if query exists in cache""" + normalized_query = self._normalize_query(query) + + # Check Redis first + if SEARCH_USE_REDIS: + try: + exists = await redis.get(f"{self._redis_prefix}{normalized_query}") + if exists: + return True + except Exception as e: + logger.error(f"Error checking Redis for query existence: {e}") + + # Fall back to memory cache + return normalized_query in self.cache + + async def get_total_count(self, query): + """Get total count of results for a query""" + normalized_query = self._normalize_query(query) + + # Check Redis first + if SEARCH_USE_REDIS: + try: + cached_data = await redis.get(f"{self._redis_prefix}{normalized_query}") + if cached_data: + all_results = json.loads(cached_data) + return len(all_results) + except Exception as e: + logger.error(f"Error getting result count from Redis: {e}") + + # Fall back to memory cache + if normalized_query in self.cache: + return len(self.cache[normalized_query]) + + return 0 + + def _normalize_query(self, query): + """Normalize query string for cache key""" + if not query: + return "" + # Simple normalization - lowercase and strip whitespace + return query.lower().strip() + + def _cleanup(self): + """Remove oldest entries if memory cache is full""" + now = time.time() + # First remove expired entries + expired_keys = [ + key for key, last_access in self.last_accessed.items() + if now - last_access > self.ttl + ] + + for key in expired_keys: + if key in self.cache: + del self.cache[key] + if key in self.last_accessed: + del self.last_accessed[key] + + logger.info(f"Cleaned up {len(expired_keys)} expired search cache entries") + + # If still above max size, remove oldest entries + if len(self.cache) >= self.max_items: + # Sort by last access time + sorted_items = sorted(self.last_accessed.items(), key=lambda x: x[1]) + # Remove oldest 20% + remove_count = max(1, int(len(sorted_items) * 0.2)) + for key, _ in sorted_items[:remove_count]: + if key in self.cache: + del self.cache[key] + if key in self.last_accessed: + del self.last_accessed[key] + logger.info(f"Removed {remove_count} oldest search cache entries") class SearchService: def __init__(self): @@ -23,15 +186,21 @@ class SearchService: # Use different timeout settings for indexing and search requests self.client = httpx.AsyncClient(timeout=30.0, base_url=TXTAI_SERVICE_URL) self.index_client = httpx.AsyncClient(timeout=120.0, base_url=TXTAI_SERVICE_URL) - + # Initialize search cache + self.cache = SearchCache() if SEARCH_CACHE_ENABLED else None + if not self.available: logger.info("Search disabled (SEARCH_ENABLED = False)") + + if SEARCH_CACHE_ENABLED: + cache_location = "Redis" if SEARCH_USE_REDIS else "Memory" + logger.info(f"Search caching enabled using {cache_location} cache with TTL={SEARCH_CACHE_TTL_SECONDS}s") + logger.info(f"Minimum score filter: {SEARCH_MIN_SCORE}, prefetch size: {SEARCH_PREFETCH_SIZE}") async def info(self): """Return information about search service""" if not self.available: return {"status": "disabled"} - try: response = await self.client.get("/info") response.raise_for_status() @@ -41,7 +210,7 @@ class SearchService: except Exception as e: logger.error(f"Failed to get search info: {e}") return {"status": "error", "message": str(e)} - + def is_ready(self): """Check if service is available""" return self.available @@ -69,12 +238,11 @@ class SearchService: except Exception as e: logger.error(f"Document verification error: {e}") return {"status": "error", "message": str(e)} - + def index(self, shout): """Index a single document""" if not self.available: return - logger.info(f"Indexing post {shout.id}") # Start in background to not block asyncio.create_task(self.perform_index(shout)) @@ -390,12 +558,30 @@ class SearchService: return [] logger.info(f"Searching for: '{text}' (limit={limit}, offset={offset})") - + + # Check if we can serve from cache + if SEARCH_CACHE_ENABLED: + has_cache = await self.cache.has_query(text) + if has_cache: + cached_results = await self.cache.get(text, limit, offset) + if cached_results is not None: + logger.info(f"Serving search results for '{text}' from cache (offset={offset}, limit={limit})") + return cached_results + + # Not in cache or cache disabled, perform new search try: - logger.info(f"Sending search request: text='{text}', limit={limit}, offset={offset}") + search_limit = limit + search_offset = offset + + # If cache is enabled, prefetch more results to store in cache + if SEARCH_CACHE_ENABLED and offset == 0: + search_limit = SEARCH_PREFETCH_SIZE # Fetch more results to cache + search_offset = 0 # Always start from beginning for cache + + logger.info(f"Sending search request: text='{text}', limit={search_limit}, offset={search_offset}") response = await self.client.post( "/search", - json={"text": text, "limit": limit, "offset": offset} + json={"text": text, "limit": search_limit, "offset": search_offset} ) response.raise_for_status() @@ -404,13 +590,46 @@ class SearchService: logger.info(f"Parsed search response: {result}") formatted_results = result.get("results", []) - logger.info(f"Search for '{text}' returned {len(formatted_results)} results") + + # Filter out non-numeric IDs to prevent database errors + valid_results = [] + for item in formatted_results: + doc_id = item.get("id") + if doc_id and doc_id.isdigit(): + valid_results.append(item) + else: + logger.warning(f"Filtered out non-numeric document ID: {doc_id}") + + if len(valid_results) != len(formatted_results): + logger.info(f"Filtered {len(formatted_results) - len(valid_results)} results with non-numeric IDs") + formatted_results = valid_results + + # Filter out low-score results + if SEARCH_MIN_SCORE > 0: + initial_count = len(formatted_results) + formatted_results = [r for r in formatted_results if r.get("score", 0) >= SEARCH_MIN_SCORE] + if len(formatted_results) != initial_count: + logger.info(f"Filtered {initial_count - len(formatted_results)} results with score < {SEARCH_MIN_SCORE}") + + logger.info(f"Search for '{text}' returned {len(formatted_results)} valid results") if formatted_results: logger.info(f"Sample result: {formatted_results[0]}") else: logger.warning(f"No results found for '{text}'") + # Store full results in cache if caching is enabled + if SEARCH_CACHE_ENABLED and offset == 0: + # Store normal sorted results + await self.cache.store(text, formatted_results) + + # Return only the requested page + if limit < len(formatted_results): + page_results = formatted_results[:limit] + logger.info(f"Returning first page of {len(page_results)} results " + + f"(out of {len(formatted_results)} total)") + return page_results + return formatted_results except Exception as e: logger.error(f"Search error for '{text}': {e}", exc_info=True) @@ -438,11 +657,9 @@ class SearchService: logger.error(f"Failed to check index status: {e}") return {"status": "error", "message": str(e)} - # Create the search service singleton search_service = SearchService() - # API-compatible function to perform a search async def search_text(text: str, limit: int = 50, offset: int = 0): payload = [] @@ -450,6 +667,14 @@ async def search_text(text: str, limit: int = 50, offset: int = 0): payload = await search_service.search(text, limit, offset) return payload +async def get_search_count(text: str): + """Get total count of results for a query without fetching all results""" + if search_service.available and SEARCH_CACHE_ENABLED: + if await search_service.cache.has_query(text): + return await search_service.cache.get_total_count(text) + # If not cached, we'll need to perform the full search once + results = await search_text(text, SEARCH_PREFETCH_SIZE, 0) + return len(results) async def initialize_search_index(shouts_data): """Initialize search index with existing data during application startup""" -- 2.43.0 From bc4ec79240b88c39d86f547b3c173893f4b3f396 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Thu, 3 Apr 2025 13:10:53 -0300 Subject: [PATCH 35/40] fix(search.py): store all results in cash not only first offset --- services/search.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/services/search.py b/services/search.py index bc77dfb6..90e127ba 100644 --- a/services/search.py +++ b/services/search.py @@ -23,6 +23,8 @@ SEARCH_MIN_SCORE = float(os.environ.get("SEARCH_MIN_SCORE", "0.1")) SEARCH_PREFETCH_SIZE = int(os.environ.get("SEARCH_PREFETCH_SIZE", "200")) SEARCH_USE_REDIS = bool(os.environ.get("SEARCH_USE_REDIS", "true").lower() in ["true", "1", "yes"]) +search_offset = 0 + # Import Redis client if Redis caching is enabled if SEARCH_USE_REDIS: try: @@ -573,10 +575,13 @@ class SearchService: search_limit = limit search_offset = offset - # If cache is enabled, prefetch more results to store in cache - if SEARCH_CACHE_ENABLED and offset == 0: - search_limit = SEARCH_PREFETCH_SIZE # Fetch more results to cache - search_offset = 0 # Always start from beginning for cache + # Always prefetch full results when caching is enabled + if SEARCH_CACHE_ENABLED: + search_limit = SEARCH_PREFETCH_SIZE # Always fetch a large set + search_offset = 0 # Always start from beginning + else: + search_limit = limit + search_offset = offset logger.info(f"Sending search request: text='{text}', limit={search_limit}, offset={search_offset}") response = await self.client.post( @@ -585,7 +590,7 @@ class SearchService: ) response.raise_for_status() - logger.info(f"Raw search response: {response.text}") + # logger.info(f"Raw search response: {response.text}") result = response.json() logger.info(f"Parsed search response: {result}") @@ -617,18 +622,15 @@ class SearchService: logger.info(f"Sample result: {formatted_results[0]}") else: logger.warning(f"No results found for '{text}'") + - # Store full results in cache if caching is enabled - if SEARCH_CACHE_ENABLED and offset == 0: - # Store normal sorted results - await self.cache.store(text, formatted_results) - - # Return only the requested page - if limit < len(formatted_results): - page_results = formatted_results[:limit] - logger.info(f"Returning first page of {len(page_results)} results " + - f"(out of {len(formatted_results)} total)") - return page_results + if SEARCH_CACHE_ENABLED: + logger.info(f"Storing {len(formatted_results)} results in cache for query '{text}'") + await self.cache.store(text, formatted_results) # Return the proper page slice from the full results stored in cache end_idx = offset + limit + end_idx = offset + limit + page_results = formatted_results[offset:end_idx] + logger.info(f"Returning results from {offset} to {end_idx} (of {len(formatted_results)} total)") + return page_results return formatted_results except Exception as e: -- 2.43.0 From 78326047bf5a3cc246a71cdeb37c5828ff6fd587 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Thu, 3 Apr 2025 13:20:18 -0300 Subject: [PATCH 36/40] fix(reader.py): change sorting and answer on querys --- resolvers/reader.py | 16 ++++++---------- services/search.py | 4 ++-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/resolvers/reader.py b/resolvers/reader.py index 48b3f695..33fe3d72 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -410,8 +410,8 @@ async def load_shouts_search(_, info, text, options): if not results: logger.info(f"No search results found for '{text}'") return [] - - # Build a map of document IDs to their search scores + + # Extract IDs and scores scores = {} hits_ids = [] for sr in results: @@ -426,18 +426,14 @@ async def load_shouts_search(_, info, text, options): q = q.filter(Shout.id.in_(hits_ids)) q = apply_filters(q, options.get("filters", {})) - # Fetch shout details - shouts = get_shouts_with_links(info, q, limit, offset) + # + shouts = get_shouts_with_links(info, q, len(hits_ids), 0) - # Populate search scores in results and sort by score + # Add scores and sort for shout in shouts: shout_id = str(shout['id']) - if shout_id in scores: - shout["score"] = scores[shout_id] - else: - shout["score"] = 0 # Default score if not found in search results + shout["score"] = scores.get(shout_id, 0) - # Sort by score (highest first) shouts.sort(key=lambda x: x["score"], reverse=True) return shouts diff --git a/services/search.py b/services/search.py index 90e127ba..f0907d53 100644 --- a/services/search.py +++ b/services/search.py @@ -574,7 +574,7 @@ class SearchService: try: search_limit = limit search_offset = offset - + # Always prefetch full results when caching is enabled if SEARCH_CACHE_ENABLED: search_limit = SEARCH_PREFETCH_SIZE # Always fetch a large set @@ -592,7 +592,7 @@ class SearchService: # logger.info(f"Raw search response: {response.text}") result = response.json() - logger.info(f"Parsed search response: {result}") + # logger.info(f"Parsed search response: {result}") formatted_results = result.get("results", []) -- 2.43.0 From c533241d1eb0a6f0443aa11a9c74c3d1b429f8e8 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Thu, 3 Apr 2025 13:51:13 -0300 Subject: [PATCH 37/40] fix(reader): sorting by rang not by id in cash --- resolvers/reader.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/resolvers/reader.py b/resolvers/reader.py index 33fe3d72..e83ff6d4 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -421,7 +421,7 @@ async def load_shouts_search(_, info, text, options): scores[shout_id] = sr.get("score") hits_ids.append(shout_id) - # Build query to fetch shout details + # Query DB for only the IDs in the current page q = query_with_stat(info) q = q.filter(Shout.id.in_(hits_ids)) q = apply_filters(q, options.get("filters", {})) @@ -429,12 +429,13 @@ async def load_shouts_search(_, info, text, options): # shouts = get_shouts_with_links(info, q, len(hits_ids), 0) - # Add scores and sort + # Add scores from search results for shout in shouts: shout_id = str(shout['id']) shout["score"] = scores.get(shout_id, 0) - - shouts.sort(key=lambda x: x["score"], reverse=True) + + # Re-sort by search score to maintain ranking + shouts.sort(key=lambda x: scores.get(str(x['id']), 0), reverse=True) return shouts return [] -- 2.43.0 From 106222b0e081b41e1fa662fb1716d5ff46df23ac Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Mon, 7 Apr 2025 11:41:48 -0300 Subject: [PATCH 38/40] debug: without debug logging. clean --- resolvers/reader.py | 10 ---- services/search.py | 135 ++++---------------------------------------- 2 files changed, 10 insertions(+), 135 deletions(-) diff --git a/resolvers/reader.py b/resolvers/reader.py index e83ff6d4..aeb60e50 100644 --- a/resolvers/reader.py +++ b/resolvers/reader.py @@ -187,12 +187,10 @@ def get_shouts_with_links(info, q, limit=20, offset=0): """ shouts = [] try: - # logger.info(f"Starting get_shouts_with_links with limit={limit}, offset={offset}") q = q.limit(limit).offset(offset) with local_session() as session: shouts_result = session.execute(q).all() - # logger.info(f"Got {len(shouts_result) if shouts_result else 0} shouts from query") if not shouts_result: logger.warning("No shouts found in query result") @@ -203,7 +201,6 @@ def get_shouts_with_links(info, q, limit=20, offset=0): shout = None if hasattr(row, "Shout"): shout = row.Shout - # logger.debug(f"Processing shout#{shout.id} at index {idx}") if shout: shout_id = int(f"{shout.id}") shout_dict = shout.dict() @@ -231,20 +228,16 @@ def get_shouts_with_links(info, q, limit=20, offset=0): topics = None if has_field(info, "topics") and hasattr(row, "topics"): topics = orjson.loads(row.topics) if isinstance(row.topics, str) else row.topics - # logger.debug(f"Shout#{shout_id} topics: {topics}") shout_dict["topics"] = topics if has_field(info, "main_topic"): main_topic = None if hasattr(row, "main_topic"): - # logger.debug(f"Raw main_topic for shout#{shout_id}: {row.main_topic}") main_topic = ( orjson.loads(row.main_topic) if isinstance(row.main_topic, str) else row.main_topic ) - # logger.debug(f"Parsed main_topic for shout#{shout_id}: {main_topic}") if not main_topic and topics and len(topics) > 0: - # logger.info(f"No main_topic found for shout#{shout_id}, using first topic from list") main_topic = { "id": topics[0]["id"], "title": topics[0]["title"], @@ -252,10 +245,8 @@ def get_shouts_with_links(info, q, limit=20, offset=0): "is_main": True, } elif not main_topic: - logger.debug(f"No main_topic and no topics found for shout#{shout_id}") main_topic = {"id": 0, "title": "no topic", "slug": "notopic", "is_main": True} shout_dict["main_topic"] = main_topic - logger.debug(f"Final main_topic for shout#{shout_id}: {main_topic}") if has_field(info, "authors") and hasattr(row, "authors"): shout_dict["authors"] = ( @@ -282,7 +273,6 @@ def get_shouts_with_links(info, q, limit=20, offset=0): logger.error(f"Fatal error in get_shouts_with_links: {e}", exc_info=True) raise finally: - logger.info(f"Returning {len(shouts)} shouts from get_shouts_with_links") return shouts diff --git a/services/search.py b/services/search.py index f0907d53..b8036644 100644 --- a/services/search.py +++ b/services/search.py @@ -84,7 +84,6 @@ class SearchCache: if cached_data: all_results = json.loads(cached_data) logger.info(f"Retrieved search results for '{query}' from Redis") - # Redis TTL is auto-extended when setting the key with expiry except Exception as e: logger.error(f"Error retrieving search results from Redis: {e}") @@ -96,7 +95,7 @@ class SearchCache: # If not found in any cache if all_results is None: - logger.debug(f"Cache miss for query '{query}'") + logger.info(f"Cache miss for query '{query}'") return None # Return paginated subset @@ -314,7 +313,6 @@ class SearchService: # Media field processing remains the same media = getattr(shout, 'media', None) if media: - # Your existing media processing logic if isinstance(media, str): try: media_json = json.loads(media) @@ -334,7 +332,6 @@ class SearchService: text = " ".join(text_fields) if not text.strip(): - logger.debug(f"Skipping shout {shout.id}: no text content") total_skipped += 1 continue @@ -342,7 +339,6 @@ class SearchService: original_length = len(text) if original_length > MAX_TEXT_LENGTH: text = text[:MAX_TEXT_LENGTH] - logger.info(f"Truncated document {shout.id} from {original_length} to {MAX_TEXT_LENGTH} chars") total_truncated += 1 document = { @@ -403,10 +399,6 @@ class SearchService: # Process with retries while not success and retry_count < max_retries: try: - if batch: - sample = batch[0] - logger.info(f"Sample document in batch {batch_id}: id={sample['id']}, text_length={len(sample['text'])}") - logger.info(f"Sending batch {batch_id} of {len(batch)} documents to search service (attempt {retry_count+1})") response = await self.index_client.post( "/bulk-index", @@ -434,31 +426,25 @@ class SearchService: if retry_count < max_retries - 1: retry_count += 1 wait_time = (2 ** retry_count) + (random.random() * 0.5) # Exponential backoff with jitter - logger.warning(f"Server error for batch {batch_id}, retrying in {wait_time:.1f}s (attempt {retry_count+1}/{max_retries})") await asyncio.sleep(wait_time) continue # Final retry, split the batch elif len(batch) > 1: - logger.warning(f"Splitting batch {batch_id} after repeated failures") mid = len(batch) // 2 await self._process_single_batch(batch[:mid], f"{batch_id}-A") await self._process_single_batch(batch[mid:], f"{batch_id}-B") break else: # Can't split a single document - logger.error(f"Failed to index document {batch[0]['id']} after {max_retries} attempts") break # Normal success case response.raise_for_status() - result = response.json() - logger.info(f"Batch {batch_id} indexed successfully: {result}") success = True db_error_count = 0 # Reset error counter on success except Exception as e: - # Check if it looks like a database corruption error error_str = str(e).lower() if "duplicate key" in error_str or "unique constraint" in error_str or "nonetype" in error_str: db_error_count += 1 @@ -469,17 +455,12 @@ class SearchService: if retry_count < max_retries - 1: retry_count += 1 wait_time = (2 ** retry_count) + (random.random() * 0.5) - logger.warning(f"Error for batch {batch_id}, retrying in {wait_time:.1f}s: {str(e)[:200]}") await asyncio.sleep(wait_time) else: - # Last resort - try to split the batch if len(batch) > 1: - logger.warning(f"Splitting batch {batch_id} after exception: {str(e)[:200]}") mid = len(batch) // 2 await self._process_single_batch(batch[:mid], f"{batch_id}-A") await self._process_single_batch(batch[mid:], f"{batch_id}-B") - else: - logger.error(f"Failed to index document {batch[0]['id']} after {max_retries} attempts: {e}") break async def _process_single_batch(self, documents, batch_id): @@ -492,41 +473,30 @@ class SearchService: if not documents: return - logger.info(f"Processing sub-batch {batch_id} with {len(documents)} documents") response = await self.index_client.post( "/bulk-index", json=documents, timeout=90.0 ) response.raise_for_status() - result = response.json() - logger.info(f"Sub-batch {batch_id} indexed successfully: {result}") return # Success, exit the retry loop except Exception as e: error_str = str(e).lower() retry_count += 1 - # Check if it's a transient error that txtai might recover from internally if "dictionary changed size" in error_str or "transaction error" in error_str: wait_time = (2 ** retry_count) + (random.random() * 0.5) - logger.warning(f"Transient txtai error in sub-batch {batch_id}, waiting {wait_time:.1f}s for recovery: {str(e)[:200]}") await asyncio.sleep(wait_time) # Wait for txtai to recover - continue # Try again + continue - # For other errors or final retry failure - logger.error(f"Error indexing sub-batch {batch_id} (attempt {retry_count}/{max_retries}): {str(e)[:200]}") - - # Only try one-by-one on the final retry if retry_count >= max_retries and len(documents) > 1: - logger.info(f"Processing documents in sub-batch {batch_id} individually") for i, doc in enumerate(documents): try: resp = await self.index_client.post("/index", json=doc, timeout=30.0) resp.raise_for_status() - logger.info(f"Indexed document {doc['id']} individually") except Exception as e2: - logger.error(f"Failed to index document {doc['id']} individually: {str(e2)[:100]}") + pass return # Exit after individual processing attempt def _truncate_error_detail(self, error_detail): @@ -537,13 +507,11 @@ class SearchService: for i, item in enumerate(truncated_detail['detail']): if isinstance(item, dict) and 'input' in item: if isinstance(item['input'], dict) and any(k in item['input'] for k in ['documents', 'text']): - # Check for documents list if 'documents' in item['input'] and isinstance(item['input']['documents'], list): for j, doc in enumerate(item['input']['documents']): if 'text' in doc and isinstance(doc['text'], str) and len(doc['text']) > 100: item['input']['documents'][j]['text'] = f"{doc['text'][:100]}... [truncated, total {len(doc['text'])} chars]" - # Check for direct text field if 'text' in item['input'] and isinstance(item['input']['text'], str) and len(item['input']['text']) > 100: item['input']['text'] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]" @@ -552,11 +520,9 @@ class SearchService: async def search(self, text, limit, offset): """Search documents""" if not self.available: - logger.warning("Search not available") return [] if not isinstance(text, str) or not text.strip(): - logger.warning(f"Invalid search text: {text}") return [] logger.info(f"Searching for: '{text}' (limit={limit}, offset={offset})") @@ -567,7 +533,6 @@ class SearchService: if has_cache: cached_results = await self.cache.get(text, limit, offset) if cached_results is not None: - logger.info(f"Serving search results for '{text}' from cache (offset={offset}, limit={limit})") return cached_results # Not in cache or cache disabled, perform new search @@ -575,61 +540,40 @@ class SearchService: search_limit = limit search_offset = offset - # Always prefetch full results when caching is enabled if SEARCH_CACHE_ENABLED: - search_limit = SEARCH_PREFETCH_SIZE # Always fetch a large set - search_offset = 0 # Always start from beginning + search_limit = SEARCH_PREFETCH_SIZE + search_offset = 0 else: search_limit = limit search_offset = offset - logger.info(f"Sending search request: text='{text}', limit={search_limit}, offset={search_offset}") response = await self.client.post( "/search", json={"text": text, "limit": search_limit, "offset": search_offset} ) response.raise_for_status() - # logger.info(f"Raw search response: {response.text}") result = response.json() - # logger.info(f"Parsed search response: {result}") formatted_results = result.get("results", []) - # Filter out non-numeric IDs to prevent database errors valid_results = [] for item in formatted_results: doc_id = item.get("id") if doc_id and doc_id.isdigit(): valid_results.append(item) - else: - logger.warning(f"Filtered out non-numeric document ID: {doc_id}") if len(valid_results) != len(formatted_results): - logger.info(f"Filtered {len(formatted_results) - len(valid_results)} results with non-numeric IDs") formatted_results = valid_results - # Filter out low-score results if SEARCH_MIN_SCORE > 0: initial_count = len(formatted_results) formatted_results = [r for r in formatted_results if r.get("score", 0) >= SEARCH_MIN_SCORE] - if len(formatted_results) != initial_count: - logger.info(f"Filtered {initial_count - len(formatted_results)} results with score < {SEARCH_MIN_SCORE}") - - logger.info(f"Search for '{text}' returned {len(formatted_results)} valid results") - - if formatted_results: - logger.info(f"Sample result: {formatted_results[0]}") - else: - logger.warning(f"No results found for '{text}'") - if SEARCH_CACHE_ENABLED: - logger.info(f"Storing {len(formatted_results)} results in cache for query '{text}'") - await self.cache.store(text, formatted_results) # Return the proper page slice from the full results stored in cache end_idx = offset + limit + await self.cache.store(text, formatted_results) end_idx = offset + limit page_results = formatted_results[offset:end_idx] - logger.info(f"Returning results from {offset} to {end_idx} (of {len(formatted_results)} total)") return page_results return formatted_results @@ -646,9 +590,7 @@ class SearchService: response = await self.client.get("/index-status") response.raise_for_status() result = response.json() - logger.info(f"Index status check: {result['status']}, {result['documents_count']} documents") - # Log warnings for any inconsistencies if result.get("consistency", {}).get("status") != "ok": null_count = result.get("consistency", {}).get("null_embeddings_count", 0) if null_count > 0: @@ -674,126 +616,69 @@ async def get_search_count(text: str): if search_service.available and SEARCH_CACHE_ENABLED: if await search_service.cache.has_query(text): return await search_service.cache.get_total_count(text) - # If not cached, we'll need to perform the full search once results = await search_text(text, SEARCH_PREFETCH_SIZE, 0) return len(results) async def initialize_search_index(shouts_data): """Initialize search index with existing data during application startup""" if not SEARCH_ENABLED: - logger.info("Search indexing skipped (SEARCH_ENABLED=False)") return if not shouts_data: - logger.warning("No shouts data provided for search indexing") return - logger.info(f"Checking search index status for {len(shouts_data)} documents") - - # Get the current index info info = await search_service.info() if info.get("status") in ["error", "unavailable", "disabled"]: - logger.error(f"Cannot initialize search index: {info}") return - # Check if index has approximately right number of documents index_stats = info.get("index_stats", {}) indexed_doc_count = index_stats.get("document_count", 0) - # Add a more detailed status check index_status = await search_service.check_index_status() - if index_status.get("status") == "healthy": - logger.info("Index status check passed") - elif index_status.get("status") == "inconsistent": - logger.warning("Index status check found inconsistencies") - - # Get documents with null embeddings + if index_status.get("status") == "inconsistent": problem_ids = index_status.get("consistency", {}).get("null_embeddings_sample", []) if problem_ids: - logger.info(f"Repairing {len(problem_ids)} documents with NULL embeddings") problem_docs = [shout for shout in shouts_data if str(shout.id) in problem_ids] if problem_docs: await search_service.bulk_index(problem_docs) - # Log database document summary db_ids = [str(shout.id) for shout in shouts_data] - logger.info(f"Database contains {len(shouts_data)} documents. Sample IDs: {', '.join(db_ids[:5])}...") - # Calculate summary by ID range to understand the coverage try: - # Parse numeric IDs where possible to analyze coverage numeric_ids = [int(sid) for sid in db_ids if sid.isdigit()] if numeric_ids: min_id = min(numeric_ids) max_id = max(numeric_ids) id_range = max_id - min_id + 1 - coverage_pct = (len(numeric_ids) / id_range) * 100 if id_range > 0 else 0 - logger.info(f"ID range analysis: min_id={min_id}, max_id={max_id}, range={id_range}, " - f"coverage={coverage_pct:.1f}% ({len(numeric_ids)}/{id_range})") except Exception as e: - logger.warning(f"Could not analyze ID ranges: {e}") + pass - # If counts are significantly different, do verification if abs(indexed_doc_count - len(shouts_data)) > 10: - logger.info(f"Document count mismatch: {indexed_doc_count} in index vs {len(shouts_data)} in database. Verifying...") - - # Get all document IDs from your database doc_ids = [str(shout.id) for shout in shouts_data] - # Verify which ones are missing from the index verification = await search_service.verify_docs(doc_ids) if verification.get("status") == "error": - logger.error(f"Document verification failed: {verification.get('message')}") return - # Index only missing documents missing_ids = verification.get("missing", []) if missing_ids: - logger.info(f"Found {len(missing_ids)} documents missing from index. Indexing them...") - logger.info(f"Sample missing IDs: {', '.join(missing_ids[:10])}...") missing_docs = [shout for shout in shouts_data if str(shout.id) in missing_ids] await search_service.bulk_index(missing_docs) - else: - logger.info("All documents are already indexed.") else: - logger.info(f"Search index appears to be in sync ({indexed_doc_count} documents indexed).") + pass - # Optional sample verification (can be slow with large document sets) - # Uncomment if you want to periodically check a random sample even when counts match - """ - sample_size = 10 - if len(db_ids) > sample_size: - sample_ids = random.sample(db_ids, sample_size) - logger.info(f"Performing random sample verification on {sample_size} documents...") - verification = await search_service.verify_docs(sample_ids) - if verification.get("missing"): - missing_count = len(verification.get("missing", [])) - logger.warning(f"Random verification found {missing_count}/{sample_size} missing docs " - f"despite count match. Consider full verification.") - else: - logger.info("Random document sample verification passed.") - """ - - # Verify with test query try: test_query = "test" - logger.info(f"Verifying search index with query: '{test_query}'") test_results = await search_text(test_query, 5) if test_results: - logger.info(f"Search verification successful: found {len(test_results)} results") - # Log categories covered by search results categories = set() for result in test_results: result_id = result.get("id") matching_shouts = [s for s in shouts_data if str(s.id) == result_id] if matching_shouts and hasattr(matching_shouts[0], 'category'): categories.add(getattr(matching_shouts[0], 'category', 'unknown')) - if categories: - logger.info(f"Search results cover categories: {', '.join(categories)}") - else: - logger.warning("Search verification returned no results. Index may be empty or not working.") except Exception as e: - logger.error(f"Error verifying search index: {e}") + pass -- 2.43.0 From 4d965fb27be849661cbba544d1f81626924ee1d9 Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Sun, 20 Apr 2025 19:22:08 -0300 Subject: [PATCH 39/40] feat(search.py): separate indexing of Shout Title, shout Body and Authors --- orm/shout.py | 28 +++ services/db.py | 13 +- services/search.py | 510 +++++++++++++++++++++++++++------------------ 3 files changed, 340 insertions(+), 211 deletions(-) diff --git a/orm/shout.py b/orm/shout.py index d74e84d4..30a8dc69 100644 --- a/orm/shout.py +++ b/orm/shout.py @@ -71,6 +71,34 @@ class ShoutAuthor(Base): class Shout(Base): """ Публикация в системе. + + Attributes: + body (str) + slug (str) + cover (str) : "Cover image url" + cover_caption (str) : "Cover image alt caption" + lead (str) + title (str) + subtitle (str) + layout (str) + media (dict) + authors (list[Author]) + topics (list[Topic]) + reactions (list[Reaction]) + lang (str) + version_of (int) + oid (str) + seo (str) : JSON + draft (int) + created_at (int) + updated_at (int) + published_at (int) + featured_at (int) + deleted_at (int) + created_by (int) + updated_by (int) + deleted_by (int) + community (int) """ __tablename__ = "shout" diff --git a/services/db.py b/services/db.py index ccca6c8a..935c25b4 100644 --- a/services/db.py +++ b/services/db.py @@ -19,7 +19,7 @@ from sqlalchemy import ( inspect, text, ) -from sqlalchemy.orm import Session, configure_mappers, declarative_base +from sqlalchemy.orm import Session, configure_mappers, declarative_base, joinedload from sqlalchemy.sql.schema import Table from settings import DB_URL @@ -260,8 +260,11 @@ def get_json_builder(): # Используем их в коде json_builder, json_array_builder, json_cast = get_json_builder() +# Fetch all shouts, with authors preloaded +# This function is used for search indexing + async def fetch_all_shouts(session=None): - """Fetch all published shouts for search indexing""" + """Fetch all published shouts for search indexing with authors preloaded""" from orm.shout import Shout close_session = False @@ -270,8 +273,10 @@ async def fetch_all_shouts(session=None): close_session = True try: - # Fetch only published and non-deleted shouts - query = session.query(Shout).filter( + # Fetch only published and non-deleted shouts with authors preloaded + query = session.query(Shout).options( + joinedload(Shout.authors) + ).filter( Shout.published_at.is_not(None), Shout.deleted_at.is_(None) ) diff --git a/services/search.py b/services/search.py index b8036644..75e56efe 100644 --- a/services/search.py +++ b/services/search.py @@ -216,8 +216,9 @@ class SearchService: """Check if service is available""" return self.available + async def verify_docs(self, doc_ids): - """Verify which documents exist in the search index""" + """Verify which documents exist in the search index across all content types""" if not self.available: return {"status": "disabled"} @@ -231,14 +232,36 @@ class SearchService: response.raise_for_status() result = response.json() - # Log summary of verification results - missing_count = len(result.get("missing", [])) - logger.info(f"Document verification complete: {missing_count} missing out of {len(doc_ids)} total") + # Process the more detailed response format + bodies_missing = set(result.get("bodies", {}).get("missing", [])) + titles_missing = set(result.get("titles", {}).get("missing", [])) - return result + # Combine missing IDs from both bodies and titles + # A document is considered missing if it's missing from either index + all_missing = list(bodies_missing.union(titles_missing)) + + # Log summary of verification results + bodies_missing_count = len(bodies_missing) + titles_missing_count = len(titles_missing) + total_missing_count = len(all_missing) + + logger.info(f"Document verification complete: {bodies_missing_count} bodies missing, {titles_missing_count} titles missing") + logger.info(f"Total unique missing documents: {total_missing_count} out of {len(doc_ids)} total") + + # Return in a backwards-compatible format plus the detailed breakdown + return { + "missing": all_missing, + "details": { + "bodies_missing": list(bodies_missing), + "titles_missing": list(titles_missing), + "bodies_missing_count": bodies_missing_count, + "titles_missing_count": titles_missing_count + } + } except Exception as e: logger.error(f"Document verification error: {e}") return {"status": "error", "message": str(e)} + def index(self, shout): """Index a single document""" @@ -249,68 +272,147 @@ class SearchService: asyncio.create_task(self.perform_index(shout)) async def perform_index(self, shout): - """Actually perform the indexing operation""" + """Index a single document across multiple endpoints""" if not self.available: return try: - # Combine all text fields - text = " ".join(filter(None, [ - shout.title or "", - shout.subtitle or "", - shout.lead or "", - shout.body or "", - shout.media or "" - ])) + logger.info(f"Indexing document {shout.id} to individual endpoints") + indexing_tasks = [] - if not text.strip(): - logger.warning(f"No text content to index for shout {shout.id}") - return + # 1. Index title if available + if hasattr(shout, 'title') and shout.title and isinstance(shout.title, str): + title_doc = { + "id": str(shout.id), + "title": shout.title.strip() + } + indexing_tasks.append( + self.index_client.post("/index-title", json=title_doc) + ) + + # 2. Index body content (subtitle, lead, body) + body_text_parts = [] + for field_name in ['subtitle', 'lead', 'body']: + field_value = getattr(shout, field_name, None) + if field_value and isinstance(field_value, str) and field_value.strip(): + body_text_parts.append(field_value.strip()) + + # Process media content if available + media = getattr(shout, 'media', None) + if media: + if isinstance(media, str): + try: + media_json = json.loads(media) + if isinstance(media_json, dict): + if 'title' in media_json: + body_text_parts.append(media_json['title']) + if 'body' in media_json: + body_text_parts.append(media_json['body']) + except json.JSONDecodeError: + body_text_parts.append(media) + elif isinstance(media, dict): + if 'title' in media: + body_text_parts.append(media['title']) + if 'body' in media: + body_text_parts.append(media['body']) + + if body_text_parts: + body_text = " ".join(body_text_parts) + # Truncate if too long + MAX_TEXT_LENGTH = 4000 + if len(body_text) > MAX_TEXT_LENGTH: + body_text = body_text[:MAX_TEXT_LENGTH] - logger.info(f"Indexing document: ID={shout.id}, Text length={len(text)}") + body_doc = { + "id": str(shout.id), + "body": body_text + } + indexing_tasks.append( + self.index_client.post("/index-body", json=body_doc) + ) - # Send to txtai service - response = await self.client.post( - "/index", - json={"id": str(shout.id), "text": text} - ) - response.raise_for_status() - result = response.json() - logger.info(f"Post {shout.id} successfully indexed: {result}") + # 3. Index authors + authors = getattr(shout, 'authors', []) + for author in authors: + author_id = str(getattr(author, 'id', 0)) + if not author_id or author_id == '0': + continue + + name = getattr(author, 'name', '') + + # Combine bio and about fields + bio_parts = [] + bio = getattr(author, 'bio', '') + if bio and isinstance(bio, str): + bio_parts.append(bio.strip()) + + about = getattr(author, 'about', '') + if about and isinstance(about, str): + bio_parts.append(about.strip()) + + combined_bio = " ".join(bio_parts) + + if name: + author_doc = { + "id": author_id, + "name": name, + "bio": combined_bio + } + indexing_tasks.append( + self.index_client.post("/index-author", json=author_doc) + ) + + # Run all indexing tasks in parallel + if indexing_tasks: + responses = await asyncio.gather(*indexing_tasks, return_exceptions=True) + + # Check for errors in responses + for i, response in enumerate(responses): + if isinstance(response, Exception): + logger.error(f"Error in indexing task {i}: {response}") + elif hasattr(response, 'status_code') and response.status_code >= 400: + logger.error(f"Error response in indexing task {i}: {response.status_code}, {await response.text()}") + + logger.info(f"Document {shout.id} indexed across {len(indexing_tasks)} endpoints") + else: + logger.warning(f"No content to index for shout {shout.id}") + except Exception as e: logger.error(f"Indexing error for shout {shout.id}: {e}") async def bulk_index(self, shouts): - """Index multiple documents at once with adaptive batch sizing""" + """Index multiple documents across three separate endpoints""" if not self.available or not shouts: logger.warning(f"Bulk indexing skipped: available={self.available}, shouts_count={len(shouts) if shouts else 0}") return start_time = time.time() - logger.info(f"Starting bulk indexing of {len(shouts)} documents") + logger.info(f"Starting multi-endpoint bulk indexing of {len(shouts)} documents") - MAX_TEXT_LENGTH = 4000 # Maximum text length to send in a single request - max_batch_size = MAX_BATCH_SIZE - total_indexed = 0 + # Prepare documents for different endpoints + title_docs = [] + body_docs = [] + author_docs = {} # Use dict to prevent duplicate authors + total_skipped = 0 - total_truncated = 0 - total_retries = 0 - # Group documents by size to process smaller documents in larger batches - small_docs = [] - medium_docs = [] - large_docs = [] - - # First pass: prepare all documents and categorize by size for shout in shouts: try: - text_fields = [] - for field_name in ['title', 'subtitle', 'lead', 'body']: + # 1. Process title documents + if hasattr(shout, 'title') and shout.title and isinstance(shout.title, str): + title_docs.append({ + "id": str(shout.id), + "title": shout.title.strip() + }) + + # 2. Process body documents (subtitle, lead, body) + body_text_parts = [] + for field_name in ['subtitle', 'lead', 'body']: field_value = getattr(shout, field_name, None) if field_value and isinstance(field_value, str) and field_value.strip(): - text_fields.append(field_value.strip()) + body_text_parts.append(field_value.strip()) - # Media field processing remains the same + # Process media content if available media = getattr(shout, 'media', None) if media: if isinstance(media, str): @@ -318,186 +420,180 @@ class SearchService: media_json = json.loads(media) if isinstance(media_json, dict): if 'title' in media_json: - text_fields.append(media_json['title']) + body_text_parts.append(media_json['title']) if 'body' in media_json: - text_fields.append(media_json['body']) + body_text_parts.append(media_json['body']) except json.JSONDecodeError: - text_fields.append(media) + body_text_parts.append(media) elif isinstance(media, dict): if 'title' in media: - text_fields.append(media['title']) + body_text_parts.append(media['title']) if 'body' in media: - text_fields.append(media['body']) + body_text_parts.append(media['body']) - text = " ".join(text_fields) - - if not text.strip(): - total_skipped += 1 - continue - - # Truncate text if it exceeds the maximum length - original_length = len(text) - if original_length > MAX_TEXT_LENGTH: - text = text[:MAX_TEXT_LENGTH] - total_truncated += 1 - - document = { - "id": str(shout.id), - "text": text - } - - # Categorize by size - text_len = len(text) - if text_len > 5000: - large_docs.append(document) - elif text_len > 2000: - medium_docs.append(document) - else: - small_docs.append(document) - - total_indexed += 1 + # Only add body document if we have body text + if body_text_parts: + body_text = " ".join(body_text_parts) + # Truncate if too long + MAX_TEXT_LENGTH = 4000 + if len(body_text) > MAX_TEXT_LENGTH: + body_text = body_text[:MAX_TEXT_LENGTH] + + body_docs.append({ + "id": str(shout.id), + "body": body_text + }) + # 3. Process authors if available + authors = getattr(shout, 'authors', []) + for author in authors: + author_id = str(getattr(author, 'id', 0)) + if not author_id or author_id == '0': + continue + + # Skip if we've already processed this author + if author_id in author_docs: + continue + + name = getattr(author, 'name', '') + + # Combine bio and about fields + bio_parts = [] + bio = getattr(author, 'bio', '') + if bio and isinstance(bio, str): + bio_parts.append(bio.strip()) + + about = getattr(author, 'about', '') + if about and isinstance(about, str): + bio_parts.append(about.strip()) + + combined_bio = " ".join(bio_parts) + + # Only add if we have author data + if name: + author_docs[author_id] = { + "id": author_id, + "name": name, + "bio": combined_bio + } + except Exception as e: logger.error(f"Error processing shout {getattr(shout, 'id', 'unknown')} for indexing: {e}") total_skipped += 1 - # Process each category with appropriate batch sizes - logger.info(f"Documents categorized: {len(small_docs)} small, {len(medium_docs)} medium, {len(large_docs)} large") + # Convert author dict to list + author_docs_list = list(author_docs.values()) - # Process small documents (larger batches) - if small_docs: - batch_size = min(max_batch_size, 15) - await self._process_document_batches(small_docs, batch_size, "small") - - # Process medium documents (medium batches) - if medium_docs: - batch_size = min(max_batch_size, 10) - await self._process_document_batches(medium_docs, batch_size, "medium") - - # Process large documents (small batches) - if large_docs: - batch_size = min(max_batch_size, 3) - await self._process_document_batches(large_docs, batch_size, "large") + # Process each endpoint in parallel + indexing_tasks = [ + self._index_endpoint(title_docs, "/bulk-index-titles", "title"), + self._index_endpoint(body_docs, "/bulk-index-bodies", "body"), + self._index_endpoint(author_docs_list, "/bulk-index-authors", "author") + ] + + await asyncio.gather(*indexing_tasks) elapsed = time.time() - start_time - logger.info(f"Bulk indexing completed in {elapsed:.2f}s: {total_indexed} indexed, {total_skipped} skipped, {total_truncated} truncated, {total_retries} retries") - - async def _process_document_batches(self, documents, batch_size, size_category): - """Process document batches with retry logic""" - # Check for possible database corruption before starting - db_error_count = 0 - - for i in range(0, len(documents), batch_size): - batch = documents[i:i+batch_size] - batch_id = f"{size_category}-{i//batch_size + 1}" - logger.info(f"Processing {size_category} batch {batch_id} of {len(batch)} documents") - - retry_count = 0 - max_retries = 3 - success = False - - # Process with retries - while not success and retry_count < max_retries: - try: - logger.info(f"Sending batch {batch_id} of {len(batch)} documents to search service (attempt {retry_count+1})") - response = await self.index_client.post( - "/bulk-index", - json=batch, - timeout=120.0 # Explicit longer timeout for large batches - ) - - # Handle 422 validation errors - these won't be fixed by retrying - if response.status_code == 422: - error_detail = response.json() - truncated_error = self._truncate_error_detail(error_detail) - logger.error(f"Validation error from search service for batch {batch_id}: {truncated_error}") - break - - # Handle 500 server errors - these might be fixed by retrying with smaller batches - elif response.status_code == 500: - db_error_count += 1 - - # If we've seen multiple 500s, log a critical error - if db_error_count >= 3: - logger.critical(f"Multiple server errors detected (500). The search service may need manual intervention. Stopping batch {batch_id} processing.") - break - - # Try again with exponential backoff - if retry_count < max_retries - 1: - retry_count += 1 - wait_time = (2 ** retry_count) + (random.random() * 0.5) # Exponential backoff with jitter - await asyncio.sleep(wait_time) - continue - - # Final retry, split the batch - elif len(batch) > 1: - mid = len(batch) // 2 - await self._process_single_batch(batch[:mid], f"{batch_id}-A") - await self._process_single_batch(batch[mid:], f"{batch_id}-B") - break - else: - # Can't split a single document - break - - # Normal success case - response.raise_for_status() - success = True - db_error_count = 0 # Reset error counter on success - - except Exception as e: - error_str = str(e).lower() - if "duplicate key" in error_str or "unique constraint" in error_str or "nonetype" in error_str: - db_error_count += 1 - if db_error_count >= 2: - logger.critical(f"Potential database corruption detected: {error_str}. The search service may need manual intervention. Stopping batch {batch_id} processing.") - break - - if retry_count < max_retries - 1: - retry_count += 1 - wait_time = (2 ** retry_count) + (random.random() * 0.5) - await asyncio.sleep(wait_time) - else: - if len(batch) > 1: - mid = len(batch) // 2 - await self._process_single_batch(batch[:mid], f"{batch_id}-A") - await self._process_single_batch(batch[mid:], f"{batch_id}-B") - break - - async def _process_single_batch(self, documents, batch_id): - """Process a single batch with maximum reliability""" - max_retries = 3 - retry_count = 0 + logger.info( + f"Multi-endpoint indexing completed in {elapsed:.2f}s: " + f"{len(title_docs)} titles, {len(body_docs)} bodies, {len(author_docs_list)} authors, " + f"{total_skipped} shouts skipped" + ) - while retry_count < max_retries: - try: - if not documents: - return + async def _index_endpoint(self, documents, endpoint, doc_type): + """Process and index documents to a specific endpoint""" + if not documents: + logger.info(f"No {doc_type} documents to index") + return + + logger.info(f"Indexing {len(documents)} {doc_type} documents") + + # Categorize documents by size + small_docs, medium_docs, large_docs = self._categorize_by_size(documents, doc_type) + + # Process each category with appropriate batch sizes + batch_sizes = { + "small": min(MAX_BATCH_SIZE, 15), + "medium": min(MAX_BATCH_SIZE, 10), + "large": min(MAX_BATCH_SIZE, 3) + } + + for category, docs in [("small", small_docs), ("medium", medium_docs), ("large", large_docs)]: + if docs: + batch_size = batch_sizes[category] + await self._process_batches(docs, batch_size, endpoint, f"{doc_type}-{category}") + + def _categorize_by_size(self, documents, doc_type): + """Categorize documents by size for optimized batch processing""" + small_docs = [] + medium_docs = [] + large_docs = [] + + for doc in documents: + # Extract relevant text based on document type + if doc_type == "title": + text = doc.get("title", "") + elif doc_type == "body": + text = doc.get("body", "") + else: # author + # For authors, consider both name and bio length + text = doc.get("name", "") + " " + doc.get("bio", "") + + text_len = len(text) + + if text_len > 5000: + large_docs.append(doc) + elif text_len > 2000: + medium_docs.append(doc) + else: + small_docs.append(doc) + + logger.info(f"{doc_type.capitalize()} documents categorized: {len(small_docs)} small, {len(medium_docs)} medium, {len(large_docs)} large") + return small_docs, medium_docs, large_docs + + async def _process_batches(self, documents, batch_size, endpoint, batch_prefix): + """Process document batches with retry logic""" + for i in range(0, len(documents), batch_size): + batch = documents[i:i+batch_size] + batch_id = f"{batch_prefix}-{i//batch_size + 1}" + + retry_count = 0 + max_retries = 3 + success = False + + while not success and retry_count < max_retries: + try: + logger.info(f"Sending batch {batch_id} ({len(batch)} docs) to {endpoint}") + response = await self.index_client.post( + endpoint, + json=batch, + timeout=90.0 + ) + + if response.status_code == 422: + error_detail = response.json() + logger.error(f"Validation error from search service for batch {batch_id}: {self._truncate_error_detail(error_detail)}") + break + + response.raise_for_status() + success = True + logger.info(f"Successfully indexed batch {batch_id}") + + except Exception as e: + retry_count += 1 + if retry_count >= max_retries: + if len(batch) > 1: + mid = len(batch) // 2 + logger.warning(f"Splitting batch {batch_id} into smaller batches for retry") + await self._process_batches(batch[:mid], batch_size // 2, endpoint, f"{batch_prefix}-{i//batch_size}-A") + await self._process_batches(batch[mid:], batch_size // 2, endpoint, f"{batch_prefix}-{i//batch_size}-B") + else: + logger.error(f"Failed to index single document in batch {batch_id} after {max_retries} attempts: {str(e)}") + break - response = await self.index_client.post( - "/bulk-index", - json=documents, - timeout=90.0 - ) - response.raise_for_status() - return # Success, exit the retry loop - - except Exception as e: - error_str = str(e).lower() - retry_count += 1 - - if "dictionary changed size" in error_str or "transaction error" in error_str: wait_time = (2 ** retry_count) + (random.random() * 0.5) - await asyncio.sleep(wait_time) # Wait for txtai to recover - continue - - if retry_count >= max_retries and len(documents) > 1: - for i, doc in enumerate(documents): - try: - resp = await self.index_client.post("/index", json=doc, timeout=30.0) - resp.raise_for_status() - except Exception as e2: - pass - return # Exit after individual processing attempt + logger.warning(f"Retrying batch {batch_id} in {wait_time:.1f}s... (attempt {retry_count+1}/{max_retries})") + await asyncio.sleep(wait_time) def _truncate_error_detail(self, error_detail): """Truncate error details for logging""" @@ -632,7 +728,7 @@ async def initialize_search_index(shouts_data): return index_stats = info.get("index_stats", {}) - indexed_doc_count = index_stats.get("document_count", 0) + indexed_doc_count = index_stats.get("total_count", 0) index_status = await search_service.check_index_status() if index_status.get("status") == "inconsistent": -- 2.43.0 From 11654dba68e9cdc6689bb13df32b861c097dedae Mon Sep 17 00:00:00 2001 From: Stepan Vladovskiy Date: Wed, 23 Apr 2025 18:24:00 -0300 Subject: [PATCH 40/40] feat: with three separate endpoints --- services/search.py | 227 +++++++++++++++++++++++++++++++++------------ 1 file changed, 167 insertions(+), 60 deletions(-) diff --git a/services/search.py b/services/search.py index 75e56efe..33fe9712 100644 --- a/services/search.py +++ b/services/search.py @@ -613,70 +613,132 @@ class SearchService: return truncated_detail - async def search(self, text, limit, offset): - """Search documents""" - if not self.available: - return [] - - if not isinstance(text, str) or not text.strip(): + + #******************* + # Specialized search methods for titles, bodies, and authors + + async def search_titles(self, text, limit=10, offset=0): + """Search only in titles using the specialized endpoint""" + if not self.available or not text.strip(): return [] - logger.info(f"Searching for: '{text}' (limit={limit}, offset={offset})") + cache_key = f"title:{text}" - # Check if we can serve from cache + # Try cache first if enabled if SEARCH_CACHE_ENABLED: - has_cache = await self.cache.has_query(text) - if has_cache: - cached_results = await self.cache.get(text, limit, offset) - if cached_results is not None: - return cached_results + if await self.cache.has_query(cache_key): + return await self.cache.get(cache_key, limit, offset) - # Not in cache or cache disabled, perform new search try: - search_limit = limit - search_offset = offset - - if SEARCH_CACHE_ENABLED: - search_limit = SEARCH_PREFETCH_SIZE - search_offset = 0 - else: - search_limit = limit - search_offset = offset - + logger.info(f"Searching titles for: '{text}' (limit={limit}, offset={offset})") response = await self.client.post( - "/search", - json={"text": text, "limit": search_limit, "offset": search_offset} + "/search-title", + json={"text": text, "limit": limit + offset} ) response.raise_for_status() result = response.json() + title_results = result.get("results", []) - formatted_results = result.get("results", []) - - valid_results = [] - for item in formatted_results: - doc_id = item.get("id") - if doc_id and doc_id.isdigit(): - valid_results.append(item) - - if len(valid_results) != len(formatted_results): - formatted_results = valid_results - + # Apply score filtering if needed if SEARCH_MIN_SCORE > 0: - initial_count = len(formatted_results) - formatted_results = [r for r in formatted_results if r.get("score", 0) >= SEARCH_MIN_SCORE] - + title_results = [r for r in title_results if r.get("score", 0) >= SEARCH_MIN_SCORE] + + # Store in cache if enabled if SEARCH_CACHE_ENABLED: - await self.cache.store(text, formatted_results) - end_idx = offset + limit - page_results = formatted_results[offset:end_idx] - return page_results - - return formatted_results + await self.cache.store(cache_key, title_results) + + # Apply offset/limit (API might not support it directly) + return title_results[offset:offset+limit] + except Exception as e: - logger.error(f"Search error for '{text}': {e}", exc_info=True) + logger.error(f"Error searching titles for '{text}': {e}") return [] + async def search_bodies(self, text, limit=10, offset=0): + """Search only in document bodies using the specialized endpoint""" + if not self.available or not text.strip(): + return [] + + cache_key = f"body:{text}" + + # Try cache first if enabled + if SEARCH_CACHE_ENABLED: + if await self.cache.has_query(cache_key): + return await self.cache.get(cache_key, limit, offset) + + try: + logger.info(f"Searching bodies for: '{text}' (limit={limit}, offset={offset})") + response = await self.client.post( + "/search-body", + json={"text": text, "limit": limit + offset} + ) + response.raise_for_status() + + result = response.json() + body_results = result.get("results", []) + + # Apply score filtering if needed + if SEARCH_MIN_SCORE > 0: + body_results = [r for r in body_results if r.get("score", 0) >= SEARCH_MIN_SCORE] + + # Store in cache if enabled + if SEARCH_CACHE_ENABLED: + await self.cache.store(cache_key, body_results) + + # Apply offset/limit + return body_results[offset:offset+limit] + + except Exception as e: + logger.error(f"Error searching bodies for '{text}': {e}") + return [] + + async def search_authors(self, text, limit=10, offset=0): + """Search only for authors using the specialized endpoint""" + if not self.available or not text.strip(): + return [] + + cache_key = f"author:{text}" + + # Try cache first if enabled + if SEARCH_CACHE_ENABLED: + if await self.cache.has_query(cache_key): + return await self.cache.get(cache_key, limit, offset) + + try: + logger.info(f"Searching authors for: '{text}' (limit={limit}, offset={offset})") + response = await self.client.post( + "/search-author", + json={"text": text, "limit": limit + offset} + ) + response.raise_for_status() + + result = response.json() + author_results = result.get("results", []) + + # Apply score filtering if needed + if SEARCH_MIN_SCORE > 0: + author_results = [r for r in author_results if r.get("score", 0) >= SEARCH_MIN_SCORE] + + # Store in cache if enabled + if SEARCH_CACHE_ENABLED: + await self.cache.store(cache_key, author_results) + + # Apply offset/limit + return author_results[offset:offset+limit] + + except Exception as e: + logger.error(f"Error searching authors for '{text}': {e}") + return [] + + async def search(self, text, limit, offset): + """ + Legacy search method that searches only bodies for backward compatibility. + Consider using the specialized search methods instead. + """ + logger.warning("Using deprecated search() method - consider using search_bodies(), search_titles(), or search_authors()") + return await self.search_bodies(text, limit, offset) + async def check_index_status(self): """Get detailed statistics about the search index health""" if not self.available: @@ -701,19 +763,63 @@ class SearchService: search_service = SearchService() # API-compatible function to perform a search -async def search_text(text: str, limit: int = 50, offset: int = 0): - payload = [] - if search_service.available: - payload = await search_service.search(text, limit, offset) - return payload -async def get_search_count(text: str): - """Get total count of results for a query without fetching all results""" - if search_service.available and SEARCH_CACHE_ENABLED: - if await search_service.cache.has_query(text): - return await search_service.cache.get_total_count(text) - results = await search_text(text, SEARCH_PREFETCH_SIZE, 0) - return len(results) +async def search_title_text(text: str, limit: int = 10, offset: int = 0): + """Search titles API helper function""" + if search_service.available: + return await search_service.search_titles(text, limit, offset) + return [] + +async def search_body_text(text: str, limit: int = 10, offset: int = 0): + """Search bodies API helper function""" + if search_service.available: + return await search_service.search_bodies(text, limit, offset) + return [] + +async def search_author_text(text: str, limit: int = 10, offset: int = 0): + """Search authors API helper function""" + if search_service.available: + return await search_service.search_authors(text, limit, offset) + return [] + +async def get_title_search_count(text: str): + """Get count of title search results""" + if not search_service.available: + return 0 + + if SEARCH_CACHE_ENABLED: + cache_key = f"title:{text}" + if await search_service.cache.has_query(cache_key): + return await search_service.cache.get_total_count(cache_key) + + # If not found in cache, fetch from endpoint + return len(await search_title_text(text, SEARCH_PREFETCH_SIZE, 0)) + +async def get_body_search_count(text: str): + """Get count of body search results""" + if not search_service.available: + return 0 + + if SEARCH_CACHE_ENABLED: + cache_key = f"body:{text}" + if await search_service.cache.has_query(cache_key): + return await search_service.cache.get_total_count(cache_key) + + # If not found in cache, fetch from endpoint + return len(await search_body_text(text, SEARCH_PREFETCH_SIZE, 0)) + +async def get_author_search_count(text: str): + """Get count of author search results""" + if not search_service.available: + return 0 + + if SEARCH_CACHE_ENABLED: + cache_key = f"author:{text}" + if await search_service.cache.has_query(cache_key): + return await search_service.cache.get_total_count(cache_key) + + # If not found in cache, fetch from endpoint + return len(await search_author_text(text, SEARCH_PREFETCH_SIZE, 0)) async def initialize_search_index(shouts_data): """Initialize search index with existing data during application startup""" @@ -767,7 +873,8 @@ async def initialize_search_index(shouts_data): try: test_query = "test" - test_results = await search_text(test_query, 5) + # Use body search since that's most likely to return results + test_results = await search_body_text(test_query, 5) if test_results: categories = set() -- 2.43.0