import asyncio import json import logging import os import concurrent.futures from txtai.embeddings import Embeddings from services.redis import redis from utils.encoders import CustomJSONEncoder # Set redis logging level to suppress DEBUG messages logger = logging.getLogger("search") logger.setLevel(logging.WARNING) REDIS_TTL = 86400 # 1 день в секундах # Configuration for txtai search SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]) # Thread executor for non-blocking initialization thread_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) class SearchService: def __init__(self, index_name="search_index"): logger.info("Инициализируем поиск...") self.index_name = index_name self.embeddings = None self._initialization_future = None self.available = SEARCH_ENABLED if not self.available: logger.info("Поиск отключен (SEARCH_ENABLED = False)") return # Initialize embeddings in background thread self._initialization_future = thread_executor.submit(self._init_embeddings) def _init_embeddings(self): """Initialize txtai embeddings in a background thread""" try: # Use the same model as in TopicClassifier model_path = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" # Configure embeddings with content storage and quantization for lower memory usage self.embeddings = Embeddings({ "path": model_path, "content": True, "quantize": True }) logger.info("txtai embeddings initialized successfully") return True except Exception as e: logger.error(f"Failed to initialize txtai embeddings: {e}") self.available = False return False async def info(self): """Return information about search service""" if not self.available: return {"status": "disabled"} try: if not self.is_ready(): return {"status": "initializing", "model": "paraphrase-multilingual-mpnet-base-v2"} return { "status": "active", "count": len(self.embeddings) if self.embeddings else 0, "model": "paraphrase-multilingual-mpnet-base-v2" } except Exception as e: logger.error(f"Failed to get search info: {e}") return {"status": "error", "message": str(e)} def is_ready(self): """Check if embeddings are fully initialized and ready""" return self.embeddings is not None and self.available def index(self, shout): """Index a single document""" if not self.available: return logger.info(f"Индексируем пост {shout.id}") # Start in background to not block asyncio.create_task(self.perform_index(shout)) async def perform_index(self, shout): """Actually perform the indexing operation""" if not self.is_ready(): # If embeddings not ready, wait for initialization if self._initialization_future and not self._initialization_future.done(): try: # Wait for initialization to complete with timeout await asyncio.get_event_loop().run_in_executor( None, lambda: self._initialization_future.result(timeout=30)) except Exception as e: logger.error(f"Embeddings initialization failed: {e}") return if not self.is_ready(): logger.error(f"Cannot index shout {shout.id}: embeddings not ready") return try: # Combine all text fields text = " ".join(filter(None, [ shout.title or "", shout.subtitle or "", shout.lead or "", shout.body or "", shout.media or "" ])) # Use upsert for individual documents await asyncio.get_event_loop().run_in_executor( None, lambda: self.embeddings.upsert([(str(shout.id), text, None)]) ) logger.info(f"Пост {shout.id} успешно индексирован") except Exception as e: logger.error(f"Indexing error for shout {shout.id}: {e}") async def bulk_index(self, shouts): """Index multiple documents at once""" if not self.available or not shouts: return if not self.is_ready(): # Wait for initialization if needed if self._initialization_future and not self._initialization_future.done(): try: await asyncio.get_event_loop().run_in_executor( None, lambda: self._initialization_future.result(timeout=30)) except Exception as e: logger.error(f"Embeddings initialization failed: {e}") return if not self.is_ready(): logger.error("Cannot perform bulk indexing: embeddings not ready") return documents = [] for shout in shouts: text = " ".join(filter(None, [ shout.title or "", shout.subtitle or "", shout.lead or "", shout.body or "", shout.media or "" ])) documents.append((str(shout.id), text, None)) try: await asyncio.get_event_loop().run_in_executor( None, lambda: self.embeddings.upsert(documents)) logger.info(f"Bulk indexed {len(documents)} documents") except Exception as e: logger.error(f"Bulk indexing error: {e}") async def search(self, text, limit, offset): """Search documents""" if not self.available: return [] # Check Redis cache first redis_key = f"search:{text}:{offset}+{limit}" cached = await redis.get(redis_key) if cached: return json.loads(cached) logger.info(f"Ищем: {text} {offset}+{limit}") if not self.is_ready(): # Wait for initialization if needed if self._initialization_future and not self._initialization_future.done(): try: await asyncio.get_event_loop().run_in_executor( None, lambda: self._initialization_future.result(timeout=30)) except Exception as e: logger.error(f"Embeddings initialization failed: {e}") return [] if not self.is_ready(): logger.error("Cannot search: embeddings not ready") return [] try: # Search with txtai (need to request more to handle offset) total = offset + limit results = await asyncio.get_event_loop().run_in_executor( None, lambda: self.embeddings.search(text, total)) # Apply offset and convert to the expected format results = results[offset:offset+limit] formatted_results = [{"id": doc_id, "score": float(score)} for score, doc_id in results] # Cache results if formatted_results: await redis.execute( "SETEX", redis_key, REDIS_TTL, json.dumps(formatted_results, cls=CustomJSONEncoder), ) return formatted_results except Exception as e: logger.error(f"Search error: {e}") return [] # Create the search service singleton search_service = SearchService() # Keep the API exactly the same to maintain compatibility async def search_text(text: str, limit: int = 50, offset: int = 0): payload = [] if search_service.available: payload = await search_service.search(text, limit, offset) return payload # Function to initialize search with existing data async def initialize_search_index(shouts_data): """Initialize search index with existing data during application startup""" if SEARCH_ENABLED: logger.info("Initializing search index with existing data...") await search_service.bulk_index(shouts_data) logger.info(f"Search index initialized with {len(shouts_data)} documents")