hardcopy-search-service-code
All checks were successful
Deploy on push / deploy (push) Successful in 6s

This commit is contained in:
Untone 2025-06-03 02:10:08 +03:00
parent 1329aee1f1
commit 0375939e73

View File

@ -2,34 +2,35 @@ import asyncio
import json import json
import logging import logging
import os import os
import random import secrets
import time import time
from typing import Any, Union from typing import Any, Optional
import httpx import httpx
from orm.shout import Shout
from settings import TXTAI_SERVICE_URL
from utils.logger import root_logger as logger
# Set up proper logging # Set up proper logging
logger = logging.getLogger("search")
logger.setLevel(logging.INFO) # Change to INFO to see more details logger.setLevel(logging.INFO) # Change to INFO to see more details
# Disable noise HTTP cltouchient logging # Disable noise HTTP client logging
logging.getLogger("httpx").setLevel(logging.WARNING) logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING) logging.getLogger("httpcore").setLevel(logging.WARNING)
# Configuration for search service # Configuration for search service
SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]) SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"])
TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "none")
MAX_BATCH_SIZE = int(os.environ.get("SEARCH_MAX_BATCH_SIZE", "25")) MAX_BATCH_SIZE = int(os.environ.get("SEARCH_MAX_BATCH_SIZE", "25"))
# Search cache configuration # Search cache configuration
SEARCH_CACHE_ENABLED = bool(os.environ.get("SEARCH_CACHE_ENABLED", "true").lower() in ["true", "1", "yes"]) SEARCH_CACHE_ENABLED = bool(os.environ.get("SEARCH_CACHE_ENABLED", "true").lower() in ["true", "1", "yes"])
SEARCH_CACHE_TTL_SECONDS = int(os.environ.get("SEARCH_CACHE_TTL_SECONDS", "300")) # Default: 5 minutes SEARCH_CACHE_TTL_SECONDS = int(os.environ.get("SEARCH_CACHE_TTL_SECONDS", "300")) # Default: 15 minutes
SEARCH_PREFETCH_SIZE = int(os.environ.get("SEARCH_PREFETCH_SIZE", "200")) SEARCH_PREFETCH_SIZE = int(os.environ.get("SEARCH_PREFETCH_SIZE", "200"))
SEARCH_USE_REDIS = bool(os.environ.get("SEARCH_USE_REDIS", "true").lower() in ["true", "1", "yes"]) SEARCH_USE_REDIS = bool(os.environ.get("SEARCH_USE_REDIS", "true").lower() in ["true", "1", "yes"])
search_offset = 0 search_offset = 0
# Глобальная коллекция для фоновых задач
background_tasks = []
# Import Redis client if Redis caching is enabled # Import Redis client if Redis caching is enabled
if SEARCH_USE_REDIS: if SEARCH_USE_REDIS:
try: try:
@ -45,8 +46,8 @@ class SearchCache:
"""Cache for search results to enable efficient pagination""" """Cache for search results to enable efficient pagination"""
def __init__(self, ttl_seconds: int = SEARCH_CACHE_TTL_SECONDS, max_items: int = 100) -> None: def __init__(self, ttl_seconds: int = SEARCH_CACHE_TTL_SECONDS, max_items: int = 100) -> None:
self.cache: dict[str, list] = {} # Maps search query to list of results self.cache = {} # Maps search query to list of results
self.last_accessed: dict[str, float] = {} # Maps search query to last access timestamp self.last_accessed = {} # Maps search query to last access timestamp
self.ttl = ttl_seconds self.ttl = ttl_seconds
self.max_items = max_items self.max_items = max_items
self._redis_prefix = "search_cache:" self._redis_prefix = "search_cache:"
@ -58,7 +59,7 @@ class SearchCache:
if SEARCH_USE_REDIS: if SEARCH_USE_REDIS:
try: try:
serialized_results = json.dumps(results) serialized_results = json.dumps(results)
await redis.serialize_and_set( await redis.set(
f"{self._redis_prefix}{normalized_query}", f"{self._redis_prefix}{normalized_query}",
serialized_results, serialized_results,
ex=self.ttl, ex=self.ttl,
@ -79,7 +80,7 @@ class SearchCache:
logger.info(f"Cached {len(results)} search results for query '{query}' in memory") logger.info(f"Cached {len(results)} search results for query '{query}' in memory")
return True return True
async def get(self, query: str, limit: int = 10, offset: int = 0) -> list[dict] | None: async def get(self, query: str, limit: int = 10, offset: int = 0) -> Optional[list]:
"""Get paginated results for a query""" """Get paginated results for a query"""
normalized_query = self._normalize_query(query) normalized_query = self._normalize_query(query)
all_results = None all_results = None
@ -169,7 +170,7 @@ class SearchCache:
if key in self.last_accessed: if key in self.last_accessed:
del self.last_accessed[key] del self.last_accessed[key]
logger.info("Cleaned up %d expired search cache entries", len(expired_keys)) logger.info(f"Cleaned up {len(expired_keys)} expired search cache entries")
# If still above max size, remove oldest entries # If still above max size, remove oldest entries
if len(self.cache) >= self.max_items: if len(self.cache) >= self.max_items:
@ -182,12 +183,12 @@ class SearchCache:
del self.cache[key] del self.cache[key]
if key in self.last_accessed: if key in self.last_accessed:
del self.last_accessed[key] del self.last_accessed[key]
logger.info("Removed %d oldest search cache entries", remove_count) logger.info(f"Removed {remove_count} oldest search cache entries")
class SearchService: class SearchService:
def __init__(self) -> None: def __init__(self) -> None:
logger.info("Initializing search service with URL: %s", TXTAI_SERVICE_URL) logger.info(f"Initializing search service with URL: {TXTAI_SERVICE_URL}")
self.available = SEARCH_ENABLED self.available = SEARCH_ENABLED
# Use different timeout settings for indexing and search requests # Use different timeout settings for indexing and search requests
self.client = httpx.AsyncClient(timeout=30.0, base_url=TXTAI_SERVICE_URL) self.client = httpx.AsyncClient(timeout=30.0, base_url=TXTAI_SERVICE_URL)
@ -202,69 +203,81 @@ class SearchService:
cache_location = "Redis" if SEARCH_USE_REDIS else "Memory" cache_location = "Redis" if SEARCH_USE_REDIS else "Memory"
logger.info(f"Search caching enabled using {cache_location} cache with TTL={SEARCH_CACHE_TTL_SECONDS}s") logger.info(f"Search caching enabled using {cache_location} cache with TTL={SEARCH_CACHE_TTL_SECONDS}s")
async def info(self) -> dict[str, Any]: async def info(self) -> dict:
"""Check search service info""" """Return information about search service"""
if not SEARCH_ENABLED: if not self.available:
return {"status": "disabled", "message": "Search is disabled"} return {"status": "disabled"}
try: try:
async with httpx.AsyncClient() as client: response = await self.client.get("/info")
response = await client.get(f"{TXTAI_SERVICE_URL}/info")
response.raise_for_status() response.raise_for_status()
result = response.json() result = response.json()
logger.info(f"Search service info: {result}") logger.info(f"Search service info: {result}")
return result return result
except (httpx.ConnectError, httpx.ConnectTimeout) as e: except Exception:
# Используем debug уровень для ошибок подключения logger.exception("Failed to get search info")
logger.debug("Search service connection failed: %s", str(e)) return {"status": "error", "message": "Failed to get search info"}
return {"status": "error", "message": str(e)}
except Exception as e:
# Другие ошибки логируем как debug
logger.debug("Failed to get search info: %s", str(e))
return {"status": "error", "message": str(e)}
def is_ready(self) -> bool: def is_ready(self) -> bool:
"""Check if service is available""" """Check if service is available"""
return self.available return self.available
async def verify_docs(self, doc_ids: list[int]) -> dict[str, Any]: async def verify_docs(self, doc_ids: list) -> dict:
"""Verify which documents exist in the search index across all content types""" """Verify which documents exist in the search index across all content types"""
if not self.available: if not self.available:
return {"status": "error", "message": "Search service not available"} return {"status": "disabled"}
try: try:
# Check documents across all content types logger.info(f"Verifying {len(doc_ids)} documents in search index")
results = {} response = await self.client.post(
for content_type in ["shouts", "authors", "topics"]: "/verify-docs",
endpoint = f"{TXTAI_SERVICE_URL}/exists/{content_type}" json={"doc_ids": doc_ids},
async with httpx.AsyncClient() as client: timeout=60.0, # Longer timeout for potentially large ID lists
response = await client.post(endpoint, json={"ids": doc_ids}) )
response.raise_for_status() response.raise_for_status()
results[content_type] = response.json() result = response.json()
# Process the more detailed response format
bodies_missing = set(result.get("bodies", {}).get("missing", []))
titles_missing = set(result.get("titles", {}).get("missing", []))
# Combine missing IDs from both bodies and titles
# A document is considered missing if it's missing from either index
all_missing = list(bodies_missing.union(titles_missing))
# Log summary of verification results
bodies_missing_count = len(bodies_missing)
titles_missing_count = len(titles_missing)
total_missing_count = len(all_missing)
logger.info(
f"Document verification complete: {bodies_missing_count} bodies missing, {titles_missing_count} titles missing"
)
logger.info(f"Total unique missing documents: {total_missing_count} out of {len(doc_ids)} total")
# Return in a backwards-compatible format plus the detailed breakdown
return { return {
"status": "success", "missing": all_missing,
"verified": results, "details": {
"total_docs": len(doc_ids), "bodies_missing": list(bodies_missing),
"titles_missing": list(titles_missing),
"bodies_missing_count": bodies_missing_count,
"titles_missing_count": titles_missing_count,
},
} }
except Exception as e: except Exception:
logger.exception("Document verification error") logger.exception("Document verification error")
return {"status": "error", "message": str(e)} return {"status": "error", "message": "Document verification error"}
def index(self, shout: Shout) -> None: def index(self, shout: Any) -> None:
"""Index a single document""" """Index a single document"""
if not self.available: if not self.available:
return return
logger.info(f"Indexing post {shout.id}") logger.info(f"Indexing post {shout.id}")
# Start in background to not block # Start in background to not block - store reference in a background collection
task = asyncio.create_task(self.perform_index(shout)) # to prevent garbage collection while keeping the method non-blocking
# Store task reference to prevent garbage collection background_tasks.append(asyncio.create_task(self.perform_index(shout)))
self._background_tasks: set[asyncio.Task[None]] = getattr(self, "_background_tasks", set())
self._background_tasks.add(task)
task.add_done_callback(self._background_tasks.discard)
async def perform_index(self, shout: Shout) -> None: async def perform_index(self, shout: Any) -> None:
"""Index a single document across multiple endpoints""" """Index a single document across multiple endpoints"""
if not self.available: if not self.available:
return return
@ -298,10 +311,8 @@ class SearchService:
body_text_parts.append(media_json["body"]) body_text_parts.append(media_json["body"])
except json.JSONDecodeError: except json.JSONDecodeError:
body_text_parts.append(media) body_text_parts.append(media)
elif isinstance(media, dict): elif isinstance(media, dict) and (media.get("title") or media.get("body")):
if "title" in media:
body_text_parts.append(media["title"]) body_text_parts.append(media["title"])
if "body" in media:
body_text_parts.append(media["body"]) body_text_parts.append(media["body"])
if body_text_parts: if body_text_parts:
@ -346,36 +357,32 @@ class SearchService:
# Check for errors in responses # Check for errors in responses
for i, response in enumerate(responses): for i, response in enumerate(responses):
if isinstance(response, Exception): if isinstance(response, Exception):
logger.error("Error in indexing task %d: %s", i, response) logger.error(f"Error in indexing task {i}: {response}")
elif hasattr(response, "status_code") and response.status_code >= 400: elif hasattr(response, "status_code") and response.status_code >= 400:
error_text = "" logger.error(
if hasattr(response, "text") and callable(response.text): f"Error response in indexing task {i}: {response.status_code}, {await response.text()}"
try: )
error_text = await response.text()
except (Exception, httpx.HTTPError):
error_text = str(response)
logger.error("Error response in indexing task %d: %d, %s", i, response.status_code, error_text)
logger.info("Document %s indexed across %d endpoints", shout.id, len(indexing_tasks)) logger.info(f"Document {shout.id} indexed across {len(indexing_tasks)} endpoints")
else: else:
logger.warning("No content to index for shout %s", shout.id) logger.warning(f"No content to index for shout {shout.id}")
except Exception: except Exception:
logger.exception("Indexing error for shout %s", shout.id) logger.exception(f"Indexing error for shout {shout.id}")
async def bulk_index(self, shouts: list[Shout]) -> None: async def bulk_index(self, shouts: list) -> None:
"""Index multiple documents across three separate endpoints""" """Index multiple documents across three separate endpoints"""
if not self.available or not shouts: if not self.available or not shouts:
logger.warning( logger.warning(
"Bulk indexing skipped: available=%s, shouts_count=%d", self.available, len(shouts) if shouts else 0 f"Bulk indexing skipped: available={self.available}, shouts_count={len(shouts) if shouts else 0}"
) )
return return
start_time = time.time() start_time = time.time()
logger.info("Starting multi-endpoint bulk indexing of %d documents", len(shouts)) logger.info(f"Starting multi-endpoint bulk indexing of {len(shouts)} documents")
# Prepare documents for different endpoints # Prepare documents for different endpoints
title_docs: list[dict[str, Any]] = [] title_docs = []
body_docs = [] body_docs = []
author_docs = {} # Use dict to prevent duplicate authors author_docs = {} # Use dict to prevent duplicate authors
@ -407,10 +414,8 @@ class SearchService:
body_text_parts.append(media_json["body"]) body_text_parts.append(media_json["body"])
except json.JSONDecodeError: except json.JSONDecodeError:
body_text_parts.append(media) body_text_parts.append(media)
elif isinstance(media, dict): elif isinstance(media, dict) and (media.get("title") or media.get("body")):
if "title" in media:
body_text_parts.append(media["title"]) body_text_parts.append(media["title"])
if "body" in media:
body_text_parts.append(media["body"]) body_text_parts.append(media["body"])
# Only add body document if we have body text # Only add body document if we have body text
@ -457,7 +462,7 @@ class SearchService:
} }
except Exception: except Exception:
logger.exception("Error processing shout %s for indexing", getattr(shout, "id", "unknown")) logger.exception(f"Error processing shout {getattr(shout, 'id', 'unknown')} for indexing")
total_skipped += 1 total_skipped += 1
# Convert author dict to list # Convert author dict to list
@ -477,21 +482,18 @@ class SearchService:
elapsed = time.time() - start_time elapsed = time.time() - start_time
logger.info( logger.info(
"Multi-endpoint indexing completed in %.2fs: %d titles, %d bodies, %d authors, %d shouts skipped", f"Multi-endpoint indexing completed in {elapsed:.2f}s: "
elapsed, f"{len(title_docs)} titles, {len(body_docs)} bodies, {len(author_docs_list)} authors, "
len(title_docs), f"{total_skipped} shouts skipped"
len(body_docs),
len(author_docs_list),
total_skipped,
) )
async def _index_endpoint(self, documents: list[dict], endpoint: str, doc_type: str) -> None: async def _index_endpoint(self, documents: list, endpoint: str, doc_type: str) -> None:
"""Process and index documents to a specific endpoint""" """Process and index documents to a specific endpoint"""
if not documents: if not documents:
logger.info("No %s documents to index", doc_type) logger.info(f"No {doc_type} documents to index")
return return
logger.info("Indexing %d %s documents", len(documents), doc_type) logger.info(f"Indexing {len(documents)} {doc_type} documents")
# Categorize documents by size # Categorize documents by size
small_docs, medium_docs, large_docs = self._categorize_by_size(documents, doc_type) small_docs, medium_docs, large_docs = self._categorize_by_size(documents, doc_type)
@ -512,7 +514,7 @@ class SearchService:
batch_size = batch_sizes[category] batch_size = batch_sizes[category]
await self._process_batches(docs, batch_size, endpoint, f"{doc_type}-{category}") await self._process_batches(docs, batch_size, endpoint, f"{doc_type}-{category}")
def _categorize_by_size(self, documents: list[dict], doc_type: str) -> tuple[list[dict], list[dict], list[dict]]: def _categorize_by_size(self, documents: list, doc_type: str) -> tuple[list, list, list]:
"""Categorize documents by size for optimized batch processing""" """Categorize documents by size for optimized batch processing"""
small_docs = [] small_docs = []
medium_docs = [] medium_docs = []
@ -538,15 +540,11 @@ class SearchService:
small_docs.append(doc) small_docs.append(doc)
logger.info( logger.info(
"%s documents categorized: %d small, %d medium, %d large", f"{doc_type.capitalize()} documents categorized: {len(small_docs)} small, {len(medium_docs)} medium, {len(large_docs)} large"
doc_type.capitalize(),
len(small_docs),
len(medium_docs),
len(large_docs),
) )
return small_docs, medium_docs, large_docs return small_docs, medium_docs, large_docs
async def _process_batches(self, documents: list[dict], batch_size: int, endpoint: str, batch_prefix: str) -> None: async def _process_batches(self, documents: list, batch_size: int, endpoint: str, batch_prefix: str) -> None:
"""Process document batches with retry logic""" """Process document batches with retry logic"""
for i in range(0, len(documents), batch_size): for i in range(0, len(documents), batch_size):
batch = documents[i : i + batch_size] batch = documents[i : i + batch_size]
@ -563,9 +561,7 @@ class SearchService:
if response.status_code == 422: if response.status_code == 422:
error_detail = response.json() error_detail = response.json()
logger.error( logger.error(
"Validation error from search service for batch %s: %s", f"Validation error from search service for batch {batch_id}: {self._truncate_error_detail(error_detail)}"
batch_id,
self._truncate_error_detail(error_detail),
) )
break break
@ -591,14 +587,14 @@ class SearchService:
) )
else: else:
logger.exception( logger.exception(
"Failed to index single document in batch %s after %d attempts", batch_id, max_retries f"Failed to index single document in batch {batch_id} after {max_retries} attempts"
) )
break break
wait_time = (2**retry_count) + (random.SystemRandom().random() * 0.5) wait_time = (2**retry_count) + (secrets.random() * 0.5)
await asyncio.sleep(wait_time) await asyncio.sleep(wait_time)
def _truncate_error_detail(self, error_detail: Union[dict, str, int]) -> Union[dict, str, int]: def _truncate_error_detail(self, error_detail: Any) -> Any:
"""Truncate error details for logging""" """Truncate error details for logging"""
truncated_detail = error_detail.copy() if isinstance(error_detail, dict) else error_detail truncated_detail = error_detail.copy() if isinstance(error_detail, dict) else error_detail
@ -632,174 +628,115 @@ class SearchService:
return truncated_detail return truncated_detail
async def search(self, text: str, limit: int, offset: int) -> list[dict]: async def search(self, text: str, limit: int, offset: int) -> list:
"""Search documents""" """Search documents"""
if not self.available: if not self.available:
logger.warning("Search service not available")
return [] return []
if not text or not text.strip(): # Check if we can serve from cache
logger.warning("Empty search query provided") if SEARCH_CACHE_ENABLED:
return [] has_cache = await self.cache.has_query(text)
if has_cache:
cached_results = await self.cache.get(text, limit, offset)
if cached_results is not None:
return cached_results
# Устанавливаем общий размер выборки поиска # Not in cache or cache disabled, perform new search
try:
# Decide whether to prefetch and cache or just get what we need
search_limit = SEARCH_PREFETCH_SIZE if SEARCH_CACHE_ENABLED else limit search_limit = SEARCH_PREFETCH_SIZE if SEARCH_CACHE_ENABLED else limit
logger.info("Searching for: '%s' (limit=%d, offset=%d, search_limit=%d)", text, limit, offset, search_limit) logger.info(f"Searching for: '{text}' (limit={limit}, offset={offset}, search_limit={search_limit})")
try:
response = await self.client.post( response = await self.client.post(
"/search-combined", "/search-combined",
json={"text": text, "limit": search_limit}, json={"text": text, "limit": search_limit},
) )
response.raise_for_status()
result = response.json()
formatted_results = result.get("results", [])
logger.debug(f"Search service response status: {response.status_code}") # filter out nonnumeric IDs
valid_results = [r for r in formatted_results if r.get("id", "").isdigit()]
if len(valid_results) != len(formatted_results):
formatted_results = valid_results
if response.status_code != 200: if len(valid_results) != len(formatted_results):
logger.error(f"Search service returned status {response.status_code}: {response.text}") formatted_results = valid_results
return []
results = response.json() if SEARCH_CACHE_ENABLED:
logger.debug(f"Raw search results: {len(results) if results else 0} items") # Store the full prefetch batch, then page it
if not results or not isinstance(results, list):
logger.warning(f"No search results or invalid format for query '{text}'")
return []
# Обрабатываем каждый результат
formatted_results = []
for i, item in enumerate(results):
if isinstance(item, dict):
formatted_result = self._format_search_result(item)
formatted_results.append(formatted_result)
logger.debug(
f"Formatted result {i}: id={formatted_result.get('id')}, title={formatted_result.get('title', '')[:50]}..."
)
else:
logger.warning(f"Invalid search result item {i}: {type(item)}")
logger.info(f"Successfully formatted {len(formatted_results)} search results for '{text}'")
# Сохраняем результаты в кеше
if SEARCH_CACHE_ENABLED and self.cache:
await self.cache.store(text, formatted_results) await self.cache.store(text, formatted_results)
logger.debug(f"Stored {len(formatted_results)} results in cache for '{text}'") return await self.cache.get(text, limit, offset)
# Если включен кеш и есть лишние результаты
if SEARCH_CACHE_ENABLED and self.cache and await self.cache.has_query(text):
cached_result = await self.cache.get(text, limit, offset)
logger.debug(f"Retrieved {len(cached_result) if cached_result else 0} results from cache for '{text}'")
return cached_result or []
return formatted_results return formatted_results
except Exception:
except Exception as e: logger.exception(f"Search error for '{text}'")
logger.error(f"Search error for '{text}': {e}", exc_info=True)
return [] return []
async def search_authors(self, text: str, limit: int = 10, offset: int = 0) -> list[dict]: async def search_authors(self, text: str, limit: int = 10, offset: int = 0) -> list:
"""Search only for authors using the specialized endpoint""" """Search only for authors using the specialized endpoint"""
if not self.available or not text.strip(): if not self.available or not text.strip():
return [] return []
# Кеш для авторов
cache_key = f"author:{text}" cache_key = f"author:{text}"
if SEARCH_CACHE_ENABLED and self.cache and await self.cache.has_query(cache_key):
# Check if we can serve from cache
if SEARCH_CACHE_ENABLED:
has_cache = await self.cache.has_query(cache_key)
if has_cache:
cached_results = await self.cache.get(cache_key, limit, offset) cached_results = await self.cache.get(cache_key, limit, offset)
if cached_results: if cached_results is not None:
return cached_results return cached_results
# Not in cache or cache disabled, perform new search
try: try:
# Устанавливаем общий размер выборки поиска
search_limit = SEARCH_PREFETCH_SIZE if SEARCH_CACHE_ENABLED else limit search_limit = SEARCH_PREFETCH_SIZE if SEARCH_CACHE_ENABLED else limit
logger.info( logger.info(
"Searching authors for: '%s' (limit=%d, offset=%d, search_limit=%d)", text, limit, offset, search_limit f"Searching authors for: '{text}' (limit={limit}, offset={offset}, search_limit={search_limit})"
) )
response = await self.client.post("/search-author", json={"text": text, "limit": search_limit}) response = await self.client.post("/search-author", json={"text": text, "limit": search_limit})
response.raise_for_status()
results = await response.json() result = response.json()
if not results or not isinstance(results, list): author_results = result.get("results", [])
return []
# Форматируем результаты поиска авторов # Filter out any invalid results if necessary
author_results = [] valid_results = [r for r in author_results if r.get("id", "").isdigit()]
for item in results: if len(valid_results) != len(author_results):
if isinstance(item, dict): author_results = valid_results
formatted_author = self._format_author_result(item)
author_results.append(formatted_author)
# Сохраняем результаты в кеше if SEARCH_CACHE_ENABLED:
if SEARCH_CACHE_ENABLED and self.cache: # Store the full prefetch batch, then page it
await self.cache.store(cache_key, author_results) await self.cache.store(cache_key, author_results)
return await self.cache.get(cache_key, limit, offset)
# Возвращаем нужную порцию результатов
return author_results[offset : offset + limit] return author_results[offset : offset + limit]
except Exception: except Exception:
logger.exception("Error searching authors for '%s'", text) logger.exception(f"Error searching authors for '{text}'")
return [] return []
async def check_index_status(self) -> dict: async def check_index_status(self) -> dict:
"""Get detailed statistics about the search index health""" """Get detailed statistics about the search index health"""
if not self.available: if not self.available:
return {"status": "unavailable", "message": "Search service not available"} return {"status": "disabled"}
try: try:
response = await self.client.post("/check-index") response = await self.client.get("/index-status")
result = await response.json() response.raise_for_status()
result = response.json()
if isinstance(result, dict): if result.get("consistency", {}).get("status") != "ok":
# Проверяем на NULL эмбеддинги
null_count = result.get("consistency", {}).get("null_embeddings_count", 0) null_count = result.get("consistency", {}).get("null_embeddings_count", 0)
if null_count > 0: if null_count > 0:
logger.warning("Found %d documents with NULL embeddings", null_count) logger.warning(f"Found {null_count} documents with NULL embeddings")
except Exception as e:
logger.exception("Failed to check index status")
return {"status": "error", "message": str(e)}
else:
return result return result
except Exception:
def _format_search_result(self, item: dict) -> dict: logger.exception("Failed to check index status")
"""Format search result item""" return {"status": "error", "message": "Failed to check index status"}
formatted_result = {}
# Обязательные поля
if "id" in item:
formatted_result["id"] = item["id"]
if "title" in item:
formatted_result["title"] = item["title"]
if "body" in item:
formatted_result["body"] = item["body"]
# Дополнительные поля
for field in ["subtitle", "lead", "author_id", "author_name", "created_at", "stat"]:
if field in item:
formatted_result[field] = item[field]
return formatted_result
def _format_author_result(self, item: dict) -> dict:
"""Format author search result item"""
formatted_result = {}
# Обязательные поля для автора
if "id" in item:
formatted_result["id"] = item["id"]
if "name" in item:
formatted_result["name"] = item["name"]
if "username" in item:
formatted_result["username"] = item["username"]
# Дополнительные поля для автора
for field in ["slug", "bio", "pic", "created_at", "stat"]:
if field in item:
formatted_result[field] = item[field]
return formatted_result
def close(self) -> None:
"""Close the search service"""
# Create the search service singleton # Create the search service singleton
@ -808,14 +745,14 @@ search_service = SearchService()
# API-compatible function to perform a search # API-compatible function to perform a search
async def search_text(text: str, limit: int = 200, offset: int = 0) -> list[dict]: async def search_text(text: str, limit: int = 200, offset: int = 0) -> list:
payload = [] payload = []
if search_service.available: if search_service.available:
payload = await search_service.search(text, limit, offset) payload = await search_service.search(text, limit, offset)
return payload return payload
async def search_author_text(text: str, limit: int = 10, offset: int = 0) -> list[dict]: async def search_author_text(text: str, limit: int = 10, offset: int = 0) -> list:
"""Search authors API helper function""" """Search authors API helper function"""
if search_service.available: if search_service.available:
return await search_service.search_authors(text, limit, offset) return await search_service.search_authors(text, limit, offset)
@ -827,11 +764,11 @@ async def get_search_count(text: str) -> int:
if not search_service.available: if not search_service.available:
return 0 return 0
if SEARCH_CACHE_ENABLED and search_service.cache is not None and await search_service.cache.has_query(text): if SEARCH_CACHE_ENABLED and await search_service.cache.has_query(text):
return await search_service.cache.get_total_count(text) return await search_service.cache.get_total_count(text)
# Return approximate count for active search # If not found in cache, fetch from endpoint
return 42 # Placeholder implementation return len(await search_text(text, SEARCH_PREFETCH_SIZE, 0))
async def get_author_search_count(text: str) -> int: async def get_author_search_count(text: str) -> int:
@ -841,31 +778,48 @@ async def get_author_search_count(text: str) -> int:
if SEARCH_CACHE_ENABLED: if SEARCH_CACHE_ENABLED:
cache_key = f"author:{text}" cache_key = f"author:{text}"
if search_service.cache is not None and await search_service.cache.has_query(cache_key): if await search_service.cache.has_query(cache_key):
return await search_service.cache.get_total_count(cache_key) return await search_service.cache.get_total_count(cache_key)
return 0 # Placeholder implementation # If not found in cache, fetch from endpoint
return len(await search_author_text(text, SEARCH_PREFETCH_SIZE, 0))
async def initialize_search_index(shouts_data: list) -> None: async def initialize_search_index(shouts_data: list) -> None:
"""Initialize search index with existing data during application startup""" """Initialize search index with existing data during application startup"""
if not SEARCH_ENABLED: if not SEARCH_ENABLED:
logger.info("Search is disabled, skipping index initialization")
return return
if not search_service.available: if not shouts_data:
logger.warning("Search service not available, skipping index initialization")
return return
info = await search_service.info()
if info.get("status") in ["error", "unavailable", "disabled"]:
return
index_stats = info.get("index_stats", {})
indexed_doc_count = index_stats.get("total_count", 0)
index_status = await search_service.check_index_status()
if index_status.get("status") == "inconsistent":
problem_ids = index_status.get("consistency", {}).get("null_embeddings_sample", [])
if problem_ids:
problem_docs = [shout for shout in shouts_data if str(shout.id) in problem_ids]
if problem_docs:
await search_service.bulk_index(problem_docs)
# Only consider shouts with body content for body verification # Only consider shouts with body content for body verification
def has_body_content(shout: dict) -> bool: def has_body_content(shout: Any) -> bool:
for field in ["subtitle", "lead", "body"]: for field in ["subtitle", "lead", "body"]:
if hasattr(shout, field) and getattr(shout, field) and getattr(shout, field).strip(): if (
getattr(shout, field, None)
and isinstance(getattr(shout, field, None), str)
and getattr(shout, field).strip()
):
return True return True
media = getattr(shout, "media", None)
# Check media JSON for content if media:
if hasattr(shout, "media") and shout.media:
media = shout.media
if isinstance(media, str): if isinstance(media, str):
try: try:
media_json = json.loads(media) media_json = json.loads(media)
@ -877,26 +831,36 @@ async def initialize_search_index(shouts_data: list) -> None:
return True return True
return False return False
total_count = len(shouts_data) shouts_with_body = [shout for shout in shouts_data if has_body_content(shout)]
processed_count = 0 body_ids = [str(shout.id) for shout in shouts_with_body]
# Collect categories while we're at it for informational purposes if abs(indexed_doc_count - len(shouts_data)) > 10:
categories: set = set() doc_ids = [str(shout.id) for shout in shouts_data]
verification = await search_service.verify_docs(doc_ids)
try: if verification.get("status") == "error":
for shout in shouts_data: return
# Skip items that lack meaningful text content # Only reindex missing docs that actually have body content
if not has_body_content(shout): missing_ids = [mid for mid in verification.get("missing", []) if mid in body_ids]
continue if missing_ids:
missing_docs = [shout for shout in shouts_with_body if str(shout.id) in missing_ids]
# Track categories await search_service.bulk_index(missing_docs)
matching_shouts = [s for s in shouts_data if getattr(s, "id", None) == getattr(shout, "id", None)] else:
if matching_shouts and hasattr(matching_shouts[0], "category"):
categories.add(getattr(matching_shouts[0], "category", "unknown"))
except (AttributeError, TypeError):
pass pass
logger.info("Search index initialization completed: %d/%d items", processed_count, total_count) try:
test_query = "test"
# Use body search since that's most likely to return results
test_results = await search_text(test_query, 5)
if test_results:
categories = set()
for result in test_results:
result_id = result.get("id")
matching_shouts = [s for s in shouts_data if str(s.id) == result_id]
if matching_shouts and hasattr(matching_shouts[0], "category"):
categories.add(getattr(matching_shouts[0], "category", "unknown"))
except Exception as ex:
logger.warning(f"Test search failed during initialization: {ex}")
async def check_search_service() -> None: async def check_search_service() -> None: