style(search.py): with indexing message

2025-04-24 18:45:00 -03:00
parent 3062a2b7de
commit e7facf8d87
1 changed files with 361 additions and 253 deletions
--- a/services/search.py
+++ b/services/search.py
@@ -15,16 +15,24 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("httpcore").setLevel(logging.WARNING)
 # Configuration for search service
-SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"])
+SEARCH_ENABLED = bool(
    os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]
 )
 TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "none")
 MAX_BATCH_SIZE = int(os.environ.get("SEARCH_MAX_BATCH_SIZE", "25"))
 # Search cache configuration
-SEARCH_CACHE_ENABLED = bool(os.environ.get("SEARCH_CACHE_ENABLED", "true").lower() in ["true", "1", "yes"])
+SEARCH_CACHE_ENABLED = bool(
-SEARCH_CACHE_TTL_SECONDS = int(os.environ.get("SEARCH_CACHE_TTL_SECONDS", "900"))  # Default: 15 minutes
+    os.environ.get("SEARCH_CACHE_ENABLED", "true").lower() in ["true", "1", "yes"]
 )
 SEARCH_CACHE_TTL_SECONDS = int(
    os.environ.get("SEARCH_CACHE_TTL_SECONDS", "900")
 )  # Default: 15 minutes
 SEARCH_MIN_SCORE = float(os.environ.get("SEARCH_MIN_SCORE", "0.1"))
 SEARCH_PREFETCH_SIZE = int(os.environ.get("SEARCH_PREFETCH_SIZE", "200"))
-SEARCH_USE_REDIS = bool(os.environ.get("SEARCH_USE_REDIS", "true").lower() in ["true", "1", "yes"])
+SEARCH_USE_REDIS = bool(
    os.environ.get("SEARCH_USE_REDIS", "true").lower() in ["true", "1", "yes"]
 )
 search_offset = 0
@@ -32,11 +40,13 @@ search_offset = 0
 if SEARCH_USE_REDIS:
    try:
        from services.redis import redis
        logger.info("Redis client imported for search caching")
    except ImportError:
        logger.warning("Redis client import failed, falling back to memory cache")
        SEARCH_USE_REDIS = False
 class SearchCache:
    """Cache for search results to enable efficient pagination"""
@@ -57,9 +67,11 @@ class SearchCache:
                await redis.set(
                    f"{self._redis_prefix}{normalized_query}",
                    serialized_results,
-                    ex=self.ttl
+                    ex=self.ttl,
                )
                logger.info(
                    f"Stored {len(results)} search results for query '{query}' in Redis"
                )
                logger.info(f"Stored {len(results)} search results for query '{query}' in Redis")
                return True
            except Exception as e:
                logger.error(f"Error storing search results in Redis: {e}")
@@ -72,7 +84,9 @@ class SearchCache:
        # Store results and update timestamp
        self.cache[normalized_query] = results
        self.last_accessed[normalized_query] = time.time()
-        logger.info(f"Cached {len(results)} search results for query '{query}' in memory")
+        logger.info(
            f"Cached {len(results)} search results for query '{query}' in memory"
        )
        return True
    async def get(self, query, limit=10, offset=0):
@@ -104,10 +118,14 @@ class SearchCache:
        # Return paginated subset
        end_idx = min(offset + limit, len(all_results))
        if offset >= len(all_results):
-            logger.warning(f"Requested offset {offset} exceeds result count {len(all_results)}")
+            logger.warning(
                f"Requested offset {offset} exceeds result count {len(all_results)}"
            )
            return []
-        logger.info(f"Cache hit for '{query}': serving {offset}:{end_idx} of {len(all_results)} results")
+        logger.info(
            f"Cache hit for '{query}': serving {offset}:{end_idx} of {len(all_results)} results"
        )
        return all_results[offset:end_idx]
    async def has_query(self, query):
@@ -158,7 +176,8 @@ class SearchCache:
        now = time.time()
        # First remove expired entries
        expired_keys = [
-            key for key, last_access in self.last_accessed.items() 
+            key
            for key, last_access in self.last_accessed.items()
            if now - last_access > self.ttl
        ]
@@ -183,6 +202,7 @@ class SearchCache:
                    del self.last_accessed[key]
            logger.info(f"Removed {remove_count} oldest search cache entries")
 class SearchService:
    def __init__(self):
        logger.info(f"Initializing search service with URL: {TXTAI_SERVICE_URL}")
@@ -198,8 +218,12 @@ class SearchService:
        if SEARCH_CACHE_ENABLED:
            cache_location = "Redis" if SEARCH_USE_REDIS else "Memory"
-            logger.info(f"Search caching enabled using {cache_location} cache with TTL={SEARCH_CACHE_TTL_SECONDS}s")
+            logger.info(
-            logger.info(f"Minimum score filter: {SEARCH_MIN_SCORE}, prefetch size: {SEARCH_PREFETCH_SIZE}")
+                f"Search caching enabled using {cache_location} cache with TTL={SEARCH_CACHE_TTL_SECONDS}s"
            )
            logger.info(
                f"Minimum score filter: {SEARCH_MIN_SCORE}, prefetch size: {SEARCH_PREFETCH_SIZE}"
            )
    async def info(self):
        """Return information about search service"""
@@ -219,7 +243,6 @@ class SearchService:
        """Check if service is available"""
        return self.available
    async def verify_docs(self, doc_ids):
        """Verify which documents exist in the search index across all content types"""
        if not self.available:
@@ -230,7 +253,7 @@ class SearchService:
            response = await self.client.post(
                "/verify-docs",
                json={"doc_ids": doc_ids},
-                timeout=60.0  # Longer timeout for potentially large ID lists
+                timeout=60.0,  # Longer timeout for potentially large ID lists
            )
            response.raise_for_status()
            result = response.json()
@@ -248,8 +271,12 @@ class SearchService:
            titles_missing_count = len(titles_missing)
            total_missing_count = len(all_missing)
-            logger.info(f"Document verification complete: {bodies_missing_count} bodies missing, {titles_missing_count} titles missing")
+            logger.info(
-            logger.info(f"Total unique missing documents: {total_missing_count} out of {len(doc_ids)} total")
+                f"Document verification complete: {bodies_missing_count} bodies missing, {titles_missing_count} titles missing"
            )
            logger.info(
                f"Total unique missing documents: {total_missing_count} out of {len(doc_ids)} total"
            )
            # Return in a backwards-compatible format plus the detailed breakdown
            return {
@@ -258,14 +285,13 @@ class SearchService:
                    "bodies_missing": list(bodies_missing),
                    "titles_missing": list(titles_missing),
                    "bodies_missing_count": bodies_missing_count,
-                    "titles_missing_count": titles_missing_count
+                    "titles_missing_count": titles_missing_count,
-                }
+                },
            }
        except Exception as e:
            logger.error(f"Document verification error: {e}")
            return {"status": "error", "message": str(e)}
    def index(self, shout):
        """Index a single document"""
        if not self.available:
@@ -284,40 +310,37 @@ class SearchService:
            indexing_tasks = []
            # 1. Index title if available
-            if hasattr(shout, 'title') and shout.title and isinstance(shout.title, str):
+            if hasattr(shout, "title") and shout.title and isinstance(shout.title, str):
-                title_doc = {
+                title_doc = {"id": str(shout.id), "title": shout.title.strip()}
                    "id": str(shout.id),
                    "title": shout.title.strip()
                }
                indexing_tasks.append(
                    self.index_client.post("/index-title", json=title_doc)
                )
            # 2. Index body content (subtitle, lead, body)
            body_text_parts = []
-            for field_name in ['subtitle', 'lead', 'body']:
+            for field_name in ["subtitle", "lead", "body"]:
                field_value = getattr(shout, field_name, None)
                if field_value and isinstance(field_value, str) and field_value.strip():
                    body_text_parts.append(field_value.strip())
            # Process media content if available
-            media = getattr(shout, 'media', None)
+            media = getattr(shout, "media", None)
            if media:
                if isinstance(media, str):
                    try:
                        media_json = json.loads(media)
                        if isinstance(media_json, dict):
-                            if 'title' in media_json:
+                            if "title" in media_json:
-                                body_text_parts.append(media_json['title'])
+                                body_text_parts.append(media_json["title"])
-                            if 'body' in media_json:
+                            if "body" in media_json:
-                                body_text_parts.append(media_json['body'])
+                                body_text_parts.append(media_json["body"])
                    except json.JSONDecodeError:
                        body_text_parts.append(media)
                elif isinstance(media, dict):
-                    if 'title' in media:
+                    if "title" in media:
-                        body_text_parts.append(media['title'])
+                        body_text_parts.append(media["title"])
-                    if 'body' in media:
+                    if "body" in media:
-                        body_text_parts.append(media['body'])
+                        body_text_parts.append(media["body"])
            if body_text_parts:
                body_text = " ".join(body_text_parts)
@@ -326,57 +349,58 @@ class SearchService:
                if len(body_text) > MAX_TEXT_LENGTH:
                    body_text = body_text[:MAX_TEXT_LENGTH]
-                body_doc = {
+                body_doc = {"id": str(shout.id), "body": body_text}
                    "id": str(shout.id),
                    "body": body_text
                }
                indexing_tasks.append(
                    self.index_client.post("/index-body", json=body_doc)
                )
            # 3. Index authors
-            authors = getattr(shout, 'authors', [])
+            authors = getattr(shout, "authors", [])
            for author in authors:
-                author_id = str(getattr(author, 'id', 0))
+                author_id = str(getattr(author, "id", 0))
-                if not author_id or author_id == '0':
+                if not author_id or author_id == "0":
                    continue
-                name = getattr(author, 'name', '')
+                name = getattr(author, "name", "")
                # Combine bio and about fields
                bio_parts = []
-                bio = getattr(author, 'bio', '')
+                bio = getattr(author, "bio", "")
                if bio and isinstance(bio, str):
                    bio_parts.append(bio.strip())
-                about = getattr(author, 'about', '')
+                about = getattr(author, "about", "")
                if about and isinstance(about, str):
                    bio_parts.append(about.strip())
                combined_bio = " ".join(bio_parts)
                if name:
-                    author_doc = {
+                    author_doc = {"id": author_id, "name": name, "bio": combined_bio}
                        "id": author_id,
                        "name": name,
                        "bio": combined_bio
                    }
                    indexing_tasks.append(
                        self.index_client.post("/index-author", json=author_doc)
                    )
            # Run all indexing tasks in parallel
            if indexing_tasks:
-                responses = await asyncio.gather(*indexing_tasks, return_exceptions=True)
+                responses = await asyncio.gather(
                    *indexing_tasks, return_exceptions=True
                )
                # Check for errors in responses
                for i, response in enumerate(responses):
                    if isinstance(response, Exception):
                        logger.error(f"Error in indexing task {i}: {response}")
-                    elif hasattr(response, 'status_code') and response.status_code >= 400:
+                    elif (
-                        logger.error(f"Error response in indexing task {i}: {response.status_code}, {await response.text()}")
+                        hasattr(response, "status_code") and response.status_code >= 400
                    ):
                        logger.error(
                            f"Error response in indexing task {i}: {response.status_code}, {await response.text()}"
                        )
-                logger.info(f"Document {shout.id} indexed across {len(indexing_tasks)} endpoints")
+                logger.info(
                    f"Document {shout.id} indexed across {len(indexing_tasks)} endpoints"
                )
            else:
                logger.warning(f"No content to index for shout {shout.id}")
@@ -386,7 +410,9 @@ class SearchService:
    async def bulk_index(self, shouts):
        """Index multiple documents across three separate endpoints"""
        if not self.available or not shouts:
-            logger.warning(f"Bulk indexing skipped: available={self.available}, shouts_count={len(shouts) if shouts else 0}")
+            logger.warning(
                f"Bulk indexing skipped: available={self.available}, shouts_count={len(shouts) if shouts else 0}"
            )
            return
        start_time = time.time()
@@ -402,37 +428,44 @@ class SearchService:
        for shout in shouts:
            try:
                # 1. Process title documents
-                if hasattr(shout, 'title') and shout.title and isinstance(shout.title, str):
+                if (
-                    title_docs.append({
+                    hasattr(shout, "title")
-                        "id": str(shout.id),
+                    and shout.title
-                        "title": shout.title.strip()
+                    and isinstance(shout.title, str)
-                    })
+                ):
                    title_docs.append(
                        {"id": str(shout.id), "title": shout.title.strip()}
                    )
                # 2. Process body documents (subtitle, lead, body)
                body_text_parts = []
-                for field_name in ['subtitle', 'lead', 'body']:
+                for field_name in ["subtitle", "lead", "body"]:
                    field_value = getattr(shout, field_name, None)
-                    if field_value and isinstance(field_value, str) and field_value.strip():
+                    if (
                        field_value
                        and isinstance(field_value, str)
                        and field_value.strip()
                    ):
                        body_text_parts.append(field_value.strip())
                # Process media content if available
-                media = getattr(shout, 'media', None)
+                media = getattr(shout, "media", None)
                if media:
                    if isinstance(media, str):
                        try:
                            media_json = json.loads(media)
                            if isinstance(media_json, dict):
-                                if 'title' in media_json:
+                                if "title" in media_json:
-                                    body_text_parts.append(media_json['title'])
+                                    body_text_parts.append(media_json["title"])
-                                if 'body' in media_json:
+                                if "body" in media_json:
-                                    body_text_parts.append(media_json['body'])
+                                    body_text_parts.append(media_json["body"])
                        except json.JSONDecodeError:
                            body_text_parts.append(media)
                    elif isinstance(media, dict):
-                        if 'title' in media:
+                        if "title" in media:
-                            body_text_parts.append(media['title'])
+                            body_text_parts.append(media["title"])
-                        if 'body' in media:
+                        if "body" in media:
-                            body_text_parts.append(media['body'])
+                            body_text_parts.append(media["body"])
                # Only add body document if we have body text
                if body_text_parts:
@@ -442,31 +475,28 @@ class SearchService:
                    if len(body_text) > MAX_TEXT_LENGTH:
                        body_text = body_text[:MAX_TEXT_LENGTH]
-                    body_docs.append({
+                    body_docs.append({"id": str(shout.id), "body": body_text})
                        "id": str(shout.id),
                        "body": body_text
                    })
                # 3. Process authors if available
-                authors = getattr(shout, 'authors', [])
+                authors = getattr(shout, "authors", [])
                for author in authors:
-                    author_id = str(getattr(author, 'id', 0))
+                    author_id = str(getattr(author, "id", 0))
-                    if not author_id or author_id == '0':
+                    if not author_id or author_id == "0":
                        continue
                    # Skip if we've already processed this author
                    if author_id in author_docs:
                        continue
-                    name = getattr(author, 'name', '')
+                    name = getattr(author, "name", "")
                    # Combine bio and about fields
                    bio_parts = []
-                    bio = getattr(author, 'bio', '')
+                    bio = getattr(author, "bio", "")
                    if bio and isinstance(bio, str):
                        bio_parts.append(bio.strip())
-                    about = getattr(author, 'about', '')
+                    about = getattr(author, "about", "")
                    if about and isinstance(about, str):
                        bio_parts.append(about.strip())
@@ -477,21 +507,26 @@ class SearchService:
                        author_docs[author_id] = {
                            "id": author_id,
                            "name": name,
-                            "bio": combined_bio
+                            "bio": combined_bio,
                        }
            except Exception as e:
-                logger.error(f"Error processing shout {getattr(shout, 'id', 'unknown')} for indexing: {e}")
+                logger.error(
                    f"Error processing shout {getattr(shout, 'id', 'unknown')} for indexing: {e}"
                )
                total_skipped += 1
        # Convert author dict to list
        author_docs_list = list(author_docs.values())
        # Log indexing started message
        logger.info("indexing started...")
        # Process each endpoint in parallel
        indexing_tasks = [
            self._index_endpoint(title_docs, "/bulk-index-titles", "title"),
            self._index_endpoint(body_docs, "/bulk-index-bodies", "body"),
-            self._index_endpoint(author_docs_list, "/bulk-index-authors", "author")
+            self._index_endpoint(author_docs_list, "/bulk-index-authors", "author"),
        ]
        await asyncio.gather(*indexing_tasks)
@@ -512,19 +547,27 @@ class SearchService:
        logger.info(f"Indexing {len(documents)} {doc_type} documents")
        # Categorize documents by size
-        small_docs, medium_docs, large_docs = self._categorize_by_size(documents, doc_type)
+        small_docs, medium_docs, large_docs = self._categorize_by_size(
            documents, doc_type
        )
        # Process each category with appropriate batch sizes
        batch_sizes = {
            "small": min(MAX_BATCH_SIZE, 15),
            "medium": min(MAX_BATCH_SIZE, 10),
-            "large": min(MAX_BATCH_SIZE, 3)
+            "large": min(MAX_BATCH_SIZE, 3),
        }
-        for category, docs in [("small", small_docs), ("medium", medium_docs), ("large", large_docs)]:
+        for category, docs in [
            ("small", small_docs),
            ("medium", medium_docs),
            ("large", large_docs),
        ]:
            if docs:
                batch_size = batch_sizes[category]
-                await self._process_batches(docs, batch_size, endpoint, f"{doc_type}-{category}")
+                await self._process_batches(
                    docs, batch_size, endpoint, f"{doc_type}-{category}"
                )
    def _categorize_by_size(self, documents, doc_type):
        """Categorize documents by size for optimized batch processing"""
@@ -551,13 +594,15 @@ class SearchService:
            else:
                small_docs.append(doc)
-        logger.info(f"{doc_type.capitalize()} documents categorized: {len(small_docs)} small, {len(medium_docs)} medium, {len(large_docs)} large")
+        logger.info(
            f"{doc_type.capitalize()} documents categorized: {len(small_docs)} small, {len(medium_docs)} medium, {len(large_docs)} large"
        )
        return small_docs, medium_docs, large_docs
    async def _process_batches(self, documents, batch_size, endpoint, batch_prefix):
        """Process document batches with retry logic"""
        for i in range(0, len(documents), batch_size):
-            batch = documents[i:i+batch_size]
+            batch = documents[i : i + batch_size]
            batch_id = f"{batch_prefix}-{i//batch_size + 1}"
            retry_count = 0
@@ -567,14 +612,14 @@ class SearchService:
            while not success and retry_count < max_retries:
                try:
                    response = await self.index_client.post(
-                        endpoint,
+                        endpoint, json=batch, timeout=90.0
                        json=batch,
                        timeout=90.0
                    )
                    if response.status_code == 422:
                        error_detail = response.json()
-                        logger.error(f"Validation error from search service for batch {batch_id}: {self._truncate_error_detail(error_detail)}")
+                        logger.error(
                            f"Validation error from search service for batch {batch_id}: {self._truncate_error_detail(error_detail)}"
                        )
                        break
                    response.raise_for_status()
@@ -585,30 +630,64 @@ class SearchService:
                    if retry_count >= max_retries:
                        if len(batch) > 1:
                            mid = len(batch) // 2
-                            await self._process_batches(batch[:mid], batch_size // 2, endpoint, f"{batch_prefix}-{i//batch_size}-A")
+                            await self._process_batches(
-                            await self._process_batches(batch[mid:], batch_size // 2, endpoint, f"{batch_prefix}-{i//batch_size}-B")
+                                batch[:mid],
                                batch_size // 2,
                                endpoint,
                                f"{batch_prefix}-{i//batch_size}-A",
                            )
                            await self._process_batches(
                                batch[mid:],
                                batch_size // 2,
                                endpoint,
                                f"{batch_prefix}-{i//batch_size}-B",
                            )
                        else:
-                            logger.error(f"Failed to index single document in batch {batch_id} after {max_retries} attempts: {str(e)}")
+                            logger.error(
                                f"Failed to index single document in batch {batch_id} after {max_retries} attempts: {str(e)}"
                            )
                        break
-                    wait_time = (2 ** retry_count) + (random.random() * 0.5)
+                    wait_time = (2**retry_count) + (random.random() * 0.5)
                    await asyncio.sleep(wait_time)
    def _truncate_error_detail(self, error_detail):
        """Truncate error details for logging"""
-        truncated_detail = error_detail.copy() if isinstance(error_detail, dict) else error_detail
+        truncated_detail = (
            error_detail.copy() if isinstance(error_detail, dict) else error_detail
        )
-        if isinstance(truncated_detail, dict) and 'detail' in truncated_detail and isinstance(truncated_detail['detail'], list):
+        if (
-            for i, item in enumerate(truncated_detail['detail']):
+            isinstance(truncated_detail, dict)
-                if isinstance(item, dict) and 'input' in item:
+            and "detail" in truncated_detail
-                    if isinstance(item['input'], dict) and any(k in item['input'] for k in ['documents', 'text']):
+            and isinstance(truncated_detail["detail"], list)
-                        if 'documents' in item['input'] and isinstance(item['input']['documents'], list):
+        ):
-                            for j, doc in enumerate(item['input']['documents']):
+            for i, item in enumerate(truncated_detail["detail"]):
-                                if 'text' in doc and isinstance(doc['text'], str) and len(doc['text']) > 100:
+                if isinstance(item, dict) and "input" in item:
-                                    item['input']['documents'][j]['text'] = f"{doc['text'][:100]}... [truncated, total {len(doc['text'])} chars]"
+                    if isinstance(item["input"], dict) and any(
                        k in item["input"] for k in ["documents", "text"]
                    ):
                        if "documents" in item["input"] and isinstance(
                            item["input"]["documents"], list
                        ):
                            for j, doc in enumerate(item["input"]["documents"]):
                                if (
                                    "text" in doc
                                    and isinstance(doc["text"], str)
                                    and len(doc["text"]) > 100
                                ):
                                    item["input"]["documents"][j][
                                        "text"
                                    ] = f"{doc['text'][:100]}... [truncated, total {len(doc['text'])} chars]"
-                        if 'text' in item['input'] and isinstance(item['input']['text'], str) and len(item['input']['text']) > 100:
+                        if (
-                            item['input']['text'] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]"
+                            "text" in item["input"]
                            and isinstance(item["input"]["text"], str)
                            and len(item["input"]["text"]) > 100
                        ):
                            item["input"][
                                "text"
                            ] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]"
        return truncated_detail
@@ -644,7 +723,7 @@ class SearchService:
            response = await self.client.post(
                "/search-combined",
-                json={"text": text, "limit": search_limit, "offset": search_offset}
+                json={"text": text, "limit": search_limit, "offset": search_offset},
            )
            response.raise_for_status()
@@ -663,7 +742,11 @@ class SearchService:
            if SEARCH_MIN_SCORE > 0:
                initial_count = len(formatted_results)
-                formatted_results = [r for r in formatted_results if r.get("score", 0) >= SEARCH_MIN_SCORE]
+                formatted_results = [
                    r
                    for r in formatted_results
                    if r.get("score", 0) >= SEARCH_MIN_SCORE
                ]
            if SEARCH_CACHE_ENABLED:
                await self.cache.store(text, formatted_results)
@@ -689,10 +772,11 @@ class SearchService:
                return await self.cache.get(cache_key, limit, offset)
        try:
-            logger.info(f"Searching authors for: '{text}' (limit={limit}, offset={offset})")
+            logger.info(
                f"Searching authors for: '{text}' (limit={limit}, offset={offset})"
            )
            response = await self.client.post(
-                "/search-author",
+                "/search-author", json={"text": text, "limit": limit + offset}
                json={"text": text, "limit": limit + offset}
            )
            response.raise_for_status()
@@ -701,14 +785,16 @@ class SearchService:
            # Apply score filtering if needed
            if SEARCH_MIN_SCORE > 0:
-                author_results = [r for r in author_results if r.get("score", 0) >= SEARCH_MIN_SCORE]
+                author_results = [
                    r for r in author_results if r.get("score", 0) >= SEARCH_MIN_SCORE
                ]
            # Store in cache if enabled
            if SEARCH_CACHE_ENABLED:
                await self.cache.store(cache_key, author_results)
            # Apply offset/limit
-            return author_results[offset:offset+limit]
+            return author_results[offset : offset + limit]
        except Exception as e:
            logger.error(f"Error searching authors for '{text}': {e}")
@@ -725,7 +811,9 @@ class SearchService:
            result = response.json()
            if result.get("consistency", {}).get("status") != "ok":
-                null_count = result.get("consistency", {}).get("null_embeddings_count", 0)
+                null_count = result.get("consistency", {}).get(
                    "null_embeddings_count", 0
                )
                if null_count > 0:
                    logger.warning(f"Found {null_count} documents with NULL embeddings")
@@ -734,23 +822,27 @@ class SearchService:
            logger.error(f"Failed to check index status: {e}")
            return {"status": "error", "message": str(e)}
 # Create the search service singleton
 search_service = SearchService()
 # API-compatible function to perform a search
 async def search_text(text: str, limit: int = 50, offset: int = 0):
    payload = []
    if search_service.available:
        payload = await search_service.search(text, limit, offset)
    return payload
 async def search_author_text(text: str, limit: int = 10, offset: int = 0):
    """Search authors API helper function"""
    if search_service.available:
        return await search_service.search_authors(text, limit, offset)
    return []
 async def get_search_count(text: str):
    """Get count of title search results"""
    if not search_service.available:
@@ -764,6 +856,7 @@ async def get_search_count(text: str):
    # If not found in cache, fetch from endpoint
    return len(await search_text(text, SEARCH_PREFETCH_SIZE, 0))
 async def get_author_search_count(text: str):
    """Get count of author search results"""
    if not search_service.available:
@@ -777,6 +870,7 @@ async def get_author_search_count(text: str):
    # If not found in cache, fetch from endpoint
    return len(await search_author_text(text, SEARCH_PREFETCH_SIZE, 0))
 async def initialize_search_index(shouts_data):
    """Initialize search index with existing data during application startup"""
    if not SEARCH_ENABLED:
@@ -794,29 +888,39 @@ async def initialize_search_index(shouts_data):
    index_status = await search_service.check_index_status()
    if index_status.get("status") == "inconsistent":
-        problem_ids = index_status.get("consistency", {}).get("null_embeddings_sample", [])
+        problem_ids = index_status.get("consistency", {}).get(
            "null_embeddings_sample", []
        )
        if problem_ids:
-            problem_docs = [shout for shout in shouts_data if str(shout.id) in problem_ids]
+            problem_docs = [
                shout for shout in shouts_data if str(shout.id) in problem_ids
            ]
            if problem_docs:
                await search_service.bulk_index(problem_docs)
    # Only consider shouts with body content for body verification
    def has_body_content(shout):
-        for field in ['subtitle', 'lead', 'body']:
+        for field in ["subtitle", "lead", "body"]:
-            if getattr(shout, field, None) and isinstance(getattr(shout, field, None), str) and getattr(shout, field).strip():
+            if (
                getattr(shout, field, None)
                and isinstance(getattr(shout, field, None), str)
                and getattr(shout, field).strip()
            ):
                return True
-        media = getattr(shout, 'media', None)
+        media = getattr(shout, "media", None)
        if media:
            if isinstance(media, str):
                try:
                    media_json = json.loads(media)
-                    if isinstance(media_json, dict) and (media_json.get('title') or media_json.get('body')):
+                    if isinstance(media_json, dict) and (
                        media_json.get("title") or media_json.get("body")
                    ):
                        return True
                except Exception:
                    return True
            elif isinstance(media, dict):
-                if media.get('title') or media.get('body'):
+                if media.get("title") or media.get("body"):
                    return True
        return False
@@ -829,9 +933,13 @@ async def initialize_search_index(shouts_data):
        if verification.get("status") == "error":
            return
        # Only reindex missing docs that actually have body content
-        missing_ids = [mid for mid in verification.get("missing", []) if mid in body_ids]
+        missing_ids = [
            mid for mid in verification.get("missing", []) if mid in body_ids
        ]
        if missing_ids:
-            missing_docs = [shout for shout in shouts_with_body if str(shout.id) in missing_ids]
+            missing_docs = [
                shout for shout in shouts_with_body if str(shout.id) in missing_ids
            ]
            await search_service.bulk_index(missing_docs)
    else:
        pass
@@ -846,7 +954,7 @@ async def initialize_search_index(shouts_data):
            for result in test_results:
                result_id = result.get("id")
                matching_shouts = [s for s in shouts_data if str(s.id) == result_id]
-                if matching_shouts and hasattr(matching_shouts[0], 'category'):
+                if matching_shouts and hasattr(matching_shouts[0], "category"):
-                    categories.add(getattr(matching_shouts[0], 'category', 'unknown'))
+                    categories.add(getattr(matching_shouts[0], "category", "unknown"))
    except Exception as e:
        pass