style(search.py): with indexing message

2025-04-24 18:45:00 -03:00
parent 3062a2b7de
commit e7facf8d87
1 changed files with 361 additions and 253 deletions
--- a/services/search.py
+++ b/services/search.py
@@ -15,16 +15,24 @@ logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("httpcore").setLevel(logging.WARNING)

 # Configuration for search service
-SEARCH_ENABLED = bool(os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"])
+SEARCH_ENABLED = bool(
+    os.environ.get("SEARCH_ENABLED", "true").lower() in ["true", "1", "yes"]
+)
 TXTAI_SERVICE_URL = os.environ.get("TXTAI_SERVICE_URL", "none")
 MAX_BATCH_SIZE = int(os.environ.get("SEARCH_MAX_BATCH_SIZE", "25"))

 # Search cache configuration
-SEARCH_CACHE_ENABLED = bool(os.environ.get("SEARCH_CACHE_ENABLED", "true").lower() in ["true", "1", "yes"])
-SEARCH_CACHE_TTL_SECONDS = int(os.environ.get("SEARCH_CACHE_TTL_SECONDS", "900"))  # Default: 15 minutes
+SEARCH_CACHE_ENABLED = bool(
+    os.environ.get("SEARCH_CACHE_ENABLED", "true").lower() in ["true", "1", "yes"]
+)
+SEARCH_CACHE_TTL_SECONDS = int(
+    os.environ.get("SEARCH_CACHE_TTL_SECONDS", "900")
+)  # Default: 15 minutes
 SEARCH_MIN_SCORE = float(os.environ.get("SEARCH_MIN_SCORE", "0.1"))
 SEARCH_PREFETCH_SIZE = int(os.environ.get("SEARCH_PREFETCH_SIZE", "200"))
-SEARCH_USE_REDIS = bool(os.environ.get("SEARCH_USE_REDIS", "true").lower() in ["true", "1", "yes"])
+SEARCH_USE_REDIS = bool(
+    os.environ.get("SEARCH_USE_REDIS", "true").lower() in ["true", "1", "yes"]
+)

 search_offset = 0

@@ -32,11 +40,13 @@ search_offset = 0
 if SEARCH_USE_REDIS:
    try:
        from services.redis import redis
+
        logger.info("Redis client imported for search caching")
    except ImportError:
        logger.warning("Redis client import failed, falling back to memory cache")
        SEARCH_USE_REDIS = False

+
 class SearchCache:
    """Cache for search results to enable efficient pagination"""

@@ -57,9 +67,11 @@ class SearchCache:
                await redis.set(
                    f"{self._redis_prefix}{normalized_query}",
                    serialized_results,
-                    ex=self.ttl
+                    ex=self.ttl,
+                )
+                logger.info(
+                    f"Stored {len(results)} search results for query '{query}' in Redis"
                )
-                logger.info(f"Stored {len(results)} search results for query '{query}' in Redis")
                return True
            except Exception as e:
                logger.error(f"Error storing search results in Redis: {e}")
@@ -72,7 +84,9 @@ class SearchCache:
        # Store results and update timestamp
        self.cache[normalized_query] = results
        self.last_accessed[normalized_query] = time.time()
-        logger.info(f"Cached {len(results)} search results for query '{query}' in memory")
+        logger.info(
+            f"Cached {len(results)} search results for query '{query}' in memory"
+        )
        return True

    async def get(self, query, limit=10, offset=0):
@@ -104,10 +118,14 @@ class SearchCache:
        # Return paginated subset
        end_idx = min(offset + limit, len(all_results))
        if offset >= len(all_results):
-            logger.warning(f"Requested offset {offset} exceeds result count {len(all_results)}")
+            logger.warning(
+                f"Requested offset {offset} exceeds result count {len(all_results)}"
+            )
            return []

-        logger.info(f"Cache hit for '{query}': serving {offset}:{end_idx} of {len(all_results)} results")
+        logger.info(
+            f"Cache hit for '{query}': serving {offset}:{end_idx} of {len(all_results)} results"
+        )
        return all_results[offset:end_idx]

    async def has_query(self, query):
@@ -158,7 +176,8 @@ class SearchCache:
        now = time.time()
        # First remove expired entries
        expired_keys = [
-            key for key, last_access in self.last_accessed.items() 
+            key
+            for key, last_access in self.last_accessed.items()
            if now - last_access > self.ttl
        ]

@@ -183,6 +202,7 @@ class SearchCache:
                    del self.last_accessed[key]
            logger.info(f"Removed {remove_count} oldest search cache entries")

+
 class SearchService:
    def __init__(self):
        logger.info(f"Initializing search service with URL: {TXTAI_SERVICE_URL}")
@@ -198,8 +218,12 @@ class SearchService:

        if SEARCH_CACHE_ENABLED:
            cache_location = "Redis" if SEARCH_USE_REDIS else "Memory"
-            logger.info(f"Search caching enabled using {cache_location} cache with TTL={SEARCH_CACHE_TTL_SECONDS}s")
-            logger.info(f"Minimum score filter: {SEARCH_MIN_SCORE}, prefetch size: {SEARCH_PREFETCH_SIZE}")
+            logger.info(
+                f"Search caching enabled using {cache_location} cache with TTL={SEARCH_CACHE_TTL_SECONDS}s"
+            )
+            logger.info(
+                f"Minimum score filter: {SEARCH_MIN_SCORE}, prefetch size: {SEARCH_PREFETCH_SIZE}"
+            )

    async def info(self):
        """Return information about search service"""
@@ -219,7 +243,6 @@ class SearchService:
        """Check if service is available"""
        return self.available

-    
    async def verify_docs(self, doc_ids):
        """Verify which documents exist in the search index across all content types"""
        if not self.available:
@@ -230,7 +253,7 @@ class SearchService:
            response = await self.client.post(
                "/verify-docs",
                json={"doc_ids": doc_ids},
-                timeout=60.0  # Longer timeout for potentially large ID lists
+                timeout=60.0,  # Longer timeout for potentially large ID lists
            )
            response.raise_for_status()
            result = response.json()
@@ -248,8 +271,12 @@ class SearchService:
            titles_missing_count = len(titles_missing)
            total_missing_count = len(all_missing)

-            logger.info(f"Document verification complete: {bodies_missing_count} bodies missing, {titles_missing_count} titles missing")
-            logger.info(f"Total unique missing documents: {total_missing_count} out of {len(doc_ids)} total")
+            logger.info(
+                f"Document verification complete: {bodies_missing_count} bodies missing, {titles_missing_count} titles missing"
+            )
+            logger.info(
+                f"Total unique missing documents: {total_missing_count} out of {len(doc_ids)} total"
+            )

            # Return in a backwards-compatible format plus the detailed breakdown
            return {
@@ -258,14 +285,13 @@ class SearchService:
                    "bodies_missing": list(bodies_missing),
                    "titles_missing": list(titles_missing),
                    "bodies_missing_count": bodies_missing_count,
-                    "titles_missing_count": titles_missing_count
-                }
+                    "titles_missing_count": titles_missing_count,
+                },
            }
        except Exception as e:
            logger.error(f"Document verification error: {e}")
            return {"status": "error", "message": str(e)}

-    
    def index(self, shout):
        """Index a single document"""
        if not self.available:
@@ -284,40 +310,37 @@ class SearchService:
            indexing_tasks = []

            # 1. Index title if available
-            if hasattr(shout, 'title') and shout.title and isinstance(shout.title, str):
-                title_doc = {
-                    "id": str(shout.id),
-                    "title": shout.title.strip()
-                }
+            if hasattr(shout, "title") and shout.title and isinstance(shout.title, str):
+                title_doc = {"id": str(shout.id), "title": shout.title.strip()}
                indexing_tasks.append(
                    self.index_client.post("/index-title", json=title_doc)
                )

            # 2. Index body content (subtitle, lead, body)
            body_text_parts = []
-            for field_name in ['subtitle', 'lead', 'body']:
+            for field_name in ["subtitle", "lead", "body"]:
                field_value = getattr(shout, field_name, None)
                if field_value and isinstance(field_value, str) and field_value.strip():
                    body_text_parts.append(field_value.strip())

            # Process media content if available
-            media = getattr(shout, 'media', None)
+            media = getattr(shout, "media", None)
            if media:
                if isinstance(media, str):
                    try:
                        media_json = json.loads(media)
                        if isinstance(media_json, dict):
-                            if 'title' in media_json:
-                                body_text_parts.append(media_json['title'])
-                            if 'body' in media_json:
-                                body_text_parts.append(media_json['body'])
+                            if "title" in media_json:
+                                body_text_parts.append(media_json["title"])
+                            if "body" in media_json:
+                                body_text_parts.append(media_json["body"])
                    except json.JSONDecodeError:
                        body_text_parts.append(media)
                elif isinstance(media, dict):
-                    if 'title' in media:
-                        body_text_parts.append(media['title'])
-                    if 'body' in media:
-                        body_text_parts.append(media['body'])
+                    if "title" in media:
+                        body_text_parts.append(media["title"])
+                    if "body" in media:
+                        body_text_parts.append(media["body"])

            if body_text_parts:
                body_text = " ".join(body_text_parts)
@@ -326,57 +349,58 @@ class SearchService:
                if len(body_text) > MAX_TEXT_LENGTH:
                    body_text = body_text[:MAX_TEXT_LENGTH]

-                body_doc = {
-                    "id": str(shout.id),
-                    "body": body_text
-                }
+                body_doc = {"id": str(shout.id), "body": body_text}
                indexing_tasks.append(
                    self.index_client.post("/index-body", json=body_doc)
                )

            # 3. Index authors
-            authors = getattr(shout, 'authors', [])
+            authors = getattr(shout, "authors", [])
            for author in authors:
-                author_id = str(getattr(author, 'id', 0))
-                if not author_id or author_id == '0':
+                author_id = str(getattr(author, "id", 0))
+                if not author_id or author_id == "0":
                    continue

-                name = getattr(author, 'name', '')
+                name = getattr(author, "name", "")

                # Combine bio and about fields
                bio_parts = []
-                bio = getattr(author, 'bio', '')
+                bio = getattr(author, "bio", "")
                if bio and isinstance(bio, str):
                    bio_parts.append(bio.strip())

-                about = getattr(author, 'about', '')
+                about = getattr(author, "about", "")
                if about and isinstance(about, str):
                    bio_parts.append(about.strip())

                combined_bio = " ".join(bio_parts)

                if name:
-                    author_doc = {
-                        "id": author_id,
-                        "name": name,
-                        "bio": combined_bio
-                    }
+                    author_doc = {"id": author_id, "name": name, "bio": combined_bio}
                    indexing_tasks.append(
                        self.index_client.post("/index-author", json=author_doc)
                    )

            # Run all indexing tasks in parallel
            if indexing_tasks:
-                responses = await asyncio.gather(*indexing_tasks, return_exceptions=True)
+                responses = await asyncio.gather(
+                    *indexing_tasks, return_exceptions=True
+                )

                # Check for errors in responses
                for i, response in enumerate(responses):
                    if isinstance(response, Exception):
                        logger.error(f"Error in indexing task {i}: {response}")
-                    elif hasattr(response, 'status_code') and response.status_code >= 400:
-                        logger.error(f"Error response in indexing task {i}: {response.status_code}, {await response.text()}")
+                    elif (
+                        hasattr(response, "status_code") and response.status_code >= 400
+                    ):
+                        logger.error(
+                            f"Error response in indexing task {i}: {response.status_code}, {await response.text()}"
+                        )

-                logger.info(f"Document {shout.id} indexed across {len(indexing_tasks)} endpoints")
+                logger.info(
+                    f"Document {shout.id} indexed across {len(indexing_tasks)} endpoints"
+                )
            else:
                logger.warning(f"No content to index for shout {shout.id}")

@@ -386,7 +410,9 @@ class SearchService:
    async def bulk_index(self, shouts):
        """Index multiple documents across three separate endpoints"""
        if not self.available or not shouts:
-            logger.warning(f"Bulk indexing skipped: available={self.available}, shouts_count={len(shouts) if shouts else 0}")
+            logger.warning(
+                f"Bulk indexing skipped: available={self.available}, shouts_count={len(shouts) if shouts else 0}"
+            )
            return

        start_time = time.time()
@@ -402,37 +428,44 @@ class SearchService:
        for shout in shouts:
            try:
                # 1. Process title documents
-                if hasattr(shout, 'title') and shout.title and isinstance(shout.title, str):
-                    title_docs.append({
-                        "id": str(shout.id),
-                        "title": shout.title.strip()
-                    })
+                if (
+                    hasattr(shout, "title")
+                    and shout.title
+                    and isinstance(shout.title, str)
+                ):
+                    title_docs.append(
+                        {"id": str(shout.id), "title": shout.title.strip()}
+                    )

                # 2. Process body documents (subtitle, lead, body)
                body_text_parts = []
-                for field_name in ['subtitle', 'lead', 'body']:
+                for field_name in ["subtitle", "lead", "body"]:
                    field_value = getattr(shout, field_name, None)
-                    if field_value and isinstance(field_value, str) and field_value.strip():
+                    if (
+                        field_value
+                        and isinstance(field_value, str)
+                        and field_value.strip()
+                    ):
                        body_text_parts.append(field_value.strip())

                # Process media content if available
-                media = getattr(shout, 'media', None)
+                media = getattr(shout, "media", None)
                if media:
                    if isinstance(media, str):
                        try:
                            media_json = json.loads(media)
                            if isinstance(media_json, dict):
-                                if 'title' in media_json:
-                                    body_text_parts.append(media_json['title'])
-                                if 'body' in media_json:
-                                    body_text_parts.append(media_json['body'])
+                                if "title" in media_json:
+                                    body_text_parts.append(media_json["title"])
+                                if "body" in media_json:
+                                    body_text_parts.append(media_json["body"])
                        except json.JSONDecodeError:
                            body_text_parts.append(media)
                    elif isinstance(media, dict):
-                        if 'title' in media:
-                            body_text_parts.append(media['title'])
-                        if 'body' in media:
-                            body_text_parts.append(media['body'])
+                        if "title" in media:
+                            body_text_parts.append(media["title"])
+                        if "body" in media:
+                            body_text_parts.append(media["body"])

                # Only add body document if we have body text
                if body_text_parts:
@@ -442,31 +475,28 @@ class SearchService:
                    if len(body_text) > MAX_TEXT_LENGTH:
                        body_text = body_text[:MAX_TEXT_LENGTH]

-                    body_docs.append({
-                        "id": str(shout.id),
-                        "body": body_text
-                    })
+                    body_docs.append({"id": str(shout.id), "body": body_text})

                # 3. Process authors if available
-                authors = getattr(shout, 'authors', [])
+                authors = getattr(shout, "authors", [])
                for author in authors:
-                    author_id = str(getattr(author, 'id', 0))
-                    if not author_id or author_id == '0':
+                    author_id = str(getattr(author, "id", 0))
+                    if not author_id or author_id == "0":
                        continue

                    # Skip if we've already processed this author
                    if author_id in author_docs:
                        continue

-                    name = getattr(author, 'name', '')
+                    name = getattr(author, "name", "")

                    # Combine bio and about fields
                    bio_parts = []
-                    bio = getattr(author, 'bio', '')
+                    bio = getattr(author, "bio", "")
                    if bio and isinstance(bio, str):
                        bio_parts.append(bio.strip())

-                    about = getattr(author, 'about', '')
+                    about = getattr(author, "about", "")
                    if about and isinstance(about, str):
                        bio_parts.append(about.strip())

@@ -477,21 +507,26 @@ class SearchService:
                        author_docs[author_id] = {
                            "id": author_id,
                            "name": name,
-                            "bio": combined_bio
+                            "bio": combined_bio,
                        }

            except Exception as e:
-                logger.error(f"Error processing shout {getattr(shout, 'id', 'unknown')} for indexing: {e}")
+                logger.error(
+                    f"Error processing shout {getattr(shout, 'id', 'unknown')} for indexing: {e}"
+                )
                total_skipped += 1

        # Convert author dict to list
        author_docs_list = list(author_docs.values())
        
+        # Log indexing started message
+        logger.info("indexing started...")
+
        # Process each endpoint in parallel
        indexing_tasks = [
            self._index_endpoint(title_docs, "/bulk-index-titles", "title"),
            self._index_endpoint(body_docs, "/bulk-index-bodies", "body"),
-            self._index_endpoint(author_docs_list, "/bulk-index-authors", "author")
+            self._index_endpoint(author_docs_list, "/bulk-index-authors", "author"),
        ]

        await asyncio.gather(*indexing_tasks)
@@ -512,19 +547,27 @@ class SearchService:
        logger.info(f"Indexing {len(documents)} {doc_type} documents")

        # Categorize documents by size
-        small_docs, medium_docs, large_docs = self._categorize_by_size(documents, doc_type)
+        small_docs, medium_docs, large_docs = self._categorize_by_size(
+            documents, doc_type
+        )

        # Process each category with appropriate batch sizes
        batch_sizes = {
            "small": min(MAX_BATCH_SIZE, 15),
            "medium": min(MAX_BATCH_SIZE, 10),
-            "large": min(MAX_BATCH_SIZE, 3)
+            "large": min(MAX_BATCH_SIZE, 3),
        }

-        for category, docs in [("small", small_docs), ("medium", medium_docs), ("large", large_docs)]:
+        for category, docs in [
+            ("small", small_docs),
+            ("medium", medium_docs),
+            ("large", large_docs),
+        ]:
            if docs:
                batch_size = batch_sizes[category]
-                await self._process_batches(docs, batch_size, endpoint, f"{doc_type}-{category}")
+                await self._process_batches(
+                    docs, batch_size, endpoint, f"{doc_type}-{category}"
+                )

    def _categorize_by_size(self, documents, doc_type):
        """Categorize documents by size for optimized batch processing"""
@@ -551,13 +594,15 @@ class SearchService:
            else:
                small_docs.append(doc)

-        logger.info(f"{doc_type.capitalize()} documents categorized: {len(small_docs)} small, {len(medium_docs)} medium, {len(large_docs)} large")
+        logger.info(
+            f"{doc_type.capitalize()} documents categorized: {len(small_docs)} small, {len(medium_docs)} medium, {len(large_docs)} large"
+        )
        return small_docs, medium_docs, large_docs

    async def _process_batches(self, documents, batch_size, endpoint, batch_prefix):
        """Process document batches with retry logic"""
        for i in range(0, len(documents), batch_size):
-            batch = documents[i:i+batch_size]
+            batch = documents[i : i + batch_size]
            batch_id = f"{batch_prefix}-{i//batch_size + 1}"

            retry_count = 0
@@ -567,14 +612,14 @@ class SearchService:
            while not success and retry_count < max_retries:
                try:
                    response = await self.index_client.post(
-                        endpoint,
-                        json=batch,
-                        timeout=90.0
+                        endpoint, json=batch, timeout=90.0
                    )

                    if response.status_code == 422:
                        error_detail = response.json()
-                        logger.error(f"Validation error from search service for batch {batch_id}: {self._truncate_error_detail(error_detail)}")
+                        logger.error(
+                            f"Validation error from search service for batch {batch_id}: {self._truncate_error_detail(error_detail)}"
+                        )
                        break

                    response.raise_for_status()
@@ -585,30 +630,64 @@ class SearchService:
                    if retry_count >= max_retries:
                        if len(batch) > 1:
                            mid = len(batch) // 2
-                            await self._process_batches(batch[:mid], batch_size // 2, endpoint, f"{batch_prefix}-{i//batch_size}-A")
-                            await self._process_batches(batch[mid:], batch_size // 2, endpoint, f"{batch_prefix}-{i//batch_size}-B")
+                            await self._process_batches(
+                                batch[:mid],
+                                batch_size // 2,
+                                endpoint,
+                                f"{batch_prefix}-{i//batch_size}-A",
+                            )
+                            await self._process_batches(
+                                batch[mid:],
+                                batch_size // 2,
+                                endpoint,
+                                f"{batch_prefix}-{i//batch_size}-B",
+                            )
                        else:
-                            logger.error(f"Failed to index single document in batch {batch_id} after {max_retries} attempts: {str(e)}")
+                            logger.error(
+                                f"Failed to index single document in batch {batch_id} after {max_retries} attempts: {str(e)}"
+                            )
                        break

-                    wait_time = (2 ** retry_count) + (random.random() * 0.5)
+                    wait_time = (2**retry_count) + (random.random() * 0.5)
                    await asyncio.sleep(wait_time)

    def _truncate_error_detail(self, error_detail):
        """Truncate error details for logging"""
-        truncated_detail = error_detail.copy() if isinstance(error_detail, dict) else error_detail
+        truncated_detail = (
+            error_detail.copy() if isinstance(error_detail, dict) else error_detail
+        )

-        if isinstance(truncated_detail, dict) and 'detail' in truncated_detail and isinstance(truncated_detail['detail'], list):
-            for i, item in enumerate(truncated_detail['detail']):
-                if isinstance(item, dict) and 'input' in item:
-                    if isinstance(item['input'], dict) and any(k in item['input'] for k in ['documents', 'text']):
-                        if 'documents' in item['input'] and isinstance(item['input']['documents'], list):
-                            for j, doc in enumerate(item['input']['documents']):
-                                if 'text' in doc and isinstance(doc['text'], str) and len(doc['text']) > 100:
-                                    item['input']['documents'][j]['text'] = f"{doc['text'][:100]}... [truncated, total {len(doc['text'])} chars]"
+        if (
+            isinstance(truncated_detail, dict)
+            and "detail" in truncated_detail
+            and isinstance(truncated_detail["detail"], list)
+        ):
+            for i, item in enumerate(truncated_detail["detail"]):
+                if isinstance(item, dict) and "input" in item:
+                    if isinstance(item["input"], dict) and any(
+                        k in item["input"] for k in ["documents", "text"]
+                    ):
+                        if "documents" in item["input"] and isinstance(
+                            item["input"]["documents"], list
+                        ):
+                            for j, doc in enumerate(item["input"]["documents"]):
+                                if (
+                                    "text" in doc
+                                    and isinstance(doc["text"], str)
+                                    and len(doc["text"]) > 100
+                                ):
+                                    item["input"]["documents"][j][
+                                        "text"
+                                    ] = f"{doc['text'][:100]}... [truncated, total {len(doc['text'])} chars]"

-                        if 'text' in item['input'] and isinstance(item['input']['text'], str) and len(item['input']['text']) > 100:
-                            item['input']['text'] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]"
+                        if (
+                            "text" in item["input"]
+                            and isinstance(item["input"]["text"], str)
+                            and len(item["input"]["text"]) > 100
+                        ):
+                            item["input"][
+                                "text"
+                            ] = f"{item['input']['text'][:100]}... [truncated, total {len(item['input']['text'])} chars]"

        return truncated_detail

@@ -644,7 +723,7 @@ class SearchService:

            response = await self.client.post(
                "/search-combined",
-                json={"text": text, "limit": search_limit, "offset": search_offset}
+                json={"text": text, "limit": search_limit, "offset": search_offset},
            )
            response.raise_for_status()

@@ -663,7 +742,11 @@ class SearchService:

            if SEARCH_MIN_SCORE > 0:
                initial_count = len(formatted_results)
-                formatted_results = [r for r in formatted_results if r.get("score", 0) >= SEARCH_MIN_SCORE]
+                formatted_results = [
+                    r
+                    for r in formatted_results
+                    if r.get("score", 0) >= SEARCH_MIN_SCORE
+                ]

            if SEARCH_CACHE_ENABLED:
                await self.cache.store(text, formatted_results)
@@ -689,10 +772,11 @@ class SearchService:
                return await self.cache.get(cache_key, limit, offset)

        try:
-            logger.info(f"Searching authors for: '{text}' (limit={limit}, offset={offset})")
+            logger.info(
+                f"Searching authors for: '{text}' (limit={limit}, offset={offset})"
+            )
            response = await self.client.post(
-                "/search-author",
-                json={"text": text, "limit": limit + offset}
+                "/search-author", json={"text": text, "limit": limit + offset}
            )
            response.raise_for_status()

@@ -701,14 +785,16 @@ class SearchService:

            # Apply score filtering if needed
            if SEARCH_MIN_SCORE > 0:
-                author_results = [r for r in author_results if r.get("score", 0) >= SEARCH_MIN_SCORE]
+                author_results = [
+                    r for r in author_results if r.get("score", 0) >= SEARCH_MIN_SCORE
+                ]

            # Store in cache if enabled
            if SEARCH_CACHE_ENABLED:
                await self.cache.store(cache_key, author_results)

            # Apply offset/limit
-            return author_results[offset:offset+limit]
+            return author_results[offset : offset + limit]

        except Exception as e:
            logger.error(f"Error searching authors for '{text}': {e}")
@@ -725,7 +811,9 @@ class SearchService:
            result = response.json()

            if result.get("consistency", {}).get("status") != "ok":
-                null_count = result.get("consistency", {}).get("null_embeddings_count", 0)
+                null_count = result.get("consistency", {}).get(
+                    "null_embeddings_count", 0
+                )
                if null_count > 0:
                    logger.warning(f"Found {null_count} documents with NULL embeddings")

@@ -734,23 +822,27 @@ class SearchService:
            logger.error(f"Failed to check index status: {e}")
            return {"status": "error", "message": str(e)}

+
 # Create the search service singleton
 search_service = SearchService()

 # API-compatible function to perform a search

+
 async def search_text(text: str, limit: int = 50, offset: int = 0):
    payload = []
    if search_service.available:
        payload = await search_service.search(text, limit, offset)
    return payload

+
 async def search_author_text(text: str, limit: int = 10, offset: int = 0):
    """Search authors API helper function"""
    if search_service.available:
        return await search_service.search_authors(text, limit, offset)
    return []

+
 async def get_search_count(text: str):
    """Get count of title search results"""
    if not search_service.available:
@@ -764,6 +856,7 @@ async def get_search_count(text: str):
    # If not found in cache, fetch from endpoint
    return len(await search_text(text, SEARCH_PREFETCH_SIZE, 0))

+
 async def get_author_search_count(text: str):
    """Get count of author search results"""
    if not search_service.available:
@@ -777,6 +870,7 @@ async def get_author_search_count(text: str):
    # If not found in cache, fetch from endpoint
    return len(await search_author_text(text, SEARCH_PREFETCH_SIZE, 0))

+
 async def initialize_search_index(shouts_data):
    """Initialize search index with existing data during application startup"""
    if not SEARCH_ENABLED:
@@ -794,29 +888,39 @@ async def initialize_search_index(shouts_data):

    index_status = await search_service.check_index_status()
    if index_status.get("status") == "inconsistent":
-        problem_ids = index_status.get("consistency", {}).get("null_embeddings_sample", [])
+        problem_ids = index_status.get("consistency", {}).get(
+            "null_embeddings_sample", []
+        )

        if problem_ids:
-            problem_docs = [shout for shout in shouts_data if str(shout.id) in problem_ids]
+            problem_docs = [
+                shout for shout in shouts_data if str(shout.id) in problem_ids
+            ]
            if problem_docs:
                await search_service.bulk_index(problem_docs)

    # Only consider shouts with body content for body verification
    def has_body_content(shout):
-        for field in ['subtitle', 'lead', 'body']:
-            if getattr(shout, field, None) and isinstance(getattr(shout, field, None), str) and getattr(shout, field).strip():
+        for field in ["subtitle", "lead", "body"]:
+            if (
+                getattr(shout, field, None)
+                and isinstance(getattr(shout, field, None), str)
+                and getattr(shout, field).strip()
+            ):
                return True
-        media = getattr(shout, 'media', None)
+        media = getattr(shout, "media", None)
        if media:
            if isinstance(media, str):
                try:
                    media_json = json.loads(media)
-                    if isinstance(media_json, dict) and (media_json.get('title') or media_json.get('body')):
+                    if isinstance(media_json, dict) and (
+                        media_json.get("title") or media_json.get("body")
+                    ):
                        return True
                except Exception:
                    return True
            elif isinstance(media, dict):
-                if media.get('title') or media.get('body'):
+                if media.get("title") or media.get("body"):
                    return True
        return False

@@ -829,9 +933,13 @@ async def initialize_search_index(shouts_data):
        if verification.get("status") == "error":
            return
        # Only reindex missing docs that actually have body content
-        missing_ids = [mid for mid in verification.get("missing", []) if mid in body_ids]
+        missing_ids = [
+            mid for mid in verification.get("missing", []) if mid in body_ids
+        ]
        if missing_ids:
-            missing_docs = [shout for shout in shouts_with_body if str(shout.id) in missing_ids]
+            missing_docs = [
+                shout for shout in shouts_with_body if str(shout.id) in missing_ids
+            ]
            await search_service.bulk_index(missing_docs)
    else:
        pass
@@ -846,7 +954,7 @@ async def initialize_search_index(shouts_data):
            for result in test_results:
                result_id = result.get("id")
                matching_shouts = [s for s in shouts_data if str(s.id) == result_id]
-                if matching_shouts and hasattr(matching_shouts[0], 'category'):
-                    categories.add(getattr(matching_shouts[0], 'category', 'unknown'))
+                if matching_shouts and hasattr(matching_shouts[0], "category"):
+                    categories.add(getattr(matching_shouts[0], "category", "unknown"))
    except Exception as e:
        pass