core/services/search.py

162 lines
6.2 KiB
Python
Raw Normal View History

2024-02-29 11:04:24 +00:00
import asyncio
2022-11-17 19:53:58 +00:00
import json
2024-01-29 00:27:30 +00:00
import os
2023-12-17 20:30:20 +00:00
2024-01-29 02:00:54 +00:00
from opensearchpy import OpenSearch
2023-12-17 20:30:20 +00:00
2024-03-06 18:57:04 +00:00
from services.encoders import CustomJSONEncoder
2024-02-20 16:19:46 +00:00
from services.logger import root_logger as logger
2024-01-29 01:09:54 +00:00
from services.rediscache import redis
2022-10-04 00:32:29 +00:00
2024-02-21 16:14:58 +00:00
ELASTIC_HOST = os.environ.get('ELASTIC_HOST', '').replace('https://', '')
ELASTIC_USER = os.environ.get('ELASTIC_USER', '')
ELASTIC_PASSWORD = os.environ.get('ELASTIC_PASSWORD', '')
ELASTIC_PORT = os.environ.get('ELASTIC_PORT', 9200)
ELASTIC_AUTH = f'{ELASTIC_USER}:{ELASTIC_PASSWORD}' if ELASTIC_USER else ''
2024-02-21 07:27:16 +00:00
ELASTIC_URL = os.environ.get(
2024-02-21 16:14:58 +00:00
'ELASTIC_URL', f'https://{ELASTIC_AUTH}@{ELASTIC_HOST}:{ELASTIC_PORT}'
2024-02-21 07:27:16 +00:00
)
2024-01-29 01:09:54 +00:00
REDIS_TTL = 86400 # 1 day in seconds
2024-01-29 00:27:30 +00:00
2024-01-29 08:09:10 +00:00
index_settings = {
2024-02-21 16:14:58 +00:00
'settings': {
'index': {'number_of_shards': 1, 'auto_expand_replicas': '0-all'},
'analysis': {
'analyzer': {
'ru': {
'tokenizer': 'standard',
'filter': ['lowercase', 'ru_stop', 'ru_stemmer'],
2024-01-29 08:09:10 +00:00
}
},
2024-02-21 16:14:58 +00:00
'filter': {
'ru_stemmer': {'type': 'stemmer', 'language': 'russian'},
'ru_stop': {'type': 'stop', 'stopwords': '_russian_'},
2024-01-29 08:09:10 +00:00
},
},
},
2024-02-21 16:14:58 +00:00
'mappings': {
'properties': {
'body': {'type': 'text', 'analyzer': 'ru'},
'title': {'type': 'text', 'analyzer': 'ru'},
2024-02-29 10:48:20 +00:00
'subtitle': {'type': 'text', 'analyzer': 'ru'},
2024-02-29 10:49:34 +00:00
'lead': {'type': 'text', 'analyzer': 'ru'},
# 'author': {'type': 'text'},
2024-01-29 08:09:10 +00:00
}
},
}
2024-02-21 16:14:58 +00:00
expected_mapping = index_settings['mappings']
2024-01-29 08:09:10 +00:00
2024-02-29 11:09:50 +00:00
# Create an event loop
search_loop = asyncio.get_event_loop()
2024-01-29 01:09:54 +00:00
class SearchService:
2024-02-21 16:14:58 +00:00
def __init__(self, index_name='search_index'):
2024-01-29 00:27:30 +00:00
self.index_name = index_name
2024-01-29 02:56:28 +00:00
self.client = None
2024-02-29 11:48:08 +00:00
self.lock = asyncio.Lock() # Create an asyncio lock
2024-01-29 02:56:28 +00:00
# Only initialize the instance if it's not already initialized
2024-02-29 11:04:24 +00:00
if ELASTIC_HOST:
2024-01-29 02:56:28 +00:00
try:
self.client = OpenSearch(
2024-02-21 16:14:58 +00:00
hosts=[{'host': ELASTIC_HOST, 'port': ELASTIC_PORT}],
2024-01-29 02:56:28 +00:00
http_compress=True,
http_auth=(ELASTIC_USER, ELASTIC_PASSWORD),
use_ssl=True,
verify_certs=False,
ssl_assert_hostname=False,
ssl_show_warn=False,
# ca_certs = ca_certs_path
)
2024-02-21 16:14:58 +00:00
logger.info(' Клиент OpenSearch.org подключен')
2024-02-29 11:09:50 +00:00
# Create a task and run it in the event loop
search_loop.create_task(self.check_index())
2024-01-29 02:56:28 +00:00
except Exception as exc:
2024-02-21 16:14:58 +00:00
logger.error(f' {exc}')
2024-01-29 03:18:36 +00:00
self.client = None
2024-01-29 02:56:28 +00:00
2024-01-29 01:41:46 +00:00
def info(self):
2024-01-29 10:02:14 +00:00
if isinstance(self.client, OpenSearch):
2024-02-21 16:14:58 +00:00
logger.info(' Поиск подключен') # : {self.client.info()}')
2024-01-29 03:03:37 +00:00
else:
2024-02-21 16:14:58 +00:00
logger.info(' * Задайте переменные среды для подключения к серверу поиска')
2024-01-29 01:41:46 +00:00
2024-02-29 11:17:10 +00:00
def delete_index(self):
2024-01-29 03:18:36 +00:00
if self.client:
2024-02-21 16:14:58 +00:00
logger.debug(f' Удаляем индекс {self.index_name}')
2024-02-29 11:17:10 +00:00
self.client.indices.delete(index=self.index_name, ignore_unavailable=True)
2024-01-29 00:27:30 +00:00
2024-02-29 11:17:10 +00:00
def create_index(self):
2024-01-29 08:09:10 +00:00
if self.client:
2024-02-29 11:04:24 +00:00
logger.debug(f'Создается индекс: {self.index_name}')
2024-02-29 11:17:10 +00:00
self.delete_index()
self.client.indices.create(index=self.index_name, body=index_settings)
2024-02-29 11:04:24 +00:00
logger.debug(f'Индекс {self.index_name} создан')
2024-01-29 00:27:30 +00:00
2024-02-29 11:04:24 +00:00
async def check_index(self):
2024-01-29 03:03:37 +00:00
if self.client:
2024-02-29 11:41:32 +00:00
logger.debug(f' Проверяем индекс {self.index_name}...')
2024-02-29 11:11:48 +00:00
if not self.client.indices.exists(index=self.index_name):
2024-02-29 11:17:10 +00:00
self.create_index()
self.client.indices.put_mapping(
index=self.index_name, body=expected_mapping
)
2024-01-29 03:03:37 +00:00
else:
2024-03-11 10:47:12 +00:00
logger.info(f'найден существующий индекс {self.index_name}')
2024-01-29 03:03:37 +00:00
# Check if the mapping is correct, and recreate the index if needed
2024-02-29 11:56:50 +00:00
result = self.client.indices.get_mapping(index=self.index_name)
if isinstance(result, str):
result = json.loads(result)
if isinstance(result, dict):
mapping = result.get('mapping')
if mapping and mapping != expected_mapping:
logger.debug(f' найдена структура индексации: {mapping}')
2024-03-06 09:25:55 +00:00
logger.warn(
' требуется другая структура индексации, переиндексация'
)
2024-02-29 11:56:50 +00:00
await self.recreate_index()
2024-01-29 00:27:30 +00:00
2024-02-29 11:04:24 +00:00
async def recreate_index(self):
2024-03-28 11:05:46 +00:00
if self.client:
async with self.lock:
self.client.indices.delete(index=self.index_name, ignore_unavailable=True)
await self.check_index()
2024-02-29 10:48:20 +00:00
2024-01-29 03:42:02 +00:00
def index(self, shout):
2024-01-29 03:18:36 +00:00
if self.client:
2024-01-29 02:37:10 +00:00
id_ = str(shout.id)
2024-02-21 16:14:58 +00:00
logger.debug(f' Индексируем пост {id_}')
2024-03-06 09:25:55 +00:00
asyncio.create_task(
self.client.index(index=self.index_name, id=id_, body=shout.dict())
)
2024-01-29 00:27:30 +00:00
2024-01-29 06:45:00 +00:00
async def search(self, text, limit, offset):
2024-02-21 16:14:58 +00:00
logger.debug(f' Ищем: {text}')
search_body = {'query': {'match': {'_all': text}}}
2024-01-29 03:03:37 +00:00
if self.client:
2024-02-29 11:28:51 +00:00
search_response = self.client.search(
2024-02-21 07:27:16 +00:00
index=self.index_name, body=search_body, size=limit, from_=offset
)
2024-02-21 16:14:58 +00:00
hits = search_response['hits']['hits']
2024-01-29 03:03:37 +00:00
2024-02-21 16:14:58 +00:00
results = [{**hit['_source'], 'score': hit['_score']} for hit in hits]
2024-01-29 06:45:00 +00:00
# Use Redis as cache with TTL
2024-02-21 16:14:58 +00:00
redis_key = f'search:{text}'
2024-03-06 19:00:37 +00:00
await redis.execute('SETEX', redis_key, REDIS_TTL, json.dumps(results, cls=CustomJSONEncoder))
2024-01-29 03:03:37 +00:00
return []
2024-01-29 00:27:30 +00:00
2024-02-29 11:09:50 +00:00
2024-01-29 03:42:02 +00:00
search_service = SearchService()
2024-01-29 01:41:46 +00:00
2024-02-29 11:09:50 +00:00
2024-01-29 01:41:46 +00:00
async def search_text(text: str, limit: int = 50, offset: int = 0):
payload = []
2024-01-29 06:45:00 +00:00
if search_service.client:
# Use OpenSearchService.search_post method
2024-01-29 07:48:36 +00:00
payload = await search_service.search(text, limit, offset)
2024-01-29 01:41:46 +00:00
return payload