From bcbfdd76e9c74bde192e35171d309055b1b15fef Mon Sep 17 00:00:00 2001 From: Untone Date: Sun, 27 Apr 2025 12:53:49 +0300 Subject: [PATCH] html wrap fix --- resolvers/draft.py | 11 ++++++++--- resolvers/editor.py | 9 ++++++--- utils/__init__.py | 1 + utils/html_wrapper.py | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 utils/__init__.py create mode 100644 utils/html_wrapper.py diff --git a/resolvers/draft.py b/resolvers/draft.py index 9a4d9fe4..57df9330 100644 --- a/resolvers/draft.py +++ b/resolvers/draft.py @@ -21,6 +21,7 @@ from services.db import local_session from services.notify import notify_shout from services.schema import mutation, query from services.search import search_service +from utils.html_wrapper import wrap_html_fragment from utils.logger import root_logger as logger def create_shout_from_draft(session, draft, author_id): @@ -183,7 +184,8 @@ async def create_draft(_, info, draft_input): return {"error": f"Failed to create draft: {str(e)}"} def generate_teaser(body, limit=300): - body_text = trafilatura.extract(body, include_comments=False, include_tables=False) + body_html = wrap_html_fragment(body) + body_text = trafilatura.extract(body_html, include_comments=False, include_tables=False) body_teaser = ". ".join(body_text[:limit].split(". ")[:-1]) return body_teaser @@ -270,10 +272,12 @@ async def update_draft(_, info, draft_id: int, draft_input): if "seo" not in filtered_input and not draft.seo: body_src = filtered_input.get("body", draft.body) lead_src = filtered_input.get("lead", draft.lead) + body_html = wrap_html_fragment(body_src) + lead_html = wrap_html_fragment(lead_src) try: - body_text = trafilatura.extract(body_src, include_comments=False, include_tables=False) if body_src else None - lead_text = trafilatura.extract(lead_src, include_comments=False, include_tables=False) if lead_src else None + body_text = trafilatura.extract(body_html, include_comments=False, include_tables=False) if body_src else None + lead_text = trafilatura.extract(lead_html, include_comments=False, include_tables=False) if lead_src else None body_teaser = generate_teaser(body_text, 300) if body_text else "" filtered_input["seo"] = lead_text if lead_text else body_teaser @@ -347,6 +351,7 @@ def validate_html_content(html_content: str) -> tuple[bool, str]: return False, "Content is empty" try: + html_content = wrap_html_fragment(html_content) extracted = trafilatura.extract(html_content) if not extracted: return False, "Invalid HTML structure or empty content" diff --git a/resolvers/editor.py b/resolvers/editor.py index 8bf65817..c9cd969c 100644 --- a/resolvers/editor.py +++ b/resolvers/editor.py @@ -23,6 +23,7 @@ from services.db import local_session from services.notify import notify_shout from services.schema import mutation, query from services.search import search_service +from utils.html_wrapper import wrap_html_fragment from utils.logger import root_logger as logger @@ -180,9 +181,11 @@ async def create_shout(_, info, inp): # Создаем публикацию без topics body = inp.get("body", "") lead = inp.get("lead", "") - body_text = trafilatura.extract(body) - lead_text = trafilatura.extract(lead) - seo = inp.get("seo", lead_text or body_text[:300].split(". ")[:-1].join(". ")) + body_html = wrap_html_fragment(body) + lead_html = wrap_html_fragment(lead) + body_text = trafilatura.extract(body_html) + lead_text = trafilatura.extract(lead_html) + seo = inp.get("seo", lead_text.strip() or body_text.strip()[:300].split(". ")[:-1].join(". ")) new_shout = Shout( slug=slug, body=body, diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..0519ecba --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/utils/html_wrapper.py b/utils/html_wrapper.py new file mode 100644 index 00000000..fb9e4ba4 --- /dev/null +++ b/utils/html_wrapper.py @@ -0,0 +1,38 @@ +""" +Модуль для обработки HTML-фрагментов +""" + +def wrap_html_fragment(fragment: str) -> str: + """ + Оборачивает HTML-фрагмент в полную HTML-структуру для корректной обработки. + + Args: + fragment: HTML-фрагмент для обработки + + Returns: + str: Полный HTML-документ + + Example: + >>> wrap_html_fragment("

Текст параграфа

") + '

Текст параграфа

' + """ + if not fragment or not fragment.strip(): + return fragment + + # Проверяем, является ли контент полным HTML-документом + is_full_html = fragment.strip().startswith(' + + + + + + +{fragment} + +""" + + return fragment \ No newline at end of file