diff --git a/migration/__init__.py b/migration/__init__.py index 2d195e06..4a25931d 100644 --- a/migration/__init__.py +++ b/migration/__init__.py @@ -314,9 +314,6 @@ async def handle_auto(): async def main(): if len(sys.argv) > 1: - cmd = sys.argv[1] - if type(cmd) == str: - print("[migration] command: " + cmd) init_tables() await handle_auto() else: diff --git a/migration/export.py b/migration/export.py index a9340dc5..102cfb14 100644 --- a/migration/export.py +++ b/migration/export.py @@ -4,7 +4,7 @@ from datetime import datetime, timezone import frontmatter -from .extract import extract_html, prepare_html_body, extract_media +from .extract import extract_html, extract_media from .utils import DateTimeEncoder OLD_DATE = "2016-03-05 22:22:00.350000" @@ -50,11 +50,12 @@ def export_mdx(r): def export_body(shout, storage): entry = storage["content_items"]["by_oid"][shout["oid"]] if entry: - shout["body"] = prepare_html_body(entry) # prepare_md_body(entry) - shout["media"] = extract_media(entry) + body = extract_html(entry) + media = extract_media(entry) + shout["body"] = body # prepare_html_body(entry) # prepare_md_body(entry) + shout["media"] = media export_mdx(shout) print("[export] html for %s" % shout["slug"]) - body = extract_html(entry) open(contentDir + shout["slug"] + ".html", "w").write(body) else: raise Exception("no content_items entry found") diff --git a/migration/extract.py b/migration/extract.py index 62199dcf..ccadb7e2 100644 --- a/migration/extract.py +++ b/migration/extract.py @@ -3,6 +3,9 @@ import os import re import uuid +from bs4 import BeautifulSoup + + TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)" contentDir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content" @@ -343,59 +346,7 @@ def prepare_html_body(entry): def extract_html(entry): body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')') - media = entry.get("media", []) - kind = entry.get("type") or "" - print("[extract] kind: " + kind) - mbodies = set([]) - if media: - # print('[extract] media is found') - for m in media: - mbody = m.get("body", "") - addon = "" - if kind == "Literature": - mbody = m.get("literatureBody") or m.get("body", "") - elif kind == "Image": - cover = "" - if "thumborId" in entry: - cover = cdn + "/unsafe/1600x/" + entry["thumborId"] - if not cover: - if "image" in entry: - cover = entry["image"].get("url", "") - if "cloudinary" in cover: - cover = "" - # else: print('[extract] cover: ' + cover) - title = m.get("title", "").replace("\n", " ").replace(" ", " ") - u = m.get("thumborId") or cover or "" - if title: - addon += "

" + title + "

\n" - if not u.startswith("http"): - u = s3 + u - if not u: - print("[extract] no image url for " + str(m)) - if "cloudinary" in u: - u = "img/lost.svg" - if u != cover or (u == cover and media.index(m) == 0): - addon += '' + title + '\n' - if addon: - body_orig += addon - # print('[extract] item addon: ' + addon) - # if addon: print('[extract] addon: %s' % addon) - if mbody and mbody not in mbodies: - mbodies.add(mbody) - body_orig += mbody - if len(list(mbodies)) != len(media): - print( - "[extract] %d/%d media item bodies appended" - % (len(list(mbodies)), len(media)) - ) - # print('[extract] media items body: \n' + body_orig) - if not body_orig: - for up in entry.get("bodyHistory", []) or []: - body_orig = up.get("text", "") or "" - if body_orig: - print("[extract] got html body from history") - break if not body_orig: print("[extract] empty HTML body") - # body_html = str(BeautifulSoup(body_orig, features="html.parser")) - return body_orig + body_html = str(BeautifulSoup(body_orig, features="html.parser")) + return body_html diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py index af5f99d5..1c33ea51 100644 --- a/migration/tables/content_items.py +++ b/migration/tables/content_items.py @@ -4,7 +4,7 @@ from dateutil.parser import parse as date_parse from sqlalchemy.exc import IntegrityError from transliterate import translit from base.orm import local_session -from migration.extract import prepare_html_body +from migration.extract import extract_html, extract_media from orm.reaction import Reaction, ReactionKind from orm.shout import Shout, ShoutTopic, ShoutReactionsFollower from orm.user import User @@ -195,7 +195,8 @@ async def migrate(entry, storage): entry["cover"] = r["cover"] # body - r["body"], media = prepare_html_body(entry) + r["body"] = extract_html(entry) + media = extract_media(entry) if media: r["media"] = json.dumps(media, ensure_ascii=True) # save shout to db diff --git a/migration/tables/topics.py b/migration/tables/topics.py index 4b563716..15fcf245 100644 --- a/migration/tables/topics.py +++ b/migration/tables/topics.py @@ -1,5 +1,6 @@ from base.orm import local_session -from migration.extract import extract_md, html2text +from migration.extract import extract_md +from migration.html2text import html2text from orm import Topic