diff --git a/migration/export.py b/migration/export.py index 988ab35d..a9340dc5 100644 --- a/migration/export.py +++ b/migration/export.py @@ -4,7 +4,7 @@ from datetime import datetime, timezone import frontmatter -from .extract import extract_html, prepare_html_body +from .extract import extract_html, prepare_html_body, extract_media from .utils import DateTimeEncoder OLD_DATE = "2016-03-05 22:22:00.350000" @@ -50,11 +50,11 @@ def export_mdx(r): def export_body(shout, storage): entry = storage["content_items"]["by_oid"][shout["oid"]] if entry: - shout["body"], media = prepare_html_body(entry) # prepare_md_body(entry) - shout["media"] = media + shout["body"] = prepare_html_body(entry) # prepare_md_body(entry) + shout["media"] = extract_media(entry) export_mdx(shout) print("[export] html for %s" % shout["slug"]) - body, _media = extract_html(entry) + body = extract_html(entry) open(contentDir + shout["slug"] + ".html", "w").write(body) else: raise Exception("no content_items entry found") diff --git a/migration/extract.py b/migration/extract.py index 4ea44d04..62199dcf 100644 --- a/migration/extract.py +++ b/migration/extract.py @@ -3,8 +3,6 @@ import os import re import uuid -from .html2text import html2text - TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)" contentDir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "..", "discoursio-web", "content" @@ -258,47 +256,44 @@ def extract_md(body, oid=""): return newbody -def prepare_md_body(entry): - # body modifications - body = "" - kind = entry.get("type") - addon = "" - if kind == "Video": - addon = "" - for m in entry.get("media", []): - if "youtubeId" in m: - addon += "\n" +def extract_media(entry): + ''' normalized media extraction method ''' + # media [ { title pic url body } ]} + kind = entry.get("layout") + media = [] + for m in entry.get("media", []): + # title + title = m.get("title", "").replace("\n", " ").replace(" ", " ") + artist = m.get("performer") or m.get("artist") + if artist: + title = artist + " - " + title + + # pic + url = m.get("fileUrl") or m.get("url", "") + pic = "" + if "thumborId" in m: + pic = cdn + "/unsafe/1600x/" + m["thumborId"] + + # url + if not url: + if kind == "Image": + url = pic + elif "youtubeId" in m: + url = "https://youtube.com/?watch=" + m["youtubeId"] elif "vimeoId" in m: - addon += "\n" + url = "https://vimeo.com/" + m["vimeoId"] else: print("[extract] media is not supported") - print(m) - body = "import VideoPlayer from '$/components/Article/VideoPlayer'\n\n" + addon + # body + body = m.get("body") or m.get("literatureBody") - elif kind == "Music": - addon = "" - for m in entry.get("media", []): - artist = m.get("performer") - trackname = "" - if artist: - trackname += artist + " - " - if "title" in m: - trackname += m.get("title", "") - addon += ( - '\n' - ) - body = "import AudioPlayer from '$/components/Article/AudioPlayer'\n\n" + addon - - body_orig, media = extract_html(entry) - if body_orig: - body += extract_md(html2text(body_orig), entry["_id"]) - if not body: - print("[extract] empty MDX body") - return body, media + media.append({ + "url": url, + "pic": pic, + "title": title, + "body": body + }) + return media def prepare_html_body(entry): @@ -339,11 +334,11 @@ def prepare_html_body(entry): addon += '">' body += addon - body, media = extract_html(entry) + body = extract_html(entry) # if body_orig: body += extract_md(html2text(body_orig), entry['_id']) if not body: print("[extract] empty HTML body") - return body, media + return body def extract_html(entry): @@ -403,4 +398,4 @@ def extract_html(entry): if not body_orig: print("[extract] empty HTML body") # body_html = str(BeautifulSoup(body_orig, features="html.parser")) - return body_orig, media + return body_orig diff --git a/schema.graphql b/schema.graphql index e08342ec..4b5987c3 100644 --- a/schema.graphql +++ b/schema.graphql @@ -439,7 +439,7 @@ type Shout { deletedBy: User publishedBy: User publishedAt: DateTime - media: String + media: String # json [ { title pic url body }, .. ] stat: Stat }