diff --git a/migration/extract.py b/migration/extract.py index 965d4acc..90d3a5df 100644 --- a/migration/extract.py +++ b/migration/extract.py @@ -1,8 +1,14 @@ +import json import re import base64 +from migration.html2text import html2text + TOOLTIP_REGEX = r'(\/\/\/(.+)\/\/\/)' +s3 = 'https://discours-io.s3.amazonaws.com/' +cdn = 'https://assets.discours.io' +retopics = json.loads(open('migration/tables/replacements.json', 'r').read()) def replace_tooltips(body): newbody = body @@ -29,9 +35,9 @@ def place_tooltips(body): fn = 'a class="footnote-url" href="' link = part.split(fn,1)[1].split('"', 1)[0] extracted_part = part.split(fn,1)[0] + ' ' + part.split('/', 1)[-1] - newparts[i] = '' + newparts[i] = '' + extracted_part + '' else: - newparts[i] = '' % part + newparts[i] = '%s' % part # print('[extract] tooltip: ' + newparts[i]) else: # print('[extract] pass: ' + part[:10] + '..') @@ -42,7 +48,6 @@ def place_tooltips(body): IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)" public = '../discoursio-web/public' -cdn = 'https://assets.discours.io' cache = {} @@ -81,7 +86,7 @@ def extract_images(body, oid): body = body.replace(' [](data:image', '![](data:image').replace('\n[](data:image', '![](data:image') oldparts = body.split(sep) newparts = list(oldparts) - print() + # print() if len(oldparts) > 1: print('[extract] images for %s' % oid) print('[extract] %d candidates' % (len(oldparts)-1)) @@ -95,15 +100,12 @@ def extract_images(body, oid): if end: continue else: # start or between - # print('[extract_images] have next') for mime in IMAGES.keys(): if mime in current[-15:]: - # print('[extract_images] found proper mime type') print('[extract] ' + current[-15:]) if ')' in next: b64encoded = next.split(')')[0] print('[extract] '+str(i+1)+': %d bytes' % len(b64encoded)) - # print(meta) ext = IMAGES[mime] print('[extract] type: ' + mime) name = oid + '-' + str(i) @@ -122,8 +124,8 @@ def extract_images(body, oid): newparts[i] = current.split('![](' + mime)[0] + '![](' + link + ')' newparts[i+1] = next.replace(b64encoded + ')', '') else: - print('[extract] not b64encoded') - print(current[-15:]) + print('[extract] ERROR: no b64encoded') + # print(current[-15:]) i += 1 newbody = ''.join(newparts) return newbody @@ -146,9 +148,91 @@ def cleanup(body): # .replace('\u2212', '-') return newbody - def extract(body, oid): newbody = extract_images(body, oid) newbody = cleanup(newbody) newbody = place_tooltips(newbody) - return newbody \ No newline at end of file + return newbody + +def prepare_body(entry): + # body modifications + body = '' + body_orig = entry.get('body', '') + if not body_orig: body_orig = '' + + if entry.get('type') == 'Literature': + for m in entry.get('media', []): + t = m.get('title', '') + if t: body_orig += '
' + t + '
\n' + body_orig += (m.get('body', '') or '') + body_orig += '\n' + m.get('literatureBody', '') + '\n' + + elif entry.get('type') == 'Video': + providers = set([]) + video_url = '' + require = False + for m in entry.get('media', []): + yt = m.get('youtubeId', '') + vm = m.get('vimeoId', '') + if yt: + require = True + providers.add('YouTube') + video_url = 'https://www.youtube.com/watch?v=' + yt + body += '\n' + if vm: + require = True + providers.add('Vimeo') + video_url = 'https://vimeo.com/' + vm + body += '\n' + body += extract(html2text(m.get('body', '')), entry['_id']) + if video_url == '#': print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!')) + if require: body = 'import { ' + ','.join(list(providers)) + ' } from \'solid-social\'\n\n' + body + '\n' + + elif entry.get('type') == 'Music': + for m in entry.get('media', []): + artist = m.get('performer') + trackname = '' + if artist: trackname += artist + ' - ' + if 'title' in m: trackname += m.get('title','') + body += '\n' + body += extract(html2text(m.get('body', '')), entry['_id']) + body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + body + '\n' + + elif entry.get('type') == 'Image': + cover = '' + if 'thumborId' in entry: cover = cdn + '/unsafe/1600x/' + entry['thumborId'] + if not cover and 'image' in entry: + cover = entry['image'].get('url', '') + if 'cloudinary' in cover: cover = '' + images = {} + for m in entry.get('media', []): + t = m.get('title', '') + if t: body += '#### ' + t + '\n' + u = m.get('image', {}).get('url', '') + if 'cloudinary' in u: + u = m.get('thumborId') + if not u: u = cover + u = str(u) + if u not in images.keys(): + if u.startswith('production'): u = s3 + u + body += '![' + m.get('title','').replace('\n', ' ') + '](' + u + ')\n' # TODO: gallery here + images[u] = u + body += extract(html2text(m.get('body', '')), entry['_id']) + '\n' + + if not body_orig: + print('[prepare] using body history...') + # print(entry.get('bodyHistory', '')) + try: + for up in entry.get('bodyHistory', []): + body_orig = up.get('text', '') or '' + if body_orig: break + except: pass + + # body_html = str(BeautifulSoup(body_orig, features="html.parser")) + body += extract(html2text(body_orig), entry['_id']) + + # replace some topics + for oldtopicslug, newtopicslug in retopics.items(): + body.replace(oldtopicslug, newtopicslug) + + return body diff --git a/migration/tables/content_item_categories.py b/migration/tables/content_item_categories.py index 310fc231..3009d522 100644 --- a/migration/tables/content_item_categories.py +++ b/migration/tables/content_item_categories.py @@ -35,7 +35,7 @@ def migrate(entry, topics_by_oid): if not topic: del topic_dict['oid'] topic = Topic.create(**topic_dict) - print('created') + # print('created') else: if len(topic.title) > len(topic_dict['title']) or \ len(topic.body) < len(topic_dict['body']): diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py index a529d02f..be2d0dc5 100644 --- a/migration/tables/content_items.py +++ b/migration/tables/content_items.py @@ -1,15 +1,12 @@ from dateutil.parser import parse as date_parse import frontmatter import json -import sqlalchemy from orm import Shout, ShoutTopic, ShoutRating, ShoutViewByDay, User, shout -# from bs4 import BeautifulSoup -from migration.html2text import html2text from transliterate import translit from datetime import datetime from orm.base import local_session from orm.community import Community -from migration.extract import extract +from migration.extract import prepare_body import os DISCOURS_USER = { @@ -19,9 +16,9 @@ DISCOURS_USER = { 'userpic': 'https://discours.io/images/logo-mini.svg', 'createdAt': '2016-03-05 22:22:00.350000' } - +OLD_DATE = '2016-03-05 22:22:00.350000' +retopics = json.loads(open('migration/tables/replacements.json').read()) ts = datetime.now() - type2layout = { 'Article': 'article', 'Literature': 'prose', @@ -42,44 +39,17 @@ def get_metadata(r): metadata['cover'] = r.get('cover') return metadata - -retopics = json.loads(open('migration/tables/replacements.json').read()) - def migrate(entry, users_by_oid, topics_by_oid): - ''' - type Shout { - slug: String! - author: Int! - body: String! - createdAt: DateTime! - updatedAt: DateTime! - deletedAt: DateTime - deletedBy: Int - rating: Int - ratings: [Rating] - published: Bool! - publishedAt: DateTime # if there is no published field - it is not published - replyTo: String # another shout - tags: [String] # actual values - topics: [String] # topic-slugs, order has matter - title: String - versionOf: String - visibleForRoles: [String] # role ids are strings - visibleForUsers: [Int] - views: Int - } - ''' - # print(entry) - content = '' + # init, set title and layout r = { 'layout': type2layout[entry['type']], 'title': entry['title'], 'community': Community.default_community.id, 'authors': [], 'topics': [], - 'rating': entry.get('rating', 0), + 'rating': 0, 'ratings': [], - 'createdAt': entry.get('createdAt', '2016-03-05 22:22:00.350000') + 'createdAt': [] } # slug @@ -92,126 +62,33 @@ def migrate(entry, users_by_oid, topics_by_oid): except: raise Exception if s: r['slug'] = s else: raise Exception - - # topics - - category = entry['category'] - mainTopic = topics_by_oid.get(category) - if mainTopic: - r['mainTopic'] = mainTopic["slug"] - topic_oids = [category, ] - topic_errors = [] - topic_oids.extend(entry.get('tags', [])) - for oid in topic_oids: - if oid in topics_by_oid: - r['topics'].append(topics_by_oid[oid]) - else: - # print('ERROR: unknown old topic id: ' + oid) - topic_errors.append(oid) # cover - - if entry.get('image') is not None: - r['cover'] = entry['image']['url'] - if entry.get('thumborId') is not None: - r['cover'] = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId'] - if entry.get('updatedAt') is not None: - r['updatedAt'] = date_parse(entry['updatedAt']) - - # body - - body = '' - body_orig = entry.get('body') - if not body_orig: body_orig = '' - - # body modifications - - if entry.get('type') == 'Literature': - for m in entry.get('media', []): - t = m.get('title', '') - if t: body_orig += '### ' + t + '\n' - body_orig += (m.get('body', '') or '') - body_orig += '\n' + m.get('literatureBody', '') + '\n' - - - elif entry.get('type') == 'Video': - providers = set([]) - video_url = '' - require = False - for m in entry.get('media', []): - yt = m.get('youtubeId', '') - vm = m.get('vimeoId', '') - if yt: - require = True - providers.add('YouTube') - video_url = 'https://www.youtube.com/watch?v=' + yt - body += '\n' - if vm: - require = True - providers.add('Vimeo') - video_url = 'https://vimeo.com/' + vm - body += '\n' - body += extract(html2text(m.get('body', '')), entry['_id']) - if video_url == '#': print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!')) - if require: body = 'import { ' + ','.join(list(providers)) + ' } from \'solid-social\'\n\n' + body + '\n' - body += extract(html2text(body_orig), entry['_id']) - - elif entry.get('type') == 'Music': - require = False - for m in entry.get('media', []): - if 'fileUrl' in m: - require = True - artist = m.get('performer') - trackname = '' - if artist: trackname += artist + ' - ' - trackname += m.get('title','') - body += '\n' - body += extract(html2text(m.get('body', '')), entry['_id']) - else: - print(m) - if require: body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + body + '\n' - body += extract(html2text(body_orig), entry['_id']) - - elif entry.get('type') == 'Image': - cover = r.get('cover') - images = {} - for m in entry.get('media', []): - t = m.get('title', '') - if t: body += '#### ' + t + '\n' - u = m.get('image', {}).get('url', '') - if 'cloudinary' in u: - u = m.get('thumborId') - if not u: u = cover - if u not in images.keys(): - if u.startswith('production'): u = 'https://discours-io.s3.amazonaws.com/' + u - body += '![' + m.get('title','').replace('\n', ' ') + '](' + u + ')\n' # TODO: gallery here - images[u] = u - body += extract(html2text(m.get('body', '')), entry['_id']) + '\n' - body += extract(html2text(body_orig), entry['_id']) - - # simple post or no body stored - if body == '': - if not body_orig: - print('[migration] using body history...') - try: body_orig += entry.get('bodyHistory', [{'body': ''}])[0].get('body', '') - except: pass - # need to extract - # body_html = str(BeautifulSoup(body_orig, features="html.parser")) - body += extract(html2text(body_orig), entry['_id']) + c = '' + if entry.get('thumborId'): + c = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId'] else: - # EVERYTHING IS FINE HERE - pass - - # replace some topics - for oldtopicslug, newtopicslug in retopics.items(): - body.replace(oldtopicslug, newtopicslug) + c = entry.get('image', {}).get('url') + if not c or 'cloudinary' in c: + c = '' + r['cover'] = c - # authors + # timestamps - # get author data - userdata = {} - try: userdata = users_by_oid[entry['createdBy']] - except KeyError: + r['createdAt'] = date_parse(entry.get('createdAt', OLD_DATE)) + r['updatedAt'] = date_parse(entry['updatedAt']) if 'updatedAt' in entry else ts + if entry.get('published'): + r['publishedAt'] = date_parse(entry.get('publishedAt', OLD_DATE)) + if r['publishedAt'] == OLD_DATE: r['publishedAt'] = ts + if 'deletedAt' in entry: r['deletedAt'] = date_parse(entry['deletedAt']) + + # connected users' data + + # r['deletedBy'] = entry.get('deletedBy', '0') # TypeError: 'deletedBy' is an invalid keyword argument for Shout + + oid = entry.get('createdBy', '') + userdata = users_by_oid.get(oid, {}) + if not userdata.get('slug'): app = entry.get('application') if app: userslug = translit(app['name'], 'ru', reversed=True).replace(' ', '-').replace('\'', '').replace('.', '-').lower() @@ -232,20 +109,22 @@ def migrate(entry, users_by_oid, topics_by_oid): 'userpic': 'https://discours.io/image/logo-mini.svg' } - # set author data - r['body'] = body - shout_dict = r.copy() author = { # a short version for public listings 'slug': userdata.get('slug', 'discours'), 'name': userdata.get('name', 'Дискурс'), 'userpic': userdata.get('userpic', '') } - shout_dict['authors'] = [ author, ] + r['authors'] = [ author, ] + # body + + body = prepare_body(entry) # save mdx for prerender if published - if entry['published']: - metadata = get_metadata(shout_dict) + r['body'] = body + if entry.get('published'): + content = '' + metadata = get_metadata(r) content = frontmatter.dumps(frontmatter.Post(r['body'], **metadata)) ext = 'mdx' parentDir = '/'.join(os.getcwd().split('/')[:-1]) @@ -255,90 +134,116 @@ def migrate(entry, users_by_oid, topics_by_oid): open(filepath + '.' + ext, 'w').write(bc) # open(filepath + '.html', 'w').write(body_orig) + + # topics + + category = entry['category'] + mainTopic = topics_by_oid.get(category) + if mainTopic: + r['mainTopic'] = mainTopic["slug"] + topic_oids = [category, ] + topic_errors = [] + topic_oids.extend(entry.get('tags', [])) + for oid in topic_oids: + if oid in topics_by_oid: + r['topics'].append(topics_by_oid[oid]) + else: + # print('ERROR: unknown old topic id: ' + oid) + topic_errors.append(oid) + + # set prepared shout data + + shout_dict = r.copy() + del shout_dict['topics'] # FIXME: AttributeError: 'str' object has no attribute '_sa_instance_state' + del shout_dict['rating'] # FIXME: TypeError: 'rating' is an invalid keyword argument for Shout + del shout_dict['ratings'] + + # get author + + user = None + email = userdata.get('email') + authorslug = userdata.get('slug') + with local_session() as session: + try: + if email: user = session.query(User).filter(User.email == email).first() + if not user and authorslug: user = session.query(User).filter(User.slug == authorslug).first() + if not user and userdata: user = User.create(**userdata) + except: + print('[migration] shout author error: \n%r' % entry) + raise Exception + assert user, 'could not get a user' + shout_dict['authors'] = [ user, ] + # save shout to db + s = object() + try: s = Shout.create(**shout_dict) + except: print('[migration] shout create error: \n%r' % shout_dict) + + + # shout ratings try: - shout_dict['createdAt'] = date_parse(r.get('createdAt')) if entry.get('createdAt') else ts - shout_dict['publishedAt'] = date_parse(entry.get('publishedAt')) if entry.get('published') else None - - if entry.get('deletedAt') is not None: - shout_dict['deletedAt'] = date_parse(entry.get('deletedAt')) - shout_dict['deletedBy'] = entry.get('deletedBy', '0') - - del shout_dict['topics'] # FIXME: AttributeError: 'str' object has no attribute '_sa_instance_state' - del shout_dict['rating'] # FIXME: TypeError: 'rating' is an invalid keyword argument for Shout - del shout_dict['ratings'] - - # get user - - user = None - email = userdata.get('email') - slug = userdata.get('slug') - with local_session() as session: - try: - if email: user = session.query(User).filter(User.email == email).first() - if not user and slug: user = session.query(User).filter(User.slug == slug).first() - if not user and userdata: user = User.create(**userdata) - except: - print('[migration] content_items error: \n%r' % entry) - assert user, 'could not get a user' - shout_dict['authors'] = [ user, ] - - # create shout - - s = object() - try: s = Shout.create(**shout_dict) - except: print('[migration] content_items error: \n%r' % entry) - - # shout ratings - shout_dict['ratings'] = [] for shout_rating_old in entry.get('ratings',[]): with local_session() as session: - rater = session.query(User).\ - filter(User.old_id == shout_rating_old['createdBy']).first() - if rater: - shout_rating_dict = { - 'value': shout_rating_old['value'], - 'rater': rater.slug, - 'shout': s.slug - } - cts = shout_rating_old.get('createdAt') - if cts: shout_rating_dict['ts'] = date_parse(cts) - try: + rater = session.query(User).filter(User.old_id == shout_rating_old['createdBy']).first() + if rater: + shout_rating_dict = { + 'value': shout_rating_old['value'], + 'rater': rater.slug, + 'shout': s.slug + } + cts = shout_rating_old.get('createdAt') + if cts: shout_rating_dict['ts'] = date_parse(cts) shout_rating = session.query(ShoutRating).\ filter(ShoutRating.shout == s.slug).\ filter(ShoutRating.rater == rater.slug).first() if shout_rating: - shout_rating_dict['value'] += int(shout_rating.value or 0) + shout_rating_dict['value'] = int(shout_rating_dict['value'] or 0) + int(shout_rating.value or 0) shout_rating.update(shout_rating_dict) else: ShoutRating.create(**shout_rating_dict) shout_dict['ratings'].append(shout_rating_dict) - except sqlalchemy.exc.IntegrityError: - print('[migration] shout_rating error: \n%r' % shout_rating_dict) - pass - - # shout topics + except: + print('[migration] shout rating error: \n%r' % shout_rating_old) + # raise Exception + # shout topics + try: shout_dict['topics'] = [] for topic in r['topics']: - try: - tpc = topics_by_oid[topic['oid']] - slug = retopics.get(tpc['slug'], tpc['slug']) - ShoutTopic.create(**{ 'shout': s.slug, 'topic': slug }) - shout_dict['topics'].append(slug) - except sqlalchemy.exc.IntegrityError: - pass - - # shout views + tpc = topics_by_oid[topic['oid']] + oldslug = tpc['slug'] + newslug = retopics.get(oldslug, oldslug) + need_create_topic = False + if newslug: + with local_session() as session: + shout_topic_new = session.query(ShoutTopic)\ + .filter(ShoutTopic.shout == s.slug)\ + .filter(ShoutTopic.topic == newslug).first() + shout_topic_old = session.query(ShoutTopic)\ + .filter(ShoutTopic.shout == s.slug)\ + .filter(ShoutTopic.topic == oldslug).first() + if not shout_topic_new: + if shout_topic_old: + shout_topic_old.update({ 'slug': newslug }) + else: + need_create_topic = True + if need_create_topic: + ShoutTopic.create(**{ 'shout': s.slug, 'topic': newslug }) + shout_dict['topics'].append(newslug) + except: + print('[migration] shout topic error: \n%r' % entry) + raise Exception + # shout views + try: views = entry.get('views', 1) ShoutViewByDay.create( shout = s.slug, value = views ) - - except Exception as e: - raise e + except: + print('[migration] shout view error: \n%r' % entry) + # raise Exception shout_dict['old_id'] = entry.get('_id') return shout_dict, topic_errors diff --git a/migration/tables/tags.py b/migration/tables/tags.py index 8954a71d..affc79ae 100644 --- a/migration/tables/tags.py +++ b/migration/tables/tags.py @@ -40,7 +40,7 @@ def migrate(entry, topics_by_oid): del topic_dict['oid'] topic = Topic.create(**topic_dict) except Exception as e: - print(e) + # print(e) raise e topic_dict['oid'] = entry['_id'] return topic_dict