migration: content_items refactored

This commit is contained in:
tonyrewin 2022-07-03 04:01:59 +03:00
parent 36f26aaa1c
commit 1ae64e8732

View File

@ -1,19 +1,16 @@
from dateutil.parser import parse as date_parse from dateutil.parser import parse as date_parse
import frontmatter import frontmatter
import json import json
import sqlite3
import sqlalchemy import sqlalchemy
from orm import Shout, Comment, Topic, ShoutTopic, ShoutRating, ShoutViewByDay, User from orm import Shout, ShoutTopic, ShoutRating, ShoutViewByDay, User, shout
from bs4 import BeautifulSoup # from bs4 import BeautifulSoup
from migration.html2text import html2text from migration.html2text import html2text
from migration.tables.comments import migrate as migrateComment
from transliterate import translit from transliterate import translit
from datetime import datetime from datetime import datetime
from sqlalchemy.exc import IntegrityError
from orm.base import local_session from orm.base import local_session
from orm.community import Community from orm.community import Community
from migration.extract import extract
import os import os
import string
DISCOURS_USER = { DISCOURS_USER = {
'id': 9999999, 'id': 9999999,
@ -35,7 +32,7 @@ type2layout = {
def get_metadata(r): def get_metadata(r):
metadata = {} metadata = {}
metadata['title'] = r.get('title') metadata['title'] = r.get('title', '').replace('{', '(').replace('}', ')')
metadata['authors'] = r.get('authors') metadata['authors'] = r.get('authors')
metadata['createdAt'] = r.get('createdAt', ts) metadata['createdAt'] = r.get('createdAt', ts)
metadata['layout'] = r['layout'] metadata['layout'] = r['layout']
@ -84,15 +81,19 @@ def migrate(entry, users_by_oid, topics_by_oid):
'ratings': [], 'ratings': [],
'createdAt': entry.get('createdAt', '2016-03-05 22:22:00.350000') 'createdAt': entry.get('createdAt', '2016-03-05 22:22:00.350000')
} }
r['slug'] = entry.get('slug', '')
if not r['slug'] and entry.get('friendlySlugs') is not None: # slug
r['slug'] = entry['friendlySlugs']['slug'][0]['slug']
if(r['slug'] is None): s = entry.get('slug', '')
r['slug'] = entry['friendlySlugs'][0]['slug'] fslugs = entry.get('friendlySlugs')
if not r['slug']: if not s and fslugs:
print('NO SLUG ERROR') if type(fslugs) != 'list': fslugs = fslugs.get('slug', [])
# print(entry) try: s = fslugs.pop(0).get('slug')
raise Exception except: raise Exception
if s: r['slug'] = s
else: raise Exception
# topics
category = entry['category'] category = entry['category']
mainTopic = topics_by_oid.get(category) mainTopic = topics_by_oid.get(category)
@ -107,68 +108,106 @@ def migrate(entry, users_by_oid, topics_by_oid):
else: else:
# print('ERROR: unknown old topic id: ' + oid) # print('ERROR: unknown old topic id: ' + oid)
topic_errors.append(oid) topic_errors.append(oid)
# cover
if entry.get('image') is not None: if entry.get('image') is not None:
r['cover'] = entry['image']['url'] r['cover'] = entry['image']['url']
if entry.get('thumborId') is not None: if entry.get('thumborId') is not None:
r['cover'] = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId'] r['cover'] = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId']
if entry.get('updatedAt') is not None: if entry.get('updatedAt') is not None:
r['updatedAt'] = date_parse(entry['updatedAt']) r['updatedAt'] = date_parse(entry['updatedAt'])
# body
body = ''
body_orig = entry.get('body')
if not body_orig: body_orig = ''
# body modifications
if entry.get('type') == 'Literature': if entry.get('type') == 'Literature':
media = entry.get('media', '') for m in entry.get('media', []):
# print(media[0]['literatureBody']) t = m.get('title', '')
if type(media) == list and media: if t: body_orig += '### ' + t + '\n'
body_orig = media[0].get('literatureBody', '') body_orig += (m.get('body', '') or '')
if body_orig == '': body_orig += '\n' + m.get('literatureBody', '') + '\n'
print('EMPTY BODY!')
else:
# body_html = str(BeautifulSoup(
# body_orig, features="html.parser"))
r['body'] = html2text(body_orig)
else:
print(r['slug'] + ': literature has no media')
elif entry.get('type') == 'Video': elif entry.get('type') == 'Video':
m = entry['media'][0] providers = set([])
video_url = ''
require = False
for m in entry.get('media', []):
yt = m.get('youtubeId', '') yt = m.get('youtubeId', '')
vm = m.get('vimeoId', '') vm = m.get('vimeoId', '')
video_url = 'https://www.youtube.com/watch?v=' + yt if yt else '#' if yt:
therestof = html2text(m.get('body', entry.get('body', ''))) require = True
r['body'] = 'import { YouTube } from \'solid-social\'\n\n' + \ providers.add('YouTube')
'<YouTube youtubeId=\'' + yt + '\' />\n\n' + therestof video_url = 'https://www.youtube.com/watch?v=' + yt
if video_url == '#': body += '<YouTube youtubeId=\'' + yt + '\' />\n'
video_url = 'https://vimeo.com/' + vm if vm else '#' if vm:
r['body'] = 'import { Vimeo } from \'solid-social\'\n\n' + \ require = True
'<Vimeo vimeoId=\'' + vm + '\' />\n\n' + therestof providers.add('Vimeo')
if video_url == '#': video_url = 'https://vimeo.com/' + vm
print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!')) body += '<Vimeo vimeoId=\'' + vm + '\' />\n'
# raise Exception body += extract(html2text(m.get('body', '')), entry['_id'])
if video_url == '#': print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!'))
if require: body = 'import { ' + ','.join(list(providers)) + ' } from \'solid-social\'\n\n' + body + '\n'
body += extract(html2text(body_orig), entry['_id'])
elif entry.get('type') == 'Music': elif entry.get('type') == 'Music':
r['body'] = '' require = False
for m in entry['media']: for m in entry.get('media', []):
if m == { 'main': 'true' } or m == { 'main': True } or m == {}: if 'fileUrl' in m:
continue require = True
artist = m.get('performer')
trackname = ''
if artist: trackname += artist + ' - '
trackname += m.get('title','')
body += '<MusicPlayer src=\"' + m['fileUrl'] + '\" title=\"' + trackname + '\" />\n'
body += extract(html2text(m.get('body', '')), entry['_id'])
else: else:
# TODO: mark highlighted track isMain == True
fileUrl = m.get('fileUrl', '')
if not fileUrl:
print(m) print(m)
continue if require: body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + body + '\n'
else: body += extract(html2text(body_orig), entry['_id'])
r['body'] = 'import MusicPlayer from \'../src/components/MusicPlayer\'\n\n'
r['body'] += '<MusicPlayer src=\'' + fileUrl + '\' title=\'' + m.get('title','') + '\' />\n'
r['body'] += html2text(entry.get('body', ''))
elif entry.get('type') == 'Image': elif entry.get('type') == 'Image':
r['body'] = '' cover = r.get('cover')
if 'cover' in r: r['body'] = '<img src=\"' + r.get('cover', '') + '\" />' images = {}
mbody = r.get('media', [{'body': ''},])[0].get('body', '') for m in entry.get('media', []):
r['body'] += mbody + entry.get('body', '') t = m.get('title', '')
if r['body'] == '': print(entry) if t: body += '#### ' + t + '\n'
if r.get('body') is None: u = m.get('image', {}).get('url', '')
body_orig = entry.get('body', entry.get('bodyHistory', [{ 'text': '' }, ])[0].get('text', '')) if 'cloudinary' in u:
u = m.get('thumborId')
if not u: u = cover
if u not in images.keys():
if u.startswith('production'): u = 'https://discours-io.s3.amazonaws.com/' + u
body += '![' + m.get('title','').replace('\n', ' ') + '](' + u + ')\n' # TODO: gallery here
images[u] = u
body += extract(html2text(m.get('body', '')), entry['_id']) + '\n'
body += extract(html2text(body_orig), entry['_id'])
# simple post or no body stored
if body == '':
if not body_orig:
print('[migration] using body history...')
try: body_orig += entry.get('bodyHistory', [{'body': ''}])[0].get('body', '')
except: pass
# need to extract
# body_html = str(BeautifulSoup(body_orig, features="html.parser")) # body_html = str(BeautifulSoup(body_orig, features="html.parser"))
r['body'] = html2text(body_orig) body += extract(html2text(body_orig), entry['_id'])
body = r.get('body', '') else:
# EVERYTHING IS FINE HERE
pass
# replace some topics
for oldtopicslug, newtopicslug in retopics.items(): for oldtopicslug, newtopicslug in retopics.items():
body.replace(oldtopicslug, newtopicslug) body.replace(oldtopicslug, newtopicslug)
# authors
# get author data # get author data
userdata = {} userdata = {}
try: userdata = users_by_oid[entry['createdBy']] try: userdata = users_by_oid[entry['createdBy']]
@ -194,6 +233,7 @@ def migrate(entry, users_by_oid, topics_by_oid):
} }
# set author data # set author data
r['body'] = body
shout_dict = r.copy() shout_dict = r.copy()
author = { # a short version for public listings author = { # a short version for public listings
'slug': userdata.get('slug', 'discours'), 'slug': userdata.get('slug', 'discours'),
@ -202,15 +242,21 @@ def migrate(entry, users_by_oid, topics_by_oid):
} }
shout_dict['authors'] = [ author, ] shout_dict['authors'] = [ author, ]
# save mdx for prerender if published
if entry['published']: if entry['published']:
metadata = get_metadata(shout_dict) metadata = get_metadata(shout_dict)
content = frontmatter.dumps(frontmatter.Post(body, **metadata)) content = frontmatter.dumps(frontmatter.Post(r['body'], **metadata))
ext = 'mdx' ext = 'mdx'
parentDir = '/'.join(os.getcwd().split('/')[:-1]) parentDir = '/'.join(os.getcwd().split('/')[:-1])
filepath = parentDir + '/discoursio-web/content/' + r['slug'] + '.' + ext filepath = parentDir + '/discoursio-web/content/' + r['slug']
# print(filepath) # print(filepath)
bc = bytes(content,'utf-8').decode('utf-8','ignore') bc = bytes(content,'utf-8').decode('utf-8','ignore')
open(filepath, 'w').write(bc) open(filepath + '.' + ext, 'w').write(bc)
# open(filepath + '.html', 'w').write(body_orig)
# save shout to db
try: try:
shout_dict['createdAt'] = date_parse(r.get('createdAt')) if entry.get('createdAt') else ts shout_dict['createdAt'] = date_parse(r.get('createdAt')) if entry.get('createdAt') else ts
shout_dict['publishedAt'] = date_parse(entry.get('publishedAt')) if entry.get('published') else None shout_dict['publishedAt'] = date_parse(entry.get('publishedAt')) if entry.get('published') else None
@ -234,14 +280,18 @@ def migrate(entry, users_by_oid, topics_by_oid):
if not user and slug: user = session.query(User).filter(User.slug == slug).first() if not user and slug: user = session.query(User).filter(User.slug == slug).first()
if not user and userdata: user = User.create(**userdata) if not user and userdata: user = User.create(**userdata)
except: except:
print(userdata) print('[migration] content_items error: \n%r' % entry)
assert user, 'could not get a user' assert user, 'could not get a user'
shout_dict['authors'] = [ user, ] shout_dict['authors'] = [ user, ]
try:
s = Shout.create(**shout_dict) # create shout
s = object()
try: s = Shout.create(**shout_dict)
except: print('[migration] content_items error: \n%r' % entry)
# shout ratings # shout ratings
shout_dict['ratings'] = [] shout_dict['ratings'] = []
for shout_rating_old in entry.get('ratings',[]): for shout_rating_old in entry.get('ratings',[]):
with local_session() as session: with local_session() as session:
@ -255,11 +305,21 @@ def migrate(entry, users_by_oid, topics_by_oid):
} }
cts = shout_rating_old.get('createdAt') cts = shout_rating_old.get('createdAt')
if cts: shout_rating_dict['ts'] = date_parse(cts) if cts: shout_rating_dict['ts'] = date_parse(cts)
try: shout_rating = ShoutRating.create(**shout_rating_dict) try:
except sqlalchemy.exc.IntegrityError: pass shout_rating = session.query(ShoutRating).\
filter(ShoutRating.shout == s.slug).\
filter(ShoutRating.rater == rater.slug).first()
if shout_rating:
shout_rating_dict['value'] += int(shout_rating.value or 0)
shout_rating.update(shout_rating_dict)
else: ShoutRating.create(**shout_rating_dict)
shout_dict['ratings'].append(shout_rating_dict) shout_dict['ratings'].append(shout_rating_dict)
except sqlalchemy.exc.IntegrityError:
print('[migration] shout_rating error: \n%r' % shout_rating_dict)
pass
# shout topics # shout topics
shout_dict['topics'] = [] shout_dict['topics'] = []
for topic in r['topics']: for topic in r['topics']:
try: try:
@ -270,6 +330,8 @@ def migrate(entry, users_by_oid, topics_by_oid):
except sqlalchemy.exc.IntegrityError: except sqlalchemy.exc.IntegrityError:
pass pass
# shout views
views = entry.get('views', 1) views = entry.get('views', 1)
ShoutViewByDay.create( ShoutViewByDay.create(
shout = s.slug, shout = s.slug,
@ -278,7 +340,5 @@ def migrate(entry, users_by_oid, topics_by_oid):
except Exception as e: except Exception as e:
raise e raise e
except Exception as e:
raise e
shout_dict['old_id'] = entry.get('_id') shout_dict['old_id'] = entry.get('_id')
return shout_dict, topic_errors return shout_dict, topic_errors