migration-orm-fixes

This commit is contained in:
2022-07-07 16:55:13 +03:00
parent bd4221e9af
commit 56dcd7ecbc
23 changed files with 706 additions and 799 deletions

View File

@@ -1,3 +1,4 @@
import os
import bson
import json
@@ -17,10 +18,11 @@ def json_tables():
lc = []
with open('migration/data/'+table+'.bson', 'rb') as f:
bs = f.read()
f.close()
base = 0
while base < len(bs):
base, d = bson.decode_document(bs, base)
lc.append(d)
data[table] = lc
open('dump/discours/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder))
open(os.getcwd() + '/dump/discours/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder))

100
migration/export.py Normal file
View File

@@ -0,0 +1,100 @@
from datetime import datetime
import json
import os
import frontmatter
from migration.extract import prepare_body
from migration.tables.users import migrate_email_subscription
from migration.utils import DateTimeEncoder
OLD_DATE = '2016-03-05 22:22:00.350000'
EXPORT_DEST = '../discoursio-web/data/'
parentDir = '/'.join(os.getcwd().split('/')[:-1])
contentDir = parentDir + '/discoursio-web/content/'
ts = datetime.now()
def get_metadata(r):
authors = []
for a in r['authors']:
authors.append({ # a short version for public listings
'slug': a.slug or 'discours',
'name': a.name or 'Дискурс',
'userpic': a.userpic or 'https://discours.io/static/img/discours.png'
})
metadata = {}
metadata['title'] = r.get('title', '').replace('{', '(').replace('}', ')')
metadata['authors'] = authors
metadata['createdAt'] = r.get('createdAt', ts)
metadata['layout'] = r['layout']
metadata['topics'] = [topic for topic in r['topics']]
metadata['topics'].sort()
if r.get('cover', False): metadata['cover'] = r.get('cover')
return metadata
def export_mdx(r):
# print('[export] mdx %s' % r['slug'])
content = ''
metadata = get_metadata(r)
content = frontmatter.dumps(frontmatter.Post(r['body'], **metadata))
ext = 'mdx'
filepath = contentDir + r['slug']
bc = bytes(content,'utf-8').decode('utf-8','ignore')
open(filepath + '.' + ext, 'w').write(bc)
def export_body(shout, storage):
shout['body'] = prepare_body(storage['content_items']['by_oid'][shout['oid']])
export_mdx(shout)
print('[export] trying to save html %s' % shout['slug'])
open(contentDir + shout['slug'] + '.html', 'w').write(storage['content_items']['by_oid'][shout['oid']]['body'])
def export_slug(slug, storage):
shout = storage['shouts']['by_slug'][slug]
shout = storage['shouts']['by_slug'].get(slug)
assert shout, '[export] no shout found by slug: %s ' % slug
author = storage['users']['by_slug'].get(shout['authors'][0]['slug'])
assert author, '[export] no author error'
export_body(shout, storage)
def export_email_subscriptions():
email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read())
for data in email_subscriptions_data:
migrate_email_subscription(data)
print('[migration] ' + str(len(email_subscriptions_data)) + ' email subscriptions exported')
def export_shouts(storage):
# update what was just migrated or load json again
if len(storage['users']['by_slugs'].keys()) == 0:
storage['users']['by_slugs'] = json.loads(open(EXPORT_DEST + 'authors.json').read())
print('[migration] ' + str(len(storage['users']['by_slugs'].keys())) + ' exported authors loaded')
if len(storage['shouts']['by_slugs'].keys()) == 0:
storage['shouts']['by_slugs'] = json.loads(open(EXPORT_DEST + 'articles.json').read())
print('[migration] ' + str(len(storage['shouts']['by_slugs'].keys())) + ' exported articles loaded')
for slug in storage['shouts']['by_slugs'].keys(): export_slug(slug, storage)
def export_json(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
open(EXPORT_DEST + 'authors.json', 'w').write(json.dumps(export_authors,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print('[migration] ' + str(len(export_authors.items())) + ' authors exported')
open(EXPORT_DEST + 'topics.json', 'w').write(json.dumps(export_topics,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print('[migration] ' + str(len(export_topics.keys())) + ' topics exported')
open(EXPORT_DEST + 'articles.json', 'w').write(json.dumps(export_articles,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print('[migration] ' + str(len(export_articles.items())) + ' articles exported')
open(EXPORT_DEST + 'comments.json', 'w').write(json.dumps(export_comments,
cls=DateTimeEncoder,
indent=4,
sort_keys=True,
ensure_ascii=False))
print('[migration] ' + str(len(export_comments.items())) + ' exported articles with comments')

View File

@@ -1,16 +1,16 @@
import json
import os
import re
import base64
import sys
from migration.html2text import html2text
TOOLTIP_REGEX = r'(\/\/\/(.+)\/\/\/)'
contentDir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'discoursio-web', 'content')
s3 = 'https://discours-io.s3.amazonaws.com/'
cdn = 'https://assets.discours.io'
retopics = json.loads(open('migration/tables/replacements.json', 'r').read())
def replace_tooltips(body):
def replace_tooltips(body):
# FIXME: if you prefer regexp
newbody = body
matches = list(re.finditer(TOOLTIP_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
for match in matches:
@@ -21,37 +21,40 @@ def replace_tooltips(body):
def place_tooltips(body):
parts = body.split('///')
parts = body.split('&&&')
l = len(parts)
newparts = list(parts)
placed = False
if l & 1:
if l > 1:
i = 1
print('[extract] found %d tooltips' % (l-1))
for part in parts[1:]:
if i & 1:
# print([ len(p) for p in parts ])
# print('[extract] tooltip: ' + part)
if 'a class="footnote-url" href=' in part:
print('[extract] footnote: ' + part)
fn = 'a class="footnote-url" href="'
link = part.split(fn,1)[1].split('"', 1)[0]
extracted_part = part.split(fn,1)[0] + ' ' + part.split('/', 1)[-1]
newparts[i] = '<Tooltip' + (' link="' + link + '" ' if link else '') + '>' + extracted_part + '</Tooltip>'
else:
newparts[i] = '<Tooltip>%s</Tooltip>' % part
# print('[extract] tooltip: ' + newparts[i])
else:
# print('[extract] pass: ' + part[:10] + '..')
newparts[i] = part
i += 1
return ''.join(newparts)
placed = True
return (''.join(newparts), placed)
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)"
public = '../discoursio-web/public'
cache = {}
def reextract_images(body, oid):
def reextract_images(body, oid):
# FIXME: if you prefer regexp
matches = list(re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
i = 0
for match in matches:
@@ -80,54 +83,50 @@ IMAGES = {
sep = ';base64,'
def extract_imageparts(bodyparts, prefix):
# recursive loop
for current in bodyparts:
i = bodyparts.index(current)
for mime in IMAGES.keys():
if mime == current[-len(mime):] and (i + 1 < len(bodyparts)):
print('[extract] ' + mime)
next = bodyparts[i+1]
ext = IMAGES[mime]
b64end = next.index(')')
b64encoded = next[:b64end]
name = prefix + '-' + str(len(cache))
link = '/upload/image-' + name + '.' + ext
print('[extract] name: ' + name)
print('[extract] link: ' + link)
print('[extract] %d bytes' % len(b64encoded))
if b64encoded not in cache:
try:
content = base64.b64decode(b64encoded + '==')
open(public + link, 'wb').write(content)
print('[extract] ' +str(len(content)) + ' image bytes been written')
cache[b64encoded] = name
except:
raise Exception
# raise Exception('[extract] error decoding image %r' %b64encoded)
else:
print('[extract] cached: ' + cache[b64encoded])
name = cache[b64encoded]
link = cdn + '/upload/image-' + name + '.' + ext
bodyparts[i] = current[:-len(mime)] + current[-len(mime):] + link + next[-b64end:]
bodyparts[i+1] = next[:-b64end]
break
return extract_imageparts(sep.join(bodyparts[i+1:]), prefix) \
if len(bodyparts) > (i + 1) else ''.join(bodyparts)
def extract_images(body, oid):
newbody = ''
body = body.replace(' [](data:image', '![](data:image').replace('\n[](data:image', '![](data:image')
oldparts = body.split(sep)
newparts = list(oldparts)
# print()
if len(oldparts) > 1:
print('[extract] images for %s' % oid)
print('[extract] %d candidates' % (len(oldparts)-1))
i = 0
for current in oldparts:
next = ''
try: next = oldparts[i+1]
except: newbody += current
start = oldparts.index(current) == 0
end = not next
if end:
continue
else: # start or between
for mime in IMAGES.keys():
if mime in current[-15:]:
print('[extract] ' + current[-15:])
if ')' in next:
b64encoded = next.split(')')[0]
print('[extract] '+str(i+1)+': %d bytes' % len(b64encoded))
ext = IMAGES[mime]
print('[extract] type: ' + mime)
name = oid + '-' + str(i)
print('[extract] name: ' + name)
link = '/upload/image-' + name + '.' + ext
print('[extract] link: ' + link)
if b64encoded:
if b64encoded not in cache:
content = base64.b64decode(b64encoded + '==')
open(public + link, 'wb').write(content)
cache[b64encoded] = name
else:
print('[extract] cached: ' + cache[b64encoded])
name = cache[b64encoded]
link = cdn + '/upload/image-' + name + '.' + ext
newparts[i] = current.split('![](' + mime)[0] + '![](' + link + ')'
newparts[i+1] = next.replace(b64encoded + ')', '')
else:
print('[extract] ERROR: no b64encoded')
# print(current[-15:])
i += 1
newbody = ''.join(newparts)
body = body\
.replace(' [](data:image', '![](data:image')\
.replace('\n[](data:image', '![](data:image')
parts = body.split(sep)
i = 0
if len(parts) > 1: newbody = extract_imageparts(parts, oid)
else: newbody = body
return newbody
@@ -149,25 +148,34 @@ def cleanup(body):
return newbody
def extract(body, oid):
newbody = extract_images(body, oid)
newbody = cleanup(newbody)
newbody = place_tooltips(newbody)
return newbody
if body:
newbody = extract_images(body, oid)
if not newbody: raise Exception('extract_images error')
newbody = cleanup(newbody)
if not newbody: raise Exception('cleanup error')
newbody, placed = place_tooltips(newbody)
if not newbody: raise Exception('place_tooltips error')
if placed:
newbody = 'import Tooltip from \'$/components/Article/Tooltip\'\n\n' + newbody
return newbody
return body
def prepare_body(entry):
# print('[migration] preparing body %s' % entry.get('slug',''))
# body modifications
body = ''
body_orig = entry.get('body', '')
if not body_orig: body_orig = ''
if entry.get('type') == 'Literature':
print('[extract] literature')
for m in entry.get('media', []):
t = m.get('title', '')
if t: body_orig += '<h5>' + t + '</h5>\n'
body_orig += (m.get('body', '') or '')
body_orig += '\n' + m.get('literatureBody', '') + '\n'
body_orig += (m.get('body') or '').replace((m.get('literatureBody') or ''), '') + m.get('literatureBody', '') + '\n'
elif entry.get('type') == 'Video':
print('[extract] embedding video')
providers = set([])
video_url = ''
require = False
@@ -187,8 +195,10 @@ def prepare_body(entry):
body += extract(html2text(m.get('body', '')), entry['_id'])
if video_url == '#': print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!'))
if require: body = 'import { ' + ','.join(list(providers)) + ' } from \'solid-social\'\n\n' + body + '\n'
# already body_orig = entry.get('body', '')
elif entry.get('type') == 'Music':
print('[extract] music album')
for m in entry.get('media', []):
artist = m.get('performer')
trackname = ''
@@ -197,42 +207,46 @@ def prepare_body(entry):
body += '<MusicPlayer src=\"' + m.get('fileUrl','') + '\" title=\"' + trackname + '\" />\n'
body += extract(html2text(m.get('body', '')), entry['_id'])
body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + body + '\n'
# already body_orig = entry.get('body', '')
elif entry.get('type') == 'Image':
print('[extract] image gallery')
cover = ''
if 'thumborId' in entry: cover = cdn + '/unsafe/1600x/' + entry['thumborId']
if not cover and 'image' in entry:
cover = entry['image'].get('url', '')
if not cover:
if 'image' in entry: cover = entry['image'].get('url', '')
if 'cloudinary' in cover: cover = ''
else:
print('[migration] cover: ' + cover)
images = {}
for m in entry.get('media', []):
t = m.get('title', '')
if t: body += '#### ' + t + '\n'
u = m.get('image', {}).get('url', '')
if 'cloudinary' in u:
u = m.get('thumborId')
if not u: u = cover
b = ''
title = m.get('title','').replace('\n', ' ').replace('&nbsp;', ' ')
u = m.get('image', {}).get('url', '') or m.get('thumborId') or cover
u = str(u)
b += '<h4>' + title + '</h4>\n' + body_orig
if not u.startswith('http'): u = s3 + u
if not u: print('[extract] no image for ' + str(m))
if 'cloudinary' in u: u = 'img/lost.svg'
if u not in images.keys():
if u.startswith('production'): u = s3 + u
body += '![' + m.get('title','').replace('\n', ' ') + '](' + u + ')\n' # TODO: gallery here
images[u] = u
body += extract(html2text(m.get('body', '')), entry['_id']) + '\n'
# print('[extract] image: ' + u)
images[u] = title
b += '<img src=\"' + u + '\" alt=\"'+ title +'\" />\n'
b += m.get('body', '') + '\n'
body += extract(html2text(b), entry['_id'])
if not body_orig:
print('[prepare] using body history...')
# print(entry.get('bodyHistory', ''))
try:
for up in entry.get('bodyHistory', []):
body_orig = up.get('text', '') or ''
if body_orig: break
except: pass
elif not body_orig:
for up in entry.get('bodyHistory', []) or []:
body_orig = up.get('text', '') or ''
if body_orig:
print('[extract] body from history!')
break
if not body and not body_orig: print('[extract] error: EMPTY BODY')
# body_html = str(BeautifulSoup(body_orig, features="html.parser"))
body += extract(html2text(body_orig), entry['_id'])
# replace some topics
for oldtopicslug, newtopicslug in retopics.items():
body.replace(oldtopicslug, newtopicslug)
# print('[extract] adding original body')
if body_orig: body += extract(html2text(body_orig), entry['_id'])
if entry['slug'] in sys.argv:
open(contentDir + '/' + entry['slug'] + '.html', 'w')\
.write(entry.get('body',''))
return body

View File

@@ -535,8 +535,7 @@ class HTML2Text(html.parser.HTMLParser):
if start:
if 'data-original-title' in attrs:
# WARNING: old discours specific code
if 'import Tooltip' not in self.outtextlist[0]: self.outtextlist.insert(0, 'import Tooltip from "$/components/Article/Tooltip"\n\n')
self.o('///%s///' % attrs['data-original-title'])
self.o('&&&%s&&&' % attrs['data-original-title'])
else:
if (
"href" in attrs
@@ -1033,10 +1032,10 @@ class HTML2Text(html.parser.HTMLParser):
return result
def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
if bodywidth is None:
bodywidth = config.BODY_WIDTH
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
h = h.handle(html)
def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = config.BODY_WIDTH) -> str:
h = html.strip() or ''
if h:
h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
h = h.handle(html.strip())
print('[html2text] %d bytes' % len(html))
return h

View File

@@ -7,7 +7,7 @@ UNICODE_SNOB = True
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
# Escape all special characters. Output is less readable, but avoids
# corner case formatting issues.
ESCAPE_SNOB = False
ESCAPE_SNOB = True
# Put the links after each paragraph instead of at the end.
LINKS_EACH_PARAGRAPH = False
@@ -46,10 +46,10 @@ IMAGES_AS_HTML = False
IMAGES_TO_ALT = False
IMAGES_WITH_SIZE = False
IGNORE_EMPHASIS = False
MARK_CODE = False
MARK_CODE = True
DECODE_ERRORS = "strict"
DEFAULT_IMAGE_ALT = ""
PAD_TABLES = False
PAD_TABLES = True
# Convert links with same href and text to <href> format
# if they are absolute links

View File

@@ -1,12 +1,13 @@
from datetime import datetime
from dateutil.parser import parse as date_parse
import json
import datetime
from os.path import abspath
from orm import Shout, Comment, CommentRating, User
from orm import Comment, CommentRating, User
from orm.base import local_session
from migration.html2text import html2text
from orm.shout import Shout
def migrate(entry, shouts_by_oid):
ts = datetime.now()
def migrate(entry, storage):
'''
{
"_id": "hdtwS8fSyFLxXCgSC",
@@ -28,60 +29,70 @@ def migrate(entry, shouts_by_oid):
type Comment {
id: Int!
author: Int!
createdBy: User!
body: String!
replyTo: Int!
replyTo: Comment!
createdAt: DateTime!
updatedAt: DateTime
shout: Int!
shout: Shout!
deletedAt: DateTime
deletedBy: Int
deletedBy: User
ratings: [CommentRating]
views: Int
}
'''
if entry.get('deleted'): return
comment_dict = {}
# FIXME: comment_dict['createdAt'] = ts if not entry.get('createdAt') else date_parse(entry.get('createdAt'))
# print('[migration] comment original date %r' % entry.get('createdAt'))
# print('[migration] comment date %r ' % comment_dict['createdAt'])
comment_dict['body'] = html2text(entry.get('body', ''))
comment_dict['oid'] = entry['_id']
if entry.get('createdAt'): comment_dict['createdAt'] = date_parse(entry.get('createdAt'))
shout_oid = entry.get('contentItem')
if not shout_oid in storage['shouts']['by_oid']:
print('[migration] no shout for comment', entry)
else:
with local_session() as session:
author = session.query(User).filter(User.oid == entry['createdBy']).first()
shout_dict = storage['shouts']['by_oid'][shout_oid]
if shout_dict:
comment_dict['shout'] = shout_dict['oid']
comment_dict['createdBy'] = author.slug if author else 'discours'
# FIXME if entry.get('deleted'): comment_dict['deletedAt'] = date_parse(entry['updatedAt']) or ts
# comment_dict['deletedBy'] = session.query(User).filter(User.oid == (entry.get('updatedBy') or dd['oid'])).first()
# FIXME if entry.get('updatedAt'): comment_dict['updatedAt'] = date_parse(entry['updatedAt']) or ts
#for [k, v] in comment_dict.items():
# if not v: del comment_dict[f]
# if k.endswith('At'):
# try: comment_dict[k] = datetime(comment_dict[k])
# except: print(k)
# # print('[migration] comment keys:', f)
shout_old_id = entry['contentItem']
if not shout_old_id in shouts_by_oid:
return
shout = shouts_by_oid[shout_old_id]
with local_session() as session:
author = session.query(User).filter(User.old_id == entry['createdBy']).first()
comment_dict = {
'author': author.id if author else 0,
'createdAt': date_parse(entry['createdAt']),
'body': html2text(entry['body']),
'shout': shout["slug"]
}
if entry.get('deleted'):
comment_dict['deletedAt'] = date_parse(entry['updatedAt'])
comment_dict['deletedBy'] = str(entry['updatedBy'])
if entry.get('updatedAt'):
comment_dict['updatedAt'] = date_parse(entry['updatedAt'])
# comment_dict['updatedBy'] = str(entry.get('updatedBy', 0)) invalid keyword for Comment
# print(comment_dict)
comment = Comment.create(**comment_dict)
comment_dict['id'] = comment.id
comment_dict['ratings'] = []
comment_dict['old_id'] = entry['_id']
# print(comment)
for comment_rating_old in entry.get('ratings',[]):
rater = session.query(User).filter(User.old_id == comment_rating_old['createdBy']).first()
if rater and comment:
comment_rating_dict = {
'value': comment_rating_old['value'],
'createdBy': rater.slug,
'comment_id': comment.id
}
cts = comment_rating_old.get('createdAt')
if cts: comment_rating_dict['createdAt'] = date_parse(cts)
try:
comment_rating = CommentRating.create(**comment_rating_dict)
comment_dict['ratings'].append(comment_rating_dict)
except Exception as e:
print(comment_rating_dict)
raise e
comment = Comment.create(**comment_dict)
comment_dict['id'] = comment.id
comment_dict['ratings'] = []
comment_dict['oid'] = entry['_id']
# print(comment)
for comment_rating_old in entry.get('ratings',[]):
rater = session.query(User).filter(User.oid == comment_rating_old['createdBy']).first()
if rater and comment:
comment_rating_dict = {
'value': comment_rating_old['value'],
'createdBy': rater.slug,
'comment_id': comment.id
}
cts = comment_rating_old.get('createdAt')
if cts: comment_rating_dict['createdAt'] = date_parse(cts)
try:
CommentRating.create(**comment_rating_dict)
comment_dict['ratings'].append(comment_rating_dict)
except Exception as e:
print('[migration] comment rating error: %r' % comment_rating_dict)
raise e
else:
print('[migration] error: cannot find shout for comment %r' % comment_dict)
return comment_dict
def migrate_2stage(cmt, old_new_id):

View File

@@ -1,52 +0,0 @@
from orm.base import local_session
from orm import Topic, Community
from dateutil.parser import parse as date_parse
import json
from migration.html2text import html2text
import sqlalchemy
def migrate(entry, topics_by_oid):
'''
type Topic {
slug: String! # ID
createdBy: Int! # User
createdAt: DateTime!
value: String
children: [String] # children topic
}
'''
topic_dict = {
'slug': entry['slug'],
'oid': entry['_id'],
# 'createdBy': entry['createdBy'],
# 'createdAt': date_parse(entry['createdAt']),
'title': entry['title'].replace('&nbsp;', ' '), #.lower(),
'children': [],
'community' : Community.default_community.slug,
'body' : html2text(entry.get('description', '').replace('&nbsp;', ' '))
}
retopics = json.loads(open('migration/tables/replacements.json').read())
with local_session() as session:
slug = topics_by_oid.get(topic_dict['oid'], topic_dict)['slug']
if slug:
slug = retopics.get(slug, slug)
try:
topic = session.query(Topic).filter(Topic.slug == slug).first()
if not topic:
del topic_dict['oid']
topic = Topic.create(**topic_dict)
# print('created')
else:
if len(topic.title) > len(topic_dict['title']) or \
len(topic.body) < len(topic_dict['body']):
topic.update({
'slug': slug,
'title': topic_dict['title'] if len(topic.title) > len(topic_dict['title']) else topic.title,
'body': topic_dict['body'] if len(topic.body) < len(topic_dict['body']) else topic.body
})
except Exception as e:
print('not found old topic: ' + slug)
else:
raise Exception
topic_dict['oid'] = entry['_id']
return topic_dict

View File

@@ -1,23 +1,21 @@
from dateutil.parser import parse as date_parse
import frontmatter
import json
from orm import Shout, ShoutTopic, ShoutRating, ShoutViewByDay, User, shout
import sqlalchemy
from orm import Shout, ShoutTopic, ShoutRating, ShoutViewByDay, User
from transliterate import translit
from datetime import datetime
from orm.base import local_session
from orm.community import Community
from migration.extract import prepare_body
import os
from orm.community import Community
DISCOURS_USER = {
'id': 9999999,
'slug': 'discours',
'name': 'Дискурс',
'email': 'welcome@discours.io',
'userpic': 'https://discours.io/images/logo-mini.svg',
'createdAt': '2016-03-05 22:22:00.350000'
}
OLD_DATE = '2016-03-05 22:22:00.350000'
retopics = json.loads(open('migration/tables/replacements.json').read())
ts = datetime.now()
type2layout = {
'Article': 'article',
@@ -27,18 +25,6 @@ type2layout = {
'Image': 'image'
}
def get_metadata(r):
metadata = {}
metadata['title'] = r.get('title', '').replace('{', '(').replace('}', ')')
metadata['authors'] = r.get('authors')
metadata['createdAt'] = r.get('createdAt', ts)
metadata['layout'] = r['layout']
metadata['topics'] = [topic['slug'] for topic in r['topics']]
metadata['topics'].sort()
if r.get('cover', False):
metadata['cover'] = r.get('cover')
return metadata
def get_shout_slug(entry):
slug = entry.get('slug', '')
if not slug:
@@ -47,18 +33,51 @@ def get_shout_slug(entry):
if slug: break
return slug
def migrate(entry, users_by_oid, topics_by_oid):
def migrate(entry, storage):
# init, set title and layout
r = {
'layout': type2layout[entry['type']],
'title': entry['title'],
'community': Community.default_community.id,
'community': 0,
'authors': [],
'topics': [],
'rating': 0,
'ratings': [],
'createdAt': []
}
topics_by_oid = storage['topics']['by_oid']
users_by_oid = storage['users']['by_oid']
# author
oid = entry.get('createdBy', entry.get('_id', entry.get('oid')))
userdata = users_by_oid.get(oid)
if not userdata:
app = entry.get('application')
if app:
userslug = translit(app['name'], 'ru', reversed=True)\
.replace(' ', '-')\
.replace('\'', '')\
.replace('.', '-').lower()
userdata = {
'username': app['email'],
'email': app['email'],
'name': app['name'],
'bio': app.get('bio', ''),
'emailConfirmed': False,
'slug': userslug,
'createdAt': ts,
'wasOnlineAt': ts
}
else:
userdata = {
'name': 'Дискурс',
'slug': 'discours',
'email': 'welcome@discours.io',
'userpic': 'https://discours.io/image/logo-mini.svg'
}
assert userdata, 'no user found for %s from ' % [oid, len(users_by_oid.keys())]
r['authors'] = [userdata, ]
# slug
@@ -72,8 +91,7 @@ def migrate(entry, users_by_oid, topics_by_oid):
c = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId']
else:
c = entry.get('image', {}).get('url')
if not c or 'cloudinary' in c:
c = ''
if not c or 'cloudinary' in c: c = ''
r['cover'] = c
# timestamps
@@ -85,111 +103,105 @@ def migrate(entry, users_by_oid, topics_by_oid):
if r['publishedAt'] == OLD_DATE: r['publishedAt'] = ts
if 'deletedAt' in entry: r['deletedAt'] = date_parse(entry['deletedAt'])
# connected users' data
# r['deletedBy'] = entry.get('deletedBy', '0') # TypeError: 'deletedBy' is an invalid keyword argument for Shout
oid = entry.get('createdBy', '')
userdata = users_by_oid.get(oid, {})
if not userdata.get('slug'):
app = entry.get('application')
if app:
userslug = translit(app['name'], 'ru', reversed=True).replace(' ', '-').replace('\'', '').replace('.', '-').lower()
userdata = {
'username': app['email'],
'email': app['email'],
'name': app['name'],
'bio': app.get('bio', ''),
'emailConfirmed': False,
'slug': userslug,
'createdAt': ts,
'wasOnlineAt': ts
}
if userdata == {}:
userdata = {
'name': 'Дискурс',
'slug': 'discours',
'userpic': 'https://discours.io/image/logo-mini.svg'
}
author = { # a short version for public listings
'slug': userdata.get('slug', 'discours'),
'name': userdata.get('name', 'Дискурс'),
'userpic': userdata.get('userpic', '')
}
r['authors'] = [ author, ]
# body
body = prepare_body(entry)
# save mdx for prerender if published
r['body'] = body
if entry.get('published'):
content = ''
metadata = get_metadata(r)
content = frontmatter.dumps(frontmatter.Post(r['body'], **metadata))
ext = 'mdx'
parentDir = '/'.join(os.getcwd().split('/')[:-1])
filepath = parentDir + '/discoursio-web/content/' + r['slug']
# print(filepath)
bc = bytes(content,'utf-8').decode('utf-8','ignore')
open(filepath + '.' + ext, 'w').write(bc)
# open(filepath + '.html', 'w').write(body_orig)
# topics
category = entry['category']
mainTopic = topics_by_oid.get(category)
if mainTopic:
r['mainTopic'] = mainTopic["slug"]
r['mainTopic'] = storage['replacements'].get(mainTopic["slug"], mainTopic["slug"])
topic_oids = [category, ]
topic_errors = []
topic_oids.extend(entry.get('tags', []))
for oid in topic_oids:
if oid in topics_by_oid:
r['topics'].append(topics_by_oid[oid])
if oid in storage['topics']['by_oid']:
r['topics'].append(storage['topics']['by_oid'][oid]['slug'])
else:
# print('ERROR: unknown old topic id: ' + oid)
topic_errors.append(oid)
# set prepared shout data
print('[migration] unknown old topic id: ' + oid)
shout_dict = r.copy()
del shout_dict['topics'] # FIXME: AttributeError: 'str' object has no attribute '_sa_instance_state'
del shout_dict['rating'] # FIXME: TypeError: 'rating' is an invalid keyword argument for Shout
del shout_dict['ratings']
entry['topics'] = r['topics']
entry['cover'] = r['cover']
entry['authors'] = r['authors']
# get author
user = None
email = userdata.get('email')
authorslug = userdata.get('slug')
with local_session() as session:
try:
if email: user = session.query(User).filter(User.email == email).first()
if not user and authorslug: user = session.query(User).filter(User.slug == authorslug).first()
if not user and userdata: user = User.create(**userdata)
except:
print('[migration] shout author error: \n%r' % entry)
raise Exception
assert user, 'could not get a user'
shout_dict['authors'] = [ user, ]
# body
r['body'] = prepare_body(entry)
# save shout to db
s = object()
try: s = Shout.create(**shout_dict)
except: print('[migration] shout create error: \n%r' % shout_dict)
shout_dict = r.copy()
user = None
del shout_dict['topics'] # FIXME: AttributeError: 'str' object has no attribute '_sa_instance_state'
del shout_dict['rating'] # FIXME: TypeError: 'rating' is an invalid keyword argument for Shout
del shout_dict['ratings']
email = userdata.get('email')
slug = userdata.get('slug')
with local_session() as session:
# c = session.query(Community).all().pop()
if email: user = session.query(User).filter(User.email == email).first()
if not user and slug: user = session.query(User).filter(User.slug == slug).first()
if not user and userdata:
try: user = User.create(**userdata)
except sqlalchemy.exc.IntegrityError:
print('[migration] user error: ' + userdata)
userdata['id'] = user.id
userdata['createdAt'] = user.createdAt
storage['users']['by_slug'][userdata['slug']] = userdata
storage['users']['by_oid'][entry['_id']] = userdata
assert user, 'could not get a user'
shout_dict['authors'] = [ user, ]
try:
s = Shout.create(**shout_dict)
except sqlalchemy.exc.IntegrityError:
with local_session() as session:
s = session.query(Shout).filter(Shout.slug == shout_dict['slug']).first()
bump = False
if s:
for key in shout_dict:
if key in s.__dict__:
if s.__dict__[key] != shout_dict[key]:
print('[migration] shout already exists, but differs in %s' % key)
bump = True
else:
print('[migration] shout already exists, but lacks %s' % key)
bump = True
if bump:
s.update(shout_dict)
else:
print('[migration] something went wrong with shout: \n%r' % shout_dict)
session.commit()
except:
print(s)
raise Exception
# shout topics aftermath
shout_dict['topics'] = []
for tpc in r['topics']:
oldslug = tpc
newslug = storage['replacements'].get(oldslug, oldslug)
if newslug:
with local_session() as session:
shout_topic_old = session.query(ShoutTopic)\
.filter(ShoutTopic.shout == s.slug)\
.filter(ShoutTopic.topic == oldslug).first()
if shout_topic_old:
shout_topic_old.update({ 'slug': newslug })
else:
shout_topic_new = session.query(ShoutTopic)\
.filter(ShoutTopic.shout == s.slug)\
.filter(ShoutTopic.topic == newslug).first()
if not shout_topic_new: ShoutTopic.create(**{ 'shout': s.slug, 'topic': newslug })
session.commit()
shout_dict['topics'].append(newslug)
else:
print('[migration] ignored topic slug: \n%r' % tpc['slug'])
# raise Exception
# shout ratings
try:
shout_dict['ratings'] = []
for shout_rating_old in entry.get('ratings',[]):
with local_session() as session:
rater = session.query(User).filter(User.old_id == shout_rating_old['createdBy']).first()
rater = session.query(User).filter(User.oid == shout_rating_old['createdBy']).first()
if rater:
shout_rating_dict = {
'value': shout_rating_old['value'],
@@ -210,43 +222,10 @@ def migrate(entry, users_by_oid, topics_by_oid):
print('[migration] shout rating error: \n%r' % shout_rating_old)
# raise Exception
# shout topics
try:
shout_dict['topics'] = []
for topic in r['topics']:
tpc = topics_by_oid[topic['oid']]
oldslug = tpc['slug']
newslug = retopics.get(oldslug, oldslug)
need_create_topic = False
if newslug:
with local_session() as session:
shout_topic_new = session.query(ShoutTopic)\
.filter(ShoutTopic.shout == s.slug)\
.filter(ShoutTopic.topic == newslug).first()
shout_topic_old = session.query(ShoutTopic)\
.filter(ShoutTopic.shout == s.slug)\
.filter(ShoutTopic.topic == oldslug).first()
if not shout_topic_new:
if shout_topic_old:
shout_topic_old.update({ 'slug': newslug })
else:
need_create_topic = True
if need_create_topic:
ShoutTopic.create(**{ 'shout': s.slug, 'topic': newslug })
shout_dict['topics'].append(newslug)
except:
print('[migration] shout topic error: \n%r' % topic)
raise Exception
# shout views
try:
views = entry.get('views', 1)
ShoutViewByDay.create(
shout = s.slug,
value = views
)
except:
print('[migration] shout view error: \n%r' % entry)
# raise Exception
shout_dict['old_id'] = entry.get('_id')
return shout_dict, topic_errors
ShoutViewByDay.create( shout = s.slug, value = entry.get('views', 1) )
del shout_dict['ratings']
shout_dict['oid'] = entry.get('_id')
storage['shouts']['by_oid'][entry['_id']] = shout_dict
storage['shouts']['by_slug'][slug] = shout_dict
return shout_dict

View File

@@ -1,46 +0,0 @@
import json
from datetime import datetime
from orm.base import local_session
from orm import Topic, Community
from dateutil.parser import parse as date_parse
def migrate(entry, topics_by_oid):
'''
type Topic {
slug: String! # ID
createdBy: Int! # User
createdAt: DateTime!
title: String
parents: [String] # NOTE: topic can have parent topics
children: [String] # and children
}
'''
if type(entry['createdAt']) == type(''):
ts = date_parse(entry['createdAt'])
else:
ts = datetime.fromtimestamp(entry['createdAt']/1000)
topic_dict = {
'slug': entry['slug'],
'oid': entry['_id'],
# 'createdBy': entry['createdBy'],
# 'createdAt': ts,
'title': entry['title'].replace('&nbsp;', ' '), # .lower(),
'children': [],
'community' : Community.default_community.slug,
'body' : entry.get('description','').replace('&nbsp;', ' ')
}
try:
retopics = json.loads(open('migration/tables/replacements.json').read())
with local_session() as session:
slug = topics_by_oid.get(topic_dict['oid'], topic_dict)['slug']
slug = retopics.get(slug, slug)
if slug:
topic = session.query(Topic).filter(Topic.slug == slug).first()
if not topic:
del topic_dict['oid']
topic = Topic.create(**topic_dict)
except Exception as e:
# print(e)
raise e
topic_dict['oid'] = entry['_id']
return topic_dict

View File

@@ -0,0 +1,28 @@
from migration.extract import extract, html2text
from orm.base import local_session
from orm import Topic, Community
def migrate(entry):
body_orig = entry.get('description', '').replace('&nbsp;', ' ')
topic_dict = {
'slug': entry['slug'],
'oid': entry['_id'],
'title': entry['title'].replace('&nbsp;', ' '), #.lower(),
'children': [],
'community' : Community.default_community.slug
}
topic_dict['body'] = extract(html2text(body_orig), entry['_id'])
with local_session() as session:
slug = topic_dict['slug']
topic = session.query(Topic).filter(Topic.slug == slug).first()
if not topic:
topic = Topic.create(**topic_dict)
if len(topic.title) > len(topic_dict['title']):
topic.update({ 'title': topic_dict['title'] })
if len(topic.body) < len(topic_dict['body']):
topic.update({ 'body': topic_dict['body'] })
session.commit()
# print(topic.__dict__)
rt = topic.__dict__.copy()
del rt['_sa_instance_state']
return rt

View File

@@ -1,124 +1,114 @@
import sqlalchemy
from orm import User, Role, UserRating
from orm import User, UserRating
from orm.user import EmailSubscription
import frontmatter
from dateutil.parser import parse
from migration.html2text import html2text
from orm.base import local_session
def migrate(entry):
'''
type User {
username: String! # email
createdAt: DateTime!
email: String
password: String
oauth: String # provider:token
name: String # to display
userpic: String
links: [String]
emailConfirmed: Boolean # should contain all emails too
id: Int!
muted: Boolean
roles: [Role]
updatedAt: DateTime
wasOnlineAt: DateTime
ratings: [Rating]
slug: String
bio: String
notifications: [Int]
if 'subscribedTo' in entry: del entry['subscribedTo']
email = entry['emails'][0]['address']
user_dict = {
'oid': entry['_id'],
'roles': [],
'ratings': [],
'username': email,
'email': email,
'password': entry['services']['password'].get('bcrypt', ''),
'createdAt': parse(entry['createdAt']),
'emailConfirmed': bool(entry['emails'][0]['verified']),
'muted': False, # amnesty
'bio': entry['profile'].get('bio', ''),
'notifications': [],
'createdAt': parse(entry['createdAt']),
'roles': [], # entry['roles'] # roles by community
'ratings': [], # entry['ratings']
'links': [],
'name': 'anonymous'
}
'''
res = {}
res['old_id'] = entry['_id']
res['password'] = entry['services']['password'].get('bcrypt', '')
del entry['services']
if 'subscribedTo' in entry: #TODO: use subscribedTo
del entry['subscribedTo']
res['username'] = entry['emails'][0]['address']
res['email'] = res['username']
res['wasOnlineAt'] = parse(entry.get('loggedInAt', entry['createdAt']))
res['emailConfirmed'] = entry['emails'][0]['verified']
res['createdAt'] = parse(entry['createdAt'])
res['roles'] = [] # entry['roles'] # roles by community
res['ratings'] = [] # entry['ratings']
res['notifications'] = []
res['links'] = []
res['muted'] = False
res['name'] = 'anonymous'
if 'updatedAt' in entry: user_dict['updatedAt'] = parse(entry['updatedAt'])
if 'wasOnineAt' in entry: user_dict['wasOnlineAt'] = parse(entry['wasOnlineAt'])
if entry.get('profile'):
# slug
res['slug'] = entry['profile'].get('path')
res['bio'] = entry['profile'].get('bio','')
user_dict['slug'] = entry['profile'].get('path')
user_dict['bio'] = entry['profile'].get('bio','')
# userpic
try: res['userpic'] = 'https://assets.discours.io/unsafe/100x/' + entry['profile']['thumborId']
try: user_dict['userpic'] = 'https://assets.discours.io/unsafe/100x/' + entry['profile']['thumborId']
except KeyError:
try: res['userpic'] = entry['profile']['image']['url']
except KeyError: res['userpic'] = ''
try: user_dict['userpic'] = entry['profile']['image']['url']
except KeyError: user_dict['userpic'] = ''
# name
fn = entry['profile'].get('firstName', '')
ln = entry['profile'].get('lastName', '')
name = res['slug'] if res['slug'] else 'anonymous'
name = user_dict['slug'] if user_dict['slug'] else 'anonymous'
name = fn if fn else name
name = (name + ' ' + ln) if ln else name
name = entry['profile']['path'].lower().replace(' ', '-') if len(name) < 2 else name
res['name'] = name
user_dict['name'] = name
# links
fb = entry['profile'].get('facebook', False)
if fb:
res['links'].append(fb)
if fb: user_dict['links'].append(fb)
vk = entry['profile'].get('vkontakte', False)
if vk:
res['links'].append(vk)
if vk: user_dict['links'].append(vk)
tr = entry['profile'].get('twitter', False)
if tr:
res['links'].append(tr)
if tr: user_dict['links'].append(tr)
ws = entry['profile'].get('website', False)
if ws:
res['links'].append(ws)
if ws: user_dict['links'].append(ws)
# some checks
if not res['slug'] and len(res['links']) > 0: res['slug'] = res['links'][0].split('/')[-1]
if not user_dict['slug'] and len(user_dict['links']) > 0:
user_dict['slug'] = user_dict['links'][0].split('/')[-1]
res['slug'] = res.get('slug', res['email'].split('@')[0])
old = res['old_id']
user = User.create(**res.copy())
res['id'] = user.id
return res
user_dict['slug'] = user_dict.get('slug', user_dict['email'].split('@')[0])
oid = user_dict['oid']
try: user = User.create(**user_dict.copy())
except sqlalchemy.exc.IntegrityError:
print('[migration] cannot create user ' + user_dict['slug'])
with local_session() as session:
old_user = session.query(User).filter(User.slug == user_dict['slug']).first()
old_user.oid = oid
user = old_user
if not user:
print('[migration] ERROR: cannot find user ' + user_dict['slug'])
raise Exception
user_dict['id'] = user.id
return user_dict
def migrate_email_subscription(entry):
res = {}
res["email"] = entry["email"]
res["createdAt"] = parse(entry["createdAt"])
subscription = EmailSubscription.create(**res)
EmailSubscription.create(**res)
def migrate_2stage(entry, id_map):
ce = 0
for rating_entry in entry.get('ratings',[]):
rater_old_id = rating_entry['createdBy']
rater_slug = id_map.get(rater_old_id)
rater_oid = rating_entry['createdBy']
rater_slug = id_map.get(rater_oid)
if not rater_slug:
ce +=1
# print(rating_entry)
continue
old_id = entry['_id']
oid = entry['_id']
author_slug = id_map.get(oid)
user_rating_dict = {
'value': rating_entry['value'],
'rater': rater_slug,
'user': id_map.get(old_id)
'user': author_slug
}
with local_session() as session:
try:
user_rating = UserRating.create(**user_rating_dict)
except sqlalchemy.exc.IntegrityError:
print('[migration] duplicate rating solving for ' + rater_slug)
old_rating = session.query(UserRating).filter(UserRating.rater == rater_slug).first()
old_rating.value = rating_entry['value'] + old_rating.value
print('[migration] cannot create ' + author_slug + '`s rate from ' + rater_slug)
print('[migration] concat rating value %d+%d=%d' % (old_rating.value, rating_entry['value'], old_rating.value + rating_entry['value']))
old_rating.update({ 'value': old_rating.value + rating_entry['value'] })
session.commit()
except Exception as e:
print(e)
return ce