diff --git a/migration/extract.py b/migration/extract.py
index 965d4acc..90d3a5df 100644
--- a/migration/extract.py
+++ b/migration/extract.py
@@ -1,8 +1,14 @@
+import json
import re
import base64
+from migration.html2text import html2text
+
TOOLTIP_REGEX = r'(\/\/\/(.+)\/\/\/)'
+s3 = 'https://discours-io.s3.amazonaws.com/'
+cdn = 'https://assets.discours.io'
+retopics = json.loads(open('migration/tables/replacements.json', 'r').read())
def replace_tooltips(body):
newbody = body
@@ -29,9 +35,9 @@ def place_tooltips(body):
fn = 'a class="footnote-url" href="'
link = part.split(fn,1)[1].split('"', 1)[0]
extracted_part = part.split(fn,1)[0] + ' ' + part.split('/', 1)[-1]
- newparts[i] = ''
+ newparts[i] = '' + extracted_part + ''
else:
- newparts[i] = '' % part
+ newparts[i] = '%s' % part
# print('[extract] tooltip: ' + newparts[i])
else:
# print('[extract] pass: ' + part[:10] + '..')
@@ -42,7 +48,6 @@ def place_tooltips(body):
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)"
public = '../discoursio-web/public'
-cdn = 'https://assets.discours.io'
cache = {}
@@ -81,7 +86,7 @@ def extract_images(body, oid):
body = body.replace(' [](data:image', '.replace('\n[](data:image', '
oldparts = body.split(sep)
newparts = list(oldparts)
- print()
+ # print()
if len(oldparts) > 1:
print('[extract] images for %s' % oid)
print('[extract] %d candidates' % (len(oldparts)-1))
@@ -95,15 +100,12 @@ def extract_images(body, oid):
if end:
continue
else: # start or between
- # print('[extract_images] have next')
for mime in IMAGES.keys():
if mime in current[-15:]:
- # print('[extract_images] found proper mime type')
print('[extract] ' + current[-15:])
if ')' in next:
b64encoded = next.split(')')[0]
print('[extract] '+str(i+1)+': %d bytes' % len(b64encoded))
- # print(meta)
ext = IMAGES[mime]
print('[extract] type: ' + mime)
name = oid + '-' + str(i)
@@ -122,8 +124,8 @@ def extract_images(body, oid):
newparts[i] = current.split('[0] + ''
newparts[i+1] = next.replace(b64encoded + ')', '')
else:
- print('[extract] not b64encoded')
- print(current[-15:])
+ print('[extract] ERROR: no b64encoded')
+ # print(current[-15:])
i += 1
newbody = ''.join(newparts)
return newbody
@@ -146,9 +148,91 @@ def cleanup(body):
# .replace('\u2212', '-')
return newbody
-
def extract(body, oid):
newbody = extract_images(body, oid)
newbody = cleanup(newbody)
newbody = place_tooltips(newbody)
- return newbody
\ No newline at end of file
+ return newbody
+
+def prepare_body(entry):
+ # body modifications
+ body = ''
+ body_orig = entry.get('body', '')
+ if not body_orig: body_orig = ''
+
+ if entry.get('type') == 'Literature':
+ for m in entry.get('media', []):
+ t = m.get('title', '')
+ if t: body_orig += '
' + t + '
\n'
+ body_orig += (m.get('body', '') or '')
+ body_orig += '\n' + m.get('literatureBody', '') + '\n'
+
+ elif entry.get('type') == 'Video':
+ providers = set([])
+ video_url = ''
+ require = False
+ for m in entry.get('media', []):
+ yt = m.get('youtubeId', '')
+ vm = m.get('vimeoId', '')
+ if yt:
+ require = True
+ providers.add('YouTube')
+ video_url = 'https://www.youtube.com/watch?v=' + yt
+ body += '\n'
+ if vm:
+ require = True
+ providers.add('Vimeo')
+ video_url = 'https://vimeo.com/' + vm
+ body += '\n'
+ body += extract(html2text(m.get('body', '')), entry['_id'])
+ if video_url == '#': print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!'))
+ if require: body = 'import { ' + ','.join(list(providers)) + ' } from \'solid-social\'\n\n' + body + '\n'
+
+ elif entry.get('type') == 'Music':
+ for m in entry.get('media', []):
+ artist = m.get('performer')
+ trackname = ''
+ if artist: trackname += artist + ' - '
+ if 'title' in m: trackname += m.get('title','')
+ body += '\n'
+ body += extract(html2text(m.get('body', '')), entry['_id'])
+ body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + body + '\n'
+
+ elif entry.get('type') == 'Image':
+ cover = ''
+ if 'thumborId' in entry: cover = cdn + '/unsafe/1600x/' + entry['thumborId']
+ if not cover and 'image' in entry:
+ cover = entry['image'].get('url', '')
+ if 'cloudinary' in cover: cover = ''
+ images = {}
+ for m in entry.get('media', []):
+ t = m.get('title', '')
+ if t: body += '#### ' + t + '\n'
+ u = m.get('image', {}).get('url', '')
+ if 'cloudinary' in u:
+ u = m.get('thumborId')
+ if not u: u = cover
+ u = str(u)
+ if u not in images.keys():
+ if u.startswith('production'): u = s3 + u
+ body += '\n' # TODO: gallery here
+ images[u] = u
+ body += extract(html2text(m.get('body', '')), entry['_id']) + '\n'
+
+ if not body_orig:
+ print('[prepare] using body history...')
+ # print(entry.get('bodyHistory', ''))
+ try:
+ for up in entry.get('bodyHistory', []):
+ body_orig = up.get('text', '') or ''
+ if body_orig: break
+ except: pass
+
+ # body_html = str(BeautifulSoup(body_orig, features="html.parser"))
+ body += extract(html2text(body_orig), entry['_id'])
+
+ # replace some topics
+ for oldtopicslug, newtopicslug in retopics.items():
+ body.replace(oldtopicslug, newtopicslug)
+
+ return body
diff --git a/migration/tables/content_item_categories.py b/migration/tables/content_item_categories.py
index 310fc231..3009d522 100644
--- a/migration/tables/content_item_categories.py
+++ b/migration/tables/content_item_categories.py
@@ -35,7 +35,7 @@ def migrate(entry, topics_by_oid):
if not topic:
del topic_dict['oid']
topic = Topic.create(**topic_dict)
- print('created')
+ # print('created')
else:
if len(topic.title) > len(topic_dict['title']) or \
len(topic.body) < len(topic_dict['body']):
diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py
index a529d02f..be2d0dc5 100644
--- a/migration/tables/content_items.py
+++ b/migration/tables/content_items.py
@@ -1,15 +1,12 @@
from dateutil.parser import parse as date_parse
import frontmatter
import json
-import sqlalchemy
from orm import Shout, ShoutTopic, ShoutRating, ShoutViewByDay, User, shout
-# from bs4 import BeautifulSoup
-from migration.html2text import html2text
from transliterate import translit
from datetime import datetime
from orm.base import local_session
from orm.community import Community
-from migration.extract import extract
+from migration.extract import prepare_body
import os
DISCOURS_USER = {
@@ -19,9 +16,9 @@ DISCOURS_USER = {
'userpic': 'https://discours.io/images/logo-mini.svg',
'createdAt': '2016-03-05 22:22:00.350000'
}
-
+OLD_DATE = '2016-03-05 22:22:00.350000'
+retopics = json.loads(open('migration/tables/replacements.json').read())
ts = datetime.now()
-
type2layout = {
'Article': 'article',
'Literature': 'prose',
@@ -42,44 +39,17 @@ def get_metadata(r):
metadata['cover'] = r.get('cover')
return metadata
-
-retopics = json.loads(open('migration/tables/replacements.json').read())
-
def migrate(entry, users_by_oid, topics_by_oid):
- '''
- type Shout {
- slug: String!
- author: Int!
- body: String!
- createdAt: DateTime!
- updatedAt: DateTime!
- deletedAt: DateTime
- deletedBy: Int
- rating: Int
- ratings: [Rating]
- published: Bool!
- publishedAt: DateTime # if there is no published field - it is not published
- replyTo: String # another shout
- tags: [String] # actual values
- topics: [String] # topic-slugs, order has matter
- title: String
- versionOf: String
- visibleForRoles: [String] # role ids are strings
- visibleForUsers: [Int]
- views: Int
- }
- '''
- # print(entry)
- content = ''
+ # init, set title and layout
r = {
'layout': type2layout[entry['type']],
'title': entry['title'],
'community': Community.default_community.id,
'authors': [],
'topics': [],
- 'rating': entry.get('rating', 0),
+ 'rating': 0,
'ratings': [],
- 'createdAt': entry.get('createdAt', '2016-03-05 22:22:00.350000')
+ 'createdAt': []
}
# slug
@@ -92,126 +62,33 @@ def migrate(entry, users_by_oid, topics_by_oid):
except: raise Exception
if s: r['slug'] = s
else: raise Exception
-
- # topics
-
- category = entry['category']
- mainTopic = topics_by_oid.get(category)
- if mainTopic:
- r['mainTopic'] = mainTopic["slug"]
- topic_oids = [category, ]
- topic_errors = []
- topic_oids.extend(entry.get('tags', []))
- for oid in topic_oids:
- if oid in topics_by_oid:
- r['topics'].append(topics_by_oid[oid])
- else:
- # print('ERROR: unknown old topic id: ' + oid)
- topic_errors.append(oid)
# cover
-
- if entry.get('image') is not None:
- r['cover'] = entry['image']['url']
- if entry.get('thumborId') is not None:
- r['cover'] = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId']
- if entry.get('updatedAt') is not None:
- r['updatedAt'] = date_parse(entry['updatedAt'])
-
- # body
-
- body = ''
- body_orig = entry.get('body')
- if not body_orig: body_orig = ''
-
- # body modifications
-
- if entry.get('type') == 'Literature':
- for m in entry.get('media', []):
- t = m.get('title', '')
- if t: body_orig += '### ' + t + '\n'
- body_orig += (m.get('body', '') or '')
- body_orig += '\n' + m.get('literatureBody', '') + '\n'
-
-
- elif entry.get('type') == 'Video':
- providers = set([])
- video_url = ''
- require = False
- for m in entry.get('media', []):
- yt = m.get('youtubeId', '')
- vm = m.get('vimeoId', '')
- if yt:
- require = True
- providers.add('YouTube')
- video_url = 'https://www.youtube.com/watch?v=' + yt
- body += '\n'
- if vm:
- require = True
- providers.add('Vimeo')
- video_url = 'https://vimeo.com/' + vm
- body += '\n'
- body += extract(html2text(m.get('body', '')), entry['_id'])
- if video_url == '#': print(entry.get('media', 'UNKNOWN MEDIA PROVIDER!'))
- if require: body = 'import { ' + ','.join(list(providers)) + ' } from \'solid-social\'\n\n' + body + '\n'
- body += extract(html2text(body_orig), entry['_id'])
-
- elif entry.get('type') == 'Music':
- require = False
- for m in entry.get('media', []):
- if 'fileUrl' in m:
- require = True
- artist = m.get('performer')
- trackname = ''
- if artist: trackname += artist + ' - '
- trackname += m.get('title','')
- body += '\n'
- body += extract(html2text(m.get('body', '')), entry['_id'])
- else:
- print(m)
- if require: body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + body + '\n'
- body += extract(html2text(body_orig), entry['_id'])
-
- elif entry.get('type') == 'Image':
- cover = r.get('cover')
- images = {}
- for m in entry.get('media', []):
- t = m.get('title', '')
- if t: body += '#### ' + t + '\n'
- u = m.get('image', {}).get('url', '')
- if 'cloudinary' in u:
- u = m.get('thumborId')
- if not u: u = cover
- if u not in images.keys():
- if u.startswith('production'): u = 'https://discours-io.s3.amazonaws.com/' + u
- body += '\n' # TODO: gallery here
- images[u] = u
- body += extract(html2text(m.get('body', '')), entry['_id']) + '\n'
- body += extract(html2text(body_orig), entry['_id'])
-
- # simple post or no body stored
- if body == '':
- if not body_orig:
- print('[migration] using body history...')
- try: body_orig += entry.get('bodyHistory', [{'body': ''}])[0].get('body', '')
- except: pass
- # need to extract
- # body_html = str(BeautifulSoup(body_orig, features="html.parser"))
- body += extract(html2text(body_orig), entry['_id'])
+ c = ''
+ if entry.get('thumborId'):
+ c = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId']
else:
- # EVERYTHING IS FINE HERE
- pass
-
- # replace some topics
- for oldtopicslug, newtopicslug in retopics.items():
- body.replace(oldtopicslug, newtopicslug)
+ c = entry.get('image', {}).get('url')
+ if not c or 'cloudinary' in c:
+ c = ''
+ r['cover'] = c
- # authors
+ # timestamps
- # get author data
- userdata = {}
- try: userdata = users_by_oid[entry['createdBy']]
- except KeyError:
+ r['createdAt'] = date_parse(entry.get('createdAt', OLD_DATE))
+ r['updatedAt'] = date_parse(entry['updatedAt']) if 'updatedAt' in entry else ts
+ if entry.get('published'):
+ r['publishedAt'] = date_parse(entry.get('publishedAt', OLD_DATE))
+ if r['publishedAt'] == OLD_DATE: r['publishedAt'] = ts
+ if 'deletedAt' in entry: r['deletedAt'] = date_parse(entry['deletedAt'])
+
+ # connected users' data
+
+ # r['deletedBy'] = entry.get('deletedBy', '0') # TypeError: 'deletedBy' is an invalid keyword argument for Shout
+
+ oid = entry.get('createdBy', '')
+ userdata = users_by_oid.get(oid, {})
+ if not userdata.get('slug'):
app = entry.get('application')
if app:
userslug = translit(app['name'], 'ru', reversed=True).replace(' ', '-').replace('\'', '').replace('.', '-').lower()
@@ -232,20 +109,22 @@ def migrate(entry, users_by_oid, topics_by_oid):
'userpic': 'https://discours.io/image/logo-mini.svg'
}
- # set author data
- r['body'] = body
- shout_dict = r.copy()
author = { # a short version for public listings
'slug': userdata.get('slug', 'discours'),
'name': userdata.get('name', 'Дискурс'),
'userpic': userdata.get('userpic', '')
}
- shout_dict['authors'] = [ author, ]
+ r['authors'] = [ author, ]
+ # body
+
+ body = prepare_body(entry)
# save mdx for prerender if published
- if entry['published']:
- metadata = get_metadata(shout_dict)
+ r['body'] = body
+ if entry.get('published'):
+ content = ''
+ metadata = get_metadata(r)
content = frontmatter.dumps(frontmatter.Post(r['body'], **metadata))
ext = 'mdx'
parentDir = '/'.join(os.getcwd().split('/')[:-1])
@@ -255,90 +134,116 @@ def migrate(entry, users_by_oid, topics_by_oid):
open(filepath + '.' + ext, 'w').write(bc)
# open(filepath + '.html', 'w').write(body_orig)
+
+ # topics
+
+ category = entry['category']
+ mainTopic = topics_by_oid.get(category)
+ if mainTopic:
+ r['mainTopic'] = mainTopic["slug"]
+ topic_oids = [category, ]
+ topic_errors = []
+ topic_oids.extend(entry.get('tags', []))
+ for oid in topic_oids:
+ if oid in topics_by_oid:
+ r['topics'].append(topics_by_oid[oid])
+ else:
+ # print('ERROR: unknown old topic id: ' + oid)
+ topic_errors.append(oid)
+
+ # set prepared shout data
+
+ shout_dict = r.copy()
+ del shout_dict['topics'] # FIXME: AttributeError: 'str' object has no attribute '_sa_instance_state'
+ del shout_dict['rating'] # FIXME: TypeError: 'rating' is an invalid keyword argument for Shout
+ del shout_dict['ratings']
+
+ # get author
+
+ user = None
+ email = userdata.get('email')
+ authorslug = userdata.get('slug')
+ with local_session() as session:
+ try:
+ if email: user = session.query(User).filter(User.email == email).first()
+ if not user and authorslug: user = session.query(User).filter(User.slug == authorslug).first()
+ if not user and userdata: user = User.create(**userdata)
+ except:
+ print('[migration] shout author error: \n%r' % entry)
+ raise Exception
+ assert user, 'could not get a user'
+ shout_dict['authors'] = [ user, ]
+
# save shout to db
+ s = object()
+ try: s = Shout.create(**shout_dict)
+ except: print('[migration] shout create error: \n%r' % shout_dict)
+
+
+ # shout ratings
try:
- shout_dict['createdAt'] = date_parse(r.get('createdAt')) if entry.get('createdAt') else ts
- shout_dict['publishedAt'] = date_parse(entry.get('publishedAt')) if entry.get('published') else None
-
- if entry.get('deletedAt') is not None:
- shout_dict['deletedAt'] = date_parse(entry.get('deletedAt'))
- shout_dict['deletedBy'] = entry.get('deletedBy', '0')
-
- del shout_dict['topics'] # FIXME: AttributeError: 'str' object has no attribute '_sa_instance_state'
- del shout_dict['rating'] # FIXME: TypeError: 'rating' is an invalid keyword argument for Shout
- del shout_dict['ratings']
-
- # get user
-
- user = None
- email = userdata.get('email')
- slug = userdata.get('slug')
- with local_session() as session:
- try:
- if email: user = session.query(User).filter(User.email == email).first()
- if not user and slug: user = session.query(User).filter(User.slug == slug).first()
- if not user and userdata: user = User.create(**userdata)
- except:
- print('[migration] content_items error: \n%r' % entry)
- assert user, 'could not get a user'
- shout_dict['authors'] = [ user, ]
-
- # create shout
-
- s = object()
- try: s = Shout.create(**shout_dict)
- except: print('[migration] content_items error: \n%r' % entry)
-
- # shout ratings
-
shout_dict['ratings'] = []
for shout_rating_old in entry.get('ratings',[]):
with local_session() as session:
- rater = session.query(User).\
- filter(User.old_id == shout_rating_old['createdBy']).first()
- if rater:
- shout_rating_dict = {
- 'value': shout_rating_old['value'],
- 'rater': rater.slug,
- 'shout': s.slug
- }
- cts = shout_rating_old.get('createdAt')
- if cts: shout_rating_dict['ts'] = date_parse(cts)
- try:
+ rater = session.query(User).filter(User.old_id == shout_rating_old['createdBy']).first()
+ if rater:
+ shout_rating_dict = {
+ 'value': shout_rating_old['value'],
+ 'rater': rater.slug,
+ 'shout': s.slug
+ }
+ cts = shout_rating_old.get('createdAt')
+ if cts: shout_rating_dict['ts'] = date_parse(cts)
shout_rating = session.query(ShoutRating).\
filter(ShoutRating.shout == s.slug).\
filter(ShoutRating.rater == rater.slug).first()
if shout_rating:
- shout_rating_dict['value'] += int(shout_rating.value or 0)
+ shout_rating_dict['value'] = int(shout_rating_dict['value'] or 0) + int(shout_rating.value or 0)
shout_rating.update(shout_rating_dict)
else: ShoutRating.create(**shout_rating_dict)
shout_dict['ratings'].append(shout_rating_dict)
- except sqlalchemy.exc.IntegrityError:
- print('[migration] shout_rating error: \n%r' % shout_rating_dict)
- pass
-
- # shout topics
+ except:
+ print('[migration] shout rating error: \n%r' % shout_rating_old)
+ # raise Exception
+ # shout topics
+ try:
shout_dict['topics'] = []
for topic in r['topics']:
- try:
- tpc = topics_by_oid[topic['oid']]
- slug = retopics.get(tpc['slug'], tpc['slug'])
- ShoutTopic.create(**{ 'shout': s.slug, 'topic': slug })
- shout_dict['topics'].append(slug)
- except sqlalchemy.exc.IntegrityError:
- pass
-
- # shout views
+ tpc = topics_by_oid[topic['oid']]
+ oldslug = tpc['slug']
+ newslug = retopics.get(oldslug, oldslug)
+ need_create_topic = False
+ if newslug:
+ with local_session() as session:
+ shout_topic_new = session.query(ShoutTopic)\
+ .filter(ShoutTopic.shout == s.slug)\
+ .filter(ShoutTopic.topic == newslug).first()
+ shout_topic_old = session.query(ShoutTopic)\
+ .filter(ShoutTopic.shout == s.slug)\
+ .filter(ShoutTopic.topic == oldslug).first()
+ if not shout_topic_new:
+ if shout_topic_old:
+ shout_topic_old.update({ 'slug': newslug })
+ else:
+ need_create_topic = True
+ if need_create_topic:
+ ShoutTopic.create(**{ 'shout': s.slug, 'topic': newslug })
+ shout_dict['topics'].append(newslug)
+ except:
+ print('[migration] shout topic error: \n%r' % entry)
+ raise Exception
+ # shout views
+ try:
views = entry.get('views', 1)
ShoutViewByDay.create(
shout = s.slug,
value = views
)
-
- except Exception as e:
- raise e
+ except:
+ print('[migration] shout view error: \n%r' % entry)
+ # raise Exception
shout_dict['old_id'] = entry.get('_id')
return shout_dict, topic_errors
diff --git a/migration/tables/tags.py b/migration/tables/tags.py
index 8954a71d..affc79ae 100644
--- a/migration/tables/tags.py
+++ b/migration/tables/tags.py
@@ -40,7 +40,7 @@ def migrate(entry, topics_by_oid):
del topic_dict['oid']
topic = Topic.create(**topic_dict)
except Exception as e:
- print(e)
+ # print(e)
raise e
topic_dict['oid'] = entry['_id']
return topic_dict