migration-is-back
This commit is contained in:
parent
83f5f280b2
commit
65532ea1a3
303
migrate.py
Normal file
303
migrate.py
Normal file
|
@ -0,0 +1,303 @@
|
||||||
|
''' cmd managed migration '''
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# from migration.export import export_email_subscriptions
|
||||||
|
from migration.export import export_mdx, export_slug
|
||||||
|
from migration.tables.users import migrate as migrateUser
|
||||||
|
from migration.tables.users import migrate_2stage as migrateUser_2stage
|
||||||
|
from migration.tables.content_items import get_shout_slug, migrate as migrateShout
|
||||||
|
from migration.tables.topics import migrate as migrateTopic
|
||||||
|
from migration.tables.comments import migrate as migrateComment
|
||||||
|
from migration.tables.comments import migrate_2stage as migrateComment_2stage
|
||||||
|
from orm.reaction import Reaction
|
||||||
|
from settings import DB_URL
|
||||||
|
|
||||||
|
TODAY = datetime.strftime(datetime.now(), '%Y%m%d')
|
||||||
|
|
||||||
|
OLD_DATE = '2016-03-05 22:22:00.350000'
|
||||||
|
|
||||||
|
|
||||||
|
def users_handle(storage):
|
||||||
|
''' migrating users first '''
|
||||||
|
counter = 0
|
||||||
|
id_map = {}
|
||||||
|
print('[migration] migrating %d users' % (len(storage['users']['data'])))
|
||||||
|
for entry in storage['users']['data']:
|
||||||
|
oid = entry['_id']
|
||||||
|
user = migrateUser(entry)
|
||||||
|
storage['users']['by_oid'][oid] = user # full
|
||||||
|
del user['password']
|
||||||
|
del user['notifications']
|
||||||
|
del user['emailConfirmed']
|
||||||
|
del user['username']
|
||||||
|
del user['email']
|
||||||
|
storage['users']['by_slug'][user['slug']] = user # public
|
||||||
|
id_map[user['oid']] = user['slug']
|
||||||
|
counter += 1
|
||||||
|
ce = 0
|
||||||
|
for entry in storage['users']['data']:
|
||||||
|
ce += migrateUser_2stage(entry, id_map)
|
||||||
|
return storage
|
||||||
|
|
||||||
|
|
||||||
|
def topics_handle(storage):
|
||||||
|
''' topics from categories and tags '''
|
||||||
|
counter = 0
|
||||||
|
for t in (storage['topics']['tags'] + storage['topics']['cats']):
|
||||||
|
if t['slug'] in storage['replacements']:
|
||||||
|
t['slug'] = storage['replacements'][t['slug']]
|
||||||
|
topic = migrateTopic(t)
|
||||||
|
storage['topics']['by_oid'][t['_id']] = topic
|
||||||
|
storage['topics']['by_slug'][t['slug']] = topic
|
||||||
|
counter += 1
|
||||||
|
else:
|
||||||
|
print('[migration] topic ' + t['slug'] + ' ignored')
|
||||||
|
for oldslug, newslug in storage['replacements'].items():
|
||||||
|
if oldslug != newslug and oldslug in storage['topics']['by_slug']:
|
||||||
|
oid = storage['topics']['by_slug'][oldslug]['_id']
|
||||||
|
del storage['topics']['by_slug'][oldslug]
|
||||||
|
storage['topics']['by_oid'][oid] = storage['topics']['by_slug'][newslug]
|
||||||
|
print('[migration] ' + str(counter) + ' topics migrated')
|
||||||
|
print('[migration] ' + str(len(storage['topics']
|
||||||
|
['by_oid'].values())) + ' topics by oid')
|
||||||
|
print('[migration] ' + str(len(storage['topics']
|
||||||
|
['by_slug'].values())) + ' topics by slug')
|
||||||
|
# raise Exception
|
||||||
|
return storage
|
||||||
|
|
||||||
|
|
||||||
|
def shouts_handle(storage, args):
|
||||||
|
''' migrating content items one by one '''
|
||||||
|
counter = 0
|
||||||
|
discours_author = 0
|
||||||
|
pub_counter = 0
|
||||||
|
for entry in storage['shouts']['data']:
|
||||||
|
# slug
|
||||||
|
slug = get_shout_slug(entry)
|
||||||
|
|
||||||
|
# single slug mode
|
||||||
|
if '-' in args and slug not in args: continue
|
||||||
|
|
||||||
|
# migrate
|
||||||
|
shout = migrateShout(entry, storage)
|
||||||
|
storage['shouts']['by_oid'][entry['_id']] = shout
|
||||||
|
storage['shouts']['by_slug'][shout['slug']] = shout
|
||||||
|
# shouts.topics
|
||||||
|
if not shout['topics']: print('[migration] no topics!')
|
||||||
|
|
||||||
|
# wuth author
|
||||||
|
author = shout['authors'][0].slug
|
||||||
|
if author == 'discours': discours_author += 1
|
||||||
|
# print('[migration] ' + shout['slug'] + ' with author ' + author)
|
||||||
|
|
||||||
|
if entry.get('published'):
|
||||||
|
if 'mdx' in args: export_mdx(shout)
|
||||||
|
pub_counter += 1
|
||||||
|
|
||||||
|
# print main counter
|
||||||
|
counter += 1
|
||||||
|
line = str(counter+1) + ': ' + shout['slug'] + " @" + author
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
print('[migration] ' + str(counter) + ' content items were migrated')
|
||||||
|
print('[migration] ' + str(pub_counter) + ' have been published')
|
||||||
|
print('[migration] ' + str(discours_author) + ' authored by @discours')
|
||||||
|
return storage
|
||||||
|
|
||||||
|
|
||||||
|
def comments_handle(storage):
|
||||||
|
id_map = {}
|
||||||
|
ignored_counter = 0
|
||||||
|
missed_shouts = {}
|
||||||
|
for oldcomment in storage['reactions']['data']:
|
||||||
|
if not oldcomment.get('deleted'):
|
||||||
|
reaction = migrateComment(oldcomment, storage)
|
||||||
|
if type(reaction) == str:
|
||||||
|
missed_shouts[reaction] = oldcomment
|
||||||
|
elif type(reaction) == Reaction:
|
||||||
|
reaction = reaction.dict()
|
||||||
|
id = reaction['id']
|
||||||
|
oid = reaction['oid']
|
||||||
|
id_map[oid] = id
|
||||||
|
else:
|
||||||
|
ignored_counter += 1
|
||||||
|
|
||||||
|
for reaction in storage['reactions']['data']: migrateComment_2stage(
|
||||||
|
reaction, id_map)
|
||||||
|
print('[migration] ' + str(len(id_map)) + ' comments migrated')
|
||||||
|
print('[migration] ' + str(ignored_counter) + ' comments ignored')
|
||||||
|
print('[migration] ' + str(len(missed_shouts.keys())) +
|
||||||
|
' commented shouts missed')
|
||||||
|
missed_counter = 0
|
||||||
|
for missed in missed_shouts.values():
|
||||||
|
missed_counter += len(missed)
|
||||||
|
print('[migration] ' + str(missed_counter) + ' comments dropped')
|
||||||
|
return storage
|
||||||
|
|
||||||
|
|
||||||
|
def bson_handle():
|
||||||
|
# decode bson # preparing data
|
||||||
|
from migration import bson2json
|
||||||
|
bson2json.json_tables()
|
||||||
|
|
||||||
|
|
||||||
|
def export_one(slug, storage):
|
||||||
|
topics_handle(storage)
|
||||||
|
users_handle(storage)
|
||||||
|
shouts_handle(storage)
|
||||||
|
export_slug(slug, storage)
|
||||||
|
|
||||||
|
|
||||||
|
def all_handle(storage, args):
|
||||||
|
print('[migration] handle everything')
|
||||||
|
users_handle(storage)
|
||||||
|
topics_handle(storage)
|
||||||
|
shouts_handle(storage, args)
|
||||||
|
comments_handle(storage)
|
||||||
|
# export_email_subscriptions()
|
||||||
|
print('[migration] done!')
|
||||||
|
|
||||||
|
|
||||||
|
def data_load():
|
||||||
|
storage = {
|
||||||
|
'content_items': {
|
||||||
|
'by_oid': {},
|
||||||
|
'by_slug': {},
|
||||||
|
},
|
||||||
|
'shouts': {
|
||||||
|
'by_oid': {},
|
||||||
|
'by_slug': {},
|
||||||
|
'data': []
|
||||||
|
},
|
||||||
|
'reactions': {
|
||||||
|
'by_oid': {},
|
||||||
|
'by_slug': {},
|
||||||
|
'by_content': {},
|
||||||
|
'data': []
|
||||||
|
},
|
||||||
|
'topics': {
|
||||||
|
'by_oid': {},
|
||||||
|
'by_slug': {},
|
||||||
|
'cats': [],
|
||||||
|
'tags': [],
|
||||||
|
},
|
||||||
|
'users': {
|
||||||
|
'by_oid': {},
|
||||||
|
'by_slug': {},
|
||||||
|
'data': []
|
||||||
|
},
|
||||||
|
'replacements': json.loads(open('migration/tables/replacements.json').read())
|
||||||
|
}
|
||||||
|
users_data = []
|
||||||
|
tags_data = []
|
||||||
|
cats_data = []
|
||||||
|
comments_data = []
|
||||||
|
content_data = []
|
||||||
|
try:
|
||||||
|
users_data = json.loads(open('migration/data/users.json').read())
|
||||||
|
print('[migration] ' + str(len(users_data)) + ' users ')
|
||||||
|
tags_data = json.loads(open('migration/data/tags.json').read())
|
||||||
|
storage['topics']['tags'] = tags_data
|
||||||
|
print('[migration] ' + str(len(tags_data)) + ' tags ')
|
||||||
|
cats_data = json.loads(
|
||||||
|
open('migration/data/content_item_categories.json').read())
|
||||||
|
storage['topics']['cats'] = cats_data
|
||||||
|
print('[migration] ' + str(len(cats_data)) + ' cats ')
|
||||||
|
comments_data = json.loads(open('migration/data/comments.json').read())
|
||||||
|
storage['reactions']['data'] = comments_data
|
||||||
|
print('[migration] ' + str(len(comments_data)) + ' comments ')
|
||||||
|
content_data = json.loads(open('migration/data/content_items.json').read())
|
||||||
|
storage['shouts']['data'] = content_data
|
||||||
|
print('[migration] ' + str(len(content_data)) + ' content items ')
|
||||||
|
# fill out storage
|
||||||
|
for x in users_data:
|
||||||
|
storage['users']['by_oid'][x['_id']] = x
|
||||||
|
# storage['users']['by_slug'][x['slug']] = x
|
||||||
|
# no user.slug yet
|
||||||
|
print('[migration] ' + str(len(storage['users']
|
||||||
|
['by_oid'].keys())) + ' users by oid')
|
||||||
|
for x in tags_data:
|
||||||
|
storage['topics']['by_oid'][x['_id']] = x
|
||||||
|
storage['topics']['by_slug'][x['slug']] = x
|
||||||
|
for x in cats_data:
|
||||||
|
storage['topics']['by_oid'][x['_id']] = x
|
||||||
|
storage['topics']['by_slug'][x['slug']] = x
|
||||||
|
print('[migration] ' + str(len(storage['topics']
|
||||||
|
['by_slug'].keys())) + ' topics by slug')
|
||||||
|
for item in content_data:
|
||||||
|
slug = get_shout_slug(item)
|
||||||
|
storage['content_items']['by_slug'][slug] = item
|
||||||
|
storage['content_items']['by_oid'][item['_id']] = item
|
||||||
|
print('[migration] ' + str(len(content_data)) + ' content items')
|
||||||
|
for x in comments_data:
|
||||||
|
storage['reactions']['by_oid'][x['_id']] = x
|
||||||
|
cid = x['contentItem']
|
||||||
|
storage['reactions']['by_content'][cid] = x
|
||||||
|
ci = storage['content_items']['by_oid'].get(cid, {})
|
||||||
|
if 'slug' in ci: storage['reactions']['by_slug'][ci['slug']] = x
|
||||||
|
print('[migration] ' + str(len(storage['reactions']
|
||||||
|
['by_content'].keys())) + ' with comments')
|
||||||
|
except Exception as e: raise e
|
||||||
|
storage['users']['data'] = users_data
|
||||||
|
storage['topics']['tags'] = tags_data
|
||||||
|
storage['topics']['cats'] = cats_data
|
||||||
|
storage['shouts']['data'] = content_data
|
||||||
|
storage['reactions']['data'] = comments_data
|
||||||
|
return storage
|
||||||
|
|
||||||
|
|
||||||
|
def mongo_download(url):
|
||||||
|
if not url: raise Exception('\n\nYou should set MONGODB_URL enviroment variable\n')
|
||||||
|
print('[migration] mongodump ' + url)
|
||||||
|
subprocess.call([
|
||||||
|
'mongodump',
|
||||||
|
'--uri', url + '/?authSource=admin',
|
||||||
|
'--forceTableScan',
|
||||||
|
], stderr = subprocess.STDOUT)
|
||||||
|
|
||||||
|
|
||||||
|
def create_pgdump():
|
||||||
|
pgurl = DB_URL
|
||||||
|
if not pgurl: raise Exception('\n\nYou should set DATABASE_URL enviroment variable\n')
|
||||||
|
subprocess.call(
|
||||||
|
[ 'pg_dump', pgurl, '-f', TODAY + '-pgdump.sql'],
|
||||||
|
stderr = subprocess.STDOUT
|
||||||
|
)
|
||||||
|
subprocess.call([
|
||||||
|
'scp',
|
||||||
|
TODAY + '-pgdump.sql',
|
||||||
|
'root@build.discours.io:/root/.'
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def handle_auto():
|
||||||
|
print('[migration] no command given, auto mode')
|
||||||
|
mongo_download(os.getenv('MONGODB_URL'))
|
||||||
|
bson_handle()
|
||||||
|
all_handle(data_load(), sys.argv)
|
||||||
|
create_pgdump()
|
||||||
|
|
||||||
|
def migrate():
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
cmd=sys.argv[1]
|
||||||
|
if type(cmd) == str: print('[migration] command: ' + cmd)
|
||||||
|
if cmd == 'mongodb':
|
||||||
|
mongo_download(sys.argv[2])
|
||||||
|
elif cmd == 'bson':
|
||||||
|
bson_handle()
|
||||||
|
else:
|
||||||
|
storage=data_load()
|
||||||
|
if cmd == '-': export_one(sys.argv[2], storage)
|
||||||
|
else: all_handle(storage, sys.argv)
|
||||||
|
elif len(sys.argv) == 1:
|
||||||
|
handle_auto()
|
||||||
|
else:
|
||||||
|
print('[migration] usage: python migrate.py <command>')
|
||||||
|
print('[migration] commands: mongodb, bson, all, all mdx, - <slug>')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
migrate()
|
1
migration/__init__.py
Normal file
1
migration/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
__all__ = ["tables", "bson2json", "html2md"]
|
28
migration/bson2json.py
Normal file
28
migration/bson2json.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
import os
|
||||||
|
import bson
|
||||||
|
import json
|
||||||
|
|
||||||
|
from migration.utils import DateTimeEncoder
|
||||||
|
|
||||||
|
def json_tables():
|
||||||
|
print('[migration] unpack dump/discours/*.bson to migration/data/*.json')
|
||||||
|
data = {
|
||||||
|
"content_items": [],
|
||||||
|
"content_item_categories": [],
|
||||||
|
"tags": [],
|
||||||
|
"email_subscriptions": [],
|
||||||
|
"users": [],
|
||||||
|
"comments": []
|
||||||
|
}
|
||||||
|
for table in data.keys():
|
||||||
|
lc = []
|
||||||
|
with open('dump/discours/'+table+'.bson', 'rb') as f:
|
||||||
|
bs = f.read()
|
||||||
|
f.close()
|
||||||
|
base = 0
|
||||||
|
while base < len(bs):
|
||||||
|
base, d = bson.decode_document(bs, base)
|
||||||
|
lc.append(d)
|
||||||
|
data[table] = lc
|
||||||
|
open(os.getcwd() + '/migration/data/'+table+'.json', 'w').write(json.dumps(lc,cls=DateTimeEncoder))
|
||||||
|
|
105
migration/export.py
Normal file
105
migration/export.py
Normal file
|
@ -0,0 +1,105 @@
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import frontmatter
|
||||||
|
from migration.extract import extract_html, prepare_html_body
|
||||||
|
from migration.utils import DateTimeEncoder
|
||||||
|
|
||||||
|
OLD_DATE = '2016-03-05 22:22:00.350000'
|
||||||
|
EXPORT_DEST = '../discoursio-web/data/'
|
||||||
|
parentDir = '/'.join(os.getcwd().split('/')[:-1])
|
||||||
|
contentDir = parentDir + '/discoursio-web/content/'
|
||||||
|
ts = datetime.now()
|
||||||
|
|
||||||
|
def get_metadata(r):
|
||||||
|
authors = []
|
||||||
|
for a in r['authors']:
|
||||||
|
authors.append({ # a short version for public listings
|
||||||
|
'slug': a.slug or 'discours',
|
||||||
|
'name': a.name or 'Дискурс',
|
||||||
|
'userpic': a.userpic or 'https://discours.io/static/img/discours.png'
|
||||||
|
})
|
||||||
|
metadata = {}
|
||||||
|
metadata['title'] = r.get('title', '').replace('{', '(').replace('}', ')')
|
||||||
|
metadata['authors'] = authors
|
||||||
|
metadata['createdAt'] = r.get('createdAt', ts)
|
||||||
|
metadata['layout'] = r['layout']
|
||||||
|
metadata['topics'] = [topic for topic in r['topics']]
|
||||||
|
metadata['topics'].sort()
|
||||||
|
if r.get('cover', False): metadata['cover'] = r.get('cover')
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
def export_mdx(r):
|
||||||
|
# print('[export] mdx %s' % r['slug'])
|
||||||
|
content = ''
|
||||||
|
metadata = get_metadata(r)
|
||||||
|
content = frontmatter.dumps(frontmatter.Post(r['body'], **metadata))
|
||||||
|
ext = 'mdx'
|
||||||
|
filepath = contentDir + r['slug']
|
||||||
|
bc = bytes(content,'utf-8').decode('utf-8','ignore')
|
||||||
|
open(filepath + '.' + ext, 'w').write(bc)
|
||||||
|
|
||||||
|
def export_body(shout, storage):
|
||||||
|
entry = storage['content_items']['by_oid'][shout['oid']]
|
||||||
|
if entry:
|
||||||
|
shout['body'] = prepare_html_body(entry) # prepare_md_body(entry)
|
||||||
|
export_mdx(shout)
|
||||||
|
print('[export] html for %s' % shout['slug'])
|
||||||
|
body = extract_html(entry)
|
||||||
|
open(contentDir + shout['slug'] + '.html', 'w').write(body)
|
||||||
|
else:
|
||||||
|
raise Exception('no content_items entry found')
|
||||||
|
|
||||||
|
def export_slug(slug, storage):
|
||||||
|
shout = storage['shouts']['by_slug'][slug]
|
||||||
|
shout = storage['shouts']['by_slug'].get(slug)
|
||||||
|
assert shout, '[export] no shout found by slug: %s ' % slug
|
||||||
|
author = shout['authors'][0]
|
||||||
|
assert author, '[export] no author error'
|
||||||
|
export_body(shout, storage)
|
||||||
|
|
||||||
|
def export_email_subscriptions():
|
||||||
|
email_subscriptions_data = json.loads(open('migration/data/email_subscriptions.json').read())
|
||||||
|
for data in email_subscriptions_data:
|
||||||
|
# migrate_email_subscription(data)
|
||||||
|
pass
|
||||||
|
print('[migration] ' + str(len(email_subscriptions_data)) + ' email subscriptions exported')
|
||||||
|
|
||||||
|
def export_shouts(storage):
|
||||||
|
# update what was just migrated or load json again
|
||||||
|
if len(storage['users']['by_slugs'].keys()) == 0:
|
||||||
|
storage['users']['by_slugs'] = json.loads(open(EXPORT_DEST + 'authors.json').read())
|
||||||
|
print('[migration] ' + str(len(storage['users']['by_slugs'].keys())) + ' exported authors ')
|
||||||
|
if len(storage['shouts']['by_slugs'].keys()) == 0:
|
||||||
|
storage['shouts']['by_slugs'] = json.loads(open(EXPORT_DEST + 'articles.json').read())
|
||||||
|
print('[migration] ' + str(len(storage['shouts']['by_slugs'].keys())) + ' exported articles ')
|
||||||
|
for slug in storage['shouts']['by_slugs'].keys(): export_slug(slug, storage)
|
||||||
|
|
||||||
|
def export_json(export_articles = {}, export_authors = {}, export_topics = {}, export_comments = {}):
|
||||||
|
open(EXPORT_DEST + 'authors.json', 'w').write(json.dumps(export_authors,
|
||||||
|
cls=DateTimeEncoder,
|
||||||
|
indent=4,
|
||||||
|
sort_keys=True,
|
||||||
|
ensure_ascii=False))
|
||||||
|
print('[migration] ' + str(len(export_authors.items())) + ' authors exported')
|
||||||
|
open(EXPORT_DEST + 'topics.json', 'w').write(json.dumps(export_topics,
|
||||||
|
cls=DateTimeEncoder,
|
||||||
|
indent=4,
|
||||||
|
sort_keys=True,
|
||||||
|
ensure_ascii=False))
|
||||||
|
print('[migration] ' + str(len(export_topics.keys())) + ' topics exported')
|
||||||
|
|
||||||
|
open(EXPORT_DEST + 'articles.json', 'w').write(json.dumps(export_articles,
|
||||||
|
cls=DateTimeEncoder,
|
||||||
|
indent=4,
|
||||||
|
sort_keys=True,
|
||||||
|
ensure_ascii=False))
|
||||||
|
print('[migration] ' + str(len(export_articles.items())) + ' articles exported')
|
||||||
|
open(EXPORT_DEST + 'comments.json', 'w').write(json.dumps(export_comments,
|
||||||
|
cls=DateTimeEncoder,
|
||||||
|
indent=4,
|
||||||
|
sort_keys=True,
|
||||||
|
ensure_ascii=False))
|
||||||
|
print('[migration] ' + str(len(export_comments.items())) + ' exported articles with comments')
|
||||||
|
|
324
migration/extract.py
Normal file
324
migration/extract.py
Normal file
|
@ -0,0 +1,324 @@
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import base64
|
||||||
|
from migration.html2text import html2text
|
||||||
|
|
||||||
|
TOOLTIP_REGEX = r'(\/\/\/(.+)\/\/\/)'
|
||||||
|
contentDir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'discoursio-web', 'content')
|
||||||
|
s3 = 'https://discours-io.s3.amazonaws.com/'
|
||||||
|
cdn = 'https://assets.discours.io'
|
||||||
|
|
||||||
|
def replace_tooltips(body):
|
||||||
|
# FIXME: if you prefer regexp
|
||||||
|
newbody = body
|
||||||
|
matches = list(re.finditer(TOOLTIP_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
|
||||||
|
for match in matches:
|
||||||
|
newbody = body.replace(match.group(1), '<Tooltip text="' + match.group(2) + '" />') # FIXME: doesn't work
|
||||||
|
if len(matches) > 0:
|
||||||
|
print('[extract] found %d tooltips' % len(matches))
|
||||||
|
return newbody
|
||||||
|
|
||||||
|
|
||||||
|
def place_tooltips(body):
|
||||||
|
parts = body.split('&&&')
|
||||||
|
l = len(parts)
|
||||||
|
newparts = list(parts)
|
||||||
|
placed = False
|
||||||
|
if l & 1:
|
||||||
|
if l > 1:
|
||||||
|
i = 1
|
||||||
|
print('[extract] found %d tooltips' % (l-1))
|
||||||
|
for part in parts[1:]:
|
||||||
|
if i & 1:
|
||||||
|
placed = True
|
||||||
|
if 'a class="footnote-url" href=' in part:
|
||||||
|
print('[extract] footnote: ' + part)
|
||||||
|
fn = 'a class="footnote-url" href="'
|
||||||
|
link = part.split(fn,1)[1].split('"', 1)[0]
|
||||||
|
extracted_part = part.split(fn,1)[0] + ' ' + part.split('/', 1)[-1]
|
||||||
|
newparts[i] = '<Tooltip' + (' link="' + link + '" ' if link else '') + '>' + extracted_part + '</Tooltip>'
|
||||||
|
else:
|
||||||
|
newparts[i] = '<Tooltip>%s</Tooltip>' % part
|
||||||
|
# print('[extract] ' + newparts[i])
|
||||||
|
else:
|
||||||
|
# print('[extract] ' + part[:10] + '..')
|
||||||
|
newparts[i] = part
|
||||||
|
i += 1
|
||||||
|
return (''.join(newparts), placed)
|
||||||
|
|
||||||
|
IMG_REGEX = r"\!\[(.*?)\]\((data\:image\/(png|jpeg|jpg);base64\,((?:[A-Za-z\d+\/]{4})*(?:[A-Za-z\d+\/]{3}=|[A-Za-z\d+\/]{2}==)))\)"
|
||||||
|
|
||||||
|
parentDir = '/'.join(os.getcwd().split('/')[:-1])
|
||||||
|
public = parentDir + '/discoursio-web/public'
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def reextract_images(body, oid):
|
||||||
|
# FIXME: if you prefer regexp
|
||||||
|
matches = list(re.finditer(IMG_REGEX, body, re.IGNORECASE | re.MULTILINE))[1:]
|
||||||
|
i = 0
|
||||||
|
for match in matches:
|
||||||
|
print('[extract] image ' + match.group(1))
|
||||||
|
ext = match.group(3)
|
||||||
|
name = oid + str(i)
|
||||||
|
link = public + '/upload/image-' + name + '.' + ext
|
||||||
|
img = match.group(4)
|
||||||
|
title = match.group(1) # FIXME: this is not the title
|
||||||
|
if img not in cache:
|
||||||
|
content = base64.b64decode(img + '==')
|
||||||
|
print(str(len(img)) + ' image bytes been written')
|
||||||
|
open('../' + link, 'wb').write(content)
|
||||||
|
cache[img] = name
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
print('[extract] image cached ' + cache[img])
|
||||||
|
body.replace(str(match), '') # FIXME: this does not work
|
||||||
|
return body
|
||||||
|
|
||||||
|
IMAGES = {
|
||||||
|
'data:image/png': 'png',
|
||||||
|
'data:image/jpg': 'jpg',
|
||||||
|
'data:image/jpeg': 'jpg',
|
||||||
|
}
|
||||||
|
|
||||||
|
b64 = ';base64,'
|
||||||
|
|
||||||
|
def extract_imageparts(bodyparts, prefix):
|
||||||
|
# recursive loop
|
||||||
|
newparts = list(bodyparts)
|
||||||
|
for current in bodyparts:
|
||||||
|
i = bodyparts.index(current)
|
||||||
|
for mime in IMAGES.keys():
|
||||||
|
if mime == current[-len(mime):] and (i + 1 < len(bodyparts)):
|
||||||
|
print('[extract] ' + mime)
|
||||||
|
next = bodyparts[i+1]
|
||||||
|
ext = IMAGES[mime]
|
||||||
|
b64end = next.index(')')
|
||||||
|
b64encoded = next[:b64end]
|
||||||
|
name = prefix + '-' + str(len(cache))
|
||||||
|
link = '/upload/image-' + name + '.' + ext
|
||||||
|
print('[extract] name: ' + name)
|
||||||
|
print('[extract] link: ' + link)
|
||||||
|
print('[extract] %d bytes' % len(b64encoded))
|
||||||
|
if b64encoded not in cache:
|
||||||
|
try:
|
||||||
|
content = base64.b64decode(b64encoded + '==')
|
||||||
|
open(public + link, 'wb').write(content)
|
||||||
|
print('[extract] ' +str(len(content)) + ' image bytes been written')
|
||||||
|
cache[b64encoded] = name
|
||||||
|
except:
|
||||||
|
raise Exception
|
||||||
|
# raise Exception('[extract] error decoding image %r' %b64encoded)
|
||||||
|
else:
|
||||||
|
print('[extract] cached link ' + cache[b64encoded])
|
||||||
|
name = cache[b64encoded]
|
||||||
|
link = cdn + '/upload/image-' + name + '.' + ext
|
||||||
|
newparts[i] = current[:-len(mime)] + current[-len(mime):] + link + next[-b64end:]
|
||||||
|
newparts[i+1] = next[:-b64end]
|
||||||
|
break
|
||||||
|
return extract_imageparts(newparts[i] + newparts[i+1] + b64.join(bodyparts[i+2:]), prefix) \
|
||||||
|
if len(bodyparts) > (i + 1) else ''.join(newparts)
|
||||||
|
|
||||||
|
def extract_dataimages(parts, prefix):
|
||||||
|
newparts = list(parts)
|
||||||
|
for part in parts:
|
||||||
|
i = parts.index(part)
|
||||||
|
if part.endswith(']('):
|
||||||
|
[ext, rest] = parts[i+1].split(b64)
|
||||||
|
name = prefix + '-' + str(len(cache))
|
||||||
|
if ext == '/jpeg': ext = 'jpg'
|
||||||
|
else: ext = ext.replace('/', '')
|
||||||
|
link = '/upload/image-' + name + '.' + ext
|
||||||
|
print('[extract] filename: ' + link)
|
||||||
|
b64end = rest.find(')')
|
||||||
|
if b64end !=-1:
|
||||||
|
b64encoded = rest[:b64end]
|
||||||
|
print('[extract] %d text bytes' % len(b64encoded))
|
||||||
|
# write if not cached
|
||||||
|
if b64encoded not in cache:
|
||||||
|
try:
|
||||||
|
content = base64.b64decode(b64encoded + '==')
|
||||||
|
open(public + link, 'wb').write(content)
|
||||||
|
print('[extract] ' +str(len(content)) + ' image bytes')
|
||||||
|
cache[b64encoded] = name
|
||||||
|
except:
|
||||||
|
raise Exception
|
||||||
|
# raise Exception('[extract] error decoding image %r' %b64encoded)
|
||||||
|
else:
|
||||||
|
print('[extract] 0 image bytes, cached for ' + cache[b64encoded])
|
||||||
|
name = cache[b64encoded]
|
||||||
|
|
||||||
|
# update link with CDN
|
||||||
|
link = cdn + '/upload/image-' + name + '.' + ext
|
||||||
|
|
||||||
|
# patch newparts
|
||||||
|
newparts[i+1] = link + rest[b64end:]
|
||||||
|
else:
|
||||||
|
raise Exception('cannot find the end of base64 encoded string')
|
||||||
|
else:
|
||||||
|
print('[extract] dataimage skipping part ' + str(i))
|
||||||
|
continue
|
||||||
|
return ''.join(newparts)
|
||||||
|
|
||||||
|
di = 'data:image'
|
||||||
|
|
||||||
|
def extract_md_images(body, oid):
|
||||||
|
newbody = ''
|
||||||
|
body = body\
|
||||||
|
.replace('\n! []('+di, '\n \
|
||||||
|
.replace('\n[]('+di, '\n\
|
||||||
|
.replace(' []('+di, ' 
|
||||||
|
parts = body.split(di)
|
||||||
|
i = 0
|
||||||
|
if len(parts) > 1: newbody = extract_dataimages(parts, oid)
|
||||||
|
else: newbody = body
|
||||||
|
return newbody
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup(body):
|
||||||
|
newbody = body\
|
||||||
|
.replace('<', '').replace('>', '')\
|
||||||
|
.replace('{', '(').replace('}', ')')\
|
||||||
|
.replace('…', '...')\
|
||||||
|
.replace(' __ ', ' ')\
|
||||||
|
.replace('_ _', ' ')\
|
||||||
|
.replace('****', '')\
|
||||||
|
.replace('\u00a0', ' ')\
|
||||||
|
.replace('\u02c6', '^')\
|
||||||
|
.replace('\u00a0',' ')\
|
||||||
|
.replace('\ufeff', '')\
|
||||||
|
.replace('\u200b', '')\
|
||||||
|
.replace('\u200c', '')\
|
||||||
|
# .replace('\u2212', '-')
|
||||||
|
return newbody
|
||||||
|
|
||||||
|
def extract_md(body, oid):
|
||||||
|
newbody = body
|
||||||
|
if newbody:
|
||||||
|
newbody = extract_md_images(newbody, oid)
|
||||||
|
if not newbody: raise Exception('extract_images error')
|
||||||
|
newbody = cleanup(newbody)
|
||||||
|
if not newbody: raise Exception('cleanup error')
|
||||||
|
newbody, placed = place_tooltips(newbody)
|
||||||
|
if not newbody: raise Exception('place_tooltips error')
|
||||||
|
if placed:
|
||||||
|
newbody = 'import Tooltip from \'$/components/Article/Tooltip\'\n\n' + newbody
|
||||||
|
return newbody
|
||||||
|
|
||||||
|
def prepare_md_body(entry):
|
||||||
|
# body modifications
|
||||||
|
body = ''
|
||||||
|
kind = entry.get('type')
|
||||||
|
addon = ''
|
||||||
|
if kind == 'Video':
|
||||||
|
addon = ''
|
||||||
|
for m in entry.get('media', []):
|
||||||
|
if 'youtubeId' in m: addon += '<VideoPlayer youtubeId=\'' + m['youtubeId'] + '\' />\n'
|
||||||
|
elif 'vimeoId' in m: addon += '<VideoPlayer vimeoId=\'' + m['vimeoId'] + '\' />\n'
|
||||||
|
else:
|
||||||
|
print('[extract] media is not supported')
|
||||||
|
print(m)
|
||||||
|
body = 'import VideoPlayer from \'$/components/Article/VideoPlayer\'\n\n' + addon
|
||||||
|
|
||||||
|
elif kind == 'Music':
|
||||||
|
addon = ''
|
||||||
|
for m in entry.get('media', []):
|
||||||
|
artist = m.get('performer')
|
||||||
|
trackname = ''
|
||||||
|
if artist: trackname += artist + ' - '
|
||||||
|
if 'title' in m: trackname += m.get('title','')
|
||||||
|
addon += '<MusicPlayer src=\"' + m.get('fileUrl','') + '\" title=\"' + trackname + '\" />\n'
|
||||||
|
body = 'import MusicPlayer from \'$/components/Article/MusicPlayer\'\n\n' + addon
|
||||||
|
|
||||||
|
body_orig = extract_html(entry)
|
||||||
|
if body_orig: body += extract_md(html2text(body_orig), entry['_id'])
|
||||||
|
if not body: print('[extract] empty MDX body')
|
||||||
|
return body
|
||||||
|
|
||||||
|
def prepare_html_body(entry):
|
||||||
|
# body modifications
|
||||||
|
body = ''
|
||||||
|
kind = entry.get('type')
|
||||||
|
addon = ''
|
||||||
|
if kind == 'Video':
|
||||||
|
addon = ''
|
||||||
|
for m in entry.get('media', []):
|
||||||
|
if 'youtubeId' in m:
|
||||||
|
addon += '<iframe width="420" height="345" src="http://www.youtube.com/embed/'
|
||||||
|
addon += m['youtubeId']
|
||||||
|
addon += '?autoplay=1" frameborder="0" allowfullscreen></iframe>\n'
|
||||||
|
elif 'vimeoId' in m:
|
||||||
|
addon += '<iframe src="https://player.vimeo.com/video/'
|
||||||
|
addon += m['vimeoId']
|
||||||
|
addon += ' width="420" height="345" frameborder="0" allow="autoplay; fullscreen" allowfullscreen></iframe>'
|
||||||
|
else:
|
||||||
|
print('[extract] media is not supported')
|
||||||
|
print(m)
|
||||||
|
body += addon
|
||||||
|
|
||||||
|
elif kind == 'Music':
|
||||||
|
addon = ''
|
||||||
|
for m in entry.get('media', []):
|
||||||
|
artist = m.get('performer')
|
||||||
|
trackname = ''
|
||||||
|
if artist: trackname += artist + ' - '
|
||||||
|
if 'title' in m: trackname += m.get('title','')
|
||||||
|
addon += '<figure><figcaption>'
|
||||||
|
addon += trackname
|
||||||
|
addon += '</figcaption><audio controls src="'
|
||||||
|
addon += m.get('fileUrl','')
|
||||||
|
addon += '"></audio></figure>'
|
||||||
|
body += addon
|
||||||
|
|
||||||
|
body = extract_html(entry)
|
||||||
|
# if body_orig: body += extract_md(html2text(body_orig), entry['_id'])
|
||||||
|
if not body: print('[extract] empty HTML body')
|
||||||
|
return body
|
||||||
|
|
||||||
|
def extract_html(entry):
|
||||||
|
body_orig = entry.get('body') or ''
|
||||||
|
media = entry.get('media', [])
|
||||||
|
kind = entry.get('type') or ''
|
||||||
|
print('[extract] kind: ' + kind)
|
||||||
|
mbodies = set([])
|
||||||
|
if media:
|
||||||
|
# print('[extract] media is found')
|
||||||
|
for m in media:
|
||||||
|
mbody = m.get('body', '')
|
||||||
|
addon = ''
|
||||||
|
if kind == 'Literature':
|
||||||
|
mbody = m.get('literatureBody') or m.get('body', '')
|
||||||
|
elif kind == 'Image':
|
||||||
|
cover = ''
|
||||||
|
if 'thumborId' in entry: cover = cdn + '/unsafe/1600x/' + entry['thumborId']
|
||||||
|
if not cover:
|
||||||
|
if 'image' in entry: cover = entry['image'].get('url', '')
|
||||||
|
if 'cloudinary' in cover: cover = ''
|
||||||
|
# else: print('[extract] cover: ' + cover)
|
||||||
|
title = m.get('title','').replace('\n', ' ').replace(' ', ' ')
|
||||||
|
u = m.get('thumborId') or cover or ''
|
||||||
|
if title: addon += '<h4>' + title + '</h4>\n'
|
||||||
|
if not u.startswith('http'): u = s3 + u
|
||||||
|
if not u: print('[extract] no image url for ' + str(m))
|
||||||
|
if 'cloudinary' in u: u = 'img/lost.svg'
|
||||||
|
if u != cover or (u == cover and media.index(m) == 0):
|
||||||
|
addon += '<img src=\"' + u + '\" alt=\"'+ title +'\" />\n'
|
||||||
|
if addon:
|
||||||
|
body_orig += addon
|
||||||
|
# print('[extract] item addon: ' + addon)
|
||||||
|
# if addon: print('[extract] addon: %s' % addon)
|
||||||
|
if mbody and mbody not in mbodies:
|
||||||
|
mbodies.add(mbody)
|
||||||
|
body_orig += mbody
|
||||||
|
if len(list(mbodies)) != len(media):
|
||||||
|
print('[extract] %d/%d media item bodies appended' % (len(list(mbodies)),len(media)))
|
||||||
|
# print('[extract] media items body: \n' + body_orig)
|
||||||
|
if not body_orig:
|
||||||
|
for up in entry.get('bodyHistory', []) or []:
|
||||||
|
body_orig = up.get('text', '') or ''
|
||||||
|
if body_orig:
|
||||||
|
print('[extract] got html body from history')
|
||||||
|
break
|
||||||
|
if not body_orig: print('[extract] empty HTML body')
|
||||||
|
# body_html = str(BeautifulSoup(body_orig, features="html.parser"))
|
||||||
|
return body_orig
|
1041
migration/html2text/__init__.py
Normal file
1041
migration/html2text/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
3
migration/html2text/__main__.py
Normal file
3
migration/html2text/__main__.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
from .cli import main
|
||||||
|
|
||||||
|
main()
|
322
migration/html2text/cli.py
Normal file
322
migration/html2text/cli.py
Normal file
|
@ -0,0 +1,322 @@
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from . import HTML2Text, __version__, config
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
baseurl = ""
|
||||||
|
|
||||||
|
class bcolors:
|
||||||
|
HEADER = "\033[95m"
|
||||||
|
OKBLUE = "\033[94m"
|
||||||
|
OKGREEN = "\033[92m"
|
||||||
|
WARNING = "\033[93m"
|
||||||
|
FAIL = "\033[91m"
|
||||||
|
ENDC = "\033[0m"
|
||||||
|
BOLD = "\033[1m"
|
||||||
|
UNDERLINE = "\033[4m"
|
||||||
|
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument(
|
||||||
|
"--default-image-alt",
|
||||||
|
dest="default_image_alt",
|
||||||
|
default=config.DEFAULT_IMAGE_ALT,
|
||||||
|
help="The default alt string for images with missing ones",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--pad-tables",
|
||||||
|
dest="pad_tables",
|
||||||
|
action="store_true",
|
||||||
|
default=config.PAD_TABLES,
|
||||||
|
help="pad the cells to equal column width in tables",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--no-wrap-links",
|
||||||
|
dest="wrap_links",
|
||||||
|
action="store_false",
|
||||||
|
default=config.WRAP_LINKS,
|
||||||
|
help="don't wrap links during conversion",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--wrap-list-items",
|
||||||
|
dest="wrap_list_items",
|
||||||
|
action="store_true",
|
||||||
|
default=config.WRAP_LIST_ITEMS,
|
||||||
|
help="wrap list items during conversion",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--wrap-tables",
|
||||||
|
dest="wrap_tables",
|
||||||
|
action="store_true",
|
||||||
|
default=config.WRAP_TABLES,
|
||||||
|
help="wrap tables",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-emphasis",
|
||||||
|
dest="ignore_emphasis",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IGNORE_EMPHASIS,
|
||||||
|
help="don't include any formatting for emphasis",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--reference-links",
|
||||||
|
dest="inline_links",
|
||||||
|
action="store_false",
|
||||||
|
default=config.INLINE_LINKS,
|
||||||
|
help="use reference style links instead of inline links",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-links",
|
||||||
|
dest="ignore_links",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IGNORE_ANCHORS,
|
||||||
|
help="don't include any formatting for links",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-mailto-links",
|
||||||
|
action="store_true",
|
||||||
|
dest="ignore_mailto_links",
|
||||||
|
default=config.IGNORE_MAILTO_LINKS,
|
||||||
|
help="don't include mailto: links",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--protect-links",
|
||||||
|
dest="protect_links",
|
||||||
|
action="store_true",
|
||||||
|
default=config.PROTECT_LINKS,
|
||||||
|
help="protect links from line breaks surrounding them with angle brackets",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-images",
|
||||||
|
dest="ignore_images",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IGNORE_IMAGES,
|
||||||
|
help="don't include any formatting for images",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--images-as-html",
|
||||||
|
dest="images_as_html",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IMAGES_AS_HTML,
|
||||||
|
help=(
|
||||||
|
"Always write image tags as raw html; preserves `height`, `width` and "
|
||||||
|
"`alt` if possible."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--images-to-alt",
|
||||||
|
dest="images_to_alt",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IMAGES_TO_ALT,
|
||||||
|
help="Discard image data, only keep alt text",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--images-with-size",
|
||||||
|
dest="images_with_size",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IMAGES_WITH_SIZE,
|
||||||
|
help=(
|
||||||
|
"Write image tags with height and width attrs as raw html to retain "
|
||||||
|
"dimensions"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-g",
|
||||||
|
"--google-doc",
|
||||||
|
action="store_true",
|
||||||
|
dest="google_doc",
|
||||||
|
default=False,
|
||||||
|
help="convert an html-exported Google Document",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-d",
|
||||||
|
"--dash-unordered-list",
|
||||||
|
action="store_true",
|
||||||
|
dest="ul_style_dash",
|
||||||
|
default=False,
|
||||||
|
help="use a dash rather than a star for unordered list items",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-e",
|
||||||
|
"--asterisk-emphasis",
|
||||||
|
action="store_true",
|
||||||
|
dest="em_style_asterisk",
|
||||||
|
default=False,
|
||||||
|
help="use an asterisk rather than an underscore for emphasized text",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-b",
|
||||||
|
"--body-width",
|
||||||
|
dest="body_width",
|
||||||
|
type=int,
|
||||||
|
default=config.BODY_WIDTH,
|
||||||
|
help="number of characters per output line, 0 for no wrap",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-i",
|
||||||
|
"--google-list-indent",
|
||||||
|
dest="list_indent",
|
||||||
|
type=int,
|
||||||
|
default=config.GOOGLE_LIST_INDENT,
|
||||||
|
help="number of pixels Google indents nested lists",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--hide-strikethrough",
|
||||||
|
action="store_true",
|
||||||
|
dest="hide_strikethrough",
|
||||||
|
default=False,
|
||||||
|
help="hide strike-through text. only relevant when -g is " "specified as well",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--escape-all",
|
||||||
|
action="store_true",
|
||||||
|
dest="escape_snob",
|
||||||
|
default=False,
|
||||||
|
help=(
|
||||||
|
"Escape all special characters. Output is less readable, but avoids "
|
||||||
|
"corner case formatting issues."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--bypass-tables",
|
||||||
|
action="store_true",
|
||||||
|
dest="bypass_tables",
|
||||||
|
default=config.BYPASS_TABLES,
|
||||||
|
help="Format tables in HTML rather than Markdown syntax.",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-tables",
|
||||||
|
action="store_true",
|
||||||
|
dest="ignore_tables",
|
||||||
|
default=config.IGNORE_TABLES,
|
||||||
|
help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--single-line-break",
|
||||||
|
action="store_true",
|
||||||
|
dest="single_line_break",
|
||||||
|
default=config.SINGLE_LINE_BREAK,
|
||||||
|
help=(
|
||||||
|
"Use a single line break after a block element rather than two line "
|
||||||
|
"breaks. NOTE: Requires --body-width=0"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--unicode-snob",
|
||||||
|
action="store_true",
|
||||||
|
dest="unicode_snob",
|
||||||
|
default=config.UNICODE_SNOB,
|
||||||
|
help="Use unicode throughout document",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--no-automatic-links",
|
||||||
|
action="store_false",
|
||||||
|
dest="use_automatic_links",
|
||||||
|
default=config.USE_AUTOMATIC_LINKS,
|
||||||
|
help="Do not use automatic links wherever applicable",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--no-skip-internal-links",
|
||||||
|
action="store_false",
|
||||||
|
dest="skip_internal_links",
|
||||||
|
default=config.SKIP_INTERNAL_LINKS,
|
||||||
|
help="Do not skip internal links",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--links-after-para",
|
||||||
|
action="store_true",
|
||||||
|
dest="links_each_paragraph",
|
||||||
|
default=config.LINKS_EACH_PARAGRAPH,
|
||||||
|
help="Put links after each paragraph instead of document",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--mark-code",
|
||||||
|
action="store_true",
|
||||||
|
dest="mark_code",
|
||||||
|
default=config.MARK_CODE,
|
||||||
|
help="Mark program code blocks with [code]...[/code]",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--decode-errors",
|
||||||
|
dest="decode_errors",
|
||||||
|
default=config.DECODE_ERRORS,
|
||||||
|
help=(
|
||||||
|
"What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
|
||||||
|
"acceptable values"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--open-quote",
|
||||||
|
dest="open_quote",
|
||||||
|
default=config.OPEN_QUOTE,
|
||||||
|
help="The character used to open quotes",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--close-quote",
|
||||||
|
dest="close_quote",
|
||||||
|
default=config.CLOSE_QUOTE,
|
||||||
|
help="The character used to close quotes",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--version", action="version", version=".".join(map(str, __version__))
|
||||||
|
)
|
||||||
|
p.add_argument("filename", nargs="?")
|
||||||
|
p.add_argument("encoding", nargs="?", default="utf-8")
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
if args.filename and args.filename != "-":
|
||||||
|
with open(args.filename, "rb") as fp:
|
||||||
|
data = fp.read()
|
||||||
|
else:
|
||||||
|
data = sys.stdin.buffer.read()
|
||||||
|
|
||||||
|
try:
|
||||||
|
html = data.decode(args.encoding, args.decode_errors)
|
||||||
|
except UnicodeDecodeError as err:
|
||||||
|
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
|
||||||
|
warning += " Use the " + bcolors.OKGREEN
|
||||||
|
warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
|
||||||
|
print(warning)
|
||||||
|
raise err
|
||||||
|
|
||||||
|
h = HTML2Text(baseurl=baseurl)
|
||||||
|
# handle options
|
||||||
|
if args.ul_style_dash:
|
||||||
|
h.ul_item_mark = "-"
|
||||||
|
if args.em_style_asterisk:
|
||||||
|
h.emphasis_mark = "*"
|
||||||
|
h.strong_mark = "__"
|
||||||
|
|
||||||
|
h.body_width = args.body_width
|
||||||
|
h.google_list_indent = args.list_indent
|
||||||
|
h.ignore_emphasis = args.ignore_emphasis
|
||||||
|
h.ignore_links = args.ignore_links
|
||||||
|
h.ignore_mailto_links = args.ignore_mailto_links
|
||||||
|
h.protect_links = args.protect_links
|
||||||
|
h.ignore_images = args.ignore_images
|
||||||
|
h.images_as_html = args.images_as_html
|
||||||
|
h.images_to_alt = args.images_to_alt
|
||||||
|
h.images_with_size = args.images_with_size
|
||||||
|
h.google_doc = args.google_doc
|
||||||
|
h.hide_strikethrough = args.hide_strikethrough
|
||||||
|
h.escape_snob = args.escape_snob
|
||||||
|
h.bypass_tables = args.bypass_tables
|
||||||
|
h.ignore_tables = args.ignore_tables
|
||||||
|
h.single_line_break = args.single_line_break
|
||||||
|
h.inline_links = args.inline_links
|
||||||
|
h.unicode_snob = args.unicode_snob
|
||||||
|
h.use_automatic_links = args.use_automatic_links
|
||||||
|
h.skip_internal_links = args.skip_internal_links
|
||||||
|
h.links_each_paragraph = args.links_each_paragraph
|
||||||
|
h.mark_code = args.mark_code
|
||||||
|
h.wrap_links = args.wrap_links
|
||||||
|
h.wrap_list_items = args.wrap_list_items
|
||||||
|
h.wrap_tables = args.wrap_tables
|
||||||
|
h.pad_tables = args.pad_tables
|
||||||
|
h.default_image_alt = args.default_image_alt
|
||||||
|
h.open_quote = args.open_quote
|
||||||
|
h.close_quote = args.close_quote
|
||||||
|
|
||||||
|
sys.stdout.write(h.handle(html))
|
164
migration/html2text/config.py
Normal file
164
migration/html2text/config.py
Normal file
|
@ -0,0 +1,164 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Use Unicode characters instead of their ascii pseudo-replacements
|
||||||
|
UNICODE_SNOB = True
|
||||||
|
|
||||||
|
# Marker to use for marking tables for padding post processing
|
||||||
|
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
|
||||||
|
# Escape all special characters. Output is less readable, but avoids
|
||||||
|
# corner case formatting issues.
|
||||||
|
ESCAPE_SNOB = True
|
||||||
|
|
||||||
|
# Put the links after each paragraph instead of at the end.
|
||||||
|
LINKS_EACH_PARAGRAPH = False
|
||||||
|
|
||||||
|
# Wrap long lines at position. 0 for no wrapping.
|
||||||
|
BODY_WIDTH = 0
|
||||||
|
|
||||||
|
# Don't show internal links (href="#local-anchor") -- corresponding link
|
||||||
|
# targets won't be visible in the plain text file anyway.
|
||||||
|
SKIP_INTERNAL_LINKS = False
|
||||||
|
|
||||||
|
# Use inline, rather than reference, formatting for images and links
|
||||||
|
INLINE_LINKS = True
|
||||||
|
|
||||||
|
# Protect links from line breaks surrounding them with angle brackets (in
|
||||||
|
# addition to their square brackets)
|
||||||
|
PROTECT_LINKS = True
|
||||||
|
WRAP_LINKS = True
|
||||||
|
|
||||||
|
# Wrap list items.
|
||||||
|
WRAP_LIST_ITEMS = False
|
||||||
|
|
||||||
|
# Wrap tables
|
||||||
|
WRAP_TABLES = False
|
||||||
|
|
||||||
|
# Number of pixels Google indents nested lists
|
||||||
|
GOOGLE_LIST_INDENT = 36
|
||||||
|
|
||||||
|
# Values Google and others may use to indicate bold text
|
||||||
|
BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
|
||||||
|
|
||||||
|
IGNORE_ANCHORS = False
|
||||||
|
IGNORE_MAILTO_LINKS = False
|
||||||
|
IGNORE_IMAGES = False
|
||||||
|
IMAGES_AS_HTML = False
|
||||||
|
IMAGES_TO_ALT = False
|
||||||
|
IMAGES_WITH_SIZE = False
|
||||||
|
IGNORE_EMPHASIS = False
|
||||||
|
MARK_CODE = True
|
||||||
|
DECODE_ERRORS = "strict"
|
||||||
|
DEFAULT_IMAGE_ALT = ""
|
||||||
|
PAD_TABLES = True
|
||||||
|
|
||||||
|
# Convert links with same href and text to <href> format
|
||||||
|
# if they are absolute links
|
||||||
|
USE_AUTOMATIC_LINKS = True
|
||||||
|
|
||||||
|
# For checking space-only lines on line 771
|
||||||
|
RE_SPACE = re.compile(r"\s\+")
|
||||||
|
|
||||||
|
RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
|
||||||
|
RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
|
||||||
|
RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
|
||||||
|
RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
|
||||||
|
|
||||||
|
# to find links in the text
|
||||||
|
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
|
||||||
|
|
||||||
|
# to find table separators
|
||||||
|
RE_TABLE = re.compile(r" \| ")
|
||||||
|
|
||||||
|
RE_MD_DOT_MATCHER = re.compile(
|
||||||
|
r"""
|
||||||
|
^ # start of line
|
||||||
|
(\s*\d+) # optional whitespace and a number
|
||||||
|
(\.) # dot
|
||||||
|
(?=\s) # lookahead assert whitespace
|
||||||
|
""",
|
||||||
|
re.MULTILINE | re.VERBOSE,
|
||||||
|
)
|
||||||
|
RE_MD_PLUS_MATCHER = re.compile(
|
||||||
|
r"""
|
||||||
|
^
|
||||||
|
(\s*)
|
||||||
|
(\+)
|
||||||
|
(?=\s)
|
||||||
|
""",
|
||||||
|
flags=re.MULTILINE | re.VERBOSE,
|
||||||
|
)
|
||||||
|
RE_MD_DASH_MATCHER = re.compile(
|
||||||
|
r"""
|
||||||
|
^
|
||||||
|
(\s*)
|
||||||
|
(-)
|
||||||
|
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
|
||||||
|
# or another dash (header or hr)
|
||||||
|
""",
|
||||||
|
flags=re.MULTILINE | re.VERBOSE,
|
||||||
|
)
|
||||||
|
RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
|
||||||
|
RE_MD_BACKSLASH_MATCHER = re.compile(
|
||||||
|
r"""
|
||||||
|
(\\) # match one slash
|
||||||
|
(?=[%s]) # followed by a char that requires escaping
|
||||||
|
"""
|
||||||
|
% re.escape(RE_SLASH_CHARS),
|
||||||
|
flags=re.VERBOSE,
|
||||||
|
)
|
||||||
|
|
||||||
|
UNIFIABLE = {
|
||||||
|
"rsquo": "'",
|
||||||
|
"lsquo": "'",
|
||||||
|
"rdquo": '"',
|
||||||
|
"ldquo": '"',
|
||||||
|
"copy": "(C)",
|
||||||
|
"mdash": "--",
|
||||||
|
"nbsp": " ",
|
||||||
|
"rarr": "->",
|
||||||
|
"larr": "<-",
|
||||||
|
"middot": "*",
|
||||||
|
"ndash": "-",
|
||||||
|
"oelig": "oe",
|
||||||
|
"aelig": "ae",
|
||||||
|
"agrave": "a",
|
||||||
|
"aacute": "a",
|
||||||
|
"acirc": "a",
|
||||||
|
"atilde": "a",
|
||||||
|
"auml": "a",
|
||||||
|
"aring": "a",
|
||||||
|
"egrave": "e",
|
||||||
|
"eacute": "e",
|
||||||
|
"ecirc": "e",
|
||||||
|
"euml": "e",
|
||||||
|
"igrave": "i",
|
||||||
|
"iacute": "i",
|
||||||
|
"icirc": "i",
|
||||||
|
"iuml": "i",
|
||||||
|
"ograve": "o",
|
||||||
|
"oacute": "o",
|
||||||
|
"ocirc": "o",
|
||||||
|
"otilde": "o",
|
||||||
|
"ouml": "o",
|
||||||
|
"ugrave": "u",
|
||||||
|
"uacute": "u",
|
||||||
|
"ucirc": "u",
|
||||||
|
"uuml": "u",
|
||||||
|
"lrm": "",
|
||||||
|
"rlm": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Format tables in HTML rather than Markdown syntax
|
||||||
|
BYPASS_TABLES = False
|
||||||
|
# Ignore table-related tags (table, th, td, tr) while keeping rows
|
||||||
|
IGNORE_TABLES = False
|
||||||
|
|
||||||
|
|
||||||
|
# Use a single line break after a block element rather than two line breaks.
|
||||||
|
# NOTE: Requires body width setting to be 0.
|
||||||
|
SINGLE_LINE_BREAK = False
|
||||||
|
|
||||||
|
|
||||||
|
# Use double quotation marks when converting the <q> tag.
|
||||||
|
OPEN_QUOTE = '"'
|
||||||
|
CLOSE_QUOTE = '"'
|
18
migration/html2text/elements.py
Normal file
18
migration/html2text/elements.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class AnchorElement:
|
||||||
|
__slots__ = ["attrs", "count", "outcount"]
|
||||||
|
|
||||||
|
def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
|
||||||
|
self.attrs = attrs
|
||||||
|
self.count = count
|
||||||
|
self.outcount = outcount
|
||||||
|
|
||||||
|
|
||||||
|
class ListElement:
|
||||||
|
__slots__ = ["name", "num"]
|
||||||
|
|
||||||
|
def __init__(self, name: str, num: int):
|
||||||
|
self.name = name
|
||||||
|
self.num = num
|
0
migration/html2text/py.typed
Normal file
0
migration/html2text/py.typed
Normal file
3
migration/html2text/typing.py
Normal file
3
migration/html2text/typing.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
class OutCallback:
|
||||||
|
def __call__(self, s: str) -> None:
|
||||||
|
...
|
290
migration/html2text/utils.py
Normal file
290
migration/html2text/utils.py
Normal file
|
@ -0,0 +1,290 @@
|
||||||
|
import html.entities
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from . import config
|
||||||
|
|
||||||
|
unifiable_n = {
|
||||||
|
html.entities.name2codepoint[k]: v
|
||||||
|
for k, v in config.UNIFIABLE.items()
|
||||||
|
if k != "nbsp"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def hn(tag: str) -> int:
|
||||||
|
if tag[0] == "h" and len(tag) == 2:
|
||||||
|
n = tag[1]
|
||||||
|
if "0" < n <= "9":
|
||||||
|
return int(n)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def dumb_property_dict(style: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
:returns: A hash of css attributes
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
x.strip().lower(): y.strip().lower()
|
||||||
|
for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
:type data: str
|
||||||
|
|
||||||
|
:returns: A hash of css selectors, each of which contains a hash of
|
||||||
|
css attributes.
|
||||||
|
:rtype: dict
|
||||||
|
"""
|
||||||
|
# remove @import sentences
|
||||||
|
data += ";"
|
||||||
|
importIndex = data.find("@import")
|
||||||
|
while importIndex != -1:
|
||||||
|
data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
|
||||||
|
importIndex = data.find("@import")
|
||||||
|
|
||||||
|
# parse the css. reverted from dictionary comprehension in order to
|
||||||
|
# support older pythons
|
||||||
|
pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
|
||||||
|
try:
|
||||||
|
elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
|
||||||
|
except ValueError:
|
||||||
|
elements = {} # not that important
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
def element_style(
|
||||||
|
attrs: Dict[str, Optional[str]],
|
||||||
|
style_def: Dict[str, Dict[str, str]],
|
||||||
|
parent_style: Dict[str, str],
|
||||||
|
) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
:type attrs: dict
|
||||||
|
:type style_def: dict
|
||||||
|
:type style_def: dict
|
||||||
|
|
||||||
|
:returns: A hash of the 'final' style attributes of the element
|
||||||
|
:rtype: dict
|
||||||
|
"""
|
||||||
|
style = parent_style.copy()
|
||||||
|
if "class" in attrs:
|
||||||
|
assert attrs["class"] is not None
|
||||||
|
for css_class in attrs["class"].split():
|
||||||
|
css_style = style_def.get("." + css_class, {})
|
||||||
|
style.update(css_style)
|
||||||
|
if "style" in attrs:
|
||||||
|
assert attrs["style"] is not None
|
||||||
|
immediate_style = dumb_property_dict(attrs["style"])
|
||||||
|
style.update(immediate_style)
|
||||||
|
|
||||||
|
return style
|
||||||
|
|
||||||
|
|
||||||
|
def google_list_style(style: Dict[str, str]) -> str:
|
||||||
|
"""
|
||||||
|
Finds out whether this is an ordered or unordered list
|
||||||
|
|
||||||
|
:type style: dict
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
if "list-style-type" in style:
|
||||||
|
list_style = style["list-style-type"]
|
||||||
|
if list_style in ["disc", "circle", "square", "none"]:
|
||||||
|
return "ul"
|
||||||
|
|
||||||
|
return "ol"
|
||||||
|
|
||||||
|
|
||||||
|
def google_has_height(style: Dict[str, str]) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the style of the element has the 'height' attribute
|
||||||
|
explicitly defined
|
||||||
|
|
||||||
|
:type style: dict
|
||||||
|
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
return "height" in style
|
||||||
|
|
||||||
|
|
||||||
|
def google_text_emphasis(style: Dict[str, str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
:type style: dict
|
||||||
|
|
||||||
|
:returns: A list of all emphasis modifiers of the element
|
||||||
|
:rtype: list
|
||||||
|
"""
|
||||||
|
emphasis = []
|
||||||
|
if "text-decoration" in style:
|
||||||
|
emphasis.append(style["text-decoration"])
|
||||||
|
if "font-style" in style:
|
||||||
|
emphasis.append(style["font-style"])
|
||||||
|
if "font-weight" in style:
|
||||||
|
emphasis.append(style["font-weight"])
|
||||||
|
|
||||||
|
return emphasis
|
||||||
|
|
||||||
|
|
||||||
|
def google_fixed_width_font(style: Dict[str, str]) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the css of the current element defines a fixed width font
|
||||||
|
|
||||||
|
:type style: dict
|
||||||
|
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
font_family = ""
|
||||||
|
if "font-family" in style:
|
||||||
|
font_family = style["font-family"]
|
||||||
|
return "courier new" == font_family or "consolas" == font_family
|
||||||
|
|
||||||
|
|
||||||
|
def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
|
||||||
|
"""
|
||||||
|
Extract numbering from list element attributes
|
||||||
|
|
||||||
|
:type attrs: dict
|
||||||
|
|
||||||
|
:rtype: int or None
|
||||||
|
"""
|
||||||
|
if "start" in attrs:
|
||||||
|
assert attrs["start"] is not None
|
||||||
|
try:
|
||||||
|
return int(attrs["start"]) - 1
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def skipwrap(
|
||||||
|
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
|
||||||
|
) -> bool:
|
||||||
|
# If it appears to contain a link
|
||||||
|
# don't wrap
|
||||||
|
if not wrap_links and config.RE_LINK.search(para):
|
||||||
|
return True
|
||||||
|
# If the text begins with four spaces or one tab, it's a code block;
|
||||||
|
# don't wrap
|
||||||
|
if para[0:4] == " " or para[0] == "\t":
|
||||||
|
return True
|
||||||
|
|
||||||
|
# If the text begins with only two "--", possibly preceded by
|
||||||
|
# whitespace, that's an emdash; so wrap.
|
||||||
|
stripped = para.lstrip()
|
||||||
|
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
|
||||||
|
return False
|
||||||
|
|
||||||
|
# I'm not sure what this is for; I thought it was to detect lists,
|
||||||
|
# but there's a <br>-inside-<span> case in one of the tests that
|
||||||
|
# also depends upon it.
|
||||||
|
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
|
||||||
|
return not wrap_list_items
|
||||||
|
|
||||||
|
# If text contains a pipe character it is likely a table
|
||||||
|
if not wrap_tables and config.RE_TABLE.search(para):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# If the text begins with a single -, *, or +, followed by a space,
|
||||||
|
# or an integer, followed by a ., followed by a space (in either
|
||||||
|
# case optionally proceeded by whitespace), it's a list; don't wrap.
|
||||||
|
return bool(
|
||||||
|
config.RE_ORDERED_LIST_MATCHER.match(stripped)
|
||||||
|
or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def escape_md(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Escapes markdown-sensitive characters within other markdown
|
||||||
|
constructs.
|
||||||
|
"""
|
||||||
|
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
|
||||||
|
|
||||||
|
|
||||||
|
def escape_md_section(text: str, snob: bool = False) -> str:
|
||||||
|
"""
|
||||||
|
Escapes markdown-sensitive characters across whole document sections.
|
||||||
|
"""
|
||||||
|
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
|
||||||
|
|
||||||
|
if snob:
|
||||||
|
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
|
||||||
|
|
||||||
|
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
|
||||||
|
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
|
||||||
|
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def reformat_table(lines: List[str], right_margin: int) -> List[str]:
|
||||||
|
"""
|
||||||
|
Given the lines of a table
|
||||||
|
padds the cells and returns the new lines
|
||||||
|
"""
|
||||||
|
# find the maximum width of the columns
|
||||||
|
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
|
||||||
|
max_cols = len(max_width)
|
||||||
|
for line in lines:
|
||||||
|
cols = [x.rstrip() for x in line.split("|")]
|
||||||
|
num_cols = len(cols)
|
||||||
|
|
||||||
|
# don't drop any data if colspan attributes result in unequal lengths
|
||||||
|
if num_cols < max_cols:
|
||||||
|
cols += [""] * (max_cols - num_cols)
|
||||||
|
elif max_cols < num_cols:
|
||||||
|
max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
|
||||||
|
max_cols = num_cols
|
||||||
|
|
||||||
|
max_width = [
|
||||||
|
max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
|
||||||
|
]
|
||||||
|
|
||||||
|
# reformat
|
||||||
|
new_lines = []
|
||||||
|
for line in lines:
|
||||||
|
cols = [x.rstrip() for x in line.split("|")]
|
||||||
|
if set(line.strip()) == set("-|"):
|
||||||
|
filler = "-"
|
||||||
|
new_cols = [
|
||||||
|
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||||
|
for x, M in zip(cols, max_width)
|
||||||
|
]
|
||||||
|
new_lines.append("|-" + "|".join(new_cols) + "|")
|
||||||
|
else:
|
||||||
|
filler = " "
|
||||||
|
new_cols = [
|
||||||
|
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||||
|
for x, M in zip(cols, max_width)
|
||||||
|
]
|
||||||
|
new_lines.append("| " + "|".join(new_cols) + "|")
|
||||||
|
return new_lines
|
||||||
|
|
||||||
|
|
||||||
|
def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
|
||||||
|
"""
|
||||||
|
Provide padding for tables in the text
|
||||||
|
"""
|
||||||
|
lines = text.split("\n")
|
||||||
|
table_buffer = [] # type: List[str]
|
||||||
|
table_started = False
|
||||||
|
new_lines = []
|
||||||
|
for line in lines:
|
||||||
|
# Toggle table started
|
||||||
|
if config.TABLE_MARKER_FOR_PAD in line:
|
||||||
|
table_started = not table_started
|
||||||
|
if not table_started:
|
||||||
|
table = reformat_table(table_buffer, right_margin)
|
||||||
|
new_lines.extend(table)
|
||||||
|
table_buffer = []
|
||||||
|
new_lines.append("")
|
||||||
|
continue
|
||||||
|
# Process lines
|
||||||
|
if table_started:
|
||||||
|
table_buffer.append(line)
|
||||||
|
else:
|
||||||
|
new_lines.append(line)
|
||||||
|
return "\n".join(new_lines)
|
1
migration/tables/__init__.py
Normal file
1
migration/tables/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
__all__ = ["users", "tags", "content_items", "comments"],
|
108
migration/tables/comments.py
Normal file
108
migration/tables/comments.py
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from dateutil.parser import parse as date_parse
|
||||||
|
from orm import Reaction, User
|
||||||
|
from orm import reaction
|
||||||
|
from orm.base import local_session
|
||||||
|
from migration.html2text import html2text
|
||||||
|
from orm.reaction import ReactionKind
|
||||||
|
from orm.shout import Shout
|
||||||
|
|
||||||
|
ts = datetime.now()
|
||||||
|
|
||||||
|
def migrate(entry, storage):
|
||||||
|
'''
|
||||||
|
{
|
||||||
|
"_id": "hdtwS8fSyFLxXCgSC",
|
||||||
|
"body": "<p>",
|
||||||
|
"contentItem": "mnK8KsJHPRi8DrybQ",
|
||||||
|
"createdBy": "bMFPuyNg6qAD2mhXe",
|
||||||
|
"thread": "01/",
|
||||||
|
"createdAt": "2016-04-19 04:33:53+00:00",
|
||||||
|
"ratings": [
|
||||||
|
{ "createdBy": "AqmRukvRiExNpAe8C", "value": 1 },
|
||||||
|
{ "createdBy": "YdE76Wth3yqymKEu5", "value": 1 }
|
||||||
|
],
|
||||||
|
"rating": 2,
|
||||||
|
"updatedAt": "2020-05-27 19:22:57.091000+00:00",
|
||||||
|
"updatedBy": "0"
|
||||||
|
}
|
||||||
|
|
||||||
|
->
|
||||||
|
|
||||||
|
type Reaction {
|
||||||
|
id: Int!
|
||||||
|
shout: Shout!
|
||||||
|
createdAt: DateTime!
|
||||||
|
createdBy: User!
|
||||||
|
updatedAt: DateTime
|
||||||
|
deletedAt: DateTime
|
||||||
|
deletedBy: User
|
||||||
|
range: String # full / 0:2340
|
||||||
|
kind: ReactionKind!
|
||||||
|
body: String
|
||||||
|
replyTo: Reaction
|
||||||
|
stat: Stat
|
||||||
|
old_id: String
|
||||||
|
old_thread: String
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
reaction_dict = {}
|
||||||
|
# FIXME: comment_dict['createdAt'] = ts if not entry.get('createdAt') else date_parse(entry.get('createdAt'))
|
||||||
|
# print('[migration] comment original date %r' % entry.get('createdAt'))
|
||||||
|
# print('[migration] comment date %r ' % comment_dict['createdAt'])
|
||||||
|
reaction_dict['body'] = html2text(entry.get('body', ''))
|
||||||
|
reaction_dict['oid'] = entry['_id']
|
||||||
|
if entry.get('createdAt'): reaction_dict['createdAt'] = date_parse(entry.get('createdAt'))
|
||||||
|
shout_oid = entry.get('contentItem')
|
||||||
|
if not shout_oid in storage['shouts']['by_oid']:
|
||||||
|
if len(storage['shouts']['by_oid']) > 0:
|
||||||
|
return shout_oid
|
||||||
|
else:
|
||||||
|
print('[migration] no shouts migrated yet')
|
||||||
|
raise Exception
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
with local_session() as session:
|
||||||
|
author = session.query(User).filter(User.oid == entry['createdBy']).first()
|
||||||
|
shout_dict = storage['shouts']['by_oid'][shout_oid]
|
||||||
|
if shout_dict:
|
||||||
|
reaction_dict['shout'] = shout_dict['slug']
|
||||||
|
reaction_dict['createdBy'] = author.slug if author else 'discours'
|
||||||
|
reaction_dict['kind'] = ReactionKind.COMMENT
|
||||||
|
|
||||||
|
# creating reaction from old comment
|
||||||
|
reaction = Reaction.create(**reaction_dict)
|
||||||
|
|
||||||
|
reaction_dict['id'] = reaction.id
|
||||||
|
for comment_rating_old in entry.get('ratings',[]):
|
||||||
|
rater = session.query(User).filter(User.oid == comment_rating_old['createdBy']).first()
|
||||||
|
reactedBy = rater if rater else session.query(User).filter(User.slug == 'noname').first()
|
||||||
|
re_reaction_dict = {
|
||||||
|
'shout': reaction_dict['shout'],
|
||||||
|
'replyTo': reaction.id,
|
||||||
|
'kind': ReactionKind.LIKE if comment_rating_old['value'] > 0 else ReactionKind.DISLIKE,
|
||||||
|
'createdBy': reactedBy.slug if reactedBy else 'discours'
|
||||||
|
}
|
||||||
|
cts = comment_rating_old.get('createdAt')
|
||||||
|
if cts: re_reaction_dict['createdAt'] = date_parse(cts)
|
||||||
|
try:
|
||||||
|
# creating reaction from old rating
|
||||||
|
Reaction.create(**re_reaction_dict)
|
||||||
|
except Exception as e:
|
||||||
|
print('[migration] comment rating error: %r' % re_reaction_dict)
|
||||||
|
raise e
|
||||||
|
else:
|
||||||
|
print('[migration] error: cannot find shout for comment %r' % reaction_dict)
|
||||||
|
return reaction
|
||||||
|
|
||||||
|
def migrate_2stage(rr, old_new_id):
|
||||||
|
reply_oid = rr.get('replyTo')
|
||||||
|
if not reply_oid: return
|
||||||
|
new_id = old_new_id.get(rr.get('oid'))
|
||||||
|
if not new_id: return
|
||||||
|
with local_session() as session:
|
||||||
|
comment = session.query(Reaction).filter(Reaction.id == new_id).first()
|
||||||
|
comment.replyTo = old_new_id.get(reply_oid)
|
||||||
|
comment.save()
|
||||||
|
session.commit()
|
||||||
|
if not rr['body']: raise Exception(rr)
|
226
migration/tables/content_items.py
Normal file
226
migration/tables/content_items.py
Normal file
|
@ -0,0 +1,226 @@
|
||||||
|
from dateutil.parser import parse as date_parse
|
||||||
|
import sqlalchemy
|
||||||
|
from orm.shout import Shout, ShoutTopic, User
|
||||||
|
from storages.viewed import ViewedByDay
|
||||||
|
from transliterate import translit
|
||||||
|
from datetime import datetime
|
||||||
|
from orm.base import local_session
|
||||||
|
from migration.extract import prepare_html_body
|
||||||
|
from orm.community import Community
|
||||||
|
from orm.reaction import Reaction, ReactionKind
|
||||||
|
|
||||||
|
OLD_DATE = '2016-03-05 22:22:00.350000'
|
||||||
|
ts = datetime.now()
|
||||||
|
type2layout = {
|
||||||
|
'Article': 'article',
|
||||||
|
'Literature': 'prose',
|
||||||
|
'Music': 'music',
|
||||||
|
'Video': 'video',
|
||||||
|
'Image': 'image'
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_shout_slug(entry):
|
||||||
|
slug = entry.get('slug', '')
|
||||||
|
if not slug:
|
||||||
|
for friend in entry.get('friendlySlugs', []):
|
||||||
|
slug = friend.get('slug', '')
|
||||||
|
if slug: break
|
||||||
|
return slug
|
||||||
|
|
||||||
|
def migrate(entry, storage):
|
||||||
|
# init, set title and layout
|
||||||
|
r = {
|
||||||
|
'layout': type2layout[entry['type']],
|
||||||
|
'title': entry['title'],
|
||||||
|
'community': Community.default_community.id,
|
||||||
|
'authors': [],
|
||||||
|
'topics': set([]),
|
||||||
|
# 'rating': 0,
|
||||||
|
# 'ratings': [],
|
||||||
|
'createdAt': []
|
||||||
|
}
|
||||||
|
topics_by_oid = storage['topics']['by_oid']
|
||||||
|
users_by_oid = storage['users']['by_oid']
|
||||||
|
|
||||||
|
# author
|
||||||
|
|
||||||
|
oid = entry.get('createdBy', entry.get('_id', entry.get('oid')))
|
||||||
|
userdata = users_by_oid.get(oid)
|
||||||
|
if not userdata:
|
||||||
|
app = entry.get('application')
|
||||||
|
if app:
|
||||||
|
userslug = translit(app['name'], 'ru', reversed=True)\
|
||||||
|
.replace(' ', '-')\
|
||||||
|
.replace('\'', '')\
|
||||||
|
.replace('.', '-').lower()
|
||||||
|
userdata = {
|
||||||
|
'username': app['email'],
|
||||||
|
'email': app['email'],
|
||||||
|
'name': app['name'],
|
||||||
|
'bio': app.get('bio', ''),
|
||||||
|
'emailConfirmed': False,
|
||||||
|
'slug': userslug,
|
||||||
|
'createdAt': ts,
|
||||||
|
'wasOnlineAt': ts
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
userdata = User.default_user.dict()
|
||||||
|
assert userdata, 'no user found for %s from ' % [oid, len(users_by_oid.keys())]
|
||||||
|
r['authors'] = [userdata, ]
|
||||||
|
|
||||||
|
# slug
|
||||||
|
|
||||||
|
slug = get_shout_slug(entry)
|
||||||
|
if slug: r['slug'] = slug
|
||||||
|
else: raise Exception
|
||||||
|
|
||||||
|
# cover
|
||||||
|
c = ''
|
||||||
|
if entry.get('thumborId'):
|
||||||
|
c = 'https://assets.discours.io/unsafe/1600x/' + entry['thumborId']
|
||||||
|
else:
|
||||||
|
c = entry.get('image', {}).get('url')
|
||||||
|
if not c or 'cloudinary' in c: c = ''
|
||||||
|
r['cover'] = c
|
||||||
|
|
||||||
|
# timestamps
|
||||||
|
|
||||||
|
r['createdAt'] = date_parse(entry.get('createdAt', OLD_DATE))
|
||||||
|
r['updatedAt'] = date_parse(entry['updatedAt']) if 'updatedAt' in entry else ts
|
||||||
|
if entry.get('published'):
|
||||||
|
r['publishedAt'] = date_parse(entry.get('publishedAt', OLD_DATE))
|
||||||
|
if r['publishedAt'] == OLD_DATE: r['publishedAt'] = ts
|
||||||
|
if 'deletedAt' in entry: r['deletedAt'] = date_parse(entry['deletedAt'])
|
||||||
|
|
||||||
|
# topics
|
||||||
|
category = entry['category']
|
||||||
|
mainTopic = topics_by_oid.get(category)
|
||||||
|
if mainTopic:
|
||||||
|
r['mainTopic'] = storage['replacements'].get(mainTopic["slug"], mainTopic["slug"])
|
||||||
|
topic_oids = [category, ]
|
||||||
|
topic_oids.extend(entry.get('tags', []))
|
||||||
|
for oid in topic_oids:
|
||||||
|
if oid in storage['topics']['by_oid']:
|
||||||
|
r['topics'].add(storage['topics']['by_oid'][oid]['slug'])
|
||||||
|
else:
|
||||||
|
print('[migration] unknown old topic id: ' + oid)
|
||||||
|
r['topics'] = list(r['topics'])
|
||||||
|
|
||||||
|
entry['topics'] = r['topics']
|
||||||
|
entry['cover'] = r['cover']
|
||||||
|
entry['authors'] = r['authors']
|
||||||
|
|
||||||
|
# body
|
||||||
|
r['body'] = prepare_html_body(entry)
|
||||||
|
|
||||||
|
# save shout to db
|
||||||
|
|
||||||
|
s = object()
|
||||||
|
shout_dict = r.copy()
|
||||||
|
user = None
|
||||||
|
del shout_dict['topics'] # FIXME: AttributeError: 'str' object has no attribute '_sa_instance_state'
|
||||||
|
#del shout_dict['rating'] # FIXME: TypeError: 'rating' is an invalid keyword argument for Shout
|
||||||
|
#del shout_dict['ratings']
|
||||||
|
email = userdata.get('email')
|
||||||
|
slug = userdata.get('slug')
|
||||||
|
with local_session() as session:
|
||||||
|
# c = session.query(Community).all().pop()
|
||||||
|
if email: user = session.query(User).filter(User.email == email).first()
|
||||||
|
if not user and slug: user = session.query(User).filter(User.slug == slug).first()
|
||||||
|
if not user and userdata:
|
||||||
|
try: user = User.create(**userdata)
|
||||||
|
except sqlalchemy.exc.IntegrityError:
|
||||||
|
print('[migration] user error: ' + userdata)
|
||||||
|
userdata['id'] = user.id
|
||||||
|
userdata['createdAt'] = user.createdAt
|
||||||
|
storage['users']['by_slug'][userdata['slug']] = userdata
|
||||||
|
storage['users']['by_oid'][entry['_id']] = userdata
|
||||||
|
assert user, 'could not get a user'
|
||||||
|
shout_dict['authors'] = [ user, ]
|
||||||
|
|
||||||
|
try:
|
||||||
|
s = Shout.create(**shout_dict)
|
||||||
|
except sqlalchemy.exc.IntegrityError as e:
|
||||||
|
with local_session() as session:
|
||||||
|
s = session.query(Shout).filter(Shout.slug == shout_dict['slug']).first()
|
||||||
|
bump = False
|
||||||
|
if s:
|
||||||
|
for key in shout_dict:
|
||||||
|
if key in s.__dict__:
|
||||||
|
if s.__dict__[key] != shout_dict[key]:
|
||||||
|
print('[migration] shout already exists, but differs in %s' % key)
|
||||||
|
bump = True
|
||||||
|
else:
|
||||||
|
print('[migration] shout already exists, but lacks %s' % key)
|
||||||
|
bump = True
|
||||||
|
if bump:
|
||||||
|
s.update(shout_dict)
|
||||||
|
else:
|
||||||
|
print('[migration] something went wrong with shout: \n%r' % shout_dict)
|
||||||
|
raise e
|
||||||
|
session.commit()
|
||||||
|
except:
|
||||||
|
print(s)
|
||||||
|
raise Exception
|
||||||
|
|
||||||
|
|
||||||
|
# shout topics aftermath
|
||||||
|
shout_dict['topics'] = []
|
||||||
|
for tpc in r['topics']:
|
||||||
|
oldslug = tpc
|
||||||
|
newslug = storage['replacements'].get(oldslug, oldslug)
|
||||||
|
if newslug:
|
||||||
|
with local_session() as session:
|
||||||
|
shout_topic_old = session.query(ShoutTopic)\
|
||||||
|
.filter(ShoutTopic.shout == shout_dict['slug'])\
|
||||||
|
.filter(ShoutTopic.topic == oldslug).first()
|
||||||
|
if shout_topic_old:
|
||||||
|
shout_topic_old.update({ 'slug': newslug })
|
||||||
|
else:
|
||||||
|
shout_topic_new = session.query(ShoutTopic)\
|
||||||
|
.filter(ShoutTopic.shout == shout_dict['slug'])\
|
||||||
|
.filter(ShoutTopic.topic == newslug).first()
|
||||||
|
if not shout_topic_new:
|
||||||
|
try: ShoutTopic.create(**{ 'shout': shout_dict['slug'], 'topic': newslug })
|
||||||
|
except: print('[migration] shout topic error: ' + newslug)
|
||||||
|
session.commit()
|
||||||
|
if newslug not in shout_dict['topics']:
|
||||||
|
shout_dict['topics'].append(newslug)
|
||||||
|
else:
|
||||||
|
print('[migration] ignored topic slug: \n%r' % tpc['slug'])
|
||||||
|
# raise Exception
|
||||||
|
|
||||||
|
# content_item ratings to reactions
|
||||||
|
try:
|
||||||
|
for content_rating in entry.get('ratings',[]):
|
||||||
|
with local_session() as session:
|
||||||
|
rater = session.query(User).filter(User.oid == content_rating['createdBy']).first()
|
||||||
|
reactedBy = rater if rater else session.query(User).filter(User.slug == 'noname').first()
|
||||||
|
if rater:
|
||||||
|
reaction_dict = {
|
||||||
|
'kind': ReactionKind.LIKE if content_rating['value'] > 0 else ReactionKind.DISLIKE,
|
||||||
|
'createdBy': reactedBy.slug,
|
||||||
|
'shout': shout_dict['slug']
|
||||||
|
}
|
||||||
|
cts = content_rating.get('createdAt')
|
||||||
|
if cts: reaction_dict['createdAt'] = date_parse(cts)
|
||||||
|
reaction = session.query(Reaction).\
|
||||||
|
filter(Reaction.shout == reaction_dict['shout']).\
|
||||||
|
filter(Reaction.createdBy == reaction_dict['createdBy']).\
|
||||||
|
filter(Reaction.kind == reaction_dict['kind']).first()
|
||||||
|
if reaction:
|
||||||
|
reaction_dict['kind'] = ReactionKind.AGREE if content_rating['value'] > 0 else ReactionKind.DISAGREE,
|
||||||
|
reaction.update(reaction_dict)
|
||||||
|
else: Reaction.create(**reaction_dict)
|
||||||
|
# shout_dict['ratings'].append(reaction_dict)
|
||||||
|
except:
|
||||||
|
print('[migration] content_item.ratings error: \n%r' % content_rating)
|
||||||
|
raise Exception
|
||||||
|
|
||||||
|
# shout views
|
||||||
|
ViewedByDay.create( shout = shout_dict['slug'], value = entry.get('views', 1) )
|
||||||
|
# del shout_dict['ratings']
|
||||||
|
shout_dict['oid'] = entry.get('_id')
|
||||||
|
storage['shouts']['by_oid'][entry['_id']] = shout_dict
|
||||||
|
storage['shouts']['by_slug'][slug] = shout_dict
|
||||||
|
return shout_dict
|
768
migration/tables/replacements.json
Normal file
768
migration/tables/replacements.json
Normal file
|
@ -0,0 +1,768 @@
|
||||||
|
{
|
||||||
|
"1990-e": "90s",
|
||||||
|
"2000-e": "2000s",
|
||||||
|
"90-e": "90s",
|
||||||
|
"207": "207",
|
||||||
|
"kartochki-rubinshteyna": "rubinstein-cards",
|
||||||
|
"Georgia": "georgia",
|
||||||
|
"Japan": "japan",
|
||||||
|
"Sweden": "sweden",
|
||||||
|
"abstraktsiya": "abstract",
|
||||||
|
"absurdism": "absurdism",
|
||||||
|
"acclimatization": "acclimatisation",
|
||||||
|
"activism": "activism",
|
||||||
|
"adolf-gitler": "adolf-hitler",
|
||||||
|
"afrika": "africa",
|
||||||
|
"agata-kristi": "agatha-christie",
|
||||||
|
"agressiya": "agression",
|
||||||
|
"agressivnoe-povedenie": "agression",
|
||||||
|
"aktsii": "actions",
|
||||||
|
"aktsionizm": "actionism",
|
||||||
|
"alber-kamyu": "albert-kamus",
|
||||||
|
"albomy": "albums",
|
||||||
|
"aleksandr-griboedov": "aleksander-griboedov",
|
||||||
|
"aleksandr-pushkin": "aleksander-pushkin",
|
||||||
|
"aleksandr-solzhenitsyn": "aleksander-solzhenitsyn",
|
||||||
|
"aleksandr-vvedenskiy": "aleksander-vvedensky",
|
||||||
|
"aleksey-navalnyy": "alexey-navalny",
|
||||||
|
"alfavit": "alphabet",
|
||||||
|
"alkogol": "alcohol",
|
||||||
|
"alternativa": "alternative",
|
||||||
|
"alternative": "alternative",
|
||||||
|
"alternativnaya-istoriya": "alternative-history",
|
||||||
|
"amerika": "america",
|
||||||
|
"anarhizm": "anarchism",
|
||||||
|
"anatoliy-mariengof": "anatoly-mariengof",
|
||||||
|
"ancient-russia": "ancient-russia",
|
||||||
|
"andegraund": "underground",
|
||||||
|
"andrey-platonov": "andrey-platonov",
|
||||||
|
"andrey-rodionov": "andrey-rodionov",
|
||||||
|
"andrey-tarkovskiy": "andrey-tarkovsky",
|
||||||
|
"angliyskie-istorii": "english-stories",
|
||||||
|
"angliyskiy-yazyk": "english-langugae",
|
||||||
|
"animation": "animation",
|
||||||
|
"animatsiya": "animation",
|
||||||
|
"anime": "anime",
|
||||||
|
"anri-volohonskiy": "anri-volohonsky",
|
||||||
|
"antifashizm": "anti-faschism",
|
||||||
|
"antiquity": "antiquity",
|
||||||
|
"antiutopiya": "dystopia",
|
||||||
|
"antropology": "antropology",
|
||||||
|
"antropotsen": "antropocenus",
|
||||||
|
"architecture": "architecture",
|
||||||
|
"arheologiya": "archeology",
|
||||||
|
"arhetipy": "archetypes",
|
||||||
|
"arhiv": "archive",
|
||||||
|
"aristokraty": "aristocracy",
|
||||||
|
"aristotel": "aristotle",
|
||||||
|
"arktika": "arctic",
|
||||||
|
"armiya": "army",
|
||||||
|
"art": "art",
|
||||||
|
"art-is": "art-is",
|
||||||
|
"artists": "artists",
|
||||||
|
"ateizm": "atheism",
|
||||||
|
"audiopoeziya": "audio-poetry",
|
||||||
|
"audio-poetry": "audio-poetry",
|
||||||
|
"audiospektakl": "audio-spectacles",
|
||||||
|
"auktsyon": "auktsyon",
|
||||||
|
"avangard": "avantgarde",
|
||||||
|
"avtofikshn": "autofiction",
|
||||||
|
"avtorskaya-pesnya": "bardsongs",
|
||||||
|
"azbuka-immigratsii": "immigration-basics",
|
||||||
|
"aziatskiy-kinematograf": "asian-cinema",
|
||||||
|
"b-movie": "b-movie",
|
||||||
|
"bannye-chteniya": "sauna-reading",
|
||||||
|
"bardsongs": "bardsongs",
|
||||||
|
"bdsm": "bdsm",
|
||||||
|
"belarus": "belarus",
|
||||||
|
"belgiya": "belgium",
|
||||||
|
"bertold-breht": "berttold-brecht",
|
||||||
|
"bezumie": "madness",
|
||||||
|
"biography": "biography",
|
||||||
|
"biologiya": "biology",
|
||||||
|
"bipolyarnoe-rasstroystvo": "bipolar-disorder",
|
||||||
|
"bitniki": "beatnics",
|
||||||
|
"biznes": "business",
|
||||||
|
"blizhniy-vostok": "middle-east",
|
||||||
|
"blizost": "closeness",
|
||||||
|
"blokada": "blockade",
|
||||||
|
"bob-dilan": "bob-dylan",
|
||||||
|
"bog": "god",
|
||||||
|
"bol": "pain",
|
||||||
|
"bolotnoe-delo": "bolotnaya-case",
|
||||||
|
"books": "books",
|
||||||
|
"boris-eltsin": "boris-eltsin",
|
||||||
|
"boris-godunov": "boris-godunov",
|
||||||
|
"boris-grebenschikov": "boris-grebenschikov",
|
||||||
|
"boris-nemtsov": "boris-nemtsov",
|
||||||
|
"boris-pasternak": "boris-pasternak",
|
||||||
|
"brak": "marriage",
|
||||||
|
"bret-iston-ellis": "bret-iston-ellis",
|
||||||
|
"buddizm": "buddhism",
|
||||||
|
"bullying": "bullying",
|
||||||
|
"bunt": "riot",
|
||||||
|
"burning-man": "burning-man",
|
||||||
|
"bytie": "being",
|
||||||
|
"byurokratiya": "bureaucracy",
|
||||||
|
"capitalism": "capitalism",
|
||||||
|
"censored-in-russia": "censored-in-russia",
|
||||||
|
"ch-rno-beloe": "black-and-white",
|
||||||
|
"ch-rnyy-yumor": "black-humour",
|
||||||
|
"chapters": "chapters",
|
||||||
|
"charity": "charity",
|
||||||
|
"chayldfri": "childfree",
|
||||||
|
"chechenskaya-voyna": "chechen-war",
|
||||||
|
"chechnya": "chechnya",
|
||||||
|
"chelovek": "male",
|
||||||
|
"chernobyl": "chernobyl",
|
||||||
|
"chernyy-yumor": "black-humour",
|
||||||
|
"children": "children",
|
||||||
|
"china": "china",
|
||||||
|
"chinovniki": "bureaucracy",
|
||||||
|
"chukotka": "chukotka",
|
||||||
|
"chuma": "plague",
|
||||||
|
"church": "church",
|
||||||
|
"cinema": "cinema",
|
||||||
|
"city": "city",
|
||||||
|
"civil-position": "civil-position",
|
||||||
|
"clips": "clips",
|
||||||
|
"collage": "collage",
|
||||||
|
"comics": "comics",
|
||||||
|
"conspiracy-theory": "conspiracy-theory",
|
||||||
|
"contemporary-art": "contemporary-art",
|
||||||
|
"contemporary-poetry": "poetry",
|
||||||
|
"contemporary-prose": "prose",
|
||||||
|
"coronavirus": "coronavirus",
|
||||||
|
"corruption": "corruption",
|
||||||
|
"creative-writing-school": "creative-writing-school",
|
||||||
|
"crime": "crime",
|
||||||
|
"criticism": "criticism",
|
||||||
|
"critiques": "reviews",
|
||||||
|
"culture": "culture",
|
||||||
|
"dadaizm": "dadaism",
|
||||||
|
"daniel-defo": "daniel-defoe",
|
||||||
|
"daniil-harms": "daniil-kharms",
|
||||||
|
"dante-aligeri": "dante-alighieri",
|
||||||
|
"darkveyv": "darkwave",
|
||||||
|
"death": "death",
|
||||||
|
"debaty": "debats",
|
||||||
|
"delo-seti": "seti-case",
|
||||||
|
"democracy": "democracy",
|
||||||
|
"demografiya": "demographics",
|
||||||
|
"demonstrations": "demonstrations",
|
||||||
|
"depression": "depression",
|
||||||
|
"derevnya": "village",
|
||||||
|
"design": "design",
|
||||||
|
"detskie-doma": "orphanages",
|
||||||
|
"detstvo": "childhood",
|
||||||
|
"digital": "digital",
|
||||||
|
"digital-art": "digital-art",
|
||||||
|
"directing": "directing",
|
||||||
|
"diskurs": "discours",
|
||||||
|
"diskurs-1": "discourse",
|
||||||
|
"dissidenty": "dissidents",
|
||||||
|
"diy": "diy",
|
||||||
|
"dmitriy-donskoy": "dmitriy-donskoy",
|
||||||
|
"dmitriy-prigov": "dmitriy-prigov",
|
||||||
|
"dnevniki": "dairies",
|
||||||
|
"documentary": "documentary",
|
||||||
|
"dokumenty": "doсuments",
|
||||||
|
"domashnee-nasilie": "home-terror",
|
||||||
|
"donald-tramp": "donald-trump",
|
||||||
|
"donbass": "donbass",
|
||||||
|
"donorstvo": "donation",
|
||||||
|
"drama": "drama",
|
||||||
|
"dramaturgy": "dramaturgy",
|
||||||
|
"drawing": "drawing",
|
||||||
|
"drevo-zhizni": "tree-of-life",
|
||||||
|
"drugs": "drugs",
|
||||||
|
"dzhaz": "jazz",
|
||||||
|
"dzhek-keruak": "jack-keruak",
|
||||||
|
"dzhim-morrison": "jim-morrison",
|
||||||
|
"dzhordzh-romero": "george-romero",
|
||||||
|
"dzhordzho-agamben": "giorgio-agamben",
|
||||||
|
"ecology": "ecology",
|
||||||
|
"economics": "economics",
|
||||||
|
"eda": "food",
|
||||||
|
"editing": "editing",
|
||||||
|
"editorial-statements": "editorial-statements",
|
||||||
|
"eduard-limonov": "eduard-limonov",
|
||||||
|
"education": "education",
|
||||||
|
"egor-letov": "egor-letov",
|
||||||
|
"eksperiment": "experiments",
|
||||||
|
"eksperimentalnaya-muzyka": "experimental-music",
|
||||||
|
"ekspressionizm": "expressionism",
|
||||||
|
"ekstremizm": "extremism",
|
||||||
|
"ekzistentsializm-1": "existentialism",
|
||||||
|
"elections": "elections",
|
||||||
|
"electronic": "electronics",
|
||||||
|
"electronics": "electronics",
|
||||||
|
"elena-glinskaya": "elena-glinskaya",
|
||||||
|
"elena-guro": "elena-guro",
|
||||||
|
"elizaveta-mnatsakanova": "elizaveta-mnatsakanova",
|
||||||
|
"embient": "ambient",
|
||||||
|
"emigration": "emigration",
|
||||||
|
"emil-dyurkgeym": "emile-durkheim",
|
||||||
|
"emotsii": "emotions",
|
||||||
|
"empiric": "empiric",
|
||||||
|
"epidemiya": "pandemic",
|
||||||
|
"erich-von-neff": "erich-von-neff",
|
||||||
|
"erotika": "erotics",
|
||||||
|
"essay": "essay",
|
||||||
|
"estetika": "aestetics",
|
||||||
|
"etika": "ethics",
|
||||||
|
"etnos": "ethnics",
|
||||||
|
"everyday-life": "everyday-life",
|
||||||
|
"evgeniy-onegin": "eugene-onegin",
|
||||||
|
"evolyutsiya": "evolution",
|
||||||
|
"exhibitions": "exhibitions",
|
||||||
|
"experience": "experiences",
|
||||||
|
"experimental": "experimental",
|
||||||
|
"experimental-music": "experimental-music",
|
||||||
|
"explanation": "explanation",
|
||||||
|
"faktcheking": "fact-checking",
|
||||||
|
"falsifikatsii": "falsifications",
|
||||||
|
"family": "family",
|
||||||
|
"fanfiki": "fan-fiction",
|
||||||
|
"fantastika": "sci-fi",
|
||||||
|
"fatalizm": "fatalism",
|
||||||
|
"fedor-dostoevskiy": "fedor-dostoevsky",
|
||||||
|
"fedor-ioannovich": "fedor-ioannovich",
|
||||||
|
"feleton": "feuilleton",
|
||||||
|
"feminism": "feminism",
|
||||||
|
"fenomenologiya": "phenomenology",
|
||||||
|
"fentezi": "fantasy",
|
||||||
|
"festival": "festival",
|
||||||
|
"festival-territoriya": "festival-territory",
|
||||||
|
"folk": "folk",
|
||||||
|
"folklor": "folklore",
|
||||||
|
"fotoreportazh": "photoreports",
|
||||||
|
"france": "france",
|
||||||
|
"frants-kafka": "franz-kafka",
|
||||||
|
"frederik-begbeder": "frederick-begbeder",
|
||||||
|
"freedom": "freedom",
|
||||||
|
"friendship": "friendship",
|
||||||
|
"fsb": "fsb",
|
||||||
|
"futbol": "footbool",
|
||||||
|
"future": "future",
|
||||||
|
"futuristy": "futurists",
|
||||||
|
"futurizm": "futurism",
|
||||||
|
"galereya": "gallery",
|
||||||
|
"gdr": "gdr",
|
||||||
|
"gender": "gender",
|
||||||
|
"gendernyy-diskurs": "gender",
|
||||||
|
"gennadiy-aygi": "gennadiy-aygi",
|
||||||
|
"gerhard-rihter": "gerhard-rihter",
|
||||||
|
"germaniya": "germany",
|
||||||
|
"germenevtika": "hermeneutics",
|
||||||
|
"geroi": "heroes",
|
||||||
|
"girls": "girls",
|
||||||
|
"gkchp": "gkchp",
|
||||||
|
"glitch": "glitch",
|
||||||
|
"globalizatsiya": "globalisation",
|
||||||
|
"gollivud": "hollywood",
|
||||||
|
"gonzo": "gonzo",
|
||||||
|
"gore-ot-uma": "woe-from-wit",
|
||||||
|
"graffiti": "graffiti",
|
||||||
|
"graphics": "graphics",
|
||||||
|
"gravyura": "engraving",
|
||||||
|
"grazhdanskaya-oborona": "grazhdanskaya-oborona",
|
||||||
|
"gretsiya": "greece",
|
||||||
|
"gulag": "gulag",
|
||||||
|
"han-batyy": "khan-batyy",
|
||||||
|
"health": "health",
|
||||||
|
"himiya": "chemistry",
|
||||||
|
"hip-hop": "hip-hop",
|
||||||
|
"history": "history",
|
||||||
|
"history-of-russia": "history-of-russia",
|
||||||
|
"holokost": "holocaust",
|
||||||
|
"horeografiya": "choreography",
|
||||||
|
"horror": "horror",
|
||||||
|
"hospis": "hospice",
|
||||||
|
"hristianstvo": "christianity",
|
||||||
|
"humans": "humans",
|
||||||
|
"humour": "humour",
|
||||||
|
"ideologiya": "ideology",
|
||||||
|
"idm": "idm",
|
||||||
|
"igil": "isis",
|
||||||
|
"igor-pomerantsev": "igor-pomerantsev",
|
||||||
|
"igra-prestolov": "game-of-throne",
|
||||||
|
"igry": "games",
|
||||||
|
"iisus-hristos": "jesus-christ",
|
||||||
|
"illness": "illness",
|
||||||
|
"illustration-history": "illustration-history",
|
||||||
|
"illustrations": "illustrations",
|
||||||
|
"imazhinizm": "imagism",
|
||||||
|
"immanuil-kant": "immanuel-kant",
|
||||||
|
"impressionizm": "impressionism",
|
||||||
|
"improvizatsiya": "improvisation",
|
||||||
|
"indi": "indie",
|
||||||
|
"individualizm": "individualism",
|
||||||
|
"infografika": "infographics",
|
||||||
|
"informatsiya": "information",
|
||||||
|
"ingmar-bergman": "ingmar-bergman",
|
||||||
|
"inklyuziya": "inclusion",
|
||||||
|
"installyatsiya": "installation",
|
||||||
|
"internet": "internet",
|
||||||
|
"interview": "interview",
|
||||||
|
"invalidnost": "disability",
|
||||||
|
"investigations": "investigations",
|
||||||
|
"iosif-brodskiy": "joseph-brodsky",
|
||||||
|
"iosif-stalin": "joseph-stalin",
|
||||||
|
"iskusstvennyy-intellekt": "artificial-intelligence",
|
||||||
|
"islam": "islam",
|
||||||
|
"istoriya-moskvy": "moscow-history",
|
||||||
|
"istoriya-teatra": "theatre-history",
|
||||||
|
"italiya": "italy",
|
||||||
|
"italyanskiy-yazyk": "italian-language",
|
||||||
|
"iudaika": "judaica",
|
||||||
|
"ivan-groznyy": "ivan-grozny",
|
||||||
|
"ivan-iii-gorbatyy": "ivan-iii-gorbaty",
|
||||||
|
"ivan-kalita": "ivan-kalita",
|
||||||
|
"ivan-krylov": "ivan-krylov",
|
||||||
|
"izobreteniya": "inventions",
|
||||||
|
"izrail-1": "israel",
|
||||||
|
"jazz": "jazz",
|
||||||
|
"john-lennon": "john-lennon",
|
||||||
|
"journalism": "journalism",
|
||||||
|
"justice": "justice",
|
||||||
|
"k-pop": "k-pop",
|
||||||
|
"kalligrafiya": "calligraphy",
|
||||||
|
"karikatura": "caricatures",
|
||||||
|
"katrin-nenasheva": "katrin-nenasheva",
|
||||||
|
"kavkaz": "caucasus",
|
||||||
|
"kazan": "kazan",
|
||||||
|
"kiberbezopasnost": "cybersecurity",
|
||||||
|
"kinoklub": "cinema-club",
|
||||||
|
"kirill-serebrennikov": "kirill-serebrennikov",
|
||||||
|
"klassika": "classic",
|
||||||
|
"kollektivnoe-bessoznatelnoe": "сollective-unconscious",
|
||||||
|
"komediya": "comedy",
|
||||||
|
"kommunikatsii": "communications",
|
||||||
|
"kommunizm": "communism",
|
||||||
|
"kommuny": "communes",
|
||||||
|
"kompyuternye-igry": "computer-games",
|
||||||
|
"konservatizm": "conservatism",
|
||||||
|
"kontrkultura": "counter-culture",
|
||||||
|
"kontseptualizm": "conceptualism",
|
||||||
|
"korotkometrazhka": "cinema-shorts",
|
||||||
|
"kosmos": "cosmos",
|
||||||
|
"kraudfanding": "crowdfunding",
|
||||||
|
"krizis": "crisis",
|
||||||
|
"krov": "blood",
|
||||||
|
"krym": "crimea",
|
||||||
|
"kulturologiya": "culturology",
|
||||||
|
"kulty": "cults",
|
||||||
|
"kurdistan": "kurdistan",
|
||||||
|
"kurt-kobeyn": "kurt-cobain",
|
||||||
|
"kurt-vonnegut": "kurt-vonnegut",
|
||||||
|
"kvir": "queer",
|
||||||
|
"laboratoriya": "lab",
|
||||||
|
"language": "languages",
|
||||||
|
"lars-fon-trier": "lars-fon-trier",
|
||||||
|
"laws": "laws",
|
||||||
|
"lectures": "lectures",
|
||||||
|
"leto": "summer",
|
||||||
|
"lev-tolstoy": "leo-tolstoy",
|
||||||
|
"lgbt": "lgbt",
|
||||||
|
"liberalizm": "liberalism",
|
||||||
|
"libertarianstvo": "libertarianism",
|
||||||
|
"life": "life",
|
||||||
|
"likbez": "likbez",
|
||||||
|
"lingvistika": "linguistics",
|
||||||
|
"lirika": "lirics",
|
||||||
|
"literary-studies": "literary-studies",
|
||||||
|
"literature": "literature",
|
||||||
|
"lo-fi": "lo-fi",
|
||||||
|
"love": "love",
|
||||||
|
"luzha-goluboy-krovi": "luzha-goluboy-krovi",
|
||||||
|
"lyudvig-vitgenshteyn": "ludwig-wittgenstein",
|
||||||
|
"lzhedmitriy": "false-dmitry",
|
||||||
|
"lzhenauka": "pseudoscience",
|
||||||
|
"maks-veber": "max-weber",
|
||||||
|
"manifests": "manifests",
|
||||||
|
"manipulyatsii-soznaniem": "mind-manipulation",
|
||||||
|
"marina-abramovich": "marina-abramovich",
|
||||||
|
"marketing": "marketing",
|
||||||
|
"marksizm": "marxism",
|
||||||
|
"marsel-dyushan": "marchel-duchamp",
|
||||||
|
"martin-haydegger": "martin-hidegger",
|
||||||
|
"matematika": "maths",
|
||||||
|
"vladimir-mayakovskiy": "vladimir-mayakovsky",
|
||||||
|
"mayakovskiy": "vladimir-mayakovsky",
|
||||||
|
"ekzistentsiya": "existence",
|
||||||
|
"media": "media",
|
||||||
|
"medicine": "medicine",
|
||||||
|
"memuary": "memoirs",
|
||||||
|
"menedzhment": "management",
|
||||||
|
"merab-mamardashvili": "merab-mamardashvili",
|
||||||
|
"mest": "revenge",
|
||||||
|
"metamodernizm": "metamodern",
|
||||||
|
"metavselennaya": "metaverse",
|
||||||
|
"metro": "metro",
|
||||||
|
"mifologiya": "mythology",
|
||||||
|
"mify": "myth",
|
||||||
|
"mihael-haneke": "michael-haneke",
|
||||||
|
"mihail-baryshnikov": "mihail-baryshnikov",
|
||||||
|
"mihail-bulgakov": "mihail-bulgakov",
|
||||||
|
"mikrotonalnaya-muzyka": "mikrotone-muzyka",
|
||||||
|
"minimalizm": "minimalism",
|
||||||
|
"minkult-privet": "minkult-privet",
|
||||||
|
"mir": "world",
|
||||||
|
"mirovozzrenie": "mindsets",
|
||||||
|
"mishel-fuko": "michel-foucault",
|
||||||
|
"mistika": "mystics",
|
||||||
|
"mitropolit-makariy": "mitropolit-makariy",
|
||||||
|
"mlm": "mlm",
|
||||||
|
"moda": "fashion",
|
||||||
|
"modernizm": "modernism",
|
||||||
|
"mokyumentari": "mockumentary",
|
||||||
|
"moloko-plus": "moloko-plus",
|
||||||
|
"money": "money",
|
||||||
|
"monologs": "monologues",
|
||||||
|
"monstratsiya": "monstration",
|
||||||
|
"moralnaya-otvetstvennost": "moral-responsibility",
|
||||||
|
"more": "sea",
|
||||||
|
"moscow": "moscow",
|
||||||
|
"moshennichestvo": "frauds",
|
||||||
|
"moskovskiy-romanticheskiy-kontseptualizm": "moscow-romantic-conceptualism",
|
||||||
|
"moskovskoe-delo": "moscow-case",
|
||||||
|
"movies": "movies",
|
||||||
|
"mozg": "brain",
|
||||||
|
"multiplikatsiya": "animation",
|
||||||
|
"music": "music",
|
||||||
|
"muzei": "museum",
|
||||||
|
"muzey": "museum",
|
||||||
|
"muzhchiny": "man",
|
||||||
|
"myshlenie": "thinking",
|
||||||
|
"nagornyy-karabah": "nagorno-karabakh",
|
||||||
|
"natsionalizm": "nationalism",
|
||||||
|
"natsionalnaya-ideya": "national-idea",
|
||||||
|
"natsizm": "nazism",
|
||||||
|
"natyurmort": "nature-morte",
|
||||||
|
"nauchpop": "pop-science",
|
||||||
|
"nbp": "nbp",
|
||||||
|
"nenavist": "hate",
|
||||||
|
"neofitsialnaya-literatura": "unofficial-literature",
|
||||||
|
"neoklassika": "neoclassic",
|
||||||
|
"neprozrachnye-smysly": "hidden-meanings",
|
||||||
|
"neravenstvo": "inequality",
|
||||||
|
"new-year": "new-year",
|
||||||
|
"neyronauka": "neuro-science",
|
||||||
|
"neyroseti": "neural-networks",
|
||||||
|
"niu-vshe": "hse",
|
||||||
|
"nizhniy-novgorod": "nizhny-novgorod",
|
||||||
|
"nko": "nonprofits",
|
||||||
|
"nlo": "ufo",
|
||||||
|
"nobelevskaya-premiya": "nobel-prize",
|
||||||
|
"noize-mc": "noize-mc",
|
||||||
|
"nonkonformizm": "nonconformism",
|
||||||
|
"novaya-drama": "new-drama",
|
||||||
|
"novosti": "news",
|
||||||
|
"noyz": "noise",
|
||||||
|
"oberiu": "oberiu",
|
||||||
|
"ocherk": "etudes",
|
||||||
|
"ochevidnyy-nuar": "ochevidnyy-nuar",
|
||||||
|
"odinochestvo": "loneliness",
|
||||||
|
"odna-kniga-odna-istoriya": "one-book-one-story",
|
||||||
|
"okrainy": "outskirts",
|
||||||
|
"opinions": "opinions",
|
||||||
|
"oppozitsiya": "opposition",
|
||||||
|
"orhan-pamuk": "orhan-pamuk",
|
||||||
|
"ornitologiya": "ornitology",
|
||||||
|
"osip-mandelshtam": "osip-mandelshtam",
|
||||||
|
"oskar-uayld": "oscar-wilde",
|
||||||
|
"osoznanie": "awareness",
|
||||||
|
"otnosheniya": "relationship",
|
||||||
|
"pablo-pikasso": "pablo-picasso",
|
||||||
|
"painting": "painting",
|
||||||
|
"paintings": "painting",
|
||||||
|
"pamyat": "memory",
|
||||||
|
"pandemiya": "pandemic",
|
||||||
|
"parizh": "paris",
|
||||||
|
"patriotizm": "patriotism",
|
||||||
|
"paul-tselan": "paul-tselan",
|
||||||
|
"per-burd": "pierre-bourdieu",
|
||||||
|
"performance": "performance",
|
||||||
|
"peyzazh": "landscape",
|
||||||
|
"philology": "philology",
|
||||||
|
"philosophy": "philosophy",
|
||||||
|
"photo": "photography",
|
||||||
|
"photography": "photography",
|
||||||
|
"photoprojects": "photoprojects",
|
||||||
|
"plakaty": "posters",
|
||||||
|
"plastilin": "plasticine",
|
||||||
|
"plays": "plays",
|
||||||
|
"podrostki": "teenagers",
|
||||||
|
"poema": "poem",
|
||||||
|
"poems": "poems",
|
||||||
|
"poeticheskaya-proza": "poetic-prose",
|
||||||
|
"poetry": "poetry",
|
||||||
|
"poetry-of-squares": "poetry-of-squares",
|
||||||
|
"poetry-slam": "poetry-slam",
|
||||||
|
"police": "police",
|
||||||
|
"politics": "politics",
|
||||||
|
"polsha": "poland",
|
||||||
|
"pop-art": "pop-art",
|
||||||
|
"pop-culture": "pop-culture",
|
||||||
|
"pornografiya": "pornography",
|
||||||
|
"portret": "portrait",
|
||||||
|
"poslovitsy": "proverbs",
|
||||||
|
"post-pank": "post-punk",
|
||||||
|
"post-rok": "post-rock",
|
||||||
|
"postmodernism": "postmodernism",
|
||||||
|
"povest": "novells",
|
||||||
|
"povsednevnost": "everyday-life",
|
||||||
|
"power": "power",
|
||||||
|
"pravo": "right",
|
||||||
|
"pravoslavie": "orthodox",
|
||||||
|
"pravozaschitniki": "human-rights-activism",
|
||||||
|
"prazdnik": "holidays",
|
||||||
|
"predatelstvo": "betrayal",
|
||||||
|
"predprinimatelstvo": "entrepreneurship",
|
||||||
|
"premera": "premier",
|
||||||
|
"premiya-oskar": "oscar-prize",
|
||||||
|
"pribaltika-1": "baltic",
|
||||||
|
"priroda": "nature",
|
||||||
|
"prison": "prison",
|
||||||
|
"pritcha": "parable",
|
||||||
|
"privatnost": "privacy",
|
||||||
|
"progress": "progress",
|
||||||
|
"projects": "projects",
|
||||||
|
"prokrastinatsiya": "procrastination",
|
||||||
|
"propaganda": "propaganda",
|
||||||
|
"proschenie": "forgiveness",
|
||||||
|
"prose": "prose",
|
||||||
|
"proshloe": "past",
|
||||||
|
"prostitutsiya": "prostitution",
|
||||||
|
"prosveschenie": "enlightenment",
|
||||||
|
"protests": "protests",
|
||||||
|
"psalmy": "psalms",
|
||||||
|
"psihoanaliz": "psychoanalysis",
|
||||||
|
"psihodeliki": "psychodelics",
|
||||||
|
"pskov": "pskov",
|
||||||
|
"psychiatry": "psychiatry",
|
||||||
|
"psychology": "psychology",
|
||||||
|
"punk": "punk",
|
||||||
|
"r-b": "rnb",
|
||||||
|
"realizm": "realism",
|
||||||
|
"redaktura": "editorial",
|
||||||
|
"refleksiya": "reflection",
|
||||||
|
"reggi": "reggae",
|
||||||
|
"religion": "religion",
|
||||||
|
"rene-zhirar": "rene-girard",
|
||||||
|
"renesanss": "renessance",
|
||||||
|
"renovatsiya": "renovation",
|
||||||
|
"rep": "rap",
|
||||||
|
"reportage": "reportage",
|
||||||
|
"repressions": "repressions",
|
||||||
|
"research": "research",
|
||||||
|
"retroveyv": "retrowave",
|
||||||
|
"review": "review",
|
||||||
|
"revolution": "revolution",
|
||||||
|
"rezo-gabriadze": "rezo-gabriadze",
|
||||||
|
"risunki": "painting",
|
||||||
|
"roboty": "robots",
|
||||||
|
"rock": "rock",
|
||||||
|
"roditeli": "parents",
|
||||||
|
"romantizm": "romantism",
|
||||||
|
"romany": "novell",
|
||||||
|
"ronald-reygan": "ronald-reygan",
|
||||||
|
"roskomnadzor": "roskomnadzor",
|
||||||
|
"rossiyskoe-kino": "russian-cinema",
|
||||||
|
"rozhava": "rojava",
|
||||||
|
"rpts": "rpts",
|
||||||
|
"rus-na-grani-sryva": "rus-na-grani-sryva",
|
||||||
|
"russia": "russia",
|
||||||
|
"russian-language": "russian-language",
|
||||||
|
"russian-literature": "russian-literature",
|
||||||
|
"russkiy-mir": "russkiy-mir",
|
||||||
|
"salvador-dali": "salvador-dali",
|
||||||
|
"samoidentifikatsiya": "self-identity",
|
||||||
|
"samoopredelenie": "self-definition",
|
||||||
|
"sankt-peterburg": "saint-petersburg",
|
||||||
|
"sasha-skochilenko": "sasha-skochilenko",
|
||||||
|
"satira": "satiric",
|
||||||
|
"saund-art": "sound-art",
|
||||||
|
"schaste": "hapiness",
|
||||||
|
"school": "school",
|
||||||
|
"science": "science",
|
||||||
|
"sculpture": "sculpture",
|
||||||
|
"second-world-war": "second-world-war",
|
||||||
|
"sekond-hend": "second-hand",
|
||||||
|
"seksprosvet": "sex-education",
|
||||||
|
"sekty": "sects",
|
||||||
|
"semiotics": "semiotics",
|
||||||
|
"serbiya": "serbia",
|
||||||
|
"serialy": "series",
|
||||||
|
"sever": "north",
|
||||||
|
"severnaya-koreya": "north-korea",
|
||||||
|
"sex": "sex",
|
||||||
|
"shotlandiya": "scotland",
|
||||||
|
"shugeyz": "shoegaze",
|
||||||
|
"siloviki": "siloviki",
|
||||||
|
"simeon-bekbulatovich": "simeon-bekbulatovich",
|
||||||
|
"simvolizm": "simbolism",
|
||||||
|
"siriya": "siria",
|
||||||
|
"skulptura": "sculpture",
|
||||||
|
"slavoy-zhizhek": "slavoj-zizek",
|
||||||
|
"smysl": "meaning",
|
||||||
|
"sny": "dreams",
|
||||||
|
"sobytiya": "events",
|
||||||
|
"social": "society",
|
||||||
|
"society": "society",
|
||||||
|
"sociology": "sociology",
|
||||||
|
"sofya-paleolog": "sofya-paleolog",
|
||||||
|
"sofya-vitovtovna": "sofya-vitovtovna",
|
||||||
|
"soobschestva": "communities",
|
||||||
|
"soprotivlenie": "resistence",
|
||||||
|
"sotsializm": "socialism",
|
||||||
|
"sotsialnaya-filosofiya": "social-philosophy",
|
||||||
|
"sotsseti": "social-networks",
|
||||||
|
"sotvorenie-tretego-rima": "third-rome",
|
||||||
|
"sovremennost": "modernity",
|
||||||
|
"spaces": "spaces",
|
||||||
|
"spektakl": "spectacles",
|
||||||
|
"spetseffekty": "special-fx",
|
||||||
|
"spetsoperatsiya": "special-operation",
|
||||||
|
"spetssluzhby": "special-services",
|
||||||
|
"sport": "sport",
|
||||||
|
"srednevekove": "middle-age",
|
||||||
|
"state": "state",
|
||||||
|
"statistika": "statistics",
|
||||||
|
"stendap": "stand-up",
|
||||||
|
"stoitsizm": "stoicism",
|
||||||
|
"stories": "stories",
|
||||||
|
"stoyanie-na-ugre": "stoyanie-na-ugre",
|
||||||
|
"strah": "fear",
|
||||||
|
"street-art": "street-art",
|
||||||
|
"stsenarii": "scenarios",
|
||||||
|
"summary": "summary",
|
||||||
|
"supergeroi": "superheroes",
|
||||||
|
"svetlana-aleksievich": "svetlana-aleksievich",
|
||||||
|
"svobodu-ivanu-golunovu": "free-ivan-golunov",
|
||||||
|
"syurrealizm": "surrealism",
|
||||||
|
"tales": "tales",
|
||||||
|
"tanets": "dance",
|
||||||
|
"tataro-mongolskoe-igo": "mongol-tatar-yoke",
|
||||||
|
"tatuirovki": "tattoo",
|
||||||
|
"technology": "technology",
|
||||||
|
"televidenie": "tv",
|
||||||
|
"telo": "body",
|
||||||
|
"telo-kak-iskusstvo": "body-as-art",
|
||||||
|
"terrorizm": "terrorism",
|
||||||
|
"tests": "tests",
|
||||||
|
"text": "texts",
|
||||||
|
"the-beatles": "the-beatles",
|
||||||
|
"theater": "theater",
|
||||||
|
"theory": "theory",
|
||||||
|
"tokio": "tokio",
|
||||||
|
"torture": "torture",
|
||||||
|
"totalitarizm": "totalitarism",
|
||||||
|
"traditions": "traditions",
|
||||||
|
"tragicomedy": "tragicomedy",
|
||||||
|
"transgendernost": "transgender",
|
||||||
|
"translation": "translation",
|
||||||
|
"transport": "transport",
|
||||||
|
"travel": "travel",
|
||||||
|
"travma": "trauma",
|
||||||
|
"trendy": "trends",
|
||||||
|
"tretiy-reyh": "third-reich",
|
||||||
|
"triller": "thriller",
|
||||||
|
"tsar": "central-african-republic",
|
||||||
|
"tsar-edip": "oedipus",
|
||||||
|
"tsarevich-dmitriy": "tsarevich-dmitry",
|
||||||
|
"tsennosti": "values",
|
||||||
|
"tsenzura": "censorship",
|
||||||
|
"tseremonii": "ceremonies",
|
||||||
|
"turizm": "tourism",
|
||||||
|
"tvorchestvo": "creativity",
|
||||||
|
"ugnetennyy-zhilischnyy-klass": "oppressed-housing-class",
|
||||||
|
"uilyam-shekspir": "william-shakespeare",
|
||||||
|
"ukraine": "ukraine",
|
||||||
|
"university": "university",
|
||||||
|
"urban-studies": "urban-studies",
|
||||||
|
"uroki-literatury": "literature-lessons",
|
||||||
|
"usa": "usa",
|
||||||
|
"ussr": "ussr",
|
||||||
|
"utopiya": "utopia",
|
||||||
|
"valter-benyamin": "valter-benyamin",
|
||||||
|
"varlam-shalamov": "varlam-shalamov",
|
||||||
|
"vasiliy-ii-temnyy": "basil-ii-temnyy",
|
||||||
|
"vasiliy-iii": "basil-iii",
|
||||||
|
"vdnh": "vdnh",
|
||||||
|
"vechnost": "ethernety",
|
||||||
|
"velikobritaniya": "great-britain",
|
||||||
|
"velimir-hlebnikov": "velimir-hlebnikov",
|
||||||
|
"velkom-tu-greyt-britn": "welcome-to-great-britain",
|
||||||
|
"venedikt-erofeev": "venedikt-erofeev",
|
||||||
|
"venetsiya": "veneece",
|
||||||
|
"vengriya": "hungary",
|
||||||
|
"verlibry": "free-verse",
|
||||||
|
"veschi": "things",
|
||||||
|
"vessels": "vessels",
|
||||||
|
"veterany": "veterans",
|
||||||
|
"video": "video",
|
||||||
|
"videoart": "videoart",
|
||||||
|
"videoklip": "clips",
|
||||||
|
"videopoeziya": "video-poetry",
|
||||||
|
"viktor-astafev": "viktor-astafev",
|
||||||
|
"viktor-pelevin": "viktor-pelevin",
|
||||||
|
"vilgelm-rayh": "wilhelm-reich",
|
||||||
|
"vinzavod": "vinzavod",
|
||||||
|
"violence": "violence",
|
||||||
|
"visual-culture": "visual-culture",
|
||||||
|
"vizualnaya-poeziya": "visual-poetry",
|
||||||
|
"vladimir-lenin": "vladimir-lenin",
|
||||||
|
"vladimir-nabokov": "vladimir-nabokov",
|
||||||
|
"vladimir-putin": "vladimir-putin",
|
||||||
|
"vladimir-sorokin": "vladimir-sorokin",
|
||||||
|
"vladimir-voynovich": "vladimir-voynovich",
|
||||||
|
"volga": "volga",
|
||||||
|
"volontery": "volonteurs",
|
||||||
|
"vong-karvay": "wong-karwai",
|
||||||
|
"vospominaniya": "memories",
|
||||||
|
"vostok": "east",
|
||||||
|
"vremya": "time",
|
||||||
|
"vudi-allen": "woody-allen",
|
||||||
|
"vynuzhdennye-otnosheniya": "forced-relationship",
|
||||||
|
"war": "war",
|
||||||
|
"war-in-ukraine-images": "war-in-ukrahine-images",
|
||||||
|
"women": "women",
|
||||||
|
"work": "work",
|
||||||
|
"writers": "writers",
|
||||||
|
"xx-century": "xx-century",
|
||||||
|
"yakob-yordans": "yakob-yordans",
|
||||||
|
"yan-vermeer": "yan-vermeer",
|
||||||
|
"yanka-dyagileva": "yanka-dyagileva",
|
||||||
|
"yaponskaya-literatura": "japan-literature",
|
||||||
|
"youth": "youth",
|
||||||
|
"yozef-rot": "yozef-rot",
|
||||||
|
"yurgen-habermas": "jorgen-habermas",
|
||||||
|
"za-liniey-mannergeyma": "behind-mannerheim-line",
|
||||||
|
"zahar-prilepin": "zahar-prilepin",
|
||||||
|
"zakonodatelstvo": "laws",
|
||||||
|
"zakony-mira": "world-laws",
|
||||||
|
"zametki": "notes",
|
||||||
|
"zhelanie": "wish",
|
||||||
|
"konets-vesny": "end-of-spring",
|
||||||
|
"zhivotnye": "animals",
|
||||||
|
"zhoze-saramago": "jose-saramago",
|
||||||
|
"zigmund-freyd": "sigmund-freud",
|
||||||
|
"zolotaya-orda": "golden-horde",
|
||||||
|
"zombi": "zombie",
|
||||||
|
"zombi-simpsony": "zombie-simpsons",
|
||||||
|
"rouling": "rowling",
|
||||||
|
"diskurs-analiz": "discourse-analytics",
|
||||||
|
"menty": "police",
|
||||||
|
"ptitsy": "birds",
|
||||||
|
"salo": "lard",
|
||||||
|
"rasizm": "racism",
|
||||||
|
"griby": "mushrooms",
|
||||||
|
"politzaklyuchennye": "political-prisoners",
|
||||||
|
"molodezh": "youth",
|
||||||
|
"blocked-in-russia": "blocked-in-russia",
|
||||||
|
"kavarga": "kavarga",
|
||||||
|
"galereya-anna-nova": "gallery-anna-nova",
|
||||||
|
"derrida": "derrida"
|
||||||
|
}
|
28
migration/tables/topics.py
Normal file
28
migration/tables/topics.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
from migration.extract import extract_md, html2text
|
||||||
|
from orm.base import local_session
|
||||||
|
from orm import Topic, Community
|
||||||
|
|
||||||
|
def migrate(entry):
|
||||||
|
body_orig = entry.get('description', '').replace(' ', ' ')
|
||||||
|
topic_dict = {
|
||||||
|
'slug': entry['slug'],
|
||||||
|
'oid': entry['_id'],
|
||||||
|
'title': entry['title'].replace(' ', ' '), #.lower(),
|
||||||
|
'children': [],
|
||||||
|
'community' : Community.default_community.slug
|
||||||
|
}
|
||||||
|
topic_dict['body'] = extract_md(html2text(body_orig), entry['_id'])
|
||||||
|
with local_session() as session:
|
||||||
|
slug = topic_dict['slug']
|
||||||
|
topic = session.query(Topic).filter(Topic.slug == slug).first()
|
||||||
|
if not topic:
|
||||||
|
topic = Topic.create(**topic_dict)
|
||||||
|
if len(topic.title) > len(topic_dict['title']):
|
||||||
|
topic.update({ 'title': topic_dict['title'] })
|
||||||
|
if len(topic.body) < len(topic_dict['body']):
|
||||||
|
topic.update({ 'body': topic_dict['body'] })
|
||||||
|
session.commit()
|
||||||
|
# print(topic.__dict__)
|
||||||
|
rt = topic.__dict__.copy()
|
||||||
|
del rt['_sa_instance_state']
|
||||||
|
return rt
|
106
migration/tables/users.py
Normal file
106
migration/tables/users.py
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
import sqlalchemy
|
||||||
|
from migration.html2text import html2text
|
||||||
|
from orm import User, UserRating
|
||||||
|
from dateutil.parser import parse
|
||||||
|
from orm.base import local_session
|
||||||
|
|
||||||
|
def migrate(entry):
|
||||||
|
if 'subscribedTo' in entry: del entry['subscribedTo']
|
||||||
|
email = entry['emails'][0]['address']
|
||||||
|
user_dict = {
|
||||||
|
'oid': entry['_id'],
|
||||||
|
'roles': [],
|
||||||
|
'ratings': [],
|
||||||
|
'username': email,
|
||||||
|
'email': email,
|
||||||
|
'password': entry['services']['password'].get('bcrypt', ''),
|
||||||
|
'createdAt': parse(entry['createdAt']),
|
||||||
|
'emailConfirmed': bool(entry['emails'][0]['verified']),
|
||||||
|
'muted': False, # amnesty
|
||||||
|
'bio': entry['profile'].get('bio', ''),
|
||||||
|
'notifications': [],
|
||||||
|
'createdAt': parse(entry['createdAt']),
|
||||||
|
'roles': [], # entry['roles'] # roles by community
|
||||||
|
'ratings': [], # entry['ratings']
|
||||||
|
'links': [],
|
||||||
|
'name': 'anonymous'
|
||||||
|
}
|
||||||
|
if 'updatedAt' in entry: user_dict['updatedAt'] = parse(entry['updatedAt'])
|
||||||
|
if 'wasOnineAt' in entry: user_dict['wasOnlineAt'] = parse(entry['wasOnlineAt'])
|
||||||
|
if entry.get('profile'):
|
||||||
|
# slug
|
||||||
|
user_dict['slug'] = entry['profile'].get('path')
|
||||||
|
user_dict['bio'] = html2text(entry.get('profile').get('bio') or '')
|
||||||
|
|
||||||
|
# userpic
|
||||||
|
try: user_dict['userpic'] = 'https://assets.discours.io/unsafe/100x/' + entry['profile']['thumborId']
|
||||||
|
except KeyError:
|
||||||
|
try: user_dict['userpic'] = entry['profile']['image']['url']
|
||||||
|
except KeyError: user_dict['userpic'] = ''
|
||||||
|
|
||||||
|
# name
|
||||||
|
fn = entry['profile'].get('firstName', '')
|
||||||
|
ln = entry['profile'].get('lastName', '')
|
||||||
|
name = user_dict['slug'] if user_dict['slug'] else 'noname'
|
||||||
|
name = fn if fn else name
|
||||||
|
name = (name + ' ' + ln) if ln else name
|
||||||
|
name = entry['profile']['path'].lower().replace(' ', '-') if len(name) < 2 else name
|
||||||
|
user_dict['name'] = name
|
||||||
|
|
||||||
|
# links
|
||||||
|
fb = entry['profile'].get('facebook', False)
|
||||||
|
if fb: user_dict['links'].append(fb)
|
||||||
|
vk = entry['profile'].get('vkontakte', False)
|
||||||
|
if vk: user_dict['links'].append(vk)
|
||||||
|
tr = entry['profile'].get('twitter', False)
|
||||||
|
if tr: user_dict['links'].append(tr)
|
||||||
|
ws = entry['profile'].get('website', False)
|
||||||
|
if ws: user_dict['links'].append(ws)
|
||||||
|
|
||||||
|
# some checks
|
||||||
|
if not user_dict['slug'] and len(user_dict['links']) > 0:
|
||||||
|
user_dict['slug'] = user_dict['links'][0].split('/')[-1]
|
||||||
|
|
||||||
|
user_dict['slug'] = user_dict.get('slug', user_dict['email'].split('@')[0])
|
||||||
|
oid = user_dict['oid']
|
||||||
|
try: user = User.create(**user_dict.copy())
|
||||||
|
except sqlalchemy.exc.IntegrityError:
|
||||||
|
print('[migration] cannot create user ' + user_dict['slug'])
|
||||||
|
with local_session() as session:
|
||||||
|
old_user = session.query(User).filter(User.slug == user_dict['slug']).first()
|
||||||
|
old_user.oid = oid
|
||||||
|
user = old_user
|
||||||
|
if not user:
|
||||||
|
print('[migration] ERROR: cannot find user ' + user_dict['slug'])
|
||||||
|
raise Exception
|
||||||
|
user_dict['id'] = user.id
|
||||||
|
return user_dict
|
||||||
|
|
||||||
|
def migrate_2stage(entry, id_map):
|
||||||
|
ce = 0
|
||||||
|
for rating_entry in entry.get('ratings',[]):
|
||||||
|
rater_oid = rating_entry['createdBy']
|
||||||
|
rater_slug = id_map.get(rater_oid)
|
||||||
|
if not rater_slug:
|
||||||
|
ce +=1
|
||||||
|
# print(rating_entry)
|
||||||
|
continue
|
||||||
|
oid = entry['_id']
|
||||||
|
author_slug = id_map.get(oid)
|
||||||
|
user_rating_dict = {
|
||||||
|
'value': rating_entry['value'],
|
||||||
|
'rater': rater_slug,
|
||||||
|
'user': author_slug
|
||||||
|
}
|
||||||
|
with local_session() as session:
|
||||||
|
try:
|
||||||
|
user_rating = UserRating.create(**user_rating_dict)
|
||||||
|
except sqlalchemy.exc.IntegrityError:
|
||||||
|
old_rating = session.query(UserRating).filter(UserRating.rater == rater_slug).first()
|
||||||
|
print('[migration] cannot create ' + author_slug + '`s rate from ' + rater_slug)
|
||||||
|
print('[migration] concat rating value %d+%d=%d' % (old_rating.value, rating_entry['value'], old_rating.value + rating_entry['value']))
|
||||||
|
old_rating.update({ 'value': old_rating.value + rating_entry['value'] })
|
||||||
|
session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
return ce
|
9
migration/utils.py
Normal file
9
migration/utils.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from json import JSONEncoder
|
||||||
|
|
||||||
|
class DateTimeEncoder(JSONEncoder):
|
||||||
|
def default(self, z):
|
||||||
|
if isinstance(z, datetime):
|
||||||
|
return (str(z))
|
||||||
|
else:
|
||||||
|
return super().default(z)
|
Loading…
Reference in New Issue
Block a user