From 2a6baa7404e7548a845718337d658c35ffc32670 Mon Sep 17 00:00:00 2001 From: Untone Date: Sat, 16 Oct 2021 10:19:39 +0300 Subject: [PATCH] discours content decode --- migrate.py | 66 ++++++++++++++++++--------------- migration/html2text/__init__.py | 27 ++++++++++++-- migration/html2text/config.py | 3 +- 3 files changed, 60 insertions(+), 36 deletions(-) diff --git a/migrate.py b/migrate.py index ec773067..df2ca2a0 100644 --- a/migrate.py +++ b/migrate.py @@ -21,7 +21,7 @@ if __name__ == '__main__': import sys users_data = json.loads(open('migration/data/users.json').read()) - users_dict = { x['_id']: x for x in users_data } # by id + # users_dict = { x['_id']: x for x in users_data } # by id print(str(len(users_data)) + ' users loaded') users_by_oid = {} users_by_slug = {} @@ -49,7 +49,8 @@ if __name__ == '__main__': for old_comment in comments_data: cid = old_comment['contentItem'] comments_by_post[cid] = comments_by_post.get(cid, []) - comments_by_post[cid].append(old_comment) + if 'deletedAt' not in old_comment: + comments_by_post[cid].append(old_comment) print(str(len(comments_by_post.keys())) + ' articles with comments') export_articles = {} # slug: shout @@ -77,7 +78,7 @@ if __name__ == '__main__': return article - def users(): + def users(users_by_oid, users_by_slug, users_data): ''' migrating users first ''' # limiting limit = len(users_data) @@ -102,7 +103,7 @@ if __name__ == '__main__': print(str(len(users_by_slug.items())) + ' users migrated') - def topics(): + def topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data): ''' topics from categories and tags ''' # limiting limit = len(cats_data) + len(tags_data) @@ -133,7 +134,7 @@ if __name__ == '__main__': sort_keys=True, ensure_ascii=False)) - def shouts(): + def shouts(content_data, shouts_by_slug, shouts_by_oid): ''' migrating content items one by one ''' # limiting limit = len(content_data) @@ -168,7 +169,7 @@ if __name__ == '__main__': print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated') print(str(discours_author) + ' authored by @discours') - def export_shouts(shouts_by_slug, export_articles, export_authors): + def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict): # update what was just migrated or load json again if len(export_authors.keys()) == 0: export_authors = json.loads(open('../src/data/authors.json').read()) @@ -190,33 +191,33 @@ if __name__ == '__main__': for (slug, article) in export_list: if article['layout'] == 'article': - export_slug(slug, export_articles, export_authors) + export_slug(slug, export_articles, export_authors, content_dict) - def export_body(article): + def export_body(article, content_dict): article = extract_images(article) metadata = get_metadata(article) content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata)) open('../content/discours.io/'+slug+'.md', 'w').write(content) open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body']) - def export_slug(slug, export_articles, export_authors): - if exported_authors == {}: - exported_authors = json.loads(open('../src/data/authors.json').read()) - print(str(len(exported_authors.items())) + ' exported authors loaded') - if exported_articles == {}: - exported_articles = json.loads(open('../src/data/articles.json').read()) - print(str(len(exported_articles.items())) + ' exported articles loaded') + def export_slug(slug, export_articles, export_authors, content_dict): + print('exporting %s ' % slug) + if export_authors == {}: + export_authors = json.loads(open('../src/data/authors.json').read()) + print(str(len(export_authors.items())) + ' exported authors loaded') + if export_articles == {}: + export_articles = json.loads(open('../src/data/articles.json').read()) + print(str(len(export_articles.items())) + ' exported articles loaded') shout = shouts_by_slug.get(slug, False) assert shout, 'no data error' author = users_by_slug.get(shout['authors'][0]['slug'], None) - exported_authors.update({shout['authors'][0]['slug']: author}) - exported_articles.update({shout['slug']: shout}) - export_body(shout) + export_authors.update({shout['authors'][0]['slug']: author}) + export_articles.update({shout['slug']: shout}) + export_body(shout, content_dict) comments([slug, ]) - - def comments(sluglist = []): + def comments(sluglist, export_comments, export_articles, shouts_by_slug, content_dict): ''' migrating comments on content items one ''' if len(sluglist) == 0: export_articles = json.loads(open('../src/data/articles.json').read()) @@ -224,7 +225,8 @@ if __name__ == '__main__': if len(sluglist) == 0: sluglist = list(export_articles.keys()) if len(sluglist) > 0: - print('exporting comments for exact articles...') + print('exporting comments for: ') + print(' '.join(sluglist)) for slug in sluglist: shout = shouts_by_slug[slug] old_id = shout['old_id'] @@ -282,9 +284,9 @@ if __name__ == '__main__': if len(sys.argv) > 1: cmd = sys.argv[1] if cmd == "users": - users(users_by_oid, users_by_slug, users_data, users_dict) + users(users_by_oid, users_by_slug, users_data) elif cmd == "topics": - topics(topics_by_cat, topics_by_tag, topics_by_slug) + topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data) elif cmd == "shouts": try: Community.create(**{ @@ -298,19 +300,23 @@ if __name__ == '__main__': pass shouts(shouts_by_slug, shouts_by_oid) # NOTE: listens limit elif cmd == "comments": - comments() + cl = sys.argv[2] if len(sys.argv) > 2 else 10 + topCommented = sorted([ c[0] for c in comments_by_post.items()], reverse=True, key=lambda i: len(i[1]))[-cl:] + comments(topCommented, export_comments, export_articles, shouts_by_slug, content_dict) elif cmd == "export_shouts": - export_shouts(shouts_by_slug, export_articles, export_authors) + export_shouts(shouts_by_slug, export_articles, export_authors, content_dict) elif cmd == "all": - users() - topics() - shouts() - comments() + users(users_by_oid, users_by_slug, users_data) + topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data) + shouts(content_data, shouts_by_slug, shouts_by_oid) + cl = sys.argv[2] if len(sys.argv) > 2 else 10 + topCommented = sorted([ c[0] for c in comments_by_post.items()], reverse=True, key=lambda i: len(i[1]))[-cl:] + comments(topCommented, export_comments, export_articles, shouts_by_slug, content_dict) elif cmd == "bson": from migration import bson2json bson2json.json_tables() elif cmd == 'slug': - export_slug(sys.argv[2], export_articles, export_authors) + export_slug(sys.argv[2], export_articles, export_authors, content_dict) export_finish(export_articles, export_authors, export_topics, export_comments) else: print(''' diff --git a/migration/html2text/__init__.py b/migration/html2text/__init__.py index 7e1a279b..85c664c8 100644 --- a/migration/html2text/__init__.py +++ b/migration/html2text/__init__.py @@ -86,6 +86,9 @@ class HTML2Text(html.parser.HTMLParser): self.tag_callback = None self.open_quote = config.OPEN_QUOTE # covered in cli self.close_quote = config.CLOSE_QUOTE # covered in cli + self.header_id = None + self.span_hightlight = False + self.span_lead = False if out is None: self.out = self.outtextf @@ -347,18 +350,34 @@ class HTML2Text(html.parser.HTMLParser): self.space = False self.o(hn(tag) * "#" + " ") self.o("[") - else: - self.p_p = 0 # don't break up link name - self.inheader = False - return # prevent redundant emphasis marks on headers + self.header_id = attrs.get('id') else: self.p() if start: self.inheader = True self.o(hn(tag) * "#" + " ") + if self.header_id: + self.o(' {#' + self.header_id + '}') + self.header_id = None else: self.inheader = False return # prevent redundant emphasis marks on headers + + if tag == 'span': + if start and 'class' in attrs: + if attrs['class'] == 'highlight': + self.o('`') # NOTE: same as + self.span_hightlight = True + elif attrs['class'] == 'lead': + self.o('==') # NOTE: but CriticMarkup uses {== ==} + self.span_lead = True + else: + if self.span_hightlight: + self.o('`') + self.span_hightlight = False + elif self.span_lead: + self.o('==') + self.span_lead = False if tag in ["p", "div"]: if self.google_doc: diff --git a/migration/html2text/config.py b/migration/html2text/config.py index 88d3f912..9c10445a 100644 --- a/migration/html2text/config.py +++ b/migration/html2text/config.py @@ -17,7 +17,7 @@ BODY_WIDTH = 78 # Don't show internal links (href="#local-anchor") -- corresponding link # targets won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = True +SKIP_INTERNAL_LINKS = False # Use inline, rather than reference, formatting for images and links INLINE_LINKS = True @@ -25,7 +25,6 @@ INLINE_LINKS = True # Protect links from line breaks surrounding them with angle brackets (in # addition to their square brackets) PROTECT_LINKS = False -# WRAP_LINKS = True WRAP_LINKS = True # Wrap list items.