From 2a6baa7404e7548a845718337d658c35ffc32670 Mon Sep 17 00:00:00 2001
From: Untone <anton.rewin@gmail.com>
Date: Sat, 16 Oct 2021 10:19:39 +0300
Subject: [PATCH] discours content decode

---
 migrate.py                      | 66 ++++++++++++++++++---------------
 migration/html2text/__init__.py | 27 ++++++++++++--
 migration/html2text/config.py   |  3 +-
 3 files changed, 60 insertions(+), 36 deletions(-)

diff --git a/migrate.py b/migrate.py
index ec773067..df2ca2a0 100644
--- a/migrate.py
+++ b/migrate.py
@@ -21,7 +21,7 @@ if __name__ == '__main__':
     import sys
 
     users_data = json.loads(open('migration/data/users.json').read())
-    users_dict = { x['_id']: x for x in users_data } # by id
+    # users_dict = { x['_id']: x for x in users_data } # by id
     print(str(len(users_data)) + ' users loaded')
     users_by_oid = {}
     users_by_slug = {}
@@ -49,7 +49,8 @@ if __name__ == '__main__':
     for old_comment in comments_data:
         cid = old_comment['contentItem']
         comments_by_post[cid] = comments_by_post.get(cid, [])
-        comments_by_post[cid].append(old_comment)
+        if 'deletedAt' not in old_comment:
+            comments_by_post[cid].append(old_comment)
     print(str(len(comments_by_post.keys())) + ' articles with comments')
 
     export_articles = {} # slug: shout
@@ -77,7 +78,7 @@ if __name__ == '__main__':
         return article
 
 
-    def users():
+    def users(users_by_oid, users_by_slug, users_data):
         ''' migrating users first '''
         # limiting
         limit = len(users_data)
@@ -102,7 +103,7 @@ if __name__ == '__main__':
         print(str(len(users_by_slug.items())) + ' users migrated')
 
 
-    def topics():
+    def topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data):
         ''' topics from categories and tags '''
         # limiting
         limit = len(cats_data) + len(tags_data)
@@ -133,7 +134,7 @@ if __name__ == '__main__':
                                                             sort_keys=True,
                                                             ensure_ascii=False))
 
-    def shouts():
+    def shouts(content_data, shouts_by_slug, shouts_by_oid):
         ''' migrating content items one by one '''
         # limiting
         limit = len(content_data)
@@ -168,7 +169,7 @@ if __name__ == '__main__':
         print(str(counter) + '/' + str(len(content_data)) + ' content items were migrated')
         print(str(discours_author) + ' authored by @discours')
         
-    def export_shouts(shouts_by_slug, export_articles, export_authors):
+    def export_shouts(shouts_by_slug, export_articles, export_authors, content_dict):
         # update what was just migrated or load json again
         if len(export_authors.keys()) == 0:
             export_authors = json.loads(open('../src/data/authors.json').read())
@@ -190,33 +191,33 @@ if __name__ == '__main__':
         
         for (slug, article) in export_list:
             if article['layout'] == 'article':
-                export_slug(slug, export_articles, export_authors)
+                export_slug(slug, export_articles, export_authors, content_dict)
         
-    def export_body(article):
+    def export_body(article, content_dict):
         article = extract_images(article)
         metadata = get_metadata(article)
         content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
         open('../content/discours.io/'+slug+'.md', 'w').write(content)
         open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
 
-    def export_slug(slug, export_articles, export_authors):
-        if exported_authors == {}: 
-            exported_authors = json.loads(open('../src/data/authors.json').read())
-            print(str(len(exported_authors.items())) + ' exported authors loaded')
-        if exported_articles == {}:
-            exported_articles = json.loads(open('../src/data/articles.json').read())
-            print(str(len(exported_articles.items())) + ' exported articles loaded')
+    def export_slug(slug, export_articles, export_authors, content_dict):
+        print('exporting %s ' % slug)
+        if export_authors == {}: 
+            export_authors = json.loads(open('../src/data/authors.json').read())
+            print(str(len(export_authors.items())) + ' exported authors loaded')
+        if export_articles == {}:
+            export_articles = json.loads(open('../src/data/articles.json').read())
+            print(str(len(export_articles.items())) + ' exported articles loaded')
             
         shout = shouts_by_slug.get(slug, False)
         assert shout, 'no data error'
         author = users_by_slug.get(shout['authors'][0]['slug'], None)
-        exported_authors.update({shout['authors'][0]['slug']: author})
-        exported_articles.update({shout['slug']: shout})
-        export_body(shout)
+        export_authors.update({shout['authors'][0]['slug']: author})
+        export_articles.update({shout['slug']: shout})
+        export_body(shout, content_dict)
         comments([slug, ])
-        
 
-    def comments(sluglist = []):
+    def comments(sluglist, export_comments, export_articles, shouts_by_slug, content_dict):
         ''' migrating comments on content items one '''
         if len(sluglist) == 0:
             export_articles = json.loads(open('../src/data/articles.json').read())
@@ -224,7 +225,8 @@ if __name__ == '__main__':
             if len(sluglist) == 0: sluglist = list(export_articles.keys())
 
         if len(sluglist) > 0:
-            print('exporting comments for exact articles...')
+            print('exporting comments for: ')
+            print(' '.join(sluglist))
             for slug in sluglist:
                 shout = shouts_by_slug[slug]
                 old_id = shout['old_id']
@@ -282,9 +284,9 @@ if __name__ == '__main__':
     if len(sys.argv) > 1:
         cmd = sys.argv[1]
         if cmd == "users":
-            users(users_by_oid, users_by_slug, users_data, users_dict)
+            users(users_by_oid, users_by_slug, users_data)
         elif cmd == "topics":
-            topics(topics_by_cat, topics_by_tag, topics_by_slug)
+            topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data)
         elif cmd == "shouts":
             try:
                 Community.create(**{
@@ -298,19 +300,23 @@ if __name__ == '__main__':
                 pass
             shouts(shouts_by_slug, shouts_by_oid) # NOTE: listens limit
         elif cmd == "comments":
-            comments()
+            cl = sys.argv[2] if len(sys.argv) > 2 else 10 
+            topCommented = sorted([ c[0] for c in comments_by_post.items()], reverse=True,  key=lambda i: len(i[1]))[-cl:]
+            comments(topCommented, export_comments, export_articles, shouts_by_slug, content_dict)
         elif cmd == "export_shouts":
-            export_shouts(shouts_by_slug, export_articles, export_authors)
+            export_shouts(shouts_by_slug, export_articles, export_authors, content_dict)
         elif cmd == "all":
-            users()
-            topics()
-            shouts()
-            comments()
+            users(users_by_oid, users_by_slug, users_data)
+            topics(export_topics, topics_by_slug, topics_by_cat, topics_by_tag, cats_data, tags_data)
+            shouts(content_data, shouts_by_slug, shouts_by_oid)
+            cl = sys.argv[2] if len(sys.argv) > 2 else 10 
+            topCommented = sorted([ c[0] for c in comments_by_post.items()], reverse=True,  key=lambda i: len(i[1]))[-cl:]
+            comments(topCommented, export_comments, export_articles, shouts_by_slug, content_dict)
         elif cmd == "bson":
             from migration import bson2json
             bson2json.json_tables()
         elif cmd == 'slug':
-            export_slug(sys.argv[2], export_articles, export_authors)
+            export_slug(sys.argv[2], export_articles, export_authors, content_dict)
         export_finish(export_articles, export_authors, export_topics, export_comments)
     else:
         print('''
diff --git a/migration/html2text/__init__.py b/migration/html2text/__init__.py
index 7e1a279b..85c664c8 100644
--- a/migration/html2text/__init__.py
+++ b/migration/html2text/__init__.py
@@ -86,6 +86,9 @@ class HTML2Text(html.parser.HTMLParser):
         self.tag_callback = None
         self.open_quote = config.OPEN_QUOTE  # covered in cli
         self.close_quote = config.CLOSE_QUOTE  # covered in cli
+        self.header_id = None
+        self.span_hightlight = False
+        self.span_lead = False
 
         if out is None:
             self.out = self.outtextf
@@ -347,18 +350,34 @@ class HTML2Text(html.parser.HTMLParser):
                         self.space = False
                         self.o(hn(tag) * "#" + " ")
                         self.o("[")
-                else:
-                    self.p_p = 0  # don't break up link name
-                    self.inheader = False
-                    return  # prevent redundant emphasis marks on headers
+                        self.header_id = attrs.get('id')
             else:
                 self.p()
                 if start:
                     self.inheader = True
                     self.o(hn(tag) * "#" + " ")
+                    if self.header_id: 
+                        self.o(' {#' + self.header_id + '}')
+                        self.header_id = None
                 else:
                     self.inheader = False
                     return  # prevent redundant emphasis marks on headers
+                
+        if tag == 'span':
+            if start and 'class' in attrs:
+                    if attrs['class'] == 'highlight':
+                        self.o('`') # NOTE: same as <code>
+                        self.span_hightlight = True
+                    elif attrs['class'] == 'lead':
+                        self.o('==') # NOTE: but CriticMarkup uses {== ==}
+                        self.span_lead = True
+            else:
+                if self.span_hightlight:
+                    self.o('`')
+                    self.span_hightlight = False
+                elif self.span_lead:
+                    self.o('==')
+                    self.span_lead = False
 
         if tag in ["p", "div"]:
             if self.google_doc:
diff --git a/migration/html2text/config.py b/migration/html2text/config.py
index 88d3f912..9c10445a 100644
--- a/migration/html2text/config.py
+++ b/migration/html2text/config.py
@@ -17,7 +17,7 @@ BODY_WIDTH = 78
 
 # Don't show internal links (href="#local-anchor") -- corresponding link
 # targets won't be visible in the plain text file anyway.
-SKIP_INTERNAL_LINKS = True
+SKIP_INTERNAL_LINKS = False
 
 # Use inline, rather than reference, formatting for images and links
 INLINE_LINKS = True
@@ -25,7 +25,6 @@ INLINE_LINKS = True
 # Protect links from line breaks surrounding them with angle brackets (in
 # addition to their square brackets)
 PROTECT_LINKS = False
-# WRAP_LINKS = True
 WRAP_LINKS = True
 
 # Wrap list items.