diff --git a/migrate.py b/migrate.py index 9e6c4145..006f5532 100644 --- a/migrate.py +++ b/migrate.py @@ -114,6 +114,7 @@ def shouts(): counter = 0 discours_author = 0 content_data = json.loads(open('migration/data/content_items.json').read()) + content_dict = { x['_id']:x for x in content_data } newdata = {} print(str(len(content_data)) + ' entries loaded. now migrating...') errored = [] @@ -125,7 +126,7 @@ def shouts(): line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author) print(line) counter += 1 - if author == 'discours.io': + if author == 'discours': discours_author += 1 open('./shouts.id.log', 'a').write(line + '\n') except Exception: @@ -136,25 +137,35 @@ def shouts(): limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data) except ValueError: limit = len(content_data) - export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']] - export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)[:limit] - export_clean = {} - for (slug, a) in export_list: - export_clean[a['slug']] = extract_images(a) - metadata = get_metadata(a) - content = frontmatter.dumps(frontmatter.Post(a['body'], **metadata)) - open('../content/discours.io/'+a['slug']+'.md', 'w').write(content) open('migration/data/shouts.dict.json', 'w').write(json.dumps(newdata, cls=DateTimeEncoder)) + print(str(counter) + '/' + str(len(content_data)) + + ' content items were migrated') + print(str(discours_author) + ' from them by @discours') + +def export_shouts(limit): + print('reading json...') + newdata = json.loads(open('migration/data/shouts.dict.json', 'r').read()) + print(str(len(newdata.keys())) + ' loaded') + export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']] + export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True) + print(str(len(export_list)) + ' filtered') + export_list = export_list[:limit or len(export_list)] + export_clean = {} + for (slug, article) in export_list: + if article['layout'] == 'article': + export_clean[article['slug']] = extract_images(article) + metadata = get_metadata(article) + content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata)) + open('../content/discours.io/'+slug+'.md', 'w').write(content) + # print(slug) + # open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body']) open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean), cls=DateTimeEncoder, indent=4, sort_keys=True, ensure_ascii=False)) - print(str(counter) + '/' + str(len(content_data)) + - ' content items were migrated') - print(str(len(export_list)) + ' shouts were exported') - print(str(discours_author) + ' from them by @discours.io') + print(str(len(export_clean.items())) + ' exported') if __name__ == '__main__': @@ -176,6 +187,9 @@ if __name__ == '__main__': except Exception: pass shouts() + elif sys.argv[1] == "export_shouts": + limit = int(sys.argv[2]) if len(sys.argv) > 2 else None + export_shouts(limit) elif sys.argv[1] == "all": users() topics() diff --git a/migration/html2text.py b/migration/html2text.py index 88253d93..4882c691 100644 --- a/migration/html2text.py +++ b/migration/html2text.py @@ -463,7 +463,7 @@ class HTML2Text(HTMLParser.HTMLParser): if start: if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): self.astack.append(attrs) - self.maybe_automatic_link = attrs['href'] + self.maybe_automatic_link = attrs['href'][:2000] else: self.astack.append(None) else: @@ -903,4 +903,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py index c9d8cb36..8a165ba1 100644 --- a/migration/tables/content_items.py +++ b/migration/tables/content_items.py @@ -15,7 +15,7 @@ users_dict = json.loads(open(abspath('migration/data/users.dict.json')).read()) topics_dict = json.loads(open(abspath('migration/data/topics.dict.json')).read()) # old_id keyed users_dict['0'] = { 'id': 9999999, - 'slug': 'discours.io', + 'slug': 'discours', 'name': 'Дискурс', 'userpic': 'https://discours.io/images/logo-mini.svg', 'createdAt': '2016-03-05 22:22:00.350000' @@ -109,7 +109,7 @@ def migrate(entry): else: body_html = str(BeautifulSoup( body_orig, features="html.parser")) - r['body'] = html2text(body_html).replace('****', '**') + r['body'] = body_html # html2text(body_html).replace('****', '**') r['old_id'] = entry.get('_id') else: print(r['slug'] + ': literature has no media') @@ -131,7 +131,7 @@ def migrate(entry): if r.get('body') is None: body_orig = entry.get('body', '') body_html = str(BeautifulSoup(body_orig, features="html.parser")) - r['body'] = html2text(body_html).replace('****', '**') + r['body'] = body_html # html2text(body_html).replace('****', '**') r['old_id'] = entry.get('_id') body = r.get('body') user = None @@ -167,7 +167,7 @@ def migrate(entry): userpic = user.userpic else: # no application, no author! - slug = 'discours.io' + slug = 'discours' name = 'Дискурс' userpic = 'https://discours.io/images/logo-mini.svg' with local_session() as session: