export separated

This commit is contained in:
Untone 2021-10-09 11:36:14 +03:00
parent fe28c3918c
commit 1714a60e99
3 changed files with 33 additions and 19 deletions

View File

@ -114,6 +114,7 @@ def shouts():
counter = 0 counter = 0
discours_author = 0 discours_author = 0
content_data = json.loads(open('migration/data/content_items.json').read()) content_data = json.loads(open('migration/data/content_items.json').read())
content_dict = { x['_id']:x for x in content_data }
newdata = {} newdata = {}
print(str(len(content_data)) + ' entries loaded. now migrating...') print(str(len(content_data)) + ' entries loaded. now migrating...')
errored = [] errored = []
@ -125,7 +126,7 @@ def shouts():
line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author) line = str(counter+1) + ': ' + shout['slug'] + " @" + str(author)
print(line) print(line)
counter += 1 counter += 1
if author == 'discours.io': if author == 'discours':
discours_author += 1 discours_author += 1
open('./shouts.id.log', 'a').write(line + '\n') open('./shouts.id.log', 'a').write(line + '\n')
except Exception: except Exception:
@ -136,25 +137,35 @@ def shouts():
limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data) limit = int(sys.argv[2]) if len(sys.argv) > 2 else len(content_data)
except ValueError: except ValueError:
limit = len(content_data) limit = len(content_data)
export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)[:limit]
export_clean = {}
for (slug, a) in export_list:
export_clean[a['slug']] = extract_images(a)
metadata = get_metadata(a)
content = frontmatter.dumps(frontmatter.Post(a['body'], **metadata))
open('../content/discours.io/'+a['slug']+'.md', 'w').write(content)
open('migration/data/shouts.dict.json', open('migration/data/shouts.dict.json',
'w').write(json.dumps(newdata, cls=DateTimeEncoder)) 'w').write(json.dumps(newdata, cls=DateTimeEncoder))
print(str(counter) + '/' + str(len(content_data)) +
' content items were migrated')
print(str(discours_author) + ' from them by @discours')
def export_shouts(limit):
print('reading json...')
newdata = json.loads(open('migration/data/shouts.dict.json', 'r').read())
print(str(len(newdata.keys())) + ' loaded')
export_list = [i for i in newdata.items() if i[1]['layout'] == 'article' and i[1]['published']]
export_list = sorted(export_list, key=lambda item: item[1]['createdAt'] or OLD_DATE, reverse=True)
print(str(len(export_list)) + ' filtered')
export_list = export_list[:limit or len(export_list)]
export_clean = {}
for (slug, article) in export_list:
if article['layout'] == 'article':
export_clean[article['slug']] = extract_images(article)
metadata = get_metadata(article)
content = frontmatter.dumps(frontmatter.Post(article['body'], **metadata))
open('../content/discours.io/'+slug+'.md', 'w').write(content)
# print(slug)
# open('../content/discours.io/'+slug+'.html', 'w').write(content_dict[article['old_id']]['body'])
open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean), open('../src/data/articles.json', 'w').write(json.dumps(dict(export_clean),
cls=DateTimeEncoder, cls=DateTimeEncoder,
indent=4, indent=4,
sort_keys=True, sort_keys=True,
ensure_ascii=False)) ensure_ascii=False))
print(str(counter) + '/' + str(len(content_data)) + print(str(len(export_clean.items())) + ' exported')
' content items were migrated')
print(str(len(export_list)) + ' shouts were exported')
print(str(discours_author) + ' from them by @discours.io')
if __name__ == '__main__': if __name__ == '__main__':
@ -176,6 +187,9 @@ if __name__ == '__main__':
except Exception: except Exception:
pass pass
shouts() shouts()
elif sys.argv[1] == "export_shouts":
limit = int(sys.argv[2]) if len(sys.argv) > 2 else None
export_shouts(limit)
elif sys.argv[1] == "all": elif sys.argv[1] == "all":
users() users()
topics() topics()

View File

@ -463,7 +463,7 @@ class HTML2Text(HTMLParser.HTMLParser):
if start: if start:
if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')): if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
self.astack.append(attrs) self.astack.append(attrs)
self.maybe_automatic_link = attrs['href'] self.maybe_automatic_link = attrs['href'][:2000]
else: else:
self.astack.append(None) self.astack.append(None)
else: else:
@ -903,4 +903,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -15,7 +15,7 @@ users_dict = json.loads(open(abspath('migration/data/users.dict.json')).read())
topics_dict = json.loads(open(abspath('migration/data/topics.dict.json')).read()) # old_id keyed topics_dict = json.loads(open(abspath('migration/data/topics.dict.json')).read()) # old_id keyed
users_dict['0'] = { users_dict['0'] = {
'id': 9999999, 'id': 9999999,
'slug': 'discours.io', 'slug': 'discours',
'name': 'Дискурс', 'name': 'Дискурс',
'userpic': 'https://discours.io/images/logo-mini.svg', 'userpic': 'https://discours.io/images/logo-mini.svg',
'createdAt': '2016-03-05 22:22:00.350000' 'createdAt': '2016-03-05 22:22:00.350000'
@ -109,7 +109,7 @@ def migrate(entry):
else: else:
body_html = str(BeautifulSoup( body_html = str(BeautifulSoup(
body_orig, features="html.parser")) body_orig, features="html.parser"))
r['body'] = html2text(body_html).replace('****', '**') r['body'] = body_html # html2text(body_html).replace('****', '**')
r['old_id'] = entry.get('_id') r['old_id'] = entry.get('_id')
else: else:
print(r['slug'] + ': literature has no media') print(r['slug'] + ': literature has no media')
@ -131,7 +131,7 @@ def migrate(entry):
if r.get('body') is None: if r.get('body') is None:
body_orig = entry.get('body', '') body_orig = entry.get('body', '')
body_html = str(BeautifulSoup(body_orig, features="html.parser")) body_html = str(BeautifulSoup(body_orig, features="html.parser"))
r['body'] = html2text(body_html).replace('****', '**') r['body'] = body_html # html2text(body_html).replace('****', '**')
r['old_id'] = entry.get('_id') r['old_id'] = entry.get('_id')
body = r.get('body') body = r.get('body')
user = None user = None
@ -167,7 +167,7 @@ def migrate(entry):
userpic = user.userpic userpic = user.userpic
else: else:
# no application, no author! # no application, no author!
slug = 'discours.io' slug = 'discours'
name = 'Дискурс' name = 'Дискурс'
userpic = 'https://discours.io/images/logo-mini.svg' userpic = 'https://discours.io/images/logo-mini.svg'
with local_session() as session: with local_session() as session: