migration, auth, refactoring, formatting

This commit is contained in:
2022-09-17 21:12:14 +03:00
parent 6b4c00d9e7
commit 3136eecd7e
68 changed files with 968 additions and 930 deletions

View File

@@ -1,33 +1,31 @@
""" cmd managed migration """
import csv
import asyncio
from datetime import datetime
import json
import os
import subprocess
import sys
import os
from datetime import datetime
import bs4
import numpy as np
from migration.tables.comments import migrate as migrateComment
from migration.tables.comments import migrate_2stage as migrateComment_2stage
from migration.tables.content_items import get_shout_slug, migrate as migrateShout
from migration.tables.topics import migrate as migrateTopic
from migration.tables.users import migrate as migrateUser
from migration.tables.users import migrate_2stage as migrateUser_2stage
from orm.reaction import Reaction
from settings import DB_URL
# from export import export_email_subscriptions
from .export import export_mdx, export_slug
from orm.reaction import Reaction
from .tables.users import migrate as migrateUser
from .tables.users import migrate_2stage as migrateUser_2stage
from .tables.content_items import get_shout_slug, migrate as migrateShout
from .tables.topics import migrate as migrateTopic
from .tables.comments import migrate as migrateComment
from .tables.comments import migrate_2stage as migrateComment_2stage
from settings import DB_URL
TODAY = datetime.strftime(datetime.now(), "%Y%m%d")
OLD_DATE = "2016-03-05 22:22:00.350000"
def users_handle(storage):
async def users_handle(storage):
"""migrating users first"""
counter = 0
id_map = {}
@@ -47,10 +45,9 @@ def users_handle(storage):
ce = 0
for entry in storage["users"]["data"]:
ce += migrateUser_2stage(entry, id_map)
return storage
def topics_handle(storage):
async def topics_handle(storage):
"""topics from categories and tags"""
counter = 0
for t in storage["topics"]["tags"] + storage["topics"]["cats"]:
@@ -78,8 +75,6 @@ def topics_handle(storage):
+ str(len(storage["topics"]["by_slug"].values()))
+ " topics by slug"
)
# raise Exception
return storage
async def shouts_handle(storage, args):
@@ -105,9 +100,9 @@ async def shouts_handle(storage, args):
if not shout["topics"]:
print("[migration] no topics!")
# wuth author
author = shout["authors"][0].slug
if author == "discours":
# with author
author: str = shout["authors"][0].dict()
if author["slug"] == "discours":
discours_author += 1
# print('[migration] ' + shout['slug'] + ' with author ' + author)
@@ -118,21 +113,21 @@ async def shouts_handle(storage, args):
# print main counter
counter += 1
line = str(counter + 1) + ": " + shout["slug"] + " @" + author
line = str(counter + 1) + ": " + shout["slug"] + " @" + author["slug"]
print(line)
b = bs4.BeautifulSoup(shout["body"], "html.parser")
texts = []
texts.append(shout["title"].lower().replace(r"[^а-яА-Яa-zA-Z]", ""))
texts = b.findAll(text=True)
texts = [shout["title"].lower().replace(r"[^а-яА-Яa-zA-Z]", "")]
texts = texts + b.findAll(text=True)
topics_dataset_bodies.append(" ".join([x.strip().lower() for x in texts]))
topics_dataset_tlist.append(shout["topics"])
# np.savetxt('topics_dataset.csv', (topics_dataset_bodies, topics_dataset_tlist), delimiter=',', fmt='%s')
# np.savetxt('topics_dataset.csv', (topics_dataset_bodies, topics_dataset_tlist), delimiter=',
# ', fmt='%s')
print("[migration] " + str(counter) + " content items were migrated")
print("[migration] " + str(pub_counter) + " have been published")
print("[migration] " + str(discours_author) + " authored by @discours")
return storage
async def comments_handle(storage):
@@ -146,9 +141,9 @@ async def comments_handle(storage):
missed_shouts[reaction] = oldcomment
elif type(reaction) == Reaction:
reaction = reaction.dict()
id = reaction["id"]
rid = reaction["id"]
oid = reaction["oid"]
id_map[oid] = id
id_map[oid] = rid
else:
ignored_counter += 1
@@ -161,7 +156,6 @@ async def comments_handle(storage):
for missed in missed_shouts.values():
missed_counter += len(missed)
print("[migration] " + str(missed_counter) + " comments dropped")
return storage
def bson_handle():
@@ -180,8 +174,8 @@ def export_one(slug, storage, args=None):
async def all_handle(storage, args):
print("[migration] handle everything")
users_handle(storage)
topics_handle(storage)
await users_handle(storage)
await topics_handle(storage)
await shouts_handle(storage, args)
await comments_handle(storage)
# export_email_subscriptions()
@@ -205,11 +199,6 @@ def data_load():
"users": {"by_oid": {}, "by_slug": {}, "data": []},
"replacements": json.loads(open("migration/tables/replacements.json").read()),
}
users_data = []
tags_data = []
cats_data = []
comments_data = []
content_data = []
try:
users_data = json.loads(open("migration/data/users.json").read())
print("[migration.load] " + str(len(users_data)) + " users ")
@@ -265,13 +254,13 @@ def data_load():
+ str(len(storage["reactions"]["by_content"].keys()))
+ " with comments"
)
storage["users"]["data"] = users_data
storage["topics"]["tags"] = tags_data
storage["topics"]["cats"] = cats_data
storage["shouts"]["data"] = content_data
storage["reactions"]["data"] = comments_data
except Exception as e:
raise e
storage["users"]["data"] = users_data
storage["topics"]["tags"] = tags_data
storage["topics"]["cats"] = cats_data
storage["shouts"]["data"] = content_data
storage["reactions"]["data"] = comments_data
return storage
@@ -301,7 +290,7 @@ def create_pgdump():
async def handle_auto():
print("[migration] no command given, auto mode")
print("[migration] no option given, auto mode")
url = os.getenv("MONGODB_URL")
if url:
mongo_download(url)

View File

@@ -1,6 +1,7 @@
import os
import bson
import json
import os
import bson
from .utils import DateTimeEncoder

View File

@@ -1,7 +1,9 @@
from datetime import datetime
import json
import os
from datetime import datetime
import frontmatter
from .extract import extract_html, prepare_html_body
from .utils import DateTimeEncoder
@@ -67,22 +69,40 @@ def export_slug(slug, storage):
def export_email_subscriptions():
email_subscriptions_data = json.loads(open("migration/data/email_subscriptions.json").read())
email_subscriptions_data = json.loads(
open("migration/data/email_subscriptions.json").read()
)
for data in email_subscriptions_data:
# TODO: migrate to mailgun list manually
# migrate_email_subscription(data)
pass
print("[migration] " + str(len(email_subscriptions_data)) + " email subscriptions exported")
print(
"[migration] "
+ str(len(email_subscriptions_data))
+ " email subscriptions exported"
)
def export_shouts(storage):
# update what was just migrated or load json again
if len(storage["users"]["by_slugs"].keys()) == 0:
storage["users"]["by_slugs"] = json.loads(open(EXPORT_DEST + "authors.json").read())
print("[migration] " + str(len(storage["users"]["by_slugs"].keys())) + " exported authors ")
storage["users"]["by_slugs"] = json.loads(
open(EXPORT_DEST + "authors.json").read()
)
print(
"[migration] "
+ str(len(storage["users"]["by_slugs"].keys()))
+ " exported authors "
)
if len(storage["shouts"]["by_slugs"].keys()) == 0:
storage["shouts"]["by_slugs"] = json.loads(open(EXPORT_DEST + "articles.json").read())
print("[migration] " + str(len(storage["shouts"]["by_slugs"].keys())) + " exported articles ")
storage["shouts"]["by_slugs"] = json.loads(
open(EXPORT_DEST + "articles.json").read()
)
print(
"[migration] "
+ str(len(storage["shouts"]["by_slugs"].keys()))
+ " exported articles "
)
for slug in storage["shouts"]["by_slugs"].keys():
export_slug(slug, storage)
@@ -130,4 +150,8 @@ def export_json(
ensure_ascii=False,
)
)
print("[migration] " + str(len(export_comments.items())) + " exported articles with comments")
print(
"[migration] "
+ str(len(export_comments.items()))
+ " exported articles with comments"
)

View File

@@ -1,6 +1,7 @@
import base64
import os
import re
import base64
from .html2text import html2text
TOOLTIP_REGEX = r"(\/\/\/(.+)\/\/\/)"

View File

@@ -379,16 +379,16 @@ class HTML2Text(html.parser.HTMLParser):
if start:
if (
self.current_class == "highlight"
and self.inheader == False
and self.span_lead == False
and self.astack == False
and not self.inheader
and not self.span_lead
and not self.astack
):
self.o("`") # NOTE: same as <code>
self.span_highlight = True
elif (
self.current_class == "lead"
and self.inheader == False
and self.span_highlight == False
and not self.inheader
and not self.span_highlight
):
# self.o("==") # NOTE: CriticMarkup {==
self.span_lead = True

View File

@@ -4,6 +4,7 @@ import sys
from . import HTML2Text, __version__, config
# noinspection DuplicatedCode
def main() -> None:
baseurl = ""

View File

@@ -68,13 +68,11 @@ def element_style(
:rtype: dict
"""
style = parent_style.copy()
if "class" in attrs:
assert attrs["class"] is not None
if attrs.get("class"):
for css_class in attrs["class"].split():
css_style = style_def.get("." + css_class, {})
style.update(css_style)
if "style" in attrs:
assert attrs["style"] is not None
if attrs.get("style"):
immediate_style = dumb_property_dict(attrs["style"])
style.update(immediate_style)
@@ -149,8 +147,7 @@ def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
:rtype: int or None
"""
if "start" in attrs:
assert attrs["start"] is not None
if attrs.get("start"):
try:
return int(attrs["start"]) - 1
except ValueError:

View File

@@ -1,8 +1,10 @@
from datetime import datetime
from dateutil.parser import parse as date_parse
from orm import Reaction, User
from base.orm import local_session
from migration.html2text import html2text
from orm import Reaction, User
from orm.reaction import ReactionKind
from services.stat.reacted import ReactedStorage
@@ -46,16 +48,13 @@ async def migrate(entry, storage):
old_thread: String
}
"""
reaction_dict = {}
reaction_dict["createdAt"] = (
ts if not entry.get("createdAt") else date_parse(entry.get("createdAt"))
)
print("[migration] reaction original date %r" % entry.get("createdAt"))
# print('[migration] comment date %r ' % comment_dict['createdAt'])
reaction_dict["body"] = html2text(entry.get("body", ""))
reaction_dict["oid"] = entry["_id"]
if entry.get("createdAt"):
reaction_dict["createdAt"] = date_parse(entry.get("createdAt"))
reaction_dict = {
"createdAt": (
ts if not entry.get("createdAt") else date_parse(entry.get("createdAt"))
),
"body": html2text(entry.get("body", "")),
"oid": entry["_id"],
}
shout_oid = entry.get("contentItem")
if shout_oid not in storage["shouts"]["by_oid"]:
if len(storage["shouts"]["by_oid"]) > 0:
@@ -126,7 +125,7 @@ def migrate_2stage(rr, old_new_id):
with local_session() as session:
comment = session.query(Reaction).filter(Reaction.id == new_id).first()
comment.replyTo = old_new_id.get(reply_oid)
comment.save()
session.add(comment)
session.commit()
if not rr["body"]:
raise Exception(rr)

View File

@@ -1,14 +1,18 @@
from dateutil.parser import parse as date_parse
import sqlalchemy
from orm.shout import Shout, ShoutTopic, User
from services.stat.reacted import ReactedStorage
from services.stat.viewed import ViewedByDay
from transliterate import translit
from datetime import datetime
from dateutil.parser import parse as date_parse
from sqlalchemy.exc import IntegrityError
from transliterate import translit
from base.orm import local_session
from migration.extract import prepare_html_body
from orm.community import Community
from orm.reaction import Reaction, ReactionKind
from orm.shout import Shout, ShoutTopic, User
from orm.topic import TopicFollower
from services.stat.reacted import ReactedStorage
from services.stat.viewed import ViewedByDay
from services.zine.topics import TopicStorage
OLD_DATE = "2016-03-05 22:22:00.350000"
ts = datetime.now()
@@ -72,7 +76,10 @@ async def migrate(entry, storage):
}
else:
userdata = User.default_user.dict()
assert userdata, "no user found for %s from %d" % [oid, len(users_by_oid.keys())]
if not userdata:
raise Exception(
"no user found for %s from %d" % [oid, len(users_by_oid.keys())]
)
r["authors"] = [
userdata,
]
@@ -139,32 +146,40 @@ async def migrate(entry, storage):
# del shout_dict['rating'] # NOTE: TypeError: 'rating' is an invalid keyword argument for Shout
# del shout_dict['ratings']
email = userdata.get("email")
slug = userdata.get("slug")
if not slug:
userslug = userdata.get("slug")
if not userslug:
raise Exception
with local_session() as session:
# c = session.query(Community).all().pop()
if email:
user = session.query(User).filter(User.email == email).first()
if not user and slug:
user = session.query(User).filter(User.slug == slug).first()
if not user and userslug:
user = session.query(User).filter(User.slug == userslug).first()
if not user and userdata:
try:
userdata["slug"] = userdata["slug"].lower().strip().replace(" ", "-")
user = User.create(**userdata)
except sqlalchemy.exc.IntegrityError:
except IntegrityError:
print("[migration] user error: " + userdata)
userdata["id"] = user.id
userdata["createdAt"] = user.createdAt
storage["users"]["by_slug"][userdata["slug"]] = userdata
storage["users"]["by_oid"][entry["_id"]] = userdata
assert user, "could not get a user"
shout_dict["authors"] = [user, ]
if not user:
raise Exception("could not get a user")
shout_dict["authors"] = [
user,
]
# TODO: subscribe shout user on shout topics
try:
s = Shout.create(**shout_dict)
except sqlalchemy.exc.IntegrityError as e:
with local_session() as session:
topics = session.query(ShoutTopic).where(ShoutTopic.shout == s.slug).all()
for tpc in topics:
TopicFollower.create(topic=tpc.slug, follower=userslug)
await TopicStorage.update_topic(tpc.slug)
except IntegrityError as e:
with local_session() as session:
s = session.query(Shout).filter(Shout.slug == shout_dict["slug"]).first()
bump = False
@@ -267,9 +282,9 @@ async def migrate(entry, storage):
)
reaction.update(reaction_dict)
else:
reaction_dict["day"] = (
reaction_dict.get("createdAt") or ts
).replace(hour=0, minute=0, second=0, microsecond=0)
# day = (
# reaction_dict.get("createdAt") or ts
# ).replace(hour=0, minute=0, second=0, microsecond=0)
rea = Reaction.create(**reaction_dict)
await ReactedStorage.react(rea)
# shout_dict['ratings'].append(reaction_dict)

View File

@@ -764,5 +764,37 @@
"blocked-in-russia": "blocked-in-russia",
"kavarga": "kavarga",
"galereya-anna-nova": "gallery-anna-nova",
"derrida": "derrida"
}
"derrida": "derrida",
"dinozavry": "dinosaurs",
"beecake": "beecake",
"literaturnyykaver": "literature-cover",
"dialog": "dialogue",
"dozhd": "rain",
"pomosch": "help",
"igra": "game",
"reportazh-1": "reportage",
"armiya-1": "army",
"ukraina-2": "ukraine",
"nasilie-1": "violence",
"smert-1": "death",
"dnevnik-1": "dairy",
"voyna-na-ukraine": "war-in-ukraine",
"zabota": "care",
"ango": "ango",
"hayku": "haiku",
"utrata": "loss",
"pokoy": "peace",
"kladbische": "cemetery",
"lomonosov": "lomonosov",
"istoriya-nauki": "history-of-sceince",
"sud": "court",
"russkaya-toska": "russian-toska",
"duh": "spirit",
"devyanostye": "90s",
"seksualnoe-nasilie": "sexual-violence",
"gruziya-2": "georgia",
"dokumentalnaya-poeziya": "documentary-poetry",
"kriptovalyuty": "cryptocurrencies",
"magiya": "magic",
"yazychestvo": "paganism"
}

View File

@@ -1,5 +1,5 @@
from migration.extract import extract_md, html2text
from base.orm import local_session
from migration.extract import extract_md, html2text
from orm import Topic, Community

View File

@@ -1,8 +1,9 @@
import sqlalchemy
from dateutil.parser import parse
from sqlalchemy.exc import IntegrityError
from base.orm import local_session
from migration.html2text import html2text
from orm import User, UserRating
from dateutil.parser import parse
from base.orm import local_session
def migrate(entry):
@@ -21,9 +22,6 @@ def migrate(entry):
"muted": False, # amnesty
"bio": entry["profile"].get("bio", ""),
"notifications": [],
"createdAt": parse(entry["createdAt"]),
"roles": [], # entry['roles'] # roles by community
"ratings": [], # entry['ratings']
"links": [],
"name": "anonymous",
}
@@ -86,7 +84,7 @@ def migrate(entry):
user_dict["slug"] = user_dict["slug"].lower().strip().replace(" ", "-")
try:
user = User.create(**user_dict.copy())
except sqlalchemy.exc.IntegrityError:
except IntegrityError:
print("[migration] cannot create user " + user_dict["slug"])
with local_session() as session:
old_user = (
@@ -120,28 +118,10 @@ def migrate_2stage(entry, id_map):
with local_session() as session:
try:
user_rating = UserRating.create(**user_rating_dict)
except sqlalchemy.exc.IntegrityError:
old_rating = (
session.query(UserRating)
.filter(UserRating.rater == rater_slug)
.first()
)
print(
"[migration] cannot create "
+ author_slug
+ "`s rate from "
+ rater_slug
)
print(
"[migration] concat rating value %d+%d=%d"
% (
old_rating.value,
rating_entry["value"],
old_rating.value + rating_entry["value"],
)
)
old_rating.update({"value": old_rating.value + rating_entry["value"]})
session.add(user_rating)
session.commit()
except IntegrityError:
print("[migration] cannot rate " + author_slug + "`s by " + rater_slug)
except Exception as e:
print(e)
return ce