diff --git a/migration/extract.py b/migration/extract.py
index d67275a9..0aee6ce8 100644
--- a/migration/extract.py
+++ b/migration/extract.py
@@ -251,7 +251,7 @@ def extract_md_images(body, prefix):
return newbody
-def cleanup(body):
+def cleanup_md(body):
newbody = (
body.replace("<", "")
.replace(">", "")
@@ -274,7 +274,7 @@ def cleanup(body):
def extract_md(body, shout_dict = None):
newbody = body
if newbody:
- newbody = cleanup(newbody)
+ newbody = cleanup_md(newbody)
if not newbody:
raise Exception("cleanup error")
@@ -375,8 +375,46 @@ def prepare_html_body(entry):
return body
-def extract_html(entry, shout_id = None):
+def cleanup_html(body: str) -> str:
+ new_body = body
+ regex_remove = [
+ r"style=\"width:\s*\d+px;height:\s*\d+px;\"",
+ r"style=\"width:\s*\d+px;\"",
+ r"style=\"color: #000000;\"",
+ r"style=\"float: none;\"",
+ r"style=\"background: white;\"",
+ r"class=\"Apple-interchange-newline\"",
+ r"class=\"MsoNormalCxSpMiddle\"",
+ r"class=\"MsoNormal\"",
+ r"lang=\"EN-US\"",
+ r"id=\"docs-internal-guid-[\w-]+\"",
+ r"
",
+ r"",
+ r"",
+ r"",
+ r"",
+ r"",
+ r"",
+ r"",
+ r"",
+ ]
+ regex_replace = {
+ r"
": ""
+ }
+ for regex in regex_remove:
+ new_body = re.sub(regex, "", new_body)
+ for regex, replace in regex_replace.items():
+ new_body = re.sub(regex, replace, new_body)
+ return new_body
+
+def extract_html(entry, shout_id = None, cleanup=False):
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
+ if cleanup:
+ # we do that before bs parsing to catch the invalid html
+ body_clean = cleanup_html(body_orig)
+ if body_clean != body_orig:
+ print(f"[migration] html cleaned for slug {entry.get('slug', None)}")
+ body_orig = body_clean
if shout_id:
extract_footnotes(body_orig, shout_id)
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
diff --git a/migration/tables/content_items.py b/migration/tables/content_items.py
index 09ef4cb0..2e74f96e 100644
--- a/migration/tables/content_items.py
+++ b/migration/tables/content_items.py
@@ -150,7 +150,7 @@ async def migrate(entry, storage):
"createdAt": date_parse(entry.get("createdAt", OLD_DATE)),
"updatedAt": date_parse(entry["updatedAt"]) if "updatedAt" in entry else ts,
"topics": await add_topics_follower(entry, storage, author),
- "body": extract_html(entry)
+ "body": extract_html(entry, cleanup=True)
}
# main topic patch