Merge pull request #72 from alexeyqu/feature/migration-cleanup-1
add regexes to remove or replace dummy tags during migration
This commit is contained in:
commit
6d72a0dcec
|
@ -251,7 +251,7 @@ def extract_md_images(body, prefix):
|
||||||
return newbody
|
return newbody
|
||||||
|
|
||||||
|
|
||||||
def cleanup(body):
|
def cleanup_md(body):
|
||||||
newbody = (
|
newbody = (
|
||||||
body.replace("<", "")
|
body.replace("<", "")
|
||||||
.replace(">", "")
|
.replace(">", "")
|
||||||
|
@ -274,7 +274,7 @@ def cleanup(body):
|
||||||
def extract_md(body, shout_dict = None):
|
def extract_md(body, shout_dict = None):
|
||||||
newbody = body
|
newbody = body
|
||||||
if newbody:
|
if newbody:
|
||||||
newbody = cleanup(newbody)
|
newbody = cleanup_md(newbody)
|
||||||
if not newbody:
|
if not newbody:
|
||||||
raise Exception("cleanup error")
|
raise Exception("cleanup error")
|
||||||
|
|
||||||
|
@ -375,8 +375,46 @@ def prepare_html_body(entry):
|
||||||
return body
|
return body
|
||||||
|
|
||||||
|
|
||||||
def extract_html(entry, shout_id = None):
|
def cleanup_html(body: str) -> str:
|
||||||
|
new_body = body
|
||||||
|
regex_remove = [
|
||||||
|
r"style=\"width:\s*\d+px;height:\s*\d+px;\"",
|
||||||
|
r"style=\"width:\s*\d+px;\"",
|
||||||
|
r"style=\"color: #000000;\"",
|
||||||
|
r"style=\"float: none;\"",
|
||||||
|
r"style=\"background: white;\"",
|
||||||
|
r"class=\"Apple-interchange-newline\"",
|
||||||
|
r"class=\"MsoNormalCxSpMiddle\"",
|
||||||
|
r"class=\"MsoNormal\"",
|
||||||
|
r"lang=\"EN-US\"",
|
||||||
|
r"id=\"docs-internal-guid-[\w-]+\"",
|
||||||
|
r"<p></p>",
|
||||||
|
r"<span></span>",
|
||||||
|
r"<i></i>",
|
||||||
|
r"<b></b>",
|
||||||
|
r"<h1></h1>",
|
||||||
|
r"<h2></h2>",
|
||||||
|
r"<h3></h3>",
|
||||||
|
r"<h4></h4>",
|
||||||
|
r"<div></div>",
|
||||||
|
]
|
||||||
|
regex_replace = {
|
||||||
|
r"<br></p>": "</p>"
|
||||||
|
}
|
||||||
|
for regex in regex_remove:
|
||||||
|
new_body = re.sub(regex, "", new_body)
|
||||||
|
for regex, replace in regex_replace.items():
|
||||||
|
new_body = re.sub(regex, replace, new_body)
|
||||||
|
return new_body
|
||||||
|
|
||||||
|
def extract_html(entry, shout_id = None, cleanup=False):
|
||||||
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
|
body_orig = (entry.get("body") or "").replace('\(', '(').replace('\)', ')')
|
||||||
|
if cleanup:
|
||||||
|
# we do that before bs parsing to catch the invalid html
|
||||||
|
body_clean = cleanup_html(body_orig)
|
||||||
|
if body_clean != body_orig:
|
||||||
|
print(f"[migration] html cleaned for slug {entry.get('slug', None)}")
|
||||||
|
body_orig = body_clean
|
||||||
if shout_id:
|
if shout_id:
|
||||||
extract_footnotes(body_orig, shout_id)
|
extract_footnotes(body_orig, shout_id)
|
||||||
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
|
body_html = str(BeautifulSoup(body_orig, features="html.parser"))
|
||||||
|
|
|
@ -150,7 +150,7 @@ async def migrate(entry, storage):
|
||||||
"createdAt": date_parse(entry.get("createdAt", OLD_DATE)),
|
"createdAt": date_parse(entry.get("createdAt", OLD_DATE)),
|
||||||
"updatedAt": date_parse(entry["updatedAt"]) if "updatedAt" in entry else ts,
|
"updatedAt": date_parse(entry["updatedAt"]) if "updatedAt" in entry else ts,
|
||||||
"topics": await add_topics_follower(entry, storage, author),
|
"topics": await add_topics_follower(entry, storage, author),
|
||||||
"body": extract_html(entry)
|
"body": extract_html(entry, cleanup=True)
|
||||||
}
|
}
|
||||||
|
|
||||||
# main topic patch
|
# main topic patch
|
||||||
|
|
Loading…
Reference in New Issue
Block a user