From 14db8014eda05aadd4e4faa42120e859719e651a Mon Sep 17 00:00:00 2001 From: Kuoi Date: Thu, 12 May 2022 02:33:55 +0100 Subject: [PATCH] add lib --- webhooks/htmlutils.py | 82 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 webhooks/htmlutils.py diff --git a/webhooks/htmlutils.py b/webhooks/htmlutils.py new file mode 100644 index 0000000..47cc3e3 --- /dev/null +++ b/webhooks/htmlutils.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +import re +import copy +from html.entities import entitydefs + +from lxml import html # type: ignore + +def _br2span_inplace(el): + for br in el.iterchildren(tag='br'): + sp = html.Element('span') + sp.text = '\n' + sp.tail = br.tail + el.replace(br, sp) + +def extractText(el): + el = copy.copy(el) + _br2span_inplace(el) + return el.text_content() + +def iter_text_and_br(el): + if el.text: + yield el.text + for i in el.iterchildren(): + if i.tag == 'br': + yield '\n' + if i.tail: + yield i.tail + +def un_jsescape(s): + '''%xx & %uxxxx -> char, opposite of Javascript's escape()''' + return re.sub( + r'%u([0-9a-fA-F]{4})|%([0-9a-fA-F]{2})', + lambda m: chr(int(m.group(1) or m.group(2), 16)), + s + ) + +def entityunescape(string): + '''HTML entity decode''' + string = re.sub(r'&#[^;]+;', _sharp2uni, string) + string = re.sub(r'&[^;]+;', lambda m: entitydefs[m.group(0)[1:-1]], string) + return string + +def entityunescape_loose(string): + '''HTML entity decode. losse version.''' + string = re.sub(r'&#[0-9a-fA-F]+[;;]?', _sharp2uni, string) + string = re.sub(r'&\w+[;;]?', lambda m: entitydefs[m.group(0)[1:].rstrip(';;')], string) + return string + +def _sharp2uni(m): + '''&#...; ==> unicode''' + s = m.group(0)[2:].rstrip(';;') + if s.startswith('x'): + return chr(int('0'+s, 16)) + else: + return chr(int(s)) + +def parse_document_from_requests(response, session=None, *, encoding=None): + ''' + ``response``: requests ``Response`` object, or URL + ``encoding``: override detected encoding + ''' + if isinstance(response, str): + if session is None: + raise ValueError('URL given but no session') + r = session.get(response) + else: + r = response + if encoding: + r.encoding = encoding + + # fromstring handles bytes well + # https://stackoverflow.com/a/15305248/296473 + parser = html.HTMLParser(encoding=encoding or r.encoding) + doc = html.fromstring(r.content, base_url=r.url, parser=parser) + doc.make_links_absolute() + + return doc + +def parse_html_with_encoding(data, encoding='utf-8'): + parser = html.HTMLParser(encoding=encoding) + return html.fromstring(data, parser=parser)