From 14db8014eda05aadd4e4faa42120e859719e651a Mon Sep 17 00:00:00 2001
From: Kuoi <starsareintherose@outlook.com>
Date: Thu, 12 May 2022 02:33:55 +0100
Subject: [PATCH] add lib

---
 webhooks/htmlutils.py | 82 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 webhooks/htmlutils.py

diff --git a/webhooks/htmlutils.py b/webhooks/htmlutils.py
new file mode 100644
index 0000000..47cc3e3
--- /dev/null
+++ b/webhooks/htmlutils.py
@@ -0,0 +1,82 @@
+from __future__ import annotations
+
+import re
+import copy
+from html.entities import entitydefs
+
+from lxml import html # type: ignore
+
+def _br2span_inplace(el):
+  for br in el.iterchildren(tag='br'):
+    sp = html.Element('span')
+    sp.text = '\n'
+    sp.tail = br.tail
+    el.replace(br, sp)
+
+def extractText(el):
+  el = copy.copy(el)
+  _br2span_inplace(el)
+  return el.text_content()
+
+def iter_text_and_br(el):
+  if el.text:
+    yield el.text
+  for i in el.iterchildren():
+    if i.tag == 'br':
+      yield '\n'
+    if i.tail:
+      yield i.tail
+
+def un_jsescape(s):
+    '''%xx & %uxxxx -> char, opposite of Javascript's escape()'''
+    return re.sub(
+        r'%u([0-9a-fA-F]{4})|%([0-9a-fA-F]{2})',
+        lambda m: chr(int(m.group(1) or m.group(2), 16)),
+        s
+    )
+
+def entityunescape(string):
+  '''HTML entity decode'''
+  string = re.sub(r'&#[^;]+;', _sharp2uni, string)
+  string = re.sub(r'&[^;]+;', lambda m: entitydefs[m.group(0)[1:-1]], string)
+  return string
+
+def entityunescape_loose(string):
+  '''HTML entity decode. losse version.'''
+  string = re.sub(r'&#[0-9a-fA-F]+[;；]?', _sharp2uni, string)
+  string = re.sub(r'&\w+[;；]?', lambda m: entitydefs[m.group(0)[1:].rstrip(';；')], string)
+  return string
+
+def _sharp2uni(m):
+  '''&#...; ==> unicode'''
+  s = m.group(0)[2:].rstrip(';；')
+  if s.startswith('x'):
+    return chr(int('0'+s, 16))
+  else:
+    return chr(int(s))
+
+def parse_document_from_requests(response, session=None, *, encoding=None):
+  '''
+  ``response``: requests ``Response`` object, or URL
+  ``encoding``: override detected encoding
+  '''
+  if isinstance(response, str):
+    if session is None:
+      raise ValueError('URL given but no session')
+    r = session.get(response)
+  else:
+    r = response
+  if encoding:
+    r.encoding = encoding
+
+  # fromstring handles bytes well
+  # https://stackoverflow.com/a/15305248/296473
+  parser = html.HTMLParser(encoding=encoding or r.encoding)
+  doc = html.fromstring(r.content, base_url=r.url, parser=parser)
+  doc.make_links_absolute()
+
+  return doc
+
+def parse_html_with_encoding(data, encoding='utf-8'):
+  parser = html.HTMLParser(encoding=encoding)
+  return html.fromstring(data, parser=parser)