add lib

2025-03-09 22:53:31 +00:00 · 2022-05-12 02:33:55 +01:00 · 2022-05-12 02:33:55 +01:00 · 14db8014ed
commit 14db8014ed
parent a1c51e36d9
1 changed files with 82 additions and 0 deletions
--- a/webhooks/htmlutils.py
+++ b/webhooks/htmlutils.py
@ -0,0 +1,82 @@
 from __future__ import annotations
 import re
 import copy
 from html.entities import entitydefs
 from lxml import html # type: ignore
 def _br2span_inplace(el):
  for br in el.iterchildren(tag='br'):
    sp = html.Element('span')
    sp.text = '\n'
    sp.tail = br.tail
    el.replace(br, sp)
 def extractText(el):
  el = copy.copy(el)
  _br2span_inplace(el)
  return el.text_content()
 def iter_text_and_br(el):
  if el.text:
    yield el.text
  for i in el.iterchildren():
    if i.tag == 'br':
      yield '\n'
    if i.tail:
      yield i.tail
 def un_jsescape(s):
    '''%xx & %uxxxx -> char, opposite of Javascript's escape()'''
    return re.sub(
        r'%u([0-9a-fA-F]{4})|%([0-9a-fA-F]{2})',
        lambda m: chr(int(m.group(1) or m.group(2), 16)),
        s
    )
 def entityunescape(string):
  '''HTML entity decode'''
  string = re.sub(r'&#[^;]+;', _sharp2uni, string)
  string = re.sub(r'&[^;]+;', lambda m: entitydefs[m.group(0)[1:-1]], string)
  return string
 def entityunescape_loose(string):
  '''HTML entity decode. losse version.'''
  string = re.sub(r'&#[0-9a-fA-F]+[;；]?', _sharp2uni, string)
  string = re.sub(r'&\w+[;；]?', lambda m: entitydefs[m.group(0)[1:].rstrip(';；')], string)
  return string
 def _sharp2uni(m):
  '''&#...; ==> unicode'''
  s = m.group(0)[2:].rstrip(';；')
  if s.startswith('x'):
    return chr(int('0'+s, 16))
  else:
    return chr(int(s))
 def parse_document_from_requests(response, session=None, *, encoding=None):
  '''
  ``response``: requests ``Response`` object, or URL
  ``encoding``: override detected encoding
  '''
  if isinstance(response, str):
    if session is None:
      raise ValueError('URL given but no session')
    r = session.get(response)
  else:
    r = response
  if encoding:
    r.encoding = encoding
  # fromstring handles bytes well
  # https://stackoverflow.com/a/15305248/296473
  parser = html.HTMLParser(encoding=encoding or r.encoding)
  doc = html.fromstring(r.content, base_url=r.url, parser=parser)
  doc.make_links_absolute()
  return doc
 def parse_html_with_encoding(data, encoding='utf-8'):
  parser = html.HTMLParser(encoding=encoding)
  return html.fromstring(data, parser=parser)