mirror of
https://github.com/BioArchLinux/bioarchlinux-tools.git
synced 2025-03-09 22:53:31 +00:00
add lib
This commit is contained in:
parent
a1c51e36d9
commit
14db8014ed
1 changed files with 82 additions and 0 deletions
82
webhooks/htmlutils.py
Normal file
82
webhooks/htmlutils.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import copy
|
||||||
|
from html.entities import entitydefs
|
||||||
|
|
||||||
|
from lxml import html # type: ignore
|
||||||
|
|
||||||
|
def _br2span_inplace(el):
|
||||||
|
for br in el.iterchildren(tag='br'):
|
||||||
|
sp = html.Element('span')
|
||||||
|
sp.text = '\n'
|
||||||
|
sp.tail = br.tail
|
||||||
|
el.replace(br, sp)
|
||||||
|
|
||||||
|
def extractText(el):
|
||||||
|
el = copy.copy(el)
|
||||||
|
_br2span_inplace(el)
|
||||||
|
return el.text_content()
|
||||||
|
|
||||||
|
def iter_text_and_br(el):
|
||||||
|
if el.text:
|
||||||
|
yield el.text
|
||||||
|
for i in el.iterchildren():
|
||||||
|
if i.tag == 'br':
|
||||||
|
yield '\n'
|
||||||
|
if i.tail:
|
||||||
|
yield i.tail
|
||||||
|
|
||||||
|
def un_jsescape(s):
|
||||||
|
'''%xx & %uxxxx -> char, opposite of Javascript's escape()'''
|
||||||
|
return re.sub(
|
||||||
|
r'%u([0-9a-fA-F]{4})|%([0-9a-fA-F]{2})',
|
||||||
|
lambda m: chr(int(m.group(1) or m.group(2), 16)),
|
||||||
|
s
|
||||||
|
)
|
||||||
|
|
||||||
|
def entityunescape(string):
|
||||||
|
'''HTML entity decode'''
|
||||||
|
string = re.sub(r'&#[^;]+;', _sharp2uni, string)
|
||||||
|
string = re.sub(r'&[^;]+;', lambda m: entitydefs[m.group(0)[1:-1]], string)
|
||||||
|
return string
|
||||||
|
|
||||||
|
def entityunescape_loose(string):
|
||||||
|
'''HTML entity decode. losse version.'''
|
||||||
|
string = re.sub(r'&#[0-9a-fA-F]+[;;]?', _sharp2uni, string)
|
||||||
|
string = re.sub(r'&\w+[;;]?', lambda m: entitydefs[m.group(0)[1:].rstrip(';;')], string)
|
||||||
|
return string
|
||||||
|
|
||||||
|
def _sharp2uni(m):
|
||||||
|
'''&#...; ==> unicode'''
|
||||||
|
s = m.group(0)[2:].rstrip(';;')
|
||||||
|
if s.startswith('x'):
|
||||||
|
return chr(int('0'+s, 16))
|
||||||
|
else:
|
||||||
|
return chr(int(s))
|
||||||
|
|
||||||
|
def parse_document_from_requests(response, session=None, *, encoding=None):
|
||||||
|
'''
|
||||||
|
``response``: requests ``Response`` object, or URL
|
||||||
|
``encoding``: override detected encoding
|
||||||
|
'''
|
||||||
|
if isinstance(response, str):
|
||||||
|
if session is None:
|
||||||
|
raise ValueError('URL given but no session')
|
||||||
|
r = session.get(response)
|
||||||
|
else:
|
||||||
|
r = response
|
||||||
|
if encoding:
|
||||||
|
r.encoding = encoding
|
||||||
|
|
||||||
|
# fromstring handles bytes well
|
||||||
|
# https://stackoverflow.com/a/15305248/296473
|
||||||
|
parser = html.HTMLParser(encoding=encoding or r.encoding)
|
||||||
|
doc = html.fromstring(r.content, base_url=r.url, parser=parser)
|
||||||
|
doc.make_links_absolute()
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def parse_html_with_encoding(data, encoding='utf-8'):
|
||||||
|
parser = html.HTMLParser(encoding=encoding)
|
||||||
|
return html.fromstring(data, parser=parser)
|
Loading…
Add table
Reference in a new issue