bioarchlinux-tools/htmlutils.py
2022-05-12 01:48:42 +00:00

82 lines
2.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
import re
import copy
from html.entities import entitydefs
from lxml import html # type: ignore
def _br2span_inplace(el):
for br in el.iterchildren(tag='br'):
sp = html.Element('span')
sp.text = '\n'
sp.tail = br.tail
el.replace(br, sp)
def extractText(el):
el = copy.copy(el)
_br2span_inplace(el)
return el.text_content()
def iter_text_and_br(el):
if el.text:
yield el.text
for i in el.iterchildren():
if i.tag == 'br':
yield '\n'
if i.tail:
yield i.tail
def un_jsescape(s):
'''%xx & %uxxxx -> char, opposite of Javascript's escape()'''
return re.sub(
r'%u([0-9a-fA-F]{4})|%([0-9a-fA-F]{2})',
lambda m: chr(int(m.group(1) or m.group(2), 16)),
s
)
def entityunescape(string):
'''HTML entity decode'''
string = re.sub(r'&#[^;]+;', _sharp2uni, string)
string = re.sub(r'&[^;]+;', lambda m: entitydefs[m.group(0)[1:-1]], string)
return string
def entityunescape_loose(string):
'''HTML entity decode. losse version.'''
string = re.sub(r'&#[0-9a-fA-F]+[;]?', _sharp2uni, string)
string = re.sub(r'&\w+[;]?', lambda m: entitydefs[m.group(0)[1:].rstrip(';')], string)
return string
def _sharp2uni(m):
'''&#...; ==> unicode'''
s = m.group(0)[2:].rstrip(';')
if s.startswith('x'):
return chr(int('0'+s, 16))
else:
return chr(int(s))
def parse_document_from_requests(response, session=None, *, encoding=None):
'''
``response``: requests ``Response`` object, or URL
``encoding``: override detected encoding
'''
if isinstance(response, str):
if session is None:
raise ValueError('URL given but no session')
r = session.get(response)
else:
r = response
if encoding:
r.encoding = encoding
# fromstring handles bytes well
# https://stackoverflow.com/a/15305248/296473
parser = html.HTMLParser(encoding=encoding or r.encoding)
doc = html.fromstring(r.content, base_url=r.url, parser=parser)
doc.make_links_absolute()
return doc
def parse_html_with_encoding(data, encoding='utf-8'):
parser = html.HTMLParser(encoding=encoding)
return html.fromstring(data, parser=parser)