Couple of bug fixes
Some checks are pending
/ build (macos-latest, 3.8) (push) Waiting to run
/ build (ubuntu-latest, pypy-2.7) (push) Waiting to run
/ build (windows-latest, 3.14) (push) Waiting to run

This commit is contained in:
Miroslav Štampar 2026-06-14 17:05:32 +02:00
parent bc4ce6e44a
commit e24678fc31
11 changed files with 66 additions and 34 deletions

View file

@ -284,6 +284,8 @@ def decodePage(page, contentEncoding, contentType, percentDecode=True):
'\\t'
>>> getText(decodePage(b"J", None, "text/html; charset=utf-8"))
'J'
>>> decodePage(b"™", None, "text/html; charset=utf-8") == u"\u2122"
True
"""
if not page or (conf.nullConnection and len(page) < 2):
@ -379,6 +381,16 @@ def decodePage(page, contentEncoding, contentType, percentDecode=True):
return retVal
page = re.sub(r"&#(\d+);", _, page)
# e.g. &#x2019;&#x2026;&#x2122; (hex numeric refs >= U+0100; smaller ones already handled at byte-level)
def _(match):
retVal = match.group(0)
try:
retVal = _unichr(int(match.group(1), 16))
except (ValueError, OverflowError):
pass
return retVal
page = re.sub(r"(?i)&#x([0-9a-f]+);", _, page)
# e.g. &zeta;
page = re.sub(r"&([^;]+);", lambda _: _unichr(HTML_ENTITIES[_.group(1)]) if HTML_ENTITIES.get(_.group(1), 0) > 255 else _.group(0), page)
else: