Implementing extractStructuralTokens as a helper to detection engine
Some checks are pending
/ build (macos-latest, 3.8) (push) Waiting to run
/ build (ubuntu-latest, pypy-2.7) (push) Waiting to run
/ build (windows-latest, 3.14) (push) Waiting to run

This commit is contained in:
Miroslav Štampar 2026-06-30 23:09:06 +02:00
parent f932a3f30f
commit 74f90df8ae
6 changed files with 87 additions and 7 deletions

View file

@ -10,6 +10,7 @@ from __future__ import division
import re
from lib.core.common import extractRegexResult
from lib.core.common import extractStructuralTokens
from lib.core.common import getFilteredPageContent
from lib.core.common import jsonMinimize
from lib.core.common import listToStrValue
@ -177,6 +178,15 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
seq1 = jsonMinimize(kb.pageTemplate)
seq2 = jsonMinimize(rawPage)
# Structure-aware comparison for a structurally-stable (but byte-unstable) HTML page:
# compare the value-free tag/class/id skeleton so dynamic text does not perturb the ratio
# while a structural change (e.g. a results table appearing/disappearing) still does
if seq1 is None and kb.pageStructurallyStable and not (conf.titles or conf.textOnly or kb.nullConnection):
_ = "\n".join(sorted(extractStructuralTokens(kb.pageTemplate)))
if _: # only engage when the page actually exposes structure (HTML tags); tagless content falls back to text
seq1 = _
seq2 = "\n".join(sorted(extractStructuralTokens(rawPage)))
if seq1 is None or seq2 is None:
if conf.titles:
seq1 = extractRegexResult(HTML_TITLE_REGEX, seqMatcher.a)