sqlmap/tests/test_inference_engine.py

#!/usr/bin/env python

"""
Copyright (c) 2006-2026 sqlmap developers (https://sqlmap.org)
See the file 'LICENSE' for copying permission

The blind-SQLi extraction engine (lib/techniques/blind/inference.py bisection).

This is the actual algorithm that pulls data out one character at a time over a
boolean/blind oracle - the heart of sqlmap. It is normally network-coupled, so
here we drive the REAL bisection() against a mock oracle: Request.queryPage is
replaced with a function that decodes the forged payload (we control the payload
template, so it is trivially parseable) and answers the comparison against a
known secret. If bisection's binary search, charset narrowing, or value assembly
regress, these go red - without a live target.

Also asserts the search is logarithmic (binary search), not a linear scan of the
character space.
"""

import os
import re
import sys
import unittest

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from _testutils import bootstrap, set_dbms
bootstrap()

from lib.core.data import conf, kb
from lib.core.common import getCurrentThreadData
from lib.request.connect import Connect
import lib.techniques.blind.inference as inf

# bisection does: safeStringFormat(payload, (expression, idx, posValue)); '>' is the
# greater-char marker (swapped to '=' on the final equality check). We pass a parseable
# template so the mock oracle can recover (idx, operator, threshold).
TEMPLATE = "EXPR=%s IDX=%d CMP>%d"
_PARSE = re.compile(r"IDX=(\d+) CMP(.)(\d+)")

# conf/kb knobs bisection reads on the simple single-threaded, no-prediction path
_CONF = {"predictOutput": False, "threads": 1, "api": False, "verbose": 0, "hexConvert": False,
         "charset": None, "firstChar": None, "lastChar": None, "timeSec": 5}
_KB = {"partRun": None, "safeCharEncode": False, "bruteMode": False, "fileReadMode": False,
       "disableShiftTable": False, "originalTimeDelay": 5, "prependFlag": False}


class _EngineCase(unittest.TestCase):
    def setUp(self):
        self._saved_conf = {k: conf.get(k) for k in _CONF}
        self._saved_kb = {k: kb.get(k) for k in _KB}
        self._saved_qp = Connect.queryPage
        self._saved_processChar = kb.data.get("processChar")
        for k, v in _CONF.items():
            conf[k] = v
        for k, v in _KB.items():
            kb[k] = v
        kb.data.processChar = None
        set_dbms("MySQL")

    def tearDown(self):
        for k, v in self._saved_conf.items():
            conf[k] = v
        for k, v in self._saved_kb.items():
            kb[k] = v
        kb.data.processChar = self._saved_processChar
        Connect.queryPage = self._saved_qp
        inf.Request.queryPage = self._saved_qp

    def _extract(self, secret, charsetType=None):
        def oracle(payload=None, *args, **kwargs):
            m = _PARSE.search(payload)
            idx, op, threshold = int(m.group(1)), m.group(2), int(m.group(3))
            ch = ord(secret[idx - 1]) if 0 <= idx - 1 < len(secret) else 0
            return (ch > threshold) if op == ">" else (ch == threshold)

        Connect.queryPage = staticmethod(oracle)
        inf.Request.queryPage = staticmethod(oracle)
        td = getCurrentThreadData()
        td.shared.value = ""
        td.shared.index = [0]
        td.shared.start = 0
        td.shared.count = 0
        count, value = inf.bisection(TEMPLATE, "SELECT secret", length=len(secret), charsetType=charsetType)
        return value, count


class TestBisectionExtraction(_EngineCase):
    # NOTE: the alpha / numeric / mixed cases are NOT redundant - getChar has per-class
    # "first character" position heuristics (distinct branches for a-z, A-Z and 0-9 at
    # inference.py ~331-336), so each character class exercises a different code path.
    def test_single_char(self):
        value, _ = self._extract("X")
        self.assertEqual(value, "X")

    def test_alpha(self):
        value, _ = self._extract("AdminUser")    # exercises the a-z / A-Z heuristic branch
        self.assertEqual(value, "AdminUser")

    def test_alphanumeric(self):
        value, _ = self._extract("admin123")
        self.assertEqual(value, "admin123")

    def test_with_spaces_and_symbols(self):
        value, _ = self._extract("p@ss W0rd!")
        self.assertEqual(value, "p@ss W0rd!")

    def test_numeric_string(self):
        value, _ = self._extract("4815162342")    # exercises the 0-9 heuristic branch
        self.assertEqual(value, "4815162342")

    def test_longer_value(self):
        secret = "The quick brown fox 0123456789"
        value, _ = self._extract(secret)
        self.assertEqual(value, secret)


class TestUnicodeExpansion(_EngineCase):
    """charsetType=None starts with a 0..127 table and gradually expands it (shiftTable) to
    reach higher code points. This test exercises the FIRST expansion step (code points
    128..1023) via Latin-1 chars, where the per-byte oracle model is exact.

    NOTE: kb.disableShiftTable is an INTENTIONAL session-level safety latch (sqlmap author's
    design): once expansion runs all the way to the top - only reachable by a code point above
    0xFFFFF, or by a misbehaving always-TRUE oracle - it disables further expansion to prevent
    runaway / erroneous extraction. That is deliberate, so this test does NOT assert that
    expansion survives across such an event.

    (Code points >= 256 are retrieved/assembled byte-wise in real runs - decodeIntToUnicode
    splits them into a byte sequence - so a simple ord()-based mock oracle only models the
    single-byte range; those are out of scope here.)"""

    def test_extracts_latin1_via_first_expansion(self):
        for s in (u"caf\xe9", u"\xfcber", u"ni\xf1o", u"\xe9\xe8\xea\xeb"):
            self.assertEqual(self._extract(s)[0], s, msg="expansion extraction failed for %r" % s)


class TestSearchIsLogarithmic(_EngineCase):
    def test_query_count_is_sublinear_in_charset(self):
        # GOAL: catch a regression from binary search to a linear/per-codepoint scan.
        # Observed cost is ~6-22 queries/char (it varies: the first-char heuristic's benefit
        # depends on ambient kb/conf state, so a tighter bound would flake). A linear scan of the
        # 128-char ASCII space would be ~128/char (~3840 for 30 chars). Bound at 40/char cleanly
        # separates "logarithmic" (passes) from "linearized" (fails) without being flaky.
        secret = "x" * 30
        _, count = self._extract(secret)
        self.assertLess(count, len(secret) * 40,
                        msg="bisection used %d queries for %d chars (~%.1f/char) - search regressed toward linear?"
                            % (count, len(secret), count / float(len(secret))))


if __name__ == "__main__":
    unittest.main(verbosity=2)