From 7e77a196e62abf41b90c8cda44bb162bbb741ca3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 24 Jan 2024 14:11:22 +0530 Subject: [PATCH] Build only the SIMD code with SIMD compiler flags --- glfw/glfw.py | 30 ++++++++++-- kitty/arches.h | 29 +++++++++++ kitty/simd-string-128.c | 9 ++++ kitty/simd-string-256.c | 9 ++++ kitty/simd-string-impl.h | 29 +++++------ kitty/simd-string.c | 6 --- kitty/simd-string.h | 12 +++-- setup.py | 103 +++++++++++++++++++++++++++++---------- 8 files changed, 172 insertions(+), 55 deletions(-) create mode 100644 kitty/arches.h create mode 100644 kitty/simd-string-128.c create mode 100644 kitty/simd-string-256.c diff --git a/glfw/glfw.py b/glfw/glfw.py index f308657b9..191d8386a 100755 --- a/glfw/glfw.py +++ b/glfw/glfw.py @@ -6,7 +6,8 @@ import json import os import re import sys -from typing import Callable, Dict, List, NamedTuple, Optional, Sequence, Tuple +from enum import Enum +from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Tuple _plat = sys.platform.lower() is_linux = 'linux' in _plat @@ -32,6 +33,19 @@ class Command(NamedTuple): keyfile: Optional[str] = None +class ISA(Enum): + X86 = 0x03 + AMD64 = 0x3e + ARM64 = 0xb7 + Other = 0x0 + + +class BinaryArch(NamedTuple): + bits: int = 64 + isa: ISA = ISA.AMD64 + + + class Env: cc: List[str] = [] @@ -42,6 +56,9 @@ class Env: ldpaths: List[str] = [] ccver: Tuple[int, int] vcs_rev: str = '' + build_universal_binary: bool = False + binary_arch: BinaryArch = BinaryArch() + native_optimizations: bool = False # glfw stuff all_headers: List[str] = [] @@ -54,12 +71,16 @@ class Env: def __init__( self, cc: List[str] = [], cppflags: List[str] = [], cflags: List[str] = [], ldflags: List[str] = [], library_paths: Dict[str, List[str]] = {}, ldpaths: Optional[List[str]] = None, ccver: Tuple[int, int] = (0, 0), - vcs_rev: str = '' + vcs_rev: str = '', build_universal_binary: bool = False, binary_arch: BinaryArch = BinaryArch(), + native_optimizations: bool = False, ): self.cc, self.cppflags, self.cflags, self.ldflags, self.library_paths = cc, cppflags, cflags, ldflags, library_paths self.ldpaths = ldpaths or [] self.ccver = ccver self.vcs_rev = vcs_rev + self.build_universal_binary = build_universal_binary + self.binary_arch = binary_arch + self.native_optimizations = native_optimizations def copy(self) -> 'Env': ans = Env(self.cc, list(self.cppflags), list(self.cflags), list(self.ldflags), dict(self.library_paths), list(self.ldpaths), self.ccver) @@ -70,6 +91,9 @@ class Env: ans.wayland_scanner_code = self.wayland_scanner_code ans.wayland_protocols = self.wayland_protocols ans.vcs_rev = self.vcs_rev + ans.build_universal_binary = self.build_universal_binary + ans.binary_arch = self.binary_arch + ans.native_optimizations = self.native_optimizations return ans @@ -83,7 +107,7 @@ def init_env( pkg_config: Callable[..., List[str]], pkg_version: Callable[[str], Tuple[int, int]], at_least_version: Callable[..., None], - test_compile: Callable[..., bool], + test_compile: Callable[..., Any], module: str = 'x11' ) -> Env: ans = env.copy() diff --git a/kitty/arches.h b/kitty/arches.h new file mode 100644 index 000000000..7d1221817 --- /dev/null +++ b/kitty/arches.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2024 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#pragma once + + +#ifdef __aarch64__ +#define KITTY_TARGET_CPU_IS_ARM64 +#define KITTY_128BIT_ALLOWED +#define KITTY_256BIT_ALLOWED +#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) +#define KITTY_TARGET_CPU_IS_X86 +#define KITTY_128BIT_ALLOWED +#elif defined(__amd64__) +#define KITTY_TARGET_CPU_IS_AMD64 +#define KITTY_128BIT_ALLOWED +#define KITTY_256BIT_ALLOWED +#endif + +#if defined(__clang__) && defined(KITTY_128BIT_ALLOWED) +#define KITTY_START_128BIT_CODE +#elif defined(KITTY_128BIT_ALLOWED) +#define KITTY_START_128BIT_CODE +#else +#define KITTY_START_128BIT_CODE +#endif diff --git a/kitty/simd-string-128.c b/kitty/simd-string-128.c new file mode 100644 index 000000000..c60de78aa --- /dev/null +++ b/kitty/simd-string-128.c @@ -0,0 +1,9 @@ +/* + * simd-string-128.c + * Copyright (C) 2024 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#define KITTY_SIMD_LEVEL 128 +#include "simd-string-impl.h" diff --git a/kitty/simd-string-256.c b/kitty/simd-string-256.c new file mode 100644 index 000000000..2607b7efc --- /dev/null +++ b/kitty/simd-string-256.c @@ -0,0 +1,9 @@ +/* + * simd-string-128.c + * Copyright (C) 2024 Kovid Goyal + * + * Distributed under terms of the GPL3 license. + */ + +#define KITTY_SIMD_LEVEL 256 +#include "simd-string-impl.h" diff --git a/kitty/simd-string-impl.h b/kitty/simd-string-impl.h index 7b3a9162b..9a217ad55 100644 --- a/kitty/simd-string-impl.h +++ b/kitty/simd-string-impl.h @@ -4,8 +4,8 @@ * Distributed under terms of the GPL3 license. */ -#ifndef BITS -#define BITS 128 +#ifndef KITTY_SIMD_LEVEL +#define KITTY_SIMD_LEVEL 128 #endif #include "simd-string.h" @@ -26,13 +26,13 @@ _Pragma("clang diagnostic pop") #endif #define CONCAT(A, B) A##B #define CONCAT_EXPAND(A, B) CONCAT(A,B) -#define FUNC(name) CONCAT_EXPAND(name##_, BITS) -#define integer_t CONCAT_EXPAND(CONCAT_EXPAND(simde__m, BITS), i) +#define FUNC(name) CONCAT_EXPAND(name##_, KITTY_SIMD_LEVEL) +#define integer_t CONCAT_EXPAND(CONCAT_EXPAND(simde__m, KITTY_SIMD_LEVEL), i) #define shift_right_by_bytes128 simde_mm_srli_si128 #define zero_last_n_bytes FUNC(zero_last_n_bytes) #define is_zero FUNC(is_zero) -#if BITS == 128 +#if KITTY_SIMD_LEVEL == 128 #define set1_epi8(x) simde_mm_set1_epi8((char)(x)) #define set_epi8 simde_mm_set_epi8 #define add_epi8 simde_mm_add_epi8 @@ -199,7 +199,7 @@ FUNC(zero_last_n_bytes)(integer_t vec, char n) { return andnot_si(mask, vec); } -static inline const uint8_t* +const uint8_t* FUNC(find_either_of_two_bytes)(const uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b) { const integer_t a_vec = set1_epi8(a), b_vec = set1_epi8(b); for (const uint8_t* limit = haystack + sz; haystack < limit; haystack += sizeof(integer_t)) { @@ -220,7 +220,7 @@ FUNC(find_either_of_two_bytes)(const uint8_t *haystack, const size_t sz, const u static inline void FUNC(output_plain_ascii)(UTF8Decoder *d, integer_t vec, size_t src_sz) { -#if BITS == 128 +#if KITTY_SIMD_LEVEL == 128 for (const uint32_t *limit = d->output + src_sz, *p = d->output; p < limit; p += output_increment) { const integer_t unpacked = extract_lower_quarter_as_chars(vec); store_aligned((integer_t*)p, unpacked); @@ -252,7 +252,7 @@ FUNC(output_plain_ascii)(UTF8Decoder *d, integer_t vec, size_t src_sz) { static inline void FUNC(output_unicode)(UTF8Decoder *d, integer_t output1, integer_t output2, integer_t output3, const size_t num_codepoints) { -#if BITS == 128 +#if KITTY_SIMD_LEVEL == 128 for (const uint32_t *limit = d->output + num_codepoints, *p = d->output; p < limit; p += output_increment) { const integer_t unpacked1 = extract_lower_quarter_as_chars(output1); const integer_t unpacked2 = shift_right_by_one_byte(extract_lower_quarter_as_chars(output2)); @@ -294,7 +294,6 @@ FUNC(output_unicode)(UTF8Decoder *d, integer_t output1, integer_t output2, integ } #undef output_increment -#ifndef SIMD_STRING_IMPL_INCLUDED_ONCE static inline unsigned sum_bytes_128(simde__m128i v) { // Use _mm_sad_epu8 to perform a sum of absolute differences against zero @@ -345,9 +344,8 @@ scalar_decode_all(UTF8Decoder *d, const uint8_t *src, size_t src_sz) { return pos; } #undef do_one_byte -#endif -static inline bool +bool FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src, size_t src_sz) { // Based on the algorithm described in: https://woboq.com/blog/utf-8-processing-using-simd.html @@ -489,7 +487,7 @@ start_classification: shifts = add_epi8(shifts, shift_right_by_two_bytes(shifts)); shifts = add_epi8(shifts, shift_right_by_four_bytes(shifts)); shifts = add_epi8(shifts, shift_right_by_eight_bytes(shifts)); -#if BITS == 256 +#if KITTY_SIMD_LEVEL == 256 shifts = add_epi8(shifts, shift_right_by_sixteen_bytes(shifts)); #endif // zero the shifts for discarded continuation bytes @@ -505,7 +503,7 @@ start_classification: shifts = move(shifts, two_bytes, 2); shifts = move(shifts, four_bytes, 3); shifts = move(shifts, eight_bytes, 4); -#if BITS == 256 +#if KITTY_SIMD_LEVEL == 256 shifts = move(shifts, sixteen_bytes, 5); #endif #undef move @@ -547,7 +545,7 @@ invalid_utf8: #undef movemask_epi8 #undef CONCAT #undef CONCAT_EXPAND -#undef BITS +#undef KITTY_SIMD_LEVEL #undef shift_right_by_one_byte #undef shift_right_by_two_bytes #undef shift_right_by_four_bytes @@ -575,6 +573,3 @@ invalid_utf8: #undef sum_bytes #undef is_zero #undef print_register_as_bytes -#ifndef SIMD_STRING_IMPL_INCLUDED_ONCE -#define SIMD_STRING_IMPL_INCLUDED_ONCE -#endif diff --git a/kitty/simd-string.c b/kitty/simd-string.c index f1a77fa5d..8addb65dd 100644 --- a/kitty/simd-string.c +++ b/kitty/simd-string.c @@ -8,12 +8,6 @@ #include "data-types.h" #include "charsets.h" #include "simd-string.h" -#undef BITS -#define BITS 128 -#include "simd-string-impl.h" -#define BITS 256 -#include "simd-string-impl.h" -#undef BITS static bool has_sse4_2 = false, has_avx2 = false; // find_either_of_two_bytes {{{ diff --git a/kitty/simd-string.h b/kitty/simd-string.h index de1568cbc..d6cef5ec8 100644 --- a/kitty/simd-string.h +++ b/kitty/simd-string.h @@ -6,11 +6,9 @@ #pragma once -#include -#include -#include -#include #include "data-types.h" +#include +#include typedef void (*control_byte_callback)(void *data, uint8_t ch); typedef void (*output_chars_callback)(void *data, const uint32_t *chars, unsigned count); @@ -36,3 +34,9 @@ const uint8_t* find_either_of_two_bytes(const uint8_t *haystack, const size_t sz // first position in haystack that contains a char that is not in [a, b]. // a must be <= b const uint8_t* find_byte_not_in_range(const uint8_t *haystack, const size_t sz, const uint8_t a1, const uint8_t b); + +// SIMD implementations, internal use +bool utf8_decode_to_esc_128(UTF8Decoder *d, const uint8_t *src, size_t src_sz); +bool utf8_decode_to_esc_256(UTF8Decoder *d, const uint8_t *src, size_t src_sz); +const uint8_t* find_either_of_two_bytes_128(const uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b); +const uint8_t* find_either_of_two_bytes_256(const uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b); diff --git a/setup.py b/setup.py index 6aaa97858..18cff3843 100755 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ import re import runpy import shlex import shutil +import struct import subprocess import sys import sysconfig @@ -22,7 +23,7 @@ from pathlib import Path from typing import Callable, Dict, FrozenSet, Iterable, Iterator, List, Optional, Sequence, Set, Tuple, Union, cast from glfw import glfw -from glfw.glfw import Command, CompileKey +from glfw.glfw import ISA, BinaryArch, Command, CompileKey if sys.version_info[:2] < (3, 8): raise SystemExit('kitty requires python >= 3.8') @@ -362,6 +363,24 @@ def get_sanitize_args(cc: List[str], ccver: Tuple[int, int]) -> List[str]: return sanitize_args +def get_binary_arch(path: str) -> BinaryArch: + with open(path, 'rb') as f: + sig = f.read(64) + if sig.startswith(b'\x7fELF'): # ELF + bits = {1: 32, 2: 64}[sig[4]] + endian = {1: '<', 2: '>'}[sig[5]] + machine, = struct.unpack_from(endian + 'H', sig, 0x12) + isa = {i.value:i for i in ISA}.get(machine, ISA.Other) + elif sig[:4] in (b'\xcf\xfa\xed\xfe', b'\xce\xfa\xed\xfe'): # Mach-O + s, cpu_type, = struct.unpack_from(' bool: + get_output_arch: bool = False, +) -> Union[bool, BinaryArch]: src = src or 'int main(void) { return 0; }' with tempfile.TemporaryDirectory(prefix='kitty-test-compile-') as tdir: with open(os.path.join(tdir, f'source.{source_ext}'), 'w', encoding='utf-8') as srcf: print(src, file=srcf) - return subprocess.Popen( + output = os.path.join(tdir, 'source.output') + ret = subprocess.Popen( cc + ['-Werror=implicit-function-declaration'] + list(cflags) + ([] if link_also else ['-c']) + - ['-o', os.path.join(tdir, 'source.output'), srcf.name] + + ['-o', output, srcf.name] + [f'-l{x}' for x in libraries] + list(ldflags), stdout=subprocess.DEVNULL, stdin=subprocess.DEVNULL, stderr=None if show_stderr else subprocess.DEVNULL - ).wait() == 0 + ).wait() + if get_output_arch: + if ret != 0: + raise SystemExit(f'Failed to determine target architecture compiling test program failed with exit code: {ret}') + return get_binary_arch(output) + return ret == 0 def first_successful_compile(cc: List[str], *cflags: str, src: str = '', source_ext: str = 'c') -> str: @@ -432,6 +458,7 @@ def init_env( vcs_rev: str = '', ) -> Env: native_optimizations = native_optimizations and not sanitize + build_universal_binary = build_universal_binary and is_macos cc, ccver = cc_version() if verbose: print('CC:', cc, ccver) @@ -463,7 +490,7 @@ def init_env( # in https://github.com/kovidgoyal/kitty/issues/6845#issuecomment-1835886938 arm_control_flow_protection = '-mbranch-protection=standard' if is_macos else '' # Universal build fails with -fcf-protection clang is not smart enough to filter it out for the ARM part - intel_control_flow_protection = '-fcf-protection=full' if ccver >= (9, 0) and not build_universal_binary else '' + intel_control_flow_protection = '-fcf-protection=full' if ccver >= (9, 0) else '' control_flow_protection = arm_control_flow_protection if is_arm else intel_control_flow_protection env_cflags = shlex.split(os.environ.get('CFLAGS', '')) env_cppflags = shlex.split(os.environ.get('CPPFLAGS', '')) @@ -471,11 +498,10 @@ def init_env( if control_flow_protection and not test_compile(cc, control_flow_protection, *env_cppflags, *env_cflags, ldflags=env_ldflags): control_flow_protection = '' march = '' - if not (is_macos and is_arm) and not build_universal_binary: + if native_optimizations and not build_universal_binary and not (is_macos and is_arm): # see https://github.com/kovidgoyal/kitty/issues/3126 # -march=native is not supported when targeting Apple Silicon - if native_optimizations: - march = '-march=native -mtune=native' + march = '-march=native -mtune=native' cflags_ = os.environ.get( 'OVERRIDE_CFLAGS', ( f'-Wextra {float_conversion} -Wno-missing-field-initializers -Wall -Wstrict-prototypes {std}' @@ -508,13 +534,6 @@ def init_env( cflags.append('-g3') ldflags.append('-lprofiler') - # SIMD instructions - if is_arm: - if not is_macos: - cflags.append('-mfpu=neon') - else: - cflags.append('-msse4.2') - cflags.append('-mavx2') library_paths: Dict[str, List[str]] = {} def add_lpath(which: str, name: str, val: Optional[str]) -> None: @@ -539,11 +558,13 @@ def init_env( cflags.insert(0, f'-I{os.environ["DEVELOP_ROOT"]}/include') ldpaths.insert(0, f'-L{os.environ["DEVELOP_ROOT"]}/lib') - if build_universal_binary: - set_arches(cflags) - set_arches(ldflags) + ba = test_compile(cc, *(cppflags + cflags), ldflags=ldflags, get_output_arch=True) + assert isinstance(ba, BinaryArch) - return Env(cc, cppflags, cflags, ldflags, library_paths, ccver=ccver, ldpaths=ldpaths, vcs_rev=vcs_rev) + return Env( + cc, cppflags, cflags, ldflags, library_paths, binary_arch=ba, native_optimizations=native_optimizations, + ccver=ccver, ldpaths=ldpaths, vcs_rev=vcs_rev, build_universal_binary=build_universal_binary + ) def kitty_env(args: Options) -> Env: @@ -638,7 +659,7 @@ def get_vcs_rev() -> str: @lru_cache -def base64_defines() -> List[str]: +def base64_defines(isa: ISA) -> List[str]: defs = { 'HAVE_AVX512': 0, 'HAVE_AVX2': 0, @@ -649,14 +670,18 @@ def base64_defines() -> List[str]: 'HAVE_SSE42': 0, 'HAVE_AVX': 0, } - if is_arm: + if isa == ISA.ARM64: defs['HAVE_NEON64'] = 1 - else: + elif isa == ISA.AMD64: defs['HAVE_AVX2'] = 1 defs['HAVE_AVX'] = 1 defs['HAVE_SSE42'] = 1 defs['HAVE_SSE41'] = 1 defs['HAVE_SSE3'] = 1 + elif isa == ISA.X86: + defs['HAVE_SSE42'] = 1 + defs['HAVE_SSE41'] = 1 + defs['HAVE_SSE3'] = 1 return [f'{k}={v}' for k, v in defs.items()] @@ -668,13 +693,40 @@ def get_source_specific_defines(env: Env, src: str) -> Tuple[str, List[str], Opt env.vcs_rev = get_vcs_rev() return src, [], [f'KITTY_VCS_REV="{env.vcs_rev}"', f'WRAPPED_KITTENS="{wrapped_kittens()}"'] if src.startswith('3rdparty/base64/'): - return src, ['3rdparty/base64',], base64_defines() + return src, ['3rdparty/base64',], base64_defines(env.binary_arch.isa) try: return src, [], env.library_paths[src] except KeyError: return src, [], None +def get_source_specific_cflags(env: Env, src: str) -> List[str]: + ans = list(env.cflags) + # SIMD specific flags, ignored for native optimizations as they give slightly better performance + if src == 'kitty/simd-string-128.c': + if env.binary_arch.isa in (ISA.AMD64, ISA.X86): + if not env.native_optimizations: + ans.append('-msse4.2') + elif src == 'kitty/simd-string-256.c': + if env.binary_arch.isa in (ISA.AMD64, ISA.X86): + if not env.native_optimizations: + ans.append('-mavx2') + elif src.startswith('3rdparty/base64/lib/arch/'): + if not env.native_optimizations: + q = src.split(os.path.sep) + if 'sse3' in q: + ans.append('-msse3') + elif 'sse41' in q: + ans.append('-msse4.1') + elif 'sse42' in q: + ans.append('-msse4.2') + elif 'avx' in q: + ans.append('-mavx') + elif 'avx2' in q: + ans.append('-mavx2') + return ans + + def newer(dest: str, *sources: str) -> bool: try: dtime = os.path.getmtime(dest) @@ -782,7 +834,8 @@ def compile_c_extension( src, include_paths, defines = get_source_specific_defines(kenv, src) if defines is not None: cppflags.extend(map(define, defines)) - cmd = kenv.cc + ['-MMD'] + cppflags + [f'-I{x}' for x in include_paths] + kenv.cflags + cflags = get_source_specific_cflags(kenv, src) + cmd = kenv.cc + ['-MMD'] + cppflags + [f'-I{x}' for x in include_paths] + cflags cmd += ['-c', src] + ['-o', dest] key = CompileKey(original_src, os.path.basename(dest)) desc = f'Compiling {emphasis(desc_prefix + src)} ...'