Build only the SIMD code with SIMD compiler flags

This commit is contained in:
Kovid Goyal 2024-01-24 14:11:22 +05:30
parent 465616223c
commit 7e77a196e6
No known key found for this signature in database
GPG key ID: 06BC317B515ACE7C
8 changed files with 172 additions and 55 deletions

View file

@ -6,7 +6,8 @@ import json
import os
import re
import sys
from typing import Callable, Dict, List, NamedTuple, Optional, Sequence, Tuple
from enum import Enum
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Tuple
_plat = sys.platform.lower()
is_linux = 'linux' in _plat
@ -32,6 +33,19 @@ class Command(NamedTuple):
keyfile: Optional[str] = None
class ISA(Enum):
X86 = 0x03
AMD64 = 0x3e
ARM64 = 0xb7
Other = 0x0
class BinaryArch(NamedTuple):
bits: int = 64
isa: ISA = ISA.AMD64
class Env:
cc: List[str] = []
@ -42,6 +56,9 @@ class Env:
ldpaths: List[str] = []
ccver: Tuple[int, int]
vcs_rev: str = ''
build_universal_binary: bool = False
binary_arch: BinaryArch = BinaryArch()
native_optimizations: bool = False
# glfw stuff
all_headers: List[str] = []
@ -54,12 +71,16 @@ class Env:
def __init__(
self, cc: List[str] = [], cppflags: List[str] = [], cflags: List[str] = [], ldflags: List[str] = [],
library_paths: Dict[str, List[str]] = {}, ldpaths: Optional[List[str]] = None, ccver: Tuple[int, int] = (0, 0),
vcs_rev: str = ''
vcs_rev: str = '', build_universal_binary: bool = False, binary_arch: BinaryArch = BinaryArch(),
native_optimizations: bool = False,
):
self.cc, self.cppflags, self.cflags, self.ldflags, self.library_paths = cc, cppflags, cflags, ldflags, library_paths
self.ldpaths = ldpaths or []
self.ccver = ccver
self.vcs_rev = vcs_rev
self.build_universal_binary = build_universal_binary
self.binary_arch = binary_arch
self.native_optimizations = native_optimizations
def copy(self) -> 'Env':
ans = Env(self.cc, list(self.cppflags), list(self.cflags), list(self.ldflags), dict(self.library_paths), list(self.ldpaths), self.ccver)
@ -70,6 +91,9 @@ class Env:
ans.wayland_scanner_code = self.wayland_scanner_code
ans.wayland_protocols = self.wayland_protocols
ans.vcs_rev = self.vcs_rev
ans.build_universal_binary = self.build_universal_binary
ans.binary_arch = self.binary_arch
ans.native_optimizations = self.native_optimizations
return ans
@ -83,7 +107,7 @@ def init_env(
pkg_config: Callable[..., List[str]],
pkg_version: Callable[[str], Tuple[int, int]],
at_least_version: Callable[..., None],
test_compile: Callable[..., bool],
test_compile: Callable[..., Any],
module: str = 'x11'
) -> Env:
ans = env.copy()

29
kitty/arches.h Normal file
View file

@ -0,0 +1,29 @@
/*
* Copyright (C) 2024 Kovid Goyal <kovid at kovidgoyal.net>
*
* Distributed under terms of the GPL3 license.
*/
#pragma once
#ifdef __aarch64__
#define KITTY_TARGET_CPU_IS_ARM64
#define KITTY_128BIT_ALLOWED
#define KITTY_256BIT_ALLOWED
#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)
#define KITTY_TARGET_CPU_IS_X86
#define KITTY_128BIT_ALLOWED
#elif defined(__amd64__)
#define KITTY_TARGET_CPU_IS_AMD64
#define KITTY_128BIT_ALLOWED
#define KITTY_256BIT_ALLOWED
#endif
#if defined(__clang__) && defined(KITTY_128BIT_ALLOWED)
#define KITTY_START_128BIT_CODE
#elif defined(KITTY_128BIT_ALLOWED)
#define KITTY_START_128BIT_CODE
#else
#define KITTY_START_128BIT_CODE
#endif

9
kitty/simd-string-128.c Normal file
View file

@ -0,0 +1,9 @@
/*
* simd-string-128.c
* Copyright (C) 2024 Kovid Goyal <kovid at kovidgoyal.net>
*
* Distributed under terms of the GPL3 license.
*/
#define KITTY_SIMD_LEVEL 128
#include "simd-string-impl.h"

9
kitty/simd-string-256.c Normal file
View file

@ -0,0 +1,9 @@
/*
* simd-string-128.c
* Copyright (C) 2024 Kovid Goyal <kovid at kovidgoyal.net>
*
* Distributed under terms of the GPL3 license.
*/
#define KITTY_SIMD_LEVEL 256
#include "simd-string-impl.h"

View file

@ -4,8 +4,8 @@
* Distributed under terms of the GPL3 license.
*/
#ifndef BITS
#define BITS 128
#ifndef KITTY_SIMD_LEVEL
#define KITTY_SIMD_LEVEL 128
#endif
#include "simd-string.h"
@ -26,13 +26,13 @@ _Pragma("clang diagnostic pop")
#endif
#define CONCAT(A, B) A##B
#define CONCAT_EXPAND(A, B) CONCAT(A,B)
#define FUNC(name) CONCAT_EXPAND(name##_, BITS)
#define integer_t CONCAT_EXPAND(CONCAT_EXPAND(simde__m, BITS), i)
#define FUNC(name) CONCAT_EXPAND(name##_, KITTY_SIMD_LEVEL)
#define integer_t CONCAT_EXPAND(CONCAT_EXPAND(simde__m, KITTY_SIMD_LEVEL), i)
#define shift_right_by_bytes128 simde_mm_srli_si128
#define zero_last_n_bytes FUNC(zero_last_n_bytes)
#define is_zero FUNC(is_zero)
#if BITS == 128
#if KITTY_SIMD_LEVEL == 128
#define set1_epi8(x) simde_mm_set1_epi8((char)(x))
#define set_epi8 simde_mm_set_epi8
#define add_epi8 simde_mm_add_epi8
@ -199,7 +199,7 @@ FUNC(zero_last_n_bytes)(integer_t vec, char n) {
return andnot_si(mask, vec);
}
static inline const uint8_t*
const uint8_t*
FUNC(find_either_of_two_bytes)(const uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b) {
const integer_t a_vec = set1_epi8(a), b_vec = set1_epi8(b);
for (const uint8_t* limit = haystack + sz; haystack < limit; haystack += sizeof(integer_t)) {
@ -220,7 +220,7 @@ FUNC(find_either_of_two_bytes)(const uint8_t *haystack, const size_t sz, const u
static inline void
FUNC(output_plain_ascii)(UTF8Decoder *d, integer_t vec, size_t src_sz) {
#if BITS == 128
#if KITTY_SIMD_LEVEL == 128
for (const uint32_t *limit = d->output + src_sz, *p = d->output; p < limit; p += output_increment) {
const integer_t unpacked = extract_lower_quarter_as_chars(vec);
store_aligned((integer_t*)p, unpacked);
@ -252,7 +252,7 @@ FUNC(output_plain_ascii)(UTF8Decoder *d, integer_t vec, size_t src_sz) {
static inline void
FUNC(output_unicode)(UTF8Decoder *d, integer_t output1, integer_t output2, integer_t output3, const size_t num_codepoints) {
#if BITS == 128
#if KITTY_SIMD_LEVEL == 128
for (const uint32_t *limit = d->output + num_codepoints, *p = d->output; p < limit; p += output_increment) {
const integer_t unpacked1 = extract_lower_quarter_as_chars(output1);
const integer_t unpacked2 = shift_right_by_one_byte(extract_lower_quarter_as_chars(output2));
@ -294,7 +294,6 @@ FUNC(output_unicode)(UTF8Decoder *d, integer_t output1, integer_t output2, integ
}
#undef output_increment
#ifndef SIMD_STRING_IMPL_INCLUDED_ONCE
static inline unsigned
sum_bytes_128(simde__m128i v) {
// Use _mm_sad_epu8 to perform a sum of absolute differences against zero
@ -345,9 +344,8 @@ scalar_decode_all(UTF8Decoder *d, const uint8_t *src, size_t src_sz) {
return pos;
}
#undef do_one_byte
#endif
static inline bool
bool
FUNC(utf8_decode_to_esc)(UTF8Decoder *d, const uint8_t *src, size_t src_sz) {
// Based on the algorithm described in: https://woboq.com/blog/utf-8-processing-using-simd.html
@ -489,7 +487,7 @@ start_classification:
shifts = add_epi8(shifts, shift_right_by_two_bytes(shifts));
shifts = add_epi8(shifts, shift_right_by_four_bytes(shifts));
shifts = add_epi8(shifts, shift_right_by_eight_bytes(shifts));
#if BITS == 256
#if KITTY_SIMD_LEVEL == 256
shifts = add_epi8(shifts, shift_right_by_sixteen_bytes(shifts));
#endif
// zero the shifts for discarded continuation bytes
@ -505,7 +503,7 @@ start_classification:
shifts = move(shifts, two_bytes, 2);
shifts = move(shifts, four_bytes, 3);
shifts = move(shifts, eight_bytes, 4);
#if BITS == 256
#if KITTY_SIMD_LEVEL == 256
shifts = move(shifts, sixteen_bytes, 5);
#endif
#undef move
@ -547,7 +545,7 @@ invalid_utf8:
#undef movemask_epi8
#undef CONCAT
#undef CONCAT_EXPAND
#undef BITS
#undef KITTY_SIMD_LEVEL
#undef shift_right_by_one_byte
#undef shift_right_by_two_bytes
#undef shift_right_by_four_bytes
@ -575,6 +573,3 @@ invalid_utf8:
#undef sum_bytes
#undef is_zero
#undef print_register_as_bytes
#ifndef SIMD_STRING_IMPL_INCLUDED_ONCE
#define SIMD_STRING_IMPL_INCLUDED_ONCE
#endif

View file

@ -8,12 +8,6 @@
#include "data-types.h"
#include "charsets.h"
#include "simd-string.h"
#undef BITS
#define BITS 128
#include "simd-string-impl.h"
#define BITS 256
#include "simd-string-impl.h"
#undef BITS
static bool has_sse4_2 = false, has_avx2 = false;
// find_either_of_two_bytes {{{

View file

@ -6,11 +6,9 @@
#pragma once
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdalign.h>
#include "data-types.h"
#include <stddef.h>
#include <stdalign.h>
typedef void (*control_byte_callback)(void *data, uint8_t ch);
typedef void (*output_chars_callback)(void *data, const uint32_t *chars, unsigned count);
@ -36,3 +34,9 @@ const uint8_t* find_either_of_two_bytes(const uint8_t *haystack, const size_t sz
// first position in haystack that contains a char that is not in [a, b].
// a must be <= b
const uint8_t* find_byte_not_in_range(const uint8_t *haystack, const size_t sz, const uint8_t a1, const uint8_t b);
// SIMD implementations, internal use
bool utf8_decode_to_esc_128(UTF8Decoder *d, const uint8_t *src, size_t src_sz);
bool utf8_decode_to_esc_256(UTF8Decoder *d, const uint8_t *src, size_t src_sz);
const uint8_t* find_either_of_two_bytes_128(const uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b);
const uint8_t* find_either_of_two_bytes_256(const uint8_t *haystack, const size_t sz, const uint8_t a, const uint8_t b);

103
setup.py
View file

@ -10,6 +10,7 @@ import re
import runpy
import shlex
import shutil
import struct
import subprocess
import sys
import sysconfig
@ -22,7 +23,7 @@ from pathlib import Path
from typing import Callable, Dict, FrozenSet, Iterable, Iterator, List, Optional, Sequence, Set, Tuple, Union, cast
from glfw import glfw
from glfw.glfw import Command, CompileKey
from glfw.glfw import ISA, BinaryArch, Command, CompileKey
if sys.version_info[:2] < (3, 8):
raise SystemExit('kitty requires python >= 3.8')
@ -362,6 +363,24 @@ def get_sanitize_args(cc: List[str], ccver: Tuple[int, int]) -> List[str]:
return sanitize_args
def get_binary_arch(path: str) -> BinaryArch:
with open(path, 'rb') as f:
sig = f.read(64)
if sig.startswith(b'\x7fELF'): # ELF
bits = {1: 32, 2: 64}[sig[4]]
endian = {1: '<', 2: '>'}[sig[5]]
machine, = struct.unpack_from(endian + 'H', sig, 0x12)
isa = {i.value:i for i in ISA}.get(machine, ISA.Other)
elif sig[:4] in (b'\xcf\xfa\xed\xfe', b'\xce\xfa\xed\xfe'): # Mach-O
s, cpu_type, = struct.unpack_from('<II', sig, 0)
bits = {0xfeedface: 32, 0xfeedfacf: 64}[s]
cpu_type &= 0xff
isa = {0x7: ISA.AMD64, 0xc: ISA.ARM64}[cpu_type]
else:
raise SystemExit(f'Unknown binary format with signature: {sig[:4]!r}')
return BinaryArch(bits=bits, isa=isa)
def test_compile(
cc: List[str], *cflags: str,
src: str = '',
@ -370,18 +389,25 @@ def test_compile(
show_stderr: bool = False,
libraries: Iterable[str] = (),
ldflags: Iterable[str] = (),
) -> bool:
get_output_arch: bool = False,
) -> Union[bool, BinaryArch]:
src = src or 'int main(void) { return 0; }'
with tempfile.TemporaryDirectory(prefix='kitty-test-compile-') as tdir:
with open(os.path.join(tdir, f'source.{source_ext}'), 'w', encoding='utf-8') as srcf:
print(src, file=srcf)
return subprocess.Popen(
output = os.path.join(tdir, 'source.output')
ret = subprocess.Popen(
cc + ['-Werror=implicit-function-declaration'] + list(cflags) + ([] if link_also else ['-c']) +
['-o', os.path.join(tdir, 'source.output'), srcf.name] +
['-o', output, srcf.name] +
[f'-l{x}' for x in libraries] + list(ldflags),
stdout=subprocess.DEVNULL, stdin=subprocess.DEVNULL,
stderr=None if show_stderr else subprocess.DEVNULL
).wait() == 0
).wait()
if get_output_arch:
if ret != 0:
raise SystemExit(f'Failed to determine target architecture compiling test program failed with exit code: {ret}')
return get_binary_arch(output)
return ret == 0
def first_successful_compile(cc: List[str], *cflags: str, src: str = '', source_ext: str = 'c') -> str:
@ -432,6 +458,7 @@ def init_env(
vcs_rev: str = '',
) -> Env:
native_optimizations = native_optimizations and not sanitize
build_universal_binary = build_universal_binary and is_macos
cc, ccver = cc_version()
if verbose:
print('CC:', cc, ccver)
@ -463,7 +490,7 @@ def init_env(
# in https://github.com/kovidgoyal/kitty/issues/6845#issuecomment-1835886938
arm_control_flow_protection = '-mbranch-protection=standard' if is_macos else ''
# Universal build fails with -fcf-protection clang is not smart enough to filter it out for the ARM part
intel_control_flow_protection = '-fcf-protection=full' if ccver >= (9, 0) and not build_universal_binary else ''
intel_control_flow_protection = '-fcf-protection=full' if ccver >= (9, 0) else ''
control_flow_protection = arm_control_flow_protection if is_arm else intel_control_flow_protection
env_cflags = shlex.split(os.environ.get('CFLAGS', ''))
env_cppflags = shlex.split(os.environ.get('CPPFLAGS', ''))
@ -471,11 +498,10 @@ def init_env(
if control_flow_protection and not test_compile(cc, control_flow_protection, *env_cppflags, *env_cflags, ldflags=env_ldflags):
control_flow_protection = ''
march = ''
if not (is_macos and is_arm) and not build_universal_binary:
if native_optimizations and not build_universal_binary and not (is_macos and is_arm):
# see https://github.com/kovidgoyal/kitty/issues/3126
# -march=native is not supported when targeting Apple Silicon
if native_optimizations:
march = '-march=native -mtune=native'
march = '-march=native -mtune=native'
cflags_ = os.environ.get(
'OVERRIDE_CFLAGS', (
f'-Wextra {float_conversion} -Wno-missing-field-initializers -Wall -Wstrict-prototypes {std}'
@ -508,13 +534,6 @@ def init_env(
cflags.append('-g3')
ldflags.append('-lprofiler')
# SIMD instructions
if is_arm:
if not is_macos:
cflags.append('-mfpu=neon')
else:
cflags.append('-msse4.2')
cflags.append('-mavx2')
library_paths: Dict[str, List[str]] = {}
def add_lpath(which: str, name: str, val: Optional[str]) -> None:
@ -539,11 +558,13 @@ def init_env(
cflags.insert(0, f'-I{os.environ["DEVELOP_ROOT"]}/include')
ldpaths.insert(0, f'-L{os.environ["DEVELOP_ROOT"]}/lib')
if build_universal_binary:
set_arches(cflags)
set_arches(ldflags)
ba = test_compile(cc, *(cppflags + cflags), ldflags=ldflags, get_output_arch=True)
assert isinstance(ba, BinaryArch)
return Env(cc, cppflags, cflags, ldflags, library_paths, ccver=ccver, ldpaths=ldpaths, vcs_rev=vcs_rev)
return Env(
cc, cppflags, cflags, ldflags, library_paths, binary_arch=ba, native_optimizations=native_optimizations,
ccver=ccver, ldpaths=ldpaths, vcs_rev=vcs_rev, build_universal_binary=build_universal_binary
)
def kitty_env(args: Options) -> Env:
@ -638,7 +659,7 @@ def get_vcs_rev() -> str:
@lru_cache
def base64_defines() -> List[str]:
def base64_defines(isa: ISA) -> List[str]:
defs = {
'HAVE_AVX512': 0,
'HAVE_AVX2': 0,
@ -649,14 +670,18 @@ def base64_defines() -> List[str]:
'HAVE_SSE42': 0,
'HAVE_AVX': 0,
}
if is_arm:
if isa == ISA.ARM64:
defs['HAVE_NEON64'] = 1
else:
elif isa == ISA.AMD64:
defs['HAVE_AVX2'] = 1
defs['HAVE_AVX'] = 1
defs['HAVE_SSE42'] = 1
defs['HAVE_SSE41'] = 1
defs['HAVE_SSE3'] = 1
elif isa == ISA.X86:
defs['HAVE_SSE42'] = 1
defs['HAVE_SSE41'] = 1
defs['HAVE_SSE3'] = 1
return [f'{k}={v}' for k, v in defs.items()]
@ -668,13 +693,40 @@ def get_source_specific_defines(env: Env, src: str) -> Tuple[str, List[str], Opt
env.vcs_rev = get_vcs_rev()
return src, [], [f'KITTY_VCS_REV="{env.vcs_rev}"', f'WRAPPED_KITTENS="{wrapped_kittens()}"']
if src.startswith('3rdparty/base64/'):
return src, ['3rdparty/base64',], base64_defines()
return src, ['3rdparty/base64',], base64_defines(env.binary_arch.isa)
try:
return src, [], env.library_paths[src]
except KeyError:
return src, [], None
def get_source_specific_cflags(env: Env, src: str) -> List[str]:
ans = list(env.cflags)
# SIMD specific flags, ignored for native optimizations as they give slightly better performance
if src == 'kitty/simd-string-128.c':
if env.binary_arch.isa in (ISA.AMD64, ISA.X86):
if not env.native_optimizations:
ans.append('-msse4.2')
elif src == 'kitty/simd-string-256.c':
if env.binary_arch.isa in (ISA.AMD64, ISA.X86):
if not env.native_optimizations:
ans.append('-mavx2')
elif src.startswith('3rdparty/base64/lib/arch/'):
if not env.native_optimizations:
q = src.split(os.path.sep)
if 'sse3' in q:
ans.append('-msse3')
elif 'sse41' in q:
ans.append('-msse4.1')
elif 'sse42' in q:
ans.append('-msse4.2')
elif 'avx' in q:
ans.append('-mavx')
elif 'avx2' in q:
ans.append('-mavx2')
return ans
def newer(dest: str, *sources: str) -> bool:
try:
dtime = os.path.getmtime(dest)
@ -782,7 +834,8 @@ def compile_c_extension(
src, include_paths, defines = get_source_specific_defines(kenv, src)
if defines is not None:
cppflags.extend(map(define, defines))
cmd = kenv.cc + ['-MMD'] + cppflags + [f'-I{x}' for x in include_paths] + kenv.cflags
cflags = get_source_specific_cflags(kenv, src)
cmd = kenv.cc + ['-MMD'] + cppflags + [f'-I{x}' for x in include_paths] + cflags
cmd += ['-c', src] + ['-o', dest]
key = CompileKey(original_src, os.path.basename(dest))
desc = f'Compiling {emphasis(desc_prefix + src)} ...'