This commit is contained in:
Kovid Goyal 2025-03-25 13:45:56 +05:30
parent 61ae12e0a9
commit d429f732e1
No known key found for this signature in database
GPG key ID: 06BC317B515ACE7C
3 changed files with 78 additions and 2 deletions

View file

@ -12,6 +12,7 @@ from contextlib import contextmanager
from functools import lru_cache, partial
from html.entities import html5
from itertools import groupby
from math import ceil, log
from operator import itemgetter
from typing import (
Callable,
@ -598,15 +599,21 @@ def gen_multistage_table(
width_shift = 4
def bitsize(maxval: int) -> int: # number of bits needed to store maxval
return ceil(log(maxval, 2))
class CharProps(NamedTuple):
width: int = 3
grapheme_break: str = '4'
indic_conjunct_break: str = '2'
grapheme_break: str = '' # set at runtime
indic_conjunct_break: str = '' # set at runtime
is_extended_pictographic: bool = True
is_emoji: bool = True
is_emoji_presentation_base: bool = True
# derived properties for fast lookup
is_invalid: bool = True
is_non_rendered: bool = True
is_symbol: bool = True
@ -693,6 +700,8 @@ def generate_enum(p: Callable[..., None], gp: Callable[..., None], name: str, *i
def gen_char_props() -> None:
CharProps._field_defaults['grapheme_break'] = str(bitsize(len(grapheme_segmentation_maps) + 2))
CharProps._field_defaults['indic_conjunct_break'] = str(bitsize(len(incb_map) + 1))
invalid = class_maps['Cc'] | class_maps['Cs']
non_printing = invalid | class_maps['Cf']
width_map: dict[int, int] = {}
@ -732,6 +741,7 @@ def gen_char_props() -> None:
gp('package wcswidth')
generate_enum(c, gp, 'GraphemeBreakProperty', 'AtStart', 'None', *grapheme_segmentation_maps, prefix='GBP_')
generate_enum(c, gp, 'IndicConjunctBreak', 'None', *incb_map, prefix='ICB_')
generate_enum(c, gp, 'UnicodeCategory', *class_maps, prefix='UC_')
bf = make_bitfield('tools/wcswidth', 'CharProps', *CharProps().go_fields, add_package=False)[1]
gp(bf)
gp(f'''

View file

@ -28,6 +28,38 @@ typedef enum IndicConjunctBreak {
ICB_Extend,
} IndicConjunctBreak;
typedef enum UnicodeCategory {
UC_Cc,
UC_Zs,
UC_Po,
UC_Sc,
UC_Ps,
UC_Pe,
UC_Sm,
UC_Pd,
UC_Nd,
UC_Lu,
UC_Sk,
UC_Pc,
UC_Ll,
UC_So,
UC_Lo,
UC_Pi,
UC_Cf,
UC_No,
UC_Pf,
UC_Lt,
UC_Lm,
UC_Mn,
UC_Me,
UC_Mc,
UC_Nl,
UC_Zl,
UC_Zp,
UC_Cs,
UC_Co,
} UnicodeCategory;
static const char_type CharProps_mask = 255u;
static const char_type CharProps_shift = 8u;
static const uint8_t CharProps_t1[4352] = {

View file

@ -29,6 +29,40 @@ const (
ICB_Extend
)
type UnicodeCategory uint8
const (
UC_Cc UnicodeCategory = iota
UC_Zs
UC_Po
UC_Sc
UC_Ps
UC_Pe
UC_Sm
UC_Pd
UC_Nd
UC_Lu
UC_Sk
UC_Pc
UC_Ll
UC_So
UC_Lo
UC_Pi
UC_Cf
UC_No
UC_Pf
UC_Lt
UC_Lm
UC_Mn
UC_Me
UC_Mc
UC_Nl
UC_Zl
UC_Zp
UC_Cs
UC_Co
)
// Total number of bits used: 16
type CharProps uint16