How do you put bytes into rodata?

I have not figured out how to put bytes into rodata.
The current solution calls malloc on every function call, then copies the elements from static memory to the allocated array. I think it also leaking the memory.

I don’t think stdlib figured it out either.

alias UNICODE_ID_START = Span(list=map_list(_UNICODE_ID_START__gen1)).get_immutable()


fn map_list(a: List[(Int, Int)]) -> List[(UInt32, UInt32)]:
    return [(UInt32(start), UInt32(end)) for start, end in _UNICODE_ID_START__gen1]

alias _UNICODE_ID_START__gen1 = [
    (0x0041, 0x005A),
    (0x0061, 0x007A),
    (0x00AA, 0x00AA),
    (0x00B5, 0x00B5),
    (0x00BA, 0x00BA),
    (0x00C0, 0x00D6),
....
]
00000000000062c0 <is_unicode_id_start>:
    62c0: 41 57                        	push	r15
    62c2: 41 56                        	push	r14
    62c4: 53                           	push	rbx
    62c5: 49 89 fe                     	mov	r14, rdi
    62c8: bf 04 00 00 00               	mov	edi, 0x4
    62cd: be 00 20 00 00               	mov	esi, 0x2000
    62d2: e8 f9 0c 00 00               	call	0x6fd0 <KGEN_CompilerRT_AlignedAlloc@plt>
    62d7: 48 89 c3                     	mov	rbx, rax
    62da: c5 fc 28 05 7e b7 ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x4882] # 0x1a60 <write+0x1a60>
    62e2: c5 fc 11 00                  	vmovups	ymmword ptr [rax], ymm0
    62e6: c5 fc 28 05 12 ad ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x52ee] # 0x1000 <write+0x1000>
    62ee: c5 fc 11 40 20               	vmovups	ymmword ptr [rax + 0x20], ymm0
    62f3: c5 fc 28 05 45 b6 ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x49bb] # 0x1940 <write+0x1940>
    62fb: c5 fc 11 40 40               	vmovups	ymmword ptr [rax + 0x40], ymm0
    6300: c5 fc 28 05 78 ad ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x5288] # 0x1080 <write+0x1080>
    6308: c5 fc 11 40 60               	vmovups	ymmword ptr [rax + 0x60], ymm0
    630d: c5 fc 28 05 2b b3 ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x4cd5] # 0x1640 <write+0x1640>
    6315: c5 fc 11 80 80 00 00 00      	vmovups	ymmword ptr [rax + 0x80], ymm0
    631d: c5 fc 28 05 bb ae ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x5145] # 0x11e0 <write+0x11e0>
    6325: c5 fc 11 80 a0 00 00 00      	vmovups	ymmword ptr [rax + 0xa0], ymm0
    632d: c5 fc 28 05 2b b4 ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x4bd5] # 0x1760 <write+0x1760>
    6335: c5 fc 11 80 c0 00 00 00      	vmovups	ymmword ptr [rax + 0xc0], ymm0
    633d: c5 fc 28 05 9b b7 ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x4865] # 0x1ae0 <write+0x1ae0>
    6345: c5 fc 11 80 e0 00 00 00      	vmovups	ymmword ptr [rax + 0xe0], ymm0
    634d: c5 fc 28 05 ab b1 ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x4e55] # 0x1500 <write+0x1500>
    6355: c5 fc 11 80 00 01 00 00      	vmovups	ymmword ptr [rax + 0x100], ymm0
    ...
 6d7d: 48 8d b8 60 15 00 00         	lea	rdi, [rax + 0x1560]
    6d84: 45 31 ff                     	xor	r15d, r15d
    6d87: ba a0 0a 00 00               	mov	edx, 0xaa0
    6d8c: 31 f6                        	xor	esi, esi
    6d8e: c5 f8 77                     	vzeroupper
    6d91: e8 4a 02 00 00               	call	0x6fe0 <memset@plt>
    6d96: 41 8b 06                     	mov	eax, dword ptr [r14]
    6d99: eb 11                        	jmp	0x6dac <is_unicode_id_start+0xaec>
    6d9b: 0f 1f 44 00 00               	nop	dword ptr [rax + rax]
    6da0: 49 ff c7                     	inc	r15
    6da3: 49 81 ff ac 02 00 00         	cmp	r15, 0x2ac
    6daa: 74 15                        	je	0x6dc1 <is_unicode_id_start+0xb01>
    6dac: 42 39 04 fb                  	cmp	dword ptr [rbx + 8*r15], eax
    6db0: 77 ee                        	ja	0x6da0 <is_unicode_id_start+0xae0>
    6db2: 42 3b 44 fb 04               	cmp	eax, dword ptr [rbx + 8*r15 + 0x4]
    6db7: 77 e7                        	ja	0x6da0 <is_unicode_id_start+0xae0>
    6db9: b0 01                        	mov	al, 0x1
    6dbb: 5b                           	pop	rbx
    6dbc: 41 5e                        	pop	r14
    6dbe: 41 5f                        	pop	r15
    6dc0: c3                           	ret
    6dc1: 31 c0                        	xor	eax, eax
    6dc3: 5b                           	pop	rbx
    6dc4: 41 5e                        	pop	r14
    6dc6: 41 5f                        	pop	r15
    6dc8: c3                           	ret
    6dc9: 0f 1f 80 00 00 00 00         	nop	dword ptr [rax]

That looks like a mistake in the stdlib, the API should be in Span and not in List. I opened issue #5372

The problem is not that binary_search allocates, but that you cannot create a span without allocating on either the stack (InlineArray) or the heap (List). In fact, I have not used the binary_search function.

fn is_unicode_id_start(c: Codepoint) -> Bool:
    for start, end in UNICODE_ID_START:
        if start <= c.to_u32() <= end:
            return True
    return False

I think it’s ultimately caused by this: [Feature Request] static alias or equivalent for creating LLVM `internal constant`s · Issue #4565 · modular/modular · GitHub. It’s only that after the introduction of materialize, the “copying from static to allocated memory” problem becomes more apparent.

I see, what has been a performance problem is now also an ergonomic problem.
I guess for now you have to use String, ignore UTF-8, then figure out how to deal with byte order and alignment.
I wonder how much this would speed up string casing.

I’m sorry for not seeing this earlier. We don’t have a “pretty” way to do this yet, but this hack will work for many simple cases:

fn global_cst[T: AnyType, //, value: T]() -> ref [StaticConstantOrigin] T:
    var ptr = __mlir_op.`pop.global_constant`[
        value=value, _type = UnsafePointer[T]._mlir_type
    ]()
    return UnsafePointer[T](ptr)[]


fn complicated(a: Int) -> Int:
    return a * 2 + 1


fn main():
    alias x = 4 * 4
    ref global_ptr = global_cst[InlineArray[Int, 4](1, 2, complicated(3), x)]()

    print(global_ptr[0], global_ptr[1], global_ptr[2], global_ptr[3], sep=", ")
1 Like

I don’t understand what the problem is. This works, but of course you need to allocate since you are casting from a list of Int (32-64 bit depending on platform) to 32 bits:

alias _UNICODE_ID_START__gen1 = [
    (0x0041, 0x005A),
    (0x0061, 0x007A),
    (0x00AA, 0x00AA),
    (0x00B5, 0x00B5),
    (0x00BA, 0x00BA),
    (0x00C0, 0x00D6),
]


fn map_list[a: List[(Int, Int)]](out res: List[(UInt32, UInt32)]):
    alias a_len = len(a)
    res = {capacity = a_len}

    @parameter
    for i in range(len(_UNICODE_ID_START__gen1)):
        alias start = _UNICODE_ID_START__gen1[i][0]
        alias end = _UNICODE_ID_START__gen1[i][1]
        res[i] = {start, end}


alias UNICODE_ID_START = Span(
    list=map_list[_UNICODE_ID_START__gen1]()
).get_immutable()


fn is_unicode_id_start(c: Codepoint) -> Bool:
    for start, end in UNICODE_ID_START:
        if start <= c.to_u32() <= end:
            return True
    return False


fn main():
    print(is_unicode_id_start(Codepoint(0x0041)))

As a sidenote, since you are doing things with UTF-32 maybe just store the constants as UInt32 ?

I wonder how much this would speed up string casing.

Yeah I need to check that code, I didn’t expect there were cases like that binary search allocating.

I do this because it does not explicit casts to UInt32, which makes the table look nicer. I am fine with the compile time tradeoff when it comes with extra readability.

In fact this was just a draft. I would never choose a wasteful encoding like UTF-32. I have instead opted for UTF-20P.

fn bit_pack_list(a: List[(Int, Int, Int)]) -> List[UInt32]:
    MAX_CHUNK_SIZE = (1 << 11) - 1
    b: List[UInt32] = []
    for var start, end, id_type in a:
        difference = end - start + 1
        if difference <= MAX_CHUNK_SIZE:
            bits = bit_pack(start, id_type, difference)
            b.append(bits)
            continue
        while start <= end:
            chunk_diff = min(MAX_CHUNK_SIZE, end - start + 1)
            bits = bit_pack(start, id_type, chunk_diff)
            b.append(bits)
            start += chunk_diff
    return b


alias UNICODE_ID_START_LENGTH_BIT_POSITIONS = (21, 31)
alias UNICODE_ID_START_ID_TYPE_BIT_POSITIONS = (20, 20)
alias UNICODE_ID_START_START_BIT_POSITIONS = (0, 19)


fn bit_pack(start: Int, id_type: Int, length: Int) -> UInt32:
    id_flag = 1 if id_type == ID_START | ID_CONTINUE else 0
    # length id_type start
    # 31:21  20      19:0
    a: UInt32 = 0
    a |= UInt32(start)
    a |= UInt32(id_flag) << 20
    a |= UInt32(length) << 21
    return a


alias ID_START = 1
alias ID_CONTINUE = 2

alias _UNICODE_ID_START__gen1 = [
    (0x0030, 0x0039, ID_CONTINUE),  # [10]
    (0x0041, 0x005A, ID_START | ID_CONTINUE),  # [26]
    (0x005F, 0x005F, ID_CONTINUE),  # [1]
    (0x0061, 0x007A, ID_START | ID_CONTINUE),  # [26]
    (0x00AA, 0x00AA, ID_START | ID_CONTINUE),  # [1]
 ...
]

@always_inline("nodebug")
fn get_bits[start: UInt32, end: UInt32](bits: UInt32) -> UInt32:
    alias length = end - start + 1
    alias mask = ((1 << length) - 1)
    return (bits >> start) & mask

fn lookup_unicode_id[unicode_id_type: UInt8](c: Codepoint) -> Bool:
    alias start_bit_pos_start = unicode_lookup_table.UNICODE_ID_START_START_BIT_POSITIONS[
        0
    ]
    alias start_bit_pos_end = unicode_lookup_table.UNICODE_ID_START_START_BIT_POSITIONS[
        1
    ]
    alias length_bit_pos_start = unicode_lookup_table.UNICODE_ID_START_LENGTH_BIT_POSITIONS[
        0
    ]
    alias length_bit_pos_end = unicode_lookup_table.UNICODE_ID_START_LENGTH_BIT_POSITIONS[
        1
    ]
    alias id_type_bit_pos_start = unicode_lookup_table.UNICODE_ID_START_ID_TYPE_BIT_POSITIONS[
        0
    ]
    alias id_type_bit_pos_end = unicode_lookup_table.UNICODE_ID_START_ID_TYPE_BIT_POSITIONS[
        1
    ]
    d = c.to_u32()

    for bits in UNICODE_ID_START:
        start = get_bits[start_bit_pos_start, start_bit_pos_end](bits)
        length = get_bits[length_bit_pos_start, length_bit_pos_end](bits)
        end = start + length
        if start <= d < end:
            return True
    return False

I still don’t understand the original issue that led you to open this post. Was it because of the map_list(a: List[(Int, Int)]) function materializing the a list to operate on it? If so, then the solution is to lift it into parameter space like I showed above

I do this because it does not explicit casts to UInt32, which makes the table look nicer. I am fine with the compile time tradeoff when it comes with extra readability.

FYI there are workarounds by using aliases

alias T = Tuple[UInt32, UInt32]

alias _UNICODE_ID_START__gen1: List[T] = [
    T(0x0041, 0x005A),
    T(0x0061, 0x007A),
    T(0x00AA, 0x00AA),
    T(0x00B5, 0x00B5),
    T(0x00BA, 0x00BA),
    T(0x00C0, 0x00D6),
]

Though yeah ideally one would just write : List[Tuple[UInt32, UInt32]] = [...] and it should just be inferred.

The compiler has gotten too smart with the latest nightly. Now alias = InlineArray will just work.

0000000000003040 <is_unicode_id_start>:
    3040: 8b 07                        	mov	eax, dword ptr [rdi]
    3042: 31 f6                        	xor	esi, esi
    3044: 48 8d 0d 69 da ff ff         	lea	rcx, [rip - 0x2597]     # 0xab4 <global_constant>
    304b: 0f 1f 44 00 00               	nop	dword ptr [rax + rax]
    3050: 48 89 f2                     	mov	rdx, rsi
    3053: 48 81 fe 88 11 00 00         	cmp	rsi, 0x1188
    305a: 74 20                        	je	0x307c <is_unicode_id_start+0x3c>
    305c: 44 8b 04 0a                  	mov	r8d, dword ptr [rdx + rcx]
    3060: 44 89 c7                     	mov	edi, r8d
    3063: 81 e7 ff ff 0f 00            	and	edi, 0xfffff
    3069: 48 8d 72 04                  	lea	rsi, [rdx + 0x4]
    306d: 39 c7                        	cmp	edi, eax
    306f: 77 df                        	ja	0x3050 <is_unicode_id_start+0x10>
    3071: 41 c1 e8 15                  	shr	r8d, 0x15
    3075: 44 01 c7                     	add	edi, r8d
    3078: 39 f8                        	cmp	eax, edi
    307a: 73 d4                        	jae	0x3050 <is_unicode_id_start+0x10>
    307c: 48 81 fa 88 11 00 00         	cmp	rdx, 0x1188
    3083: 0f 95 c0                     	setne	al
    3086: c3                           	ret
    3087: 66 0f 1f 84 00 00 00 00 00   	nop	word ptr [rax + rax]

I guess global_cst is not going away for cases like these. The global static optimization does not apply for function level aliases.

fn is_syntax3(c: Byte) -> Bool:
    return in_ascii_set[r"^$\.*+?()[]{}|", opt_size=False](c)

@always_inline
fn in_ascii_set[s: StaticString, *, opt_size: Bool = True](c: Byte) -> Bool:
    alias set = ascii_set(s, opt_size=opt_size)
    alias lookup_or_set = set[2]

    from sys.intrinsics import assume

    assume(c <= 127)

    @parameter
    if lookup_or_set:
        alias mask = set[1]
        return UInt128(1) << UInt128(c) & mask != 0
    else:
        alias lookup = Span(set[0])
        return lookup[c] == c


fn ascii_set(
    s: StaticString, /, *, opt_size: Bool = True
) -> Tuple[InlineArray[UInt8, 128], UInt128, Bool]:
    max_byte: Byte = 0
    lookup = InlineArray[UInt8, 128](fill=255)
    mask: UInt128 = 0
    for c in s.as_bytes():
        max_byte = max(c, max_byte)

    if max_byte < 64 or opt_size:
        mask = ascii_mask(s)
        return (lookup, mask, True)
    lookup = ascii_lookup(s)
    return (lookup, mask, False)


fn ascii_lookup(s: StaticString) -> InlineArray[UInt8, 128]:
    lookup = InlineArray[UInt8, 128](fill=255)
    for c in s.as_bytes():
        lookup[c] = c
    return lookup


fn ascii_mask(s: StaticString) -> UInt128:
    var mask: UInt128 = 0
    for c in s.codepoints():
        mask |= UInt128(1) << UInt128(c.to_u32())
    return mask
0000000000002f90 <is_syntax3>:
    2f90: c5 fd 76 c0                  	vpcmpeqd	ymm0, ymm0, ymm0
    2f94: c5 fe 7f 44 24 80            	vmovdqu	ymmword ptr [rsp - 0x80], ymm0
    2f9a: c5 fc 28 05 3e db ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x24c2] # 0xae0 <write+0xae0>
    2fa2: c5 fc 11 44 24 a0            	vmovups	ymmword ptr [rsp - 0x60], ymm0
    2fa8: c5 fc 28 05 f0 da ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x2510] # 0xaa0 <write+0xaa0>
    2fb0: c5 fc 11 44 24 c0            	vmovups	ymmword ptr [rsp - 0x40], ymm0
    2fb6: c5 fc 28 05 a2 da ff ff      	vmovaps	ymm0, ymmword ptr [rip - 0x255e] # 0xa60 <write+0xa60>
    2fbe: c5 fc 11 44 24 e0            	vmovups	ymmword ptr [rsp - 0x20], ymm0
    2fc4: 40 0f b6 c7                  	movzx	eax, dil
    2fc8: 38 44 04 80                  	cmp	byte ptr [rsp + rax - 0x80], al
    2fcc: 0f 94 c0                     	sete	al
    2fcf: c5 f8 77                     	vzeroupper
    2fd2: c3                           	ret
    2fd3: 66 66 66 66 2e 0f 1f 84 00 00 00 00 00       	nop	word ptr cs:[rax + rax]

But for function aliases global_cst works.

alias lookup_array = set[0]
ref lookup = global_cst[lookup_array]()
return lookup[c] == c
0000000000003000 <is_syntax3>:
    3000: 40 0f b6 c7                  	movzx	eax, dil
    3004: 48 8d 0d 45 da ff ff         	lea	rcx, [rip - 0x25bb]     # 0xa50 <global_constant>
    300b: 38 04 08                     	cmp	byte ptr [rax + rcx], al
    300e: 0f 94 c0                     	sete	al
    3011: c3                           	ret
    3012: 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00    	nop	word ptr cs:[rax + rax]

1 Like

Your workaround is great, but I like meta programming. Compile times will suffer, but it will still be faster than rust.

1 Like

I had to do this in #5381 due to the global constant trick not folding at compile time. So the function needs to return a value and not a reference

fn _global_cst[T: AnyType, //, value: T]() -> ref [StaticConstantOrigin] T:
    var ptr = __mlir_op.`pop.global_constant`[
        value=value, _type = UnsafePointer[T]._mlir_type
    ]()
    return UnsafePointer[T](ptr)[]


@always_inline
fn _maybe_get_index[
    lookup: InlineArray[UInt32, **_]
](rune: Codepoint) -> Optional[UInt]:
    if is_compile_time():
        return Span(materialize[lookup]())._binary_search_index(rune.to_u32())
    return Span(_global_cst[lookup]())._binary_search_index(rune.to_u32())


@always_inline
fn _get_mapping[
    inner_size: Int,
    outer_size: Int, //,
    mapping: InlineArray[SIMD[DType.uint32, inner_size], outer_size],
](index: UInt) -> SIMD[DType.uint32, inner_size]:
    if is_compile_time():
        return materialize[mapping]().unsafe_get(index)
    return _global_cst[mapping]().unsafe_get(index)

FYI I found a better way for Mojo to infer the types just fine:

alias _UNICODE_ID_START__gen1: List[Tuple[UInt32, UInt32]] = [
    {0x0041, 0x005A},
    {0x0061, 0x007A},
    {0x00AA, 0x00AA},
    {0x00B5, 0x00B5},
    {0x00BA, 0x00BA},
    {0x00C0, 0x00D6},
]