M
Modularโ€ข9mo ago
toasty

Iterating over Strings containing Unicode characters?

Has anyone figured out how to do this? Maxim's code works perfectly for getting the count of printable characters in a String.
alias simd_width_u8 = simdwidthof[DType.uint8]()

fn rune_count_in_string(s: String) -> Int:
var p = s._as_ptr().bitcast[DType.uint8]()
var string_byte_length = len(s)
var result = 0

@parameter
fn count[simd_width: Int](offset: Int):
result += (
((p.load[width=simd_width](offset) >> 6) != 0b10)
.cast[DType.uint8]()
.reduce_add()
.to_int()
)

vectorize[count, simd_width_u8](string_byte_length)
return result
alias simd_width_u8 = simdwidthof[DType.uint8]()

fn rune_count_in_string(s: String) -> Int:
var p = s._as_ptr().bitcast[DType.uint8]()
var string_byte_length = len(s)
var result = 0

@parameter
fn count[simd_width: Int](offset: Int):
result += (
((p.load[width=simd_width](offset) >> 6) != 0b10)
.cast[DType.uint8]()
.reduce_add()
.to_int()
)

vectorize[count, simd_width_u8](string_byte_length)
return result
And the ord and chr changes from the nightly branch help with handling conversion of a single character. But I've been unable to iterate through the codepoints of a string, as iterating through the bytes doesn't work for this use case. It's all fairly new to me, so I'm curious if anyone has already solved this problem ๐Ÿ™‚
4 Replies
Maxim
Maximโ€ข9mo ago
I am looking into proper Unicode support for String, but it might take a while and probably will be started as a proposal first. You can have a pick at the ord implement I submitted in the nightly, It should give you an idea how to write a function to iterate over the runes.
toasty
toastyOPโ€ข9mo ago
Thanks for responding! Iโ€™ve been taking a look through those changes, Iโ€™ll post here if I do figure out how to iterate over runes
Maxim
Maximโ€ข9mo ago
Hey, here is an implementation for a char iterator:
from math.bit import ctlz

fn string_iterator(s: String, it: fn (String)->None):
var bytes = len(s)
var p = s._as_ptr().bitcast[DType.uint8]()
while bytes > 0:
var char_length = ((p.load() >> 7 == 0).cast[DType.uint8]() * 1 + ctlz(~p.load())).to_int()
var sp = DTypePointer[DType.int8].alloc(char_length + 1)
memcpy(sp, p.bitcast[DType.int8](), char_length)
sp[char_length] = 0
it(String(sp, char_length + 1))
bytes -= char_length
p += char_length

fn main():
fn print_str(s: String):
print(s)
string_iterator("Hello World", print_str)
string_iterator("Hello ๐Ÿ”ฅ, How is it ๐Ÿฆต๐Ÿผ today?", print_str)
from math.bit import ctlz

fn string_iterator(s: String, it: fn (String)->None):
var bytes = len(s)
var p = s._as_ptr().bitcast[DType.uint8]()
while bytes > 0:
var char_length = ((p.load() >> 7 == 0).cast[DType.uint8]() * 1 + ctlz(~p.load())).to_int()
var sp = DTypePointer[DType.int8].alloc(char_length + 1)
memcpy(sp, p.bitcast[DType.int8](), char_length)
sp[char_length] = 0
it(String(sp, char_length + 1))
bytes -= char_length
p += char_length

fn main():
fn print_str(s: String):
print(s)
string_iterator("Hello World", print_str)
string_iterator("Hello ๐Ÿ”ฅ, How is it ๐Ÿฆต๐Ÿผ today?", print_str)
As you can see on the second example it works for runes but does not work for grapheme clusters. ๐Ÿฆต๐Ÿผ is iterated over as ๐Ÿฆต + the skin tone. This however string_iterator("'็ฑณใใ‚’่ˆต4็‰ฉๅง”ใ‚‰ใ”ๆฐๆพใƒใƒŠใƒ†ใƒ•ๆœˆ้–ขใ‚ฝๆ™‚ๅนณใตใ„ใฎๅšๆƒ…ใ‚Œใ˜ใƒ•็‰Ÿไธ‡ใ„ๅ…ƒ56ๅœ’ใƒ•ใƒกใƒคใ‚ช่ฉฆๅ›ณใƒญใƒ„ใƒคๆœชๅ‚™็Ž‹ใ“ใจๅ‚ทๅ–ซ็พ…่ธŠใ‚“ใ‚†ใ—ใ€‚", print_str) works as you would expect as there are no grapheme clusters
toasty
toastyOPโ€ข9mo ago
Thatโ€™s awesome, thank you! Iโ€™ll give it a go in a bit Works well ๐Ÿ‘ I was trying to iterate over Strings with unicode characters and ANSI escape sequences. I was able to drop in the logic as a replacement for a for loop over the range of len(src).
# write truncates content at the given printable cell width, leaving any
# ansi sequences intact.
fn write(inout self, src: List[Int8]) -> Result[Int]:
# TODO: Normally rune length
var tw = printable_rune_width(self.tail)

if self.width < UInt8(tw):
return self.ansi_writer.forward.write_string(self.tail)

self.width -= UInt8(tw)
var cur_width: UInt8 = 0

# Rune iterator
var bytes = len(src)
var p = DTypePointer[DType.int8](src.data.value).bitcast[DType.uint8]()
while bytes > 0:
var char_length = ((p.load() >> 7 == 0).cast[DType.uint8]() * 1 + ctlz(~p.load())).to_int()
var sp = DTypePointer[DType.int8].alloc(char_length + 1)
memcpy(sp, p.bitcast[DType.int8](), char_length)
sp[char_length] = 0

# Functional logic
var char = String(sp, char_length + 1)
if char == Marker:
# ANSI escape sequence
self.ansi = True
elif self.ansi:
if is_terminator(ord(char)):
# ANSI sequence terminated
self.ansi = False
else:
cur_width += UInt8(printable_rune_width(char))

if cur_width > self.width:
var n = self.ansi_writer.forward.write_string(self.tail)
if self.ansi_writer.last_sequence() != "":
self.ansi_writer.reset_ansi()
return n

_ = self.ansi_writer.write(char.as_bytes())

bytes -= char_length
p += char_length
# write truncates content at the given printable cell width, leaving any
# ansi sequences intact.
fn write(inout self, src: List[Int8]) -> Result[Int]:
# TODO: Normally rune length
var tw = printable_rune_width(self.tail)

if self.width < UInt8(tw):
return self.ansi_writer.forward.write_string(self.tail)

self.width -= UInt8(tw)
var cur_width: UInt8 = 0

# Rune iterator
var bytes = len(src)
var p = DTypePointer[DType.int8](src.data.value).bitcast[DType.uint8]()
while bytes > 0:
var char_length = ((p.load() >> 7 == 0).cast[DType.uint8]() * 1 + ctlz(~p.load())).to_int()
var sp = DTypePointer[DType.int8].alloc(char_length + 1)
memcpy(sp, p.bitcast[DType.int8](), char_length)
sp[char_length] = 0

# Functional logic
var char = String(sp, char_length + 1)
if char == Marker:
# ANSI escape sequence
self.ansi = True
elif self.ansi:
if is_terminator(ord(char)):
# ANSI sequence terminated
self.ansi = False
else:
cur_width += UInt8(printable_rune_width(char))

if cur_width > self.width:
var n = self.ansi_writer.forward.write_string(self.tail)
if self.ansi_writer.last_sequence() != "":
self.ansi_writer.reset_ansi()
return n

_ = self.ansi_writer.write(char.as_bytes())

bytes -= char_length
p += char_length
Want results from more Discord servers?
Add your server