feat(stdlib): forEachCodePoint and forEachCodePointi (#766)

* feat(stdlib): forEachCodePoint and forEachCodePointi * Added missing docs for forEachCodePointi * Apply suggestions from code review * Applied suggested change in the doc comment. Co-authored-by: Philip E Blair <[email protected]> * Applied suggested change for parameter names Co-authored-by: Blaine Bublitz <[email protected]> * Fixed references to new parameter names Co-authored-by: Oscar Spencer <[email protected]> Co-authored-by: Philip E Blair <[email protected]> Co-authored-by: Blaine Bublitz <[email protected]>
grain-lang · Jul 17, 2021 · b95cfb7 · b95cfb7
1 parent bc4146b
commit b95cfb7
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 0 deletions.
diff --git a/compiler/test/stdlib/string.test.gr b/compiler/test/stdlib/string.test.gr
@@ -234,3 +234,21 @@ assert String.decodeRange(String.encodeAtWithBom(emojis, String.UTF32_LE, Bytes.
 // BOM stripping
 assert String.decode(String.encode(emojis, String.UTF32_LE), String.UTF32_LE) == emojis
 assert String.decode(String.encode(emojis, String.UTF32_BE), String.UTF32_BE) == emojis
+
+// codepoint iteration tests
+// conveniently reusing data from `explode` tests
+{
+  let mut tmp = []
+  String.forEachCodePoint((codePoint) => {
+    tmp = [codePoint, ...tmp]
+  }, emojis)
+  assert Array.reverse(Array.fromList(tmp)) == codes
+}
+
+{
+  let mut tmp = []
+  String.forEachCodePointi((codePoint,idx) => {
+    tmp = [(codePoint,idx), ...tmp]
+  }, emojis)
+  assert Array.reverse(Array.fromList(tmp)) == Array.mapi((c,i) => (c,i), codes)
+}
diff --git a/stdlib/string.gr b/stdlib/string.gr
@@ -1354,3 +1354,107 @@ export let decode = (bytes: Bytes, encoding: Encoding) => {
 export let decodeKeepBom = (bytes: Bytes, encoding: Encoding) => {
   decodeHelp(bytes, encoding, false)
 }
+
+/**
+ * Iterates over Unicode code points in a string.
+ * 
+ * @param fn: The iterator function
+ * @param str: The string to iterate
+ */
+@disableGC
+export let forEachCodePoint = (fn: (Number) -> Void, str: String) => {
+  let (>>>) = WasmI32.shrU
+  let (-) = WasmI32.sub
+  let (&) = WasmI32.and
+  let (<) = WasmI32.ltU
+  let (<=) = WasmI32.leU
+  let (==) = WasmI32.eq
+  let (+) = WasmI32.add
+
+  let strPtr = WasmI32.fromGrain(str)
+
+  let byteSize = WasmI32.load(strPtr, 4n)
+
+  let mut ptr = strPtr + 8n
+  let end = ptr + byteSize
+
+  let mut idx = 0n
+  while (ptr < end) {
+    let byte = WasmI32.load8U(ptr, 0n)
+    let codePointByteCount = if ((byte & 0x80n) == 0x00n) {
+      1n
+    } else if ((byte & 0xF0n) == 0xF0n) {
+      4n
+    } else if ((byte & 0xE0n) == 0xE0n) {
+      3n
+    } else {
+      2n
+    }
+
+    // Note that even if up to 4 bytes are needed to represent Unicode
+    // codepoints, this doesn't mean 32 bits. The highest allowed code point is
+    // 0x10FFFF and it should not change in future versions of Unicode. This
+    // means no more than 21 bits are necessary to represent a code point and
+    // thus we can use Grain's "simple" numbers that hold up to 31 bits and
+    // avoid heap allocations. `getCodePoint` will throw
+    // MalformedUnicode exception for values exceeding this limit.
+    let codePoint = getCodePoint(ptr)
+    fn(tagSimpleNumber(codePoint))
+
+    ptr += codePointByteCount
+    idx += 1n
+  }
+}
+
+/**
+ * Iterates over Unicode code points in a string. This is the same as
+ * `forEachCodePoint`, but provides the code point's index in the string
+ * as the second argument to the iterator function.
+ * 
+ * @param fn: The iterator function
+ * @param str: The string to iterate
+ */
+@disableGC
+export let forEachCodePointi = (fn: (Number, Number) -> Void, str: String) => {
+  let (>>>) = WasmI32.shrU
+  let (-) = WasmI32.sub
+  let (&) = WasmI32.and
+  let (<) = WasmI32.ltU
+  let (<=) = WasmI32.leU
+  let (==) = WasmI32.eq
+  let (+) = WasmI32.add
+
+  let strPtr = WasmI32.fromGrain(str)
+
+  let byteSize = WasmI32.load(strPtr, 4n)
+
+  let mut ptr = strPtr + 8n
+  let end = ptr + byteSize
+
+  let mut idx = 0n
+  while (ptr < end) {
+    let byte = WasmI32.load8U(ptr, 0n)
+    let codePointByteCount = if ((byte & 0x80n) == 0x00n) {
+      1n
+    } else if ((byte & 0xF0n) == 0xF0n) {
+      4n
+    } else if ((byte & 0xE0n) == 0xE0n) {
+      3n
+    } else {
+      2n
+    }
+
+    // Note that even if up to 4 bytes are needed to represent Unicode
+    // codepoints, this doesn't mean 32 bits. The highest allowed code point is
+    // 0x10FFFF and it should not change in future versions of Unicode. This
+    // means no more than 21 bits are necessary to represent a code point and
+    // thus we can use Grain's "simple" numbers that hold up to 31 bits and
+    // avoid heap allocations. `getCodePoint` will throw
+    // MalformedUnicode exception for values exceeding this limit.
+    let codePoint = getCodePoint(ptr)
+    fn(tagSimpleNumber(codePoint), tagSimpleNumber(idx))
+
+    ptr += codePointByteCount
+    idx += 1n
+  }
+}