Faster NSString to Lisp string conversion

author Mattias Engdegård <mattiase@acm.org>

Wed, 9 Aug 2023 10:34:06 +0000 (12:34 +0200)

committer Mattias Engdegård <mattiase@acm.org>

Wed, 9 Aug 2023 11:02:00 +0000 (13:02 +0200)
author Mattias Engdegård <mattiase@acm.org>
Wed, 9 Aug 2023 10:34:06 +0000 (12:34 +0200)
committer Mattias Engdegård <mattiase@acm.org>
Wed, 9 Aug 2023 11:02:00 +0000 (13:02 +0200)
diff --git a/src/nsfns.m b/src/nsfns.m

index f962de02cb99d2f23a94ec02a47f4038a46e56f0..508568d90c304df05f1bca0ad1382295b9d2eb35 100644 (file)
--- a/src/nsfns.m
+++ b/src/nsfns.m
@@ -3796,6 +3796,27 @@ all_nonzero_ascii (unsigned char *str, ptrdiff_t n)
    return true;
  }
  
+/* Count the number of characters in STR, NBYTES long.
+   The string is valid UTF-8 except that it may contain unpaired surrogates.  */
+static ptrdiff_t
+count_utf8_chars (const char *str, ptrdiff_t nbytes)
+{
+  /* This is faster than parse_str_as_multibyte, and much faster than
+     [NSString lengthOfBytesUsingEncoding: NSUTF32StringEncoding].  */
+  const char *end = str + nbytes;
+  ptrdiff_t nc = 0;
+  while (str < end)
+    {
+      nc++;
+      unsigned char c = *str;
+      str += (  c <= 0x7f ? 1    // 0xxxxxxx
+              : c <= 0xdf ? 2    // 110xxxxx 10xxxxxx
+              : c <= 0xef ? 3    // 1110xxxx 10xxxxxx 10xxxxxx
+              :             4);  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+    }
+  return nc;
+}
+
  @implementation NSString (EmacsString)
  /* Make an NSString from a Lisp string.  STRING must not be in an
     encoded form (e.g. UTF-8).  */
@@ -3840,9 +3861,9 @@ all_nonzero_ascii (unsigned char *str, ptrdiff_t n)
  /* Make a Lisp string from an NSString.  */
  - (Lisp_Object)lispString
  {
-  // make_string behaves predictably and correctly with UTF-8 input.
-  return make_string ([self UTF8String],
-                      [self lengthOfBytesUsingEncoding: NSUTF8StringEncoding]);
+  const char *utf8 = [self UTF8String];
+  ptrdiff_t bytes = [self lengthOfBytesUsingEncoding: NSUTF8StringEncoding];
+  return make_multibyte_string (utf8, count_utf8_chars (utf8, bytes), bytes);
  }
  @end
author	Mattias Engdegård <mattiase@acm.org>
	Wed, 9 Aug 2023 10:34:06 +0000 (12:34 +0200)
committer	Mattias Engdegård <mattiase@acm.org>
	Wed, 9 Aug 2023 11:02:00 +0000 (13:02 +0200)