From: Mattias Engdegård <mattiase@acm.org>
Date: Fri, 10 Mar 2023 16:10:30 +0000 (+0100)
Subject: Remove recursion from character escape handling in reader
X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=b8e7061232f9a5b06af70031dcc4b48c6575a364;p=emacs.git

Remove recursion from character escape handling in reader

This cures a C stack overflow when reading certain long (crafted)
strings (bug#62039) and improves performance of reading escaped
characters in character and string literals.
Reported by Bruno Haible.

* src/lread.c (invalid_escape_syntax_error): New.
(read_escape): Rename to...
(read_char_escape): ...this.  Remove recursion.  Pass read-ahead char
as argument.  Improve code performance and clarity.
(read_char_literal, read_string_literal): Update calls.
* test/src/lread-tests.el (lread-char-modifiers)
(lread-many-modifiers): Add test cases.
---

diff --git a/src/lread.c b/src/lread.c
index d0dc85f51c8..273120315df 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -2639,154 +2639,137 @@ character_name_to_code (char const *name, ptrdiff_t name_len,
    Unicode 9.0.0 the maximum is 83, so this should be safe.  */
 enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
 
-/* Read a \-escape sequence, assuming we already read the `\'.
-   If the escape sequence forces unibyte, return eight-bit char.  */
+static AVOID
+invalid_escape_syntax_error (void)
+{
+  error ("Invalid escape character syntax");
+}
 
+/* Read a character escape sequence, assuming we just read a backslash
+   and one more character (next_char).  */
 static int
-read_escape (Lisp_Object readcharfun)
+read_char_escape (Lisp_Object readcharfun, int next_char)
 {
-  int c = READCHAR;
-  /* \u allows up to four hex digits, \U up to eight.  Default to the
-     behavior for \u, and change this value in the case that \U is seen.  */
-  int unicode_hex_count = 4;
+  int modifiers = 0;
+  ptrdiff_t ncontrol = 0;
+  int chr;
+
+ again: ;
+  int c = next_char;
+  int unicode_hex_count;
+  int mod;
 
   switch (c)
     {
     case -1:
       end_of_file_error ();
 
-    case 'a':
-      return '\007';
-    case 'b':
-      return '\b';
-    case 'd':
-      return 0177;
-    case 'e':
-      return 033;
-    case 'f':
-      return '\f';
-    case 'n':
-      return '\n';
-    case 'r':
-      return '\r';
-    case 't':
-      return '\t';
-    case 'v':
-      return '\v';
+    case 'a': chr = '\a'; break;
+    case 'b': chr = '\b'; break;
+    case 'd': chr =  127; break;
+    case 'e': chr =   27; break;
+    case 'f': chr = '\f'; break;
+    case 'n': chr = '\n'; break;
+    case 'r': chr = '\r'; break;
+    case 't': chr = '\t'; break;
+    case 'v': chr = '\v'; break;
 
     case '\n':
       /* ?\LF is an error; it's probably a user mistake.  */
       error ("Invalid escape character syntax");
 
-    case 'M':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | meta_modifier;
-
-    case 'S':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | shift_modifier;
-
-    case 'H':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | hyper_modifier;
+    /* \M-x etc: set modifier bit and parse the char to which it applies,
+       allowing for chains such as \M-\S-\A-\H-\s-\C-q.  */
+    case 'M': mod = meta_modifier;  goto mod_key;
+    case 'S': mod = shift_modifier; goto mod_key;
+    case 'H': mod = hyper_modifier; goto mod_key;
+    case 'A': mod = alt_modifier;   goto mod_key;
+    case 's': mod = super_modifier; goto mod_key;
 
-    case 'A':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | alt_modifier;
-
-    case 's':
-      c = READCHAR;
-      if (c != '-')
-	{
-	  UNREAD (c);
-	  return ' ';
-	}
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | super_modifier;
+    mod_key:
+      {
+	int c1 = READCHAR;
+	if (c1 != '-')
+	  {
+	    if (c == 's')
+	      {
+		/* \s not followed by a hyphen is SPC.  */
+		UNREAD (c1);
+		chr = ' ';
+		break;
+	      }
+	    else
+	      /* \M, \S, \H, \A not followed by a hyphen is an error.  */
+	      invalid_escape_syntax_error ();
+	  }
+	modifiers |= mod;
+	c1 = READCHAR;
+	if (c1 == '\\')
+	  {
+	    next_char = READCHAR;
+	    goto again;
+	  }
+	chr = c1;
+	break;
+      }
 
+    /* Control modifiers (\C-x or \^x) are messy and not actually idempotent.
+       For example, ?\C-\C-a = ?\C-\001 = 0x4000001.
+       Keep a count of them and apply them separately.  */
     case 'C':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
+      {
+	int c1 = READCHAR;
+	if (c1 != '-')
+	  invalid_escape_syntax_error ();
+      }
       FALLTHROUGH;
+    /* The prefixes \C- and \^ are equivalent.  */
     case '^':
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      if ((c & ~CHAR_MODIFIER_MASK) == '?')
-	return 0177 | (c & CHAR_MODIFIER_MASK);
-      else if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
-	return c | ctrl_modifier;
-      /* ASCII control chars are made from letters (both cases),
-	 as well as the non-letters within 0100...0137.  */
-      else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
-	return (c & (037 | ~0177));
-      else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
-	return (c & (037 | ~0177));
-      else
-	return c | ctrl_modifier;
-
-    case '0':
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-      /* An octal escape, as in ANSI C.  */
       {
-	register int i = c - '0';
-	register int count = 0;
-	while (++count < 3)
+	ncontrol++;
+	int c1 = READCHAR;
+	if (c1 == '\\')
 	  {
-	    if ((c = READCHAR) >= '0' && c <= '7')
-	      {
-		i *= 8;
-		i += c - '0';
-	      }
-	    else
+	    next_char = READCHAR;
+	    goto again;
+	  }
+	chr = c1;
+	break;
+      }
+
+    /* 1-3 octal digits.  Values in 0x80..0xff are encoded as raw bytes.  */
+    case '0': case '1': case '2': case '3':
+    case '4': case '5': case '6': case '7':
+      {
+	int i = c - '0';
+	int count = 0;
+	while (count < 2)
+	  {
+	    int c = READCHAR;
+	    if (c < '0' || c > '7')
 	      {
 		UNREAD (c);
 		break;
 	      }
+	    i = (i << 3) + (c - '0');
+	    count++;
 	  }
 
 	if (i >= 0x80 && i < 0x100)
 	  i = BYTE8_TO_CHAR (i);
-	return i;
+	chr = i;
+	break;
       }
 
+    /* 1 or more hex digits.  Values may encode modifiers.
+       Values in 0x80..0xff using 2 hex digits are encoded as raw bytes.  */
     case 'x':
-      /* A hex escape, as in ANSI C.  */
       {
 	unsigned int i = 0;
 	int count = 0;
 	while (1)
 	  {
-	    c = READCHAR;
+	    int c = READCHAR;
 	    int digit = char_hexdigit (c);
 	    if (digit < 0)
 	      {
@@ -2796,40 +2779,37 @@ read_escape (Lisp_Object readcharfun)
 	    i = (i << 4) + digit;
 	    /* Allow hex escapes as large as ?\xfffffff, because some
 	       packages use them to denote characters with modifiers.  */
-	    if ((CHAR_META | (CHAR_META - 1)) < i)
+	    if (i > (CHAR_META | (CHAR_META - 1)))
 	      error ("Hex character out of range: \\x%x...", i);
 	    count += count < 3;
 	  }
 
+	if (count == 0)
+	  invalid_escape_syntax_error ();
 	if (count < 3 && i >= 0x80)
-	  return BYTE8_TO_CHAR (i);
-	return i;
+	  i = BYTE8_TO_CHAR (i);
+	modifiers |= i & CHAR_MODIFIER_MASK;
+	chr = i & ~CHAR_MODIFIER_MASK;
+	break;
       }
 
+    /* 8-digit Unicode hex escape: \UHHHHHHHH */
     case 'U':
-      /* Post-Unicode-2.0: Up to eight hex chars.  */
       unicode_hex_count = 8;
-      FALLTHROUGH;
-    case 'u':
+      goto unicode_hex;
 
-      /* A Unicode escape.  We only permit them in strings and characters,
-	 not arbitrarily in the source code, as in some other languages.  */
+    /* 4-digit Unicode hex escape: \uHHHH */
+    case 'u':
+      unicode_hex_count = 4;
+    unicode_hex:
       {
 	unsigned int i = 0;
-	int count = 0;
-
-	while (++count <= unicode_hex_count)
+	for (int count = 0; count < unicode_hex_count; count++)
 	  {
-	    c = READCHAR;
+	    int c = READCHAR;
 	    if (c < 0)
-	      {
-		if (unicode_hex_count > 4)
-		  error ("Malformed Unicode escape: \\U%x", i);
-		else
-		  error ("Malformed Unicode escape: \\u%x", i);
-	      }
-	    /* `isdigit' and `isalpha' may be locale-specific, which we don't
-	       want.  */
+	      error ("Malformed Unicode escape: \\%c%x",
+		     unicode_hex_count == 4 ? 'u' : 'U', i);
 	    int digit = char_hexdigit (c);
 	    if (digit < 0)
 	      error ("Non-hex character used for Unicode escape: %c (%d)",
@@ -2838,13 +2818,14 @@ read_escape (Lisp_Object readcharfun)
 	  }
 	if (i > 0x10FFFF)
 	  error ("Non-Unicode character: 0x%x", i);
-	return i;
+	chr = i;
+	break;
       }
 
+    /* Named character: \N{name} */
     case 'N':
-      /* Named character.  */
       {
-        c = READCHAR;
+        int c = READCHAR;
         if (c != '{')
           invalid_syntax ("Expected opening brace after \\N", readcharfun);
         char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1];
@@ -2852,12 +2833,12 @@ read_escape (Lisp_Object readcharfun)
         ptrdiff_t length = 0;
         while (true)
           {
-            c = READCHAR;
+            int c = READCHAR;
             if (c < 0)
               end_of_file_error ();
             if (c == '}')
               break;
-            if (! (0 < c && c < 0x80))
+            if (c >= 0x80)
               {
                 AUTO_STRING (format,
                              "Invalid character U+%04X in character name");
@@ -2886,13 +2867,41 @@ read_escape (Lisp_Object readcharfun)
 	name[length] = '\0';
 
 	/* character_name_to_code can invoke read0, recursively.
-	   This is why read0's buffer is not static.  */
-	return character_name_to_code (name, length, readcharfun);
+	   This is why read0 needs to be re-entrant.  */
+	chr = character_name_to_code (name, length, readcharfun);
+	break;
       }
 
     default:
-      return c;
+      chr = c;
+      break;
     }
+  eassert (chr >= 0 && chr < (1 << CHARACTERBITS));
+
+  /* Apply Control modifiers, using the rules:
+     \C-X = ascii_ctrl(nomod(X)) | mods(X)  if nomod(X) is one of:
+                                                A-Z a-z ? @ [ \ ] ^ _
+
+            X | ctrl_modifier               otherwise
+
+     where
+         nomod(c) = c without modifiers
+	 mods(c)  = the modifiers of c
+         ascii_ctrl(c) = 127       if c = '?'
+                         c & 0x1f  otherwise
+  */
+  while (ncontrol > 0)
+    {
+      if ((chr >= '@' && chr <= '_') || (chr >= 'a' && chr <= 'z'))
+	chr &= 0x1f;
+      else if (chr == '?')
+	chr = 127;
+      else
+	modifiers |= ctrl_modifier;
+      ncontrol--;
+    }
+
+  return chr | modifiers;
 }
 
 /* Return the digit that CHARACTER stands for in the given BASE.
@@ -3014,7 +3023,7 @@ read_char_literal (Lisp_Object readcharfun)
     }
 
   if (ch == '\\')
-    ch = read_escape (readcharfun);
+    ch = read_char_escape (readcharfun, READCHAR);
 
   int modifiers = ch & CHAR_MODIFIER_MASK;
   ch &= ~CHAR_MODIFIER_MASK;
@@ -3080,8 +3089,7 @@ read_string_literal (Lisp_Object readcharfun)
 	      /* `\SPC' and `\LF' generate no characters at all.  */
 	      continue;
 	    default:
-	      UNREAD (ch);
-	      ch = read_escape (readcharfun);
+	      ch = read_char_escape (readcharfun, ch);
 	      break;
 	    }
 
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
index c0ea37d2c55..fc00204ce7b 100644
--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@@ -116,8 +116,27 @@
   (should-error (read "#") :type 'invalid-read-syntax))
 
 (ert-deftest lread-char-modifiers ()
-  (should (eq ?\C-\M-Ã© (+ (- ?\M-a ?a) ?\C-Ã©)))
-  (should (eq (- ?\C-Å ?Å) (- ?\C-Ã© ?Ã©))))
+  (should (equal ?\C-\M-Ã© (+ (- ?\M-a ?a) ?\C-Ã©)))
+  (should (equal (- ?\C-Å ?Å) (- ?\C-Ã© ?Ã©)))
+  (should (equal ?\C-\C-c #x4000003))
+  (should (equal ?\C-\M-\C-c #xc000003))
+  (should (equal ?\M-\C-\C-c #xc000003))
+  (should (equal ?\C-\C-\M-c #xc000003))
+  (should (equal ?\M-\S-\H-\A-\C-\s-x #xbc00018))
+
+  (should (equal "\s-x" " -x"))
+  (should (equal "\C-x" "\x18"))
+  (should (equal "\^x" "\x18"))
+  (should (equal "\M-x" "\xf8")))
+
+(ert-deftest lread-many-modifiers ()
+  ;; The string literal "\M-\M-...\M-a" should be equivalent to "\M-a",
+  ;; and we should not run out of stack space parsing it.
+  (let* ((n 500000)
+         (s (concat "\""
+                    (apply #'concat (make-list n "\\M-"))
+                    "a\"")))
+    (should (equal (read-from-string s) (cons "\M-a" (+ (* n 3) 3))))))
 
 (ert-deftest lread-record-1 ()
   (should (equal '(#s(foo) #s(foo))