From: Mattias Engdegård <mattiase@acm.org>
Date: Wed, 1 Jun 2022 20:49:34 +0000 (+0200)
Subject: Revert "Fix reader char escape bugs (bug#55738)"
X-Git-Tag: emacs-29.0.90~1910^2~295^2~2
X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=f71cfd3084103460d7cdad96bf7f8b88e1d19596;p=emacs.git

Revert "Fix reader char escape bugs (bug#55738)"

This reverts commit c50718dcfa54293b695f8a3fa5cd4d77848ee084.

It may have caused bootstrap problems. Sorry about that.
---

diff --git a/src/lread.c b/src/lread.c
index 670413efc02..a1045184d9b 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -2631,88 +2631,93 @@ character_name_to_code (char const *name, ptrdiff_t name_len,
 enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
 
 /* Read a \-escape sequence, assuming we already read the `\'.
-   When there is a difference between string and character literal \-sequences,
-   the latter is assumed.
    If the escape sequence forces unibyte, return eight-bit char.  */
 
 static int
-read_escape (Lisp_Object readcharfun)
+read_escape (Lisp_Object readcharfun, bool stringp)
 {
-  int modifiers = 0;
- again: ;
   int c = READCHAR;
-  int unicode_hex_count;
+  /* \u allows up to four hex digits, \U up to eight.  Default to the
+     behavior for \u, and change this value in the case that \U is seen.  */
+  int unicode_hex_count = 4;
 
   switch (c)
     {
     case -1:
       end_of_file_error ();
 
-    case 'a': c = '\a'; break;
-    case 'b': c = '\b'; break;
-    case 'd': c = 127; break;
-    case 'e': c = 27; break;
-    case 'f': c = '\f'; break;
-    case 'n': c = '\n'; break;
-    case 'r': c = '\r'; break;
-    case 't': c = '\t'; break;
-    case 'v': c = '\v'; break;
+    case 'a':
+      return '\007';
+    case 'b':
+      return '\b';
+    case 'd':
+      return 0177;
+    case 'e':
+      return 033;
+    case 'f':
+      return '\f';
+    case 'n':
+      return '\n';
+    case 'r':
+      return '\r';
+    case 't':
+      return '\t';
+    case 'v':
+      return '\v';
+    case '\n':
+      return -1;
+    case ' ':
+      if (stringp)
+	return -1;
+      return ' ';
 
     case 'M':
       c = READCHAR;
       if (c != '-')
 	error ("Invalid escape character syntax");
-      modifiers |= meta_modifier;
       c = READCHAR;
       if (c == '\\')
-	goto again;
-      break;
+	c = read_escape (readcharfun, 0);
+      return c | meta_modifier;
 
     case 'S':
       c = READCHAR;
       if (c != '-')
 	error ("Invalid escape character syntax");
-      modifiers |= shift_modifier;
       c = READCHAR;
       if (c == '\\')
-	goto again;
-      break;
+	c = read_escape (readcharfun, 0);
+      return c | shift_modifier;
 
     case 'H':
       c = READCHAR;
       if (c != '-')
 	error ("Invalid escape character syntax");
-      modifiers |= hyper_modifier;
       c = READCHAR;
       if (c == '\\')
-	goto again;
-      break;
+	c = read_escape (readcharfun, 0);
+      return c | hyper_modifier;
 
     case 'A':
       c = READCHAR;
       if (c != '-')
 	error ("Invalid escape character syntax");
-      modifiers |= alt_modifier;
       c = READCHAR;
       if (c == '\\')
-	goto again;
-      break;
+	c = read_escape (readcharfun, 0);
+      return c | alt_modifier;
 
     case 's':
       c = READCHAR;
-      if (c == '-')
-	{
-	  modifiers |= super_modifier;
-	  c = READCHAR;
-	  if (c == '\\')
-	    goto again;
-	}
-      else
+      if (stringp || c != '-')
 	{
 	  UNREAD (c);
-	  c = ' ';
+	  return ' ';
 	}
-      break;
+      c = READCHAR;
+      if (c == '\\')
+	c = read_escape (readcharfun, 0);
+      return c | super_modifier;
 
     case 'C':
       c = READCHAR;
@@ -2720,11 +2725,21 @@ read_escape (Lisp_Object readcharfun)
 	error ("Invalid escape character syntax");
       FALLTHROUGH;
     case '^':
-      modifiers |= ctrl_modifier;
       c = READCHAR;
       if (c == '\\')
-	goto again;
-      break;
+	c = read_escape (readcharfun, 0);
+      if ((c & ~CHAR_MODIFIER_MASK) == '?')
+	return 0177 | (c & CHAR_MODIFIER_MASK);
+      else if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
+	return c | ctrl_modifier;
+      /* ASCII control chars are made from letters (both cases),
+	 as well as the non-letters within 0100...0137.  */
+      else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
+	return (c & (037 | ~0177));
+      else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
+	return (c & (037 | ~0177));
+      else
+	return c | ctrl_modifier;
 
     case '0':
     case '1':
@@ -2734,30 +2749,31 @@ read_escape (Lisp_Object readcharfun)
     case '5':
     case '6':
     case '7':
-      /* 1-3 octal digits.  */
+      /* An octal escape, as in ANSI C.  */
       {
-	int i = c - '0';
-	int count = 0;
+	register int i = c - '0';
+	register int count = 0;
 	while (++count < 3)
 	  {
-	    c = READCHAR;
-	    if (c < '0' || c > '7')
+	    if ((c = READCHAR) >= '0' && c <= '7')
+	      {
+		i *= 8;
+		i += c - '0';
+	      }
+	    else
 	      {
 		UNREAD (c);
 		break;
 	      }
-	    i *= 8;
-	    i += c - '0';
 	  }
 
 	if (i >= 0x80 && i < 0x100)
 	  i = BYTE8_TO_CHAR (i);
-	c = i;
-	break;
+	return i;
       }
 
     case 'x':
-      /* One or more hex digits.  */
+      /* A hex escape, as in ANSI C.  */
       {
 	unsigned int i = 0;
 	int count = 0;
@@ -2779,18 +2795,16 @@ read_escape (Lisp_Object readcharfun)
 	  }
 
 	if (count < 3 && i >= 0x80)
-	  i = BYTE8_TO_CHAR (i);
-	c = i;
-	break;
+	  return BYTE8_TO_CHAR (i);
+	return i;
       }
 
-    case 'U':			/* Eight hex digits.  */
+    case 'U':
+      /* Post-Unicode-2.0: Up to eight hex chars.  */
       unicode_hex_count = 8;
-      goto unicode;
+      FALLTHROUGH;
+    case 'u':
 
-    case 'u':			/* Four hex digits.  */
-      unicode_hex_count = 4;
-    unicode:
       /* A Unicode escape.  We only permit them in strings and characters,
 	 not arbitrarily in the source code, as in some other languages.  */
       {
@@ -2801,8 +2815,12 @@ read_escape (Lisp_Object readcharfun)
 	  {
 	    c = READCHAR;
 	    if (c < 0)
-	      error ("Malformed Unicode escape: \\%c%x",
-		     unicode_hex_count == 4 ? 'u' : 'U', i);
+	      {
+		if (unicode_hex_count > 4)
+		  error ("Malformed Unicode escape: \\U%x", i);
+		else
+		  error ("Malformed Unicode escape: \\u%x", i);
+	      }
 	    /* `isdigit' and `isalpha' may be locale-specific, which we don't
 	       want.  */
 	    int digit = char_hexdigit (c);
@@ -2813,8 +2831,7 @@ read_escape (Lisp_Object readcharfun)
 	  }
 	if (i > 0x10FFFF)
 	  error ("Non-Unicode character: 0x%x", i);
-	c = i;
-	break;
+	return i;
       }
 
     case 'N':
@@ -2863,31 +2880,12 @@ read_escape (Lisp_Object readcharfun)
 
 	/* character_name_to_code can invoke read0, recursively.
 	   This is why read0's buffer is not static.  */
-	c = character_name_to_code (name, length, readcharfun);
-	break;
+	return character_name_to_code (name, length, readcharfun);
       }
-    }
 
-  c |= modifiers;
-  if (c & ctrl_modifier)
-    {
-      int b = c & ~CHAR_MODIFIER_MASK;
-      /* If the base char is in the 0x3f..0x5f range or a lower case
-	 letter, drop the ctrl_modifier bit and generate a C0 control
-	 character instead.  */
-      if ((b >= 0x3f && b <= 0x5f) || (b >= 'a' && b <= 'z'))
-	{
-	  c &= ~ctrl_modifier;
-	  if (b == '?')
-	    /* Special case: ^? is DEL.  */
-	    b = 127;
-	  else
-	    /* Make a C0 control in 0..31 by clearing bits 5 and 6.  */
-	    b &= 0x1f;
-	}
-      c = b | (c & CHAR_MODIFIER_MASK);
+    default:
+      return c;
     }
-  return c;
 }
 
 /* Return the digit that CHARACTER stands for in the given BASE.
@@ -3014,7 +3012,7 @@ read_char_literal (Lisp_Object readcharfun)
     }
 
   if (ch == '\\')
-    ch = read_escape (readcharfun);
+    ch = read_escape (readcharfun, 0);
 
   int modifiers = ch & CHAR_MODIFIER_MASK;
   ch &= ~CHAR_MODIFIER_MASK;
@@ -3068,21 +3066,14 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)],
 
       if (ch == '\\')
 	{
-	  ch = READCHAR;
-	  switch (ch)
+	  ch = read_escape (readcharfun, 1);
+
+	  /* CH is -1 if \ newline or \ space has just been seen.  */
+	  if (ch == -1)
 	    {
-	    case 's':
-	      ch = ' ';
-	      break;
-	    case ' ':
-	    case '\n':
 	      if (p == read_buffer)
 		cancel = true;
 	      continue;
-	    default:
-	      UNREAD (ch);
-	      ch = read_escape (readcharfun);
-	      break;
 	    }
 
 	  int modifiers = ch & CHAR_MODIFIER_MASK;
@@ -3094,13 +3085,19 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)],
 	    force_multibyte = true;
 	  else		/* I.e. ASCII_CHAR_P (ch).  */
 	    {
-	      /* Allow `\C-SPC' and `\^SPC'.  This is done here because
-		 the literals ?\C-SPC and ?\^SPC (rather inconsistently)
-		 yield (' ' | CHAR_CTL); see bug#55738.  */
-	      if (modifiers == CHAR_CTL && ch == ' ')
+	      /* Allow `\C- ' and `\C-?'.  */
+	      if (modifiers == CHAR_CTL)
 		{
-		  ch = 0;
-		  modifiers = 0;
+		  if (ch == ' ')
+		    {
+		      ch = 0;
+		      modifiers = 0;
+		    }
+		  else if (ch == '?')
+		    {
+		      ch = 127;
+		      modifiers = 0;
+		    }
 		}
 	      if (modifiers & CHAR_SHIFT)
 		{
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
index 59d5ca076f1..47351c1d116 100644
--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@@ -317,14 +317,4 @@ literals (Bug#20852)."
   (should (equal (read-from-string "#_")
                  '(## . 2))))
 
-(ert-deftest lread-misc-2 ()
-  ;; ?\LF should produce LF (only inside string literals do we ignore \LF).
-  (should (equal (read-from-string "?\\\n") '(?\n . 3)))
-  (should (equal (read-from-string "\"a\\\nb\"") '("ab" . 6)))
-  ;; The Control modifier constructs should be idempotent.
-  (should (equal ?\C-\C-x ?\C-x))
-  (should (equal ?\^\^x ?\C-x))
-  (should (equal ?\C-\^x ?\C-x))
-  (should (equal ?\^\C-x ?\C-x)))
-
 ;;; lread-tests.el ends here