Revert "Fix reader char escape bugs (bug#55738)"

author Mattias Engdegård <mattiase@acm.org>

Wed, 1 Jun 2022 20:49:34 +0000 (22:49 +0200)

committer Mattias Engdegård <mattiase@acm.org>

Wed, 1 Jun 2022 20:49:34 +0000 (22:49 +0200)
author Mattias Engdegård <mattiase@acm.org>
Wed, 1 Jun 2022 20:49:34 +0000 (22:49 +0200)
committer Mattias Engdegård <mattiase@acm.org>
Wed, 1 Jun 2022 20:49:34 +0000 (22:49 +0200)
diff --git a/src/lread.c b/src/lread.c

index 670413efc02458024d3b89bc3c1d73821ad4d930..a1045184d9bb000d9ffa565a8d1753b52a5e9b46 100644 (file)
--- a/src/lread.c
+++ b/src/lread.c
@@ -2631,88 +2631,93 @@ character_name_to_code (char const *name, ptrdiff_t name_len,
  enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
  
  /* Read a \-escape sequence, assuming we already read the `\'.
-   When there is a difference between string and character literal \-sequences,
-   the latter is assumed.
     If the escape sequence forces unibyte, return eight-bit char.  */
  
  static int
-read_escape (Lisp_Object readcharfun)
+read_escape (Lisp_Object readcharfun, bool stringp)
  {
-  int modifiers = 0;
- again: ;
    int c = READCHAR;
-  int unicode_hex_count;
+  /* \u allows up to four hex digits, \U up to eight.  Default to the
+     behavior for \u, and change this value in the case that \U is seen.  */
+  int unicode_hex_count = 4;
  
    switch (c)
      {
      case -1:
        end_of_file_error ();
  
-    case 'a': c = '\a'; break;
-    case 'b': c = '\b'; break;
-    case 'd': c = 127; break;
-    case 'e': c = 27; break;
-    case 'f': c = '\f'; break;
-    case 'n': c = '\n'; break;
-    case 'r': c = '\r'; break;
-    case 't': c = '\t'; break;
-    case 'v': c = '\v'; break;
+    case 'a':
+      return '\007';
+    case 'b':
+      return '\b';
+    case 'd':
+      return 0177;
+    case 'e':
+      return 033;
+    case 'f':
+      return '\f';
+    case 'n':
+      return '\n';
+    case 'r':
+      return '\r';
+    case 't':
+      return '\t';
+    case 'v':
+      return '\v';
+    case '\n':
+      return -1;
+    case ' ':
+      if (stringp)
+       return -1;
+      return ' ';
  
      case 'M':
        c = READCHAR;
        if (c != '-')
         error ("Invalid escape character syntax");
-      modifiers |= meta_modifier;
        c = READCHAR;
        if (c == '\\')
-       goto again;
-      break;
+       c = read_escape (readcharfun, 0);
+      return c | meta_modifier;
  
      case 'S':
        c = READCHAR;
        if (c != '-')
         error ("Invalid escape character syntax");
-      modifiers |= shift_modifier;
        c = READCHAR;
        if (c == '\\')
-       goto again;
-      break;
+       c = read_escape (readcharfun, 0);
+      return c | shift_modifier;
  
      case 'H':
        c = READCHAR;
        if (c != '-')
         error ("Invalid escape character syntax");
-      modifiers |= hyper_modifier;
        c = READCHAR;
        if (c == '\\')
-       goto again;
-      break;
+       c = read_escape (readcharfun, 0);
+      return c | hyper_modifier;
  
      case 'A':
        c = READCHAR;
        if (c != '-')
         error ("Invalid escape character syntax");
-      modifiers |= alt_modifier;
        c = READCHAR;
        if (c == '\\')
-       goto again;
-      break;
+       c = read_escape (readcharfun, 0);
+      return c | alt_modifier;
  
      case 's':
        c = READCHAR;
-      if (c == '-')
-       {
-         modifiers |= super_modifier;
-         c = READCHAR;
-         if (c == '\\')
-           goto again;
-       }
-      else
+      if (stringp || c != '-')
         {
           UNREAD (c);
-         c = ' ';
+         return ' ';
         }
-      break;
+      c = READCHAR;
+      if (c == '\\')
+       c = read_escape (readcharfun, 0);
+      return c | super_modifier;
  
      case 'C':
        c = READCHAR;
@@ -2720,11 +2725,21 @@ read_escape (Lisp_Object readcharfun)
         error ("Invalid escape character syntax");
        FALLTHROUGH;
      case '^':
-      modifiers |= ctrl_modifier;
        c = READCHAR;
        if (c == '\\')
-       goto again;
-      break;
+       c = read_escape (readcharfun, 0);
+      if ((c & ~CHAR_MODIFIER_MASK) == '?')
+       return 0177 | (c & CHAR_MODIFIER_MASK);
+      else if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
+       return c | ctrl_modifier;
+      /* ASCII control chars are made from letters (both cases),
+        as well as the non-letters within 0100...0137.  */
+      else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
+       return (c & (037 | ~0177));
+      else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
+       return (c & (037 | ~0177));
+      else
+       return c | ctrl_modifier;
  
      case '0':
      case '1':
@@ -2734,30 +2749,31 @@ read_escape (Lisp_Object readcharfun)
      case '5':
      case '6':
      case '7':
-      /* 1-3 octal digits.  */
+      /* An octal escape, as in ANSI C.  */
        {
-       int i = c - '0';
-       int count = 0;
+       register int i = c - '0';
+       register int count = 0;
         while (++count < 3)
           {
-           c = READCHAR;
-           if (c < '0' || c > '7')
+           if ((c = READCHAR) >= '0' && c <= '7')
+             {
+               i *= 8;
+               i += c - '0';
+             }
+           else
               {
                 UNREAD (c);
                 break;
               }
-           i *= 8;
-           i += c - '0';
           }
  
         if (i >= 0x80 && i < 0x100)
           i = BYTE8_TO_CHAR (i);
-       c = i;
-       break;
+       return i;
        }
  
      case 'x':
-      /* One or more hex digits.  */
+      /* A hex escape, as in ANSI C.  */
        {
         unsigned int i = 0;
         int count = 0;
@@ -2779,18 +2795,16 @@ read_escape (Lisp_Object readcharfun)
           }
  
         if (count < 3 && i >= 0x80)
-         i = BYTE8_TO_CHAR (i);
-       c = i;
-       break;
+         return BYTE8_TO_CHAR (i);
+       return i;
        }
  
-    case 'U':                  /* Eight hex digits.  */
+    case 'U':
+      /* Post-Unicode-2.0: Up to eight hex chars.  */
        unicode_hex_count = 8;
-      goto unicode;
+      FALLTHROUGH;
+    case 'u':
  
-    case 'u':                  /* Four hex digits.  */
-      unicode_hex_count = 4;
-    unicode:
        /* A Unicode escape.  We only permit them in strings and characters,
          not arbitrarily in the source code, as in some other languages.  */
        {
@@ -2801,8 +2815,12 @@ read_escape (Lisp_Object readcharfun)
           {
             c = READCHAR;
             if (c < 0)
-             error ("Malformed Unicode escape: \\%c%x",
-                    unicode_hex_count == 4 ? 'u' : 'U', i);
+             {
+               if (unicode_hex_count > 4)
+                 error ("Malformed Unicode escape: \\U%x", i);
+               else
+                 error ("Malformed Unicode escape: \\u%x", i);
+             }
             /* `isdigit' and `isalpha' may be locale-specific, which we don't
                want.  */
             int digit = char_hexdigit (c);
@@ -2813,8 +2831,7 @@ read_escape (Lisp_Object readcharfun)
           }
         if (i > 0x10FFFF)
           error ("Non-Unicode character: 0x%x", i);
-       c = i;
-       break;
+       return i;
        }
  
      case 'N':
@@ -2863,31 +2880,12 @@ read_escape (Lisp_Object readcharfun)
  
         /* character_name_to_code can invoke read0, recursively.
            This is why read0's buffer is not static.  */
-       c = character_name_to_code (name, length, readcharfun);
-       break;
+       return character_name_to_code (name, length, readcharfun);
        }
-    }
  
-  c |= modifiers;
-  if (c & ctrl_modifier)
-    {
-      int b = c & ~CHAR_MODIFIER_MASK;
-      /* If the base char is in the 0x3f..0x5f range or a lower case
-        letter, drop the ctrl_modifier bit and generate a C0 control
-        character instead.  */
-      if ((b >= 0x3f && b <= 0x5f) || (b >= 'a' && b <= 'z'))
-       {
-         c &= ~ctrl_modifier;
-         if (b == '?')
-           /* Special case: ^? is DEL.  */
-           b = 127;
-         else
-           /* Make a C0 control in 0..31 by clearing bits 5 and 6.  */
-           b &= 0x1f;
-       }
-      c = b | (c & CHAR_MODIFIER_MASK);
+    default:
+      return c;
      }
-  return c;
  }
  
  /* Return the digit that CHARACTER stands for in the given BASE.
@@ -3014,7 +3012,7 @@ read_char_literal (Lisp_Object readcharfun)
      }
  
    if (ch == '\\')
-    ch = read_escape (readcharfun);
+    ch = read_escape (readcharfun, 0);
  
    int modifiers = ch & CHAR_MODIFIER_MASK;
    ch &= ~CHAR_MODIFIER_MASK;
@@ -3068,21 +3066,14 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)],
  
        if (ch == '\\')
         {
-         ch = READCHAR;
-         switch (ch)
+         ch = read_escape (readcharfun, 1);
+
+         /* CH is -1 if \ newline or \ space has just been seen.  */
+         if (ch == -1)
             {
-           case 's':
-             ch = ' ';
-             break;
-           case ' ':
-           case '\n':
               if (p == read_buffer)
                 cancel = true;
               continue;
-           default:
-             UNREAD (ch);
-             ch = read_escape (readcharfun);
-             break;
             }
  
           int modifiers = ch & CHAR_MODIFIER_MASK;
@@ -3094,13 +3085,19 @@ read_string_literal (char stackbuf[VLA_ELEMS (stackbufsize)],
             force_multibyte = true;
           else          /* I.e. ASCII_CHAR_P (ch).  */
             {
-             /* Allow `\C-SPC' and `\^SPC'.  This is done here because
-                the literals ?\C-SPC and ?\^SPC (rather inconsistently)
-                yield (' ' | CHAR_CTL); see bug#55738.  */
-             if (modifiers == CHAR_CTL && ch == ' ')
+             /* Allow `\C- ' and `\C-?'.  */
+             if (modifiers == CHAR_CTL)
                 {
-                 ch = 0;
-                 modifiers = 0;
+                 if (ch == ' ')
+                   {
+                     ch = 0;
+                     modifiers = 0;
+                   }
+                 else if (ch == '?')
+                   {
+                     ch = 127;
+                     modifiers = 0;
+                   }
                 }
               if (modifiers & CHAR_SHIFT)
                 {
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el

index 59d5ca076f159a296ed1c30741606faedb11040c..47351c1d116fa43adb0af2f6d3dd63daff64dad3 100644 (file)
--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@@ -317,14 +317,4 @@ literals (Bug#20852)."
    (should (equal (read-from-string "#_")
                   '(## . 2))))
  
-(ert-deftest lread-misc-2 ()
-  ;; ?\LF should produce LF (only inside string literals do we ignore \LF).
-  (should (equal (read-from-string "?\\\n") '(?\n . 3)))
-  (should (equal (read-from-string "\"a\\\nb\"") '("ab" . 6)))
-  ;; The Control modifier constructs should be idempotent.
-  (should (equal ?\C-\C-x ?\C-x))
-  (should (equal ?\^\^x ?\C-x))
-  (should (equal ?\C-\^x ?\C-x))
-  (should (equal ?\^\C-x ?\C-x)))
-
  ;;; lread-tests.el ends here
author	Mattias Engdegård <mattiase@acm.org>
	Wed, 1 Jun 2022 20:49:34 +0000 (22:49 +0200)
committer	Mattias Engdegård <mattiase@acm.org>
	Wed, 1 Jun 2022 20:49:34 +0000 (22:49 +0200)
src/lread.c		patch \| blob \| history
test/src/lread-tests.el		patch \| blob \| history