From de7d5f36e0f3261a7300fa3a3d87ae3b758b8a73 Mon Sep 17 00:00:00 2001 From: Philipp Stephani Date: Thu, 21 Apr 2016 14:45:22 -0700 Subject: [PATCH] Implement named character escapes, similar to Perl * lread.c (init_character_names): New function. (read_escape): Read Perl-style named character escape sequences. (syms_of_lread): Initialize new variable 'character_names'. * test/src/lread-tests.el (lread-char-empty-name): Add test file for src/lread.c. --- src/lread.c | 96 +++++++++++++++++++++++++++++++++++++++++ test/src/lread-tests.el | 54 +++++++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 test/src/lread-tests.el diff --git a/src/lread.c b/src/lread.c index fedfcb807c8..9fa46a875be 100644 --- a/src/lread.c +++ b/src/lread.c @@ -43,6 +43,7 @@ along with GNU Emacs. If not, see . */ #include "systime.h" #include "termhooks.h" #include "blockinput.h" +#include #ifdef MSDOS #include "msdos.h" @@ -2149,6 +2150,36 @@ grow_read_buffer (void) MAX_MULTIBYTE_LENGTH, -1, 1); } +/* Hash table that maps Unicode character names to code points. */ +static Lisp_Object character_names; + +/* Length of the longest Unicode character name, in bytes. */ +static ptrdiff_t max_character_name_length; + +/* Initializes `character_names' and `max_character_name_length'. + Called by `read_escape'. */ +void init_character_names () +{ + character_names = CALLN (Fmake_hash_table, + QCtest, Qequal, + /* Currently around 100,000 Unicode + characters are defined. */ + QCsize, make_natnum (100000)); + const Lisp_Object get_property = + Fsymbol_function (intern_c_string ("get-char-code-property")); + ptrdiff_t length = 0; + for (int i = 0; i <= MAX_UNICODE_CHAR; ++i) + { + const Lisp_Object code = make_natnum (i); + const Lisp_Object name = call2 (get_property, code, Qname); + if (NILP (name)) continue; + CHECK_STRING (name); + length = max (length, SBYTES (name)); + Fputhash (name, code, character_names); + } + max_character_name_length = length; +} + /* Read a \-escape sequence, assuming we already read the `\'. If the escape sequence forces unibyte, return eight-bit char. */ @@ -2356,6 +2387,68 @@ read_escape (Lisp_Object readcharfun, bool stringp) return i; } + case 'N': + /* Named character. */ + { + c = READCHAR; + if (c != '{') + invalid_syntax ("Expected opening brace after \\N"); + if (NILP (character_names)) + init_character_names (); + USE_SAFE_ALLOCA; + char *name = SAFE_ALLOCA (max_character_name_length + 1); + bool whitespace = false; + ptrdiff_t length = 0; + while (true) + { + c = READCHAR; + if (c < 0) + end_of_file_error (); + if (c == '}') + break; + if (! c_isascii (c)) + xsignal1 (Qinvalid_read_syntax, + CALLN (Fformat, + build_pure_c_string ("Non-ASCII character U+%04X" + " in character name"), + make_natnum (c))); + /* We treat multiple adjacent whitespace characters as a + single space character. This makes it easier to use + character names in e.g. multi-line strings. */ + if (c_isspace (c)) + { + if (! whitespace) + { + whitespace = true; + name[length++] = ' '; + } + } + else + { + whitespace = false; + name[length++] = c; + } + if (length >= max_character_name_length) + invalid_syntax ("Character name too long"); + } + if (length == 0) + invalid_syntax ("Empty character name"); + name[length] = 0; + const Lisp_Object lisp_name = make_unibyte_string (name, length); + const Lisp_Object code = + (length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ? + /* Code point as U+N, where N is between 1 and 8 hexadecimal + digits. */ + string_to_number (name + 2, 16, false) : + Fgethash (lisp_name, character_names, Qnil); + SAFE_FREE (); + if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR)) + xsignal1 (Qinvalid_read_syntax, + CALLN (Fformat, + build_pure_c_string ("\\N{%s}"), lisp_name)); + return XINT (code); + } + default: return c; } @@ -4744,4 +4837,7 @@ that are loaded before your customizations are read! */); DEFSYM (Qweakness, "weakness"); DEFSYM (Qrehash_size, "rehash-size"); DEFSYM (Qrehash_threshold, "rehash-threshold"); + + character_names = Qnil; + staticpro (&character_names); } diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el new file mode 100644 index 00000000000..1f873340c56 --- /dev/null +++ b/test/src/lread-tests.el @@ -0,0 +1,54 @@ +;;; lread-tests.el --- tests for lread.c -*- lexical-binding: t; -*- + +;; Copyright (C) 2016 Google Inc. + +;; Author: Philipp Stephani + +;; This file is part of GNU Emacs. + +;; This program is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with this program. If not, see . + +;;; Commentary: + +;; Unit tests for code in src/lread.c. + +;;; Code: + +(ert-deftest lread-char-number () + (should (equal ?\N{U+A817} #xA817))) + +(ert-deftest lread-char-name () + (should (equal ?\N{SYLOTI NAGRI LETTER + DHO} + #xA817))) + +(ert-deftest lread-char-invalid-number () + (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax)) + +(ert-deftest lread-char-invalid-name () + (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax) + +(ert-deftest lread-char-non-ascii-name () + (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")) 'invalid-read-syntax) + +(ert-deftest lread-char-empty-name () + (should-error (read "?\\N{}")) 'invalid-read-syntax) + +(ert-deftest lread-string-char-number () + (should (equal "a\N{U+A817}b" "a\uA817b"))) + +(ert-deftest lread-string-char-name () + (should (equal "a\N{SYLOTI NAGRI LETTER DHO}b" "a\uA817b"))) + +;;; lread-tests.el ends here -- 2.39.2