From 6bc3800000c6d4ed87330df5eee0958e29aa6521 Mon Sep 17 00:00:00 2001 From: Robert Pluim Date: Mon, 18 Sep 2023 10:41:01 +0200 Subject: [PATCH] Ensure ucs-names is consistent with Unicode names * lisp/international/mule-cmds.el (ucs-names): Skip adding an old-name if it conflicts with the offical name of a codepoint. Adjust the ranges iterated over to account for new Unicode codepoints. * test/lisp/international/mule-tests.el (mule-cmds-tests--ucs-names-old-name-override, mule-cmds-tests--ucs-names-missing-names): New tests for checking 'ucs-names' consistency. Bug#65997 --- lisp/international/mule-cmds.el | 26 ++++++++++++++++++-------- test/lisp/international/mule-tests.el | 24 ++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el index 3d6d66970d3..a906c032b9a 100644 --- a/lisp/international/mule-cmds.el +++ b/lisp/international/mule-cmds.el @@ -3094,6 +3094,10 @@ on encoding." (defun ucs-names () "Return table of CHAR-NAME keys and CHAR-CODE values cached in `ucs-names'." (or ucs-names + ;; Sometimes these ranges will need adjusting as codepoints are + ;; added to unicode. The test case + ;; 'mule-cmds-tests--ucs-names-missing-names' will tell you + ;; which are missing (Bug#65997). (let ((ranges '((#x0000 . #x33FF) ;; (#x3400 . #x4DBF) CJK Ideographs Extension A @@ -3106,14 +3110,16 @@ on encoding." (#x14400 . #x14646) ;; (#x14647 . #x167FF) unused (#x16800 . #x16F9F) - (#x16FE0 . #x16FE3) + (#x16FE0 . #x16FF1) ;; (#x17000 . #x187FF) Tangut Ideographs ;; (#x18800 . #x18AFF) Tangut Components ;; (#x18B00 . #x18CFF) Khitan Small Script ;; (#x18D00 . #x18D0F) Tangut Ideograph Supplement ;; (#x18D10 . #x1AFEF) unused - (#x1AFF0 . #x1B12F) - ;; (#x1B130 . #x1B14F) unused + (#x1AFF0 . #x1B122) + ;; (#x1B123 . #x1B131) unused + (#x1B132 . #x1B132) + ;; (#x1B133 . #x1B14F) unused (#x1B150 . #x1B16F) (#x1B170 . #x1B2FF) ;; (#x1B300 . #x1BBFF) unused @@ -3130,12 +3136,16 @@ on encoding." (while (<= c end) (let ((new-name (get-char-code-property c 'name)) (old-name (get-char-code-property c 'old-name))) - ;; In theory this code could end up pushing an "old-name" that - ;; shadows a "new-name" but in practice every time an - ;; `old-name' conflicts with a `new-name', the newer one has a - ;; higher code, so it gets pushed later! + ;; This code used to push both old-name and new-name + ;; on the assumption that the new-name codepoint would + ;; always be higher, which was true for a long time. + ;; As of at latest 2023-09-15, this is no longer true, + ;; so we now skip the old-name if it conflicts with an + ;; existing new-name (Bug#65997). (if new-name (puthash new-name c names)) - (if old-name (puthash old-name c names)) + (when (and old-name + (not (gethash old-name names))) + (puthash old-name c names)) ;; Unicode uses the spelling "lamda" in character ;; names, instead of "lambda", due to "preferences ;; expressed by the Greek National Body" (Bug#30513). diff --git a/test/lisp/international/mule-tests.el b/test/lisp/international/mule-tests.el index 3e0c5bf9f4b..4dc099a18af 100644 --- a/test/lisp/international/mule-tests.el +++ b/test/lisp/international/mule-tests.el @@ -49,6 +49,30 @@ (kbd "C-x RET c u t f - 8 RET C-u C-u c a b RET") (read-string "prompt:")))))) +;;Bug#65997, ensure that old-names haven't overriden new names. +(ert-deftest mule-cmds-tests--ucs-names-old-name-override () + (let (code-points) + (dotimes (u (1+ (max-char 'ucs))) + (when-let* ((name (get-char-code-property u 'name)) + (c (char-from-name name))) + (when (and (not (<= #xD800 u #xDFFF)) + (not (= c u))) + (push (format "%X" u) code-points)))) + (setq code-points (nreverse code-points)) + (should (null code-points)))) + +;; Bug#65997, ensure that all codepoints with names are in '(ucs-names)'. +(ert-deftest mule-cmds-tests--ucs-names-missing-names () + (let (code-points) + (dotimes (u (1+ (max-char 'ucs))) + (when-let ((name (get-char-code-property u 'name))) + (when (and (not (<= #xD800 u #xDFFF)) + (not (<= #x18800 u #x18AFF)) + (not (char-from-name name))) + (push (format "%X" u) code-points)))) + (setq code-points (nreverse code-points)) + (should (null code-points)))) + (ert-deftest mule-utf-7 () ;; utf-7 and utf-7-imap are not ASCII-compatible. (should-not (coding-system-get 'utf-7 :ascii-compatible-p)) -- 2.39.2