From 6bc3800000c6d4ed87330df5eee0958e29aa6521 Mon Sep 17 00:00:00 2001
From: Robert Pluim <rpluim@gmail.com>
Date: Mon, 18 Sep 2023 10:41:01 +0200
Subject: [PATCH] Ensure ucs-names is consistent with Unicode names

* lisp/international/mule-cmds.el (ucs-names): Skip adding an old-name
if it conflicts with the offical name of a codepoint.  Adjust the
ranges iterated over to account for new Unicode codepoints.
* test/lisp/international/mule-tests.el
(mule-cmds-tests--ucs-names-old-name-override,
mule-cmds-tests--ucs-names-missing-names): New tests for checking
'ucs-names' consistency.

Bug#65997
---
 lisp/international/mule-cmds.el       | 26 ++++++++++++++++++--------
 test/lisp/international/mule-tests.el | 24 ++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el
index 3d6d66970d3..a906c032b9a 100644
--- a/lisp/international/mule-cmds.el
+++ b/lisp/international/mule-cmds.el
@@ -3094,6 +3094,10 @@ on encoding."
 (defun ucs-names ()
   "Return table of CHAR-NAME keys and CHAR-CODE values cached in `ucs-names'."
   (or ucs-names
+      ;; Sometimes these ranges will need adjusting as codepoints are
+      ;; added to unicode.  The test case
+      ;; 'mule-cmds-tests--ucs-names-missing-names' will tell you
+      ;; which are missing (Bug#65997).
       (let ((ranges
 	     '((#x0000 . #x33FF)
 	       ;; (#x3400 . #x4DBF) CJK Ideographs Extension A
@@ -3106,14 +3110,16 @@ on encoding."
                (#x14400 . #x14646)
 	       ;; (#x14647 . #x167FF) unused
 	       (#x16800 . #x16F9F)
-               (#x16FE0 . #x16FE3)
+               (#x16FE0 . #x16FF1)
                ;; (#x17000 . #x187FF) Tangut Ideographs
                ;; (#x18800 . #x18AFF) Tangut Components
                ;; (#x18B00 . #x18CFF) Khitan Small Script
                ;; (#x18D00 . #x18D0F) Tangut Ideograph Supplement
 	       ;; (#x18D10 . #x1AFEF) unused
-	       (#x1AFF0 . #x1B12F)
-               ;; (#x1B130 . #x1B14F) unused
+	       (#x1AFF0 . #x1B122)
+               ;; (#x1B123 . #x1B131) unused
+               (#x1B132 . #x1B132)
+               ;; (#x1B133 . #x1B14F) unused
                (#x1B150 . #x1B16F)
                (#x1B170 . #x1B2FF)
 	       ;; (#x1B300 . #x1BBFF) unused
@@ -3130,12 +3136,16 @@ on encoding."
 	    (while (<= c end)
 	      (let ((new-name (get-char-code-property c 'name))
 		    (old-name (get-char-code-property c 'old-name)))
-	        ;; In theory this code could end up pushing an "old-name" that
-	        ;; shadows a "new-name" but in practice every time an
-	        ;; `old-name' conflicts with a `new-name', the newer one has a
-	        ;; higher code, so it gets pushed later!
+                ;; This code used to push both old-name and new-name
+                ;; on the assumption that the new-name codepoint would
+                ;; always be higher, which was true for a long time.
+                ;; As of at latest 2023-09-15, this is no longer true,
+                ;; so we now skip the old-name if it conflicts with an
+                ;; existing new-name (Bug#65997).
 	        (if new-name (puthash new-name c names))
-	        (if old-name (puthash old-name c names))
+                (when (and old-name
+                           (not (gethash old-name names)))
+                  (puthash old-name c names))
                 ;; Unicode uses the spelling "lamda" in character
                 ;; names, instead of "lambda", due to "preferences
                 ;; expressed by the Greek National Body" (Bug#30513).
diff --git a/test/lisp/international/mule-tests.el b/test/lisp/international/mule-tests.el
index 3e0c5bf9f4b..4dc099a18af 100644
--- a/test/lisp/international/mule-tests.el
+++ b/test/lisp/international/mule-tests.el
@@ -49,6 +49,30 @@
                        (kbd "C-x RET c u t f - 8 RET C-u C-u c a b RET")
                      (read-string "prompt:"))))))
 
+;;Bug#65997, ensure that old-names haven't overriden new names.
+(ert-deftest mule-cmds-tests--ucs-names-old-name-override ()
+  (let (code-points)
+    (dotimes (u (1+ (max-char 'ucs)))
+      (when-let* ((name (get-char-code-property u 'name))
+                  (c (char-from-name name)))
+        (when (and (not (<= #xD800 u #xDFFF))
+                   (not (= c u)))
+          (push (format "%X" u) code-points))))
+    (setq code-points (nreverse code-points))
+    (should (null code-points))))
+
+;; Bug#65997, ensure that all codepoints with names are in '(ucs-names)'.
+(ert-deftest mule-cmds-tests--ucs-names-missing-names ()
+  (let (code-points)
+    (dotimes (u (1+ (max-char 'ucs)))
+      (when-let ((name (get-char-code-property u 'name)))
+        (when (and (not (<= #xD800 u #xDFFF))
+                   (not (<= #x18800 u #x18AFF))
+                   (not (char-from-name name)))
+          (push (format "%X" u) code-points))))
+    (setq code-points (nreverse code-points))
+    (should (null code-points))))
+
 (ert-deftest mule-utf-7 ()
   ;; utf-7 and utf-7-imap are not ASCII-compatible.
   (should-not (coding-system-get 'utf-7 :ascii-compatible-p))
-- 
2.39.5