From b72e4b149329797b8f2c947953251f92615ee73e Mon Sep 17 00:00:00 2001 From: Lars Ingebrigtsen Date: Sun, 3 Jul 2022 14:05:01 +0200 Subject: [PATCH] Make string-limit with encoding return complete glyphs * lisp/emacs-lisp/subr-x.el (string-limit): Return more correct results in the CODING-SYSTEM case for coding systems with BOM and charset designations (bug#48324). Also amend the algorithm to return complete glyphs, not just complete code points. --- lisp/emacs-lisp/subr-x.el | 83 +++++++++++++++++----------- test/lisp/emacs-lisp/subr-x-tests.el | 23 ++++++-- 2 files changed, 71 insertions(+), 35 deletions(-) diff --git a/lisp/emacs-lisp/subr-x.el b/lisp/emacs-lisp/subr-x.el index 390e505f009..56e8c2aa862 100644 --- a/lisp/emacs-lisp/subr-x.el +++ b/lisp/emacs-lisp/subr-x.el @@ -167,9 +167,9 @@ non-nil, return the last LENGTH characters instead. If CODING-SYSTEM is non-nil, STRING will be encoded before limiting, and LENGTH is interpreted as the number of bytes to limit the string to. The result will be a unibyte string that is -shorter than LENGTH, but will not contain \"partial\" characters, -even if CODING-SYSTEM encodes characters with several bytes per -character. +shorter than LENGTH, but will not contain \"partial\" +characters (or glyphs), even if CODING-SYSTEM encodes characters +with several bytes per character. When shortening strings for display purposes, `truncate-string-to-width' is almost always a better alternative @@ -177,34 +177,55 @@ than this function." (unless (natnump length) (signal 'wrong-type-argument (list 'natnump length))) (if coding-system - (let ((result nil) - (result-length 0) - (index (if end (1- (length string)) 0))) - ;; FIXME: This implementation, which uses encode-coding-char - ;; to encode the string one character at a time, is in general - ;; incorrect: coding-systems that produce prefix or suffix - ;; bytes, such as ISO-2022-based or UTF-8/16 with BOM, will - ;; produce those bytes for each character, instead of just - ;; once for the entire string. encode-coding-char attempts to - ;; remove those extra bytes at least in some situations, but - ;; it cannot do that in all cases. And in any case, producing - ;; what is supposed to be a UTF-16 or ISO-2022-CN encoded - ;; string which lacks the BOM bytes at the beginning and the - ;; charset designation sequences at the head and tail of the - ;; result will definitely surprise the callers in some cases. - (while (let ((encoded (encode-coding-char - (aref string index) coding-system))) - (and (<= (+ (length encoded) result-length) length) - (progn - (push encoded result) - (cl-incf result-length (length encoded)) - (setq index (if end (1- index) - (1+ index)))) - (if end (> index -1) - (< index (length string))))) - ;; No body. - ) - (apply #'concat (if end result (nreverse result)))) + ;; The previous implementation here tried to encode char by + ;; char, and then adding up the length of the encoded octets, + ;; but that's not reliably in the presence of BOM marks and + ;; ISO-2022-CN which may add charset designations at the + ;; start/end of each encoded char (which we don't want). So + ;; iterate (with a binary search) instead to find the desired + ;; length. + (let* ((glyphs (string-glyph-split string)) + (nglyphs (length glyphs)) + (too-long (1+ nglyphs)) + (stop (max (/ nglyphs 2) 1)) + (gap stop) + candidate encoded found candidate-stop) + ;; We're returning the end of the string. + (when end + (setq glyphs (nreverse glyphs))) + (while (and (not found) + (< stop too-long)) + (setq encoded + (encode-coding-string (string-join (seq-take glyphs stop)) + coding-system)) + (cond + ((= (length encoded) length) + (setq found encoded + candidate-stop stop)) + ;; Too long; try shortening. + ((> (length encoded) length) + (setq too-long stop + stop (max (- stop gap) 1))) + ;; Too short; try lengthening. + (t + (setq candidate encoded + candidate-stop stop) + (setq stop + (if (>= stop nglyphs) + too-long + (min (+ stop gap) nglyphs))))) + (setq gap (max (/ gap 2) 1))) + (cond + ((not (or found candidate)) + "") + ;; We're returning the end, so redo the encoding. + (end + (encode-coding-string + (string-join (nreverse (seq-take glyphs candidate-stop))) + coding-system)) + (t + (or found candidate)))) + ;; Char-based version. (cond ((<= (length string) length) string) (end (substring string (- (length string) length))) diff --git a/test/lisp/emacs-lisp/subr-x-tests.el b/test/lisp/emacs-lisp/subr-x-tests.el index 0bec9db36e9..99c0e822155 100644 --- a/test/lisp/emacs-lisp/subr-x-tests.el +++ b/test/lisp/emacs-lisp/subr-x-tests.el @@ -607,21 +607,36 @@ (should (equal (string-limit "foó" 4 nil 'utf-8) "fo\303\263")) (should (equal (string-limit "foóa" 4 nil 'utf-8) "fo\303\263")) (should (equal (string-limit "foóá" 4 nil 'utf-8) "fo\303\263")) + (should (equal (string-limit "foóá" 2 nil 'utf-8-with-signature) + "")) (should (equal (string-limit "foóá" 4 nil 'utf-8-with-signature) - "fo\303\263")) + "\357\273\277f")) (should (equal (string-limit "foóa" 4 nil 'iso-8859-1) "fo\363a")) (should (equal (string-limit "foóá" 4 nil 'iso-8859-1) "fo\363\341")) - (should (equal (string-limit "foóá" 4 nil 'utf-16) "\000f\000o")) + (should (equal (string-limit "foóá" 3 nil 'utf-16) "")) + (should (equal (string-limit "foóá" 6 nil 'utf-16) "\376\377\000f\000o")) (should (equal (string-limit "foó" 10 t 'utf-8) "fo\303\263")) (should (equal (string-limit "foó" 3 t 'utf-8) "o\303\263")) (should (equal (string-limit "foó" 4 t 'utf-8) "fo\303\263")) (should (equal (string-limit "foóa" 4 t 'utf-8) "o\303\263a")) (should (equal (string-limit "foóá" 4 t 'utf-8) "\303\263\303\241")) - (should (equal (string-limit "foóá" 2 t 'utf-8-with-signature) "\303\241")) + (should (equal (string-limit "foóá" 2 t 'utf-8-with-signature) + "")) (should (equal (string-limit "foóa" 4 t 'iso-8859-1) "fo\363a")) (should (equal (string-limit "foóá" 4 t 'iso-8859-1) "fo\363\341")) - (should (equal (string-limit "foóá" 4 t 'utf-16) "\000\363\000\341"))) + (should (equal (string-limit "foóá" 6 t 'utf-16) "\376\377\000\363\000\341"))) + +(ert-deftest subr-string-limit-glyphs () + (should (equal (encode-coding-string "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 'utf-8) + "Hello, \360\237\221\274\360\237\217\273\360\237\247\221\360\237\217\274\342\200\215\360\237\244\235\342\200\215\360\237\247\221\360\237\217\273")) + (should (= (length (encode-coding-string "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 'utf-8)) 41)) + (should (equal (string-limit "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 100 nil 'utf-8) + "Hello, \360\237\221\274\360\237\217\273\360\237\247\221\360\237\217\274\342\200\215\360\237\244\235\342\200\215\360\237\247\221\360\237\217\273")) + (should (equal (string-limit "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 15 nil 'utf-8) + "Hello, \360\237\221\274\360\237\217\273")) + (should (equal (string-limit "Hello, 👼🏻🧑🏼‍🤝‍🧑🏻" 10 nil 'utf-8) + "Hello, "))) (ert-deftest subr-string-lines () (should (equal (string-lines "foo") '("foo"))) -- 2.39.2