From f3f1947e5b5beeef9c004cfa2bf591dc0c0331b8 Mon Sep 17 00:00:00 2001 From: Eli Zaretskii Date: Tue, 11 May 2021 14:55:29 +0300 Subject: [PATCH] Fix Hexl handling of coding-systems with BOM * lisp/international/mule-cmds.el (encode-coding-char): If CODING-SYSTEM produces BOM, remove the BOM bytes from the produced byte sequence. (Bug#48324) * lisp/hexl.el (hexl-mode): Use bufferpos-to-filepos to convert point to offset into the original file. (hexl-mode-exit, hexl-maybe-dehexlify-buffer): Use filepos-to-bufferpos to restore point in the original buffer. (hexl-mode, hexl-insert-multibyte-char) (hexl-self-insert-command, hexl-insert-hex-char) (hexl-insert-decimal-char, hexl-insert-octal-char) (hexl-find-file): Enhance the doc strings, mainly explaining the complications of inserting multibyte characters. (hexl-insert-multibyte-char): Don't treat CH as unibyte if the coding-system isn't ASCII-compatible. Don't treat null bytes as multibyte. --- lisp/hexl.el | 136 +++++++++++++++++++++++--------- lisp/international/mule-cmds.el | 28 +++++-- 2 files changed, 120 insertions(+), 44 deletions(-) diff --git a/lisp/hexl.el b/lisp/hexl.el index 85c3a53413d..8bfc1fb89e4 100644 --- a/lisp/hexl.el +++ b/lisp/hexl.el @@ -303,22 +303,30 @@ also supported. There are several ways to change text in hexl mode: -ASCII characters (character between space (0x20) and tilde (0x7E)) are -bound to self-insert so you can simply type the character and it will -insert itself (actually overstrike) into the buffer. +Self-inserting characters are bound to `hexl-self-insert' so you +can simply type the character and it will insert itself (actually +overstrike) into the buffer. However, inserting non-ASCII characters +requires caution: the buffer's coding-system should correspond to +the encoding on disk, and multibyte characters should be inserted +with cursor on the first byte of a multibyte sequence whose length +is identical to the length of the multibyte sequence to be inserted, +otherwise this could produce invalid multibyte sequences. Non-ASCII +characters in ISO-2022 encodings should preferably inserted byte by +byte, to avoid problems caused by the designation sequences before +the actual characters. \\[hexl-quoted-insert] followed by another keystroke allows you to insert the key even if it isn't bound to self-insert. An octal number can be supplied in place of another key to insert the octal number's ASCII representation. -\\[hexl-insert-hex-char] will insert a given hexadecimal value (if it is between 0 and 0xFF) -into the buffer at the current point. +\\[hexl-insert-hex-char] will insert a given hexadecimal value +into the buffer at the current address. -\\[hexl-insert-octal-char] will insert a given octal value (if it is between 0 and 0377) -into the buffer at the current point. +\\[hexl-insert-octal-char] will insert a given octal value +into the buffer at the current address. -\\[hexl-insert-decimal-char] will insert a given decimal value (if it is between 0 and 255) -into the buffer at the current point. +\\[hexl-insert-decimal-char] will insert a given decimal value +into the buffer at the current address.. \\[hexl-mode-exit] will exit `hexl-mode'. @@ -332,26 +340,16 @@ You can use \\[hexl-find-file] to visit a file in Hexl mode. (unless (eq major-mode 'hexl-mode) (let ((modified (buffer-modified-p)) (inhibit-read-only t) - (original-point (- (point) (point-min)))) - (and (eobp) (not (bobp)) - (setq original-point (1- original-point))) + (point-offset (bufferpos-to-filepos (point) 'exact))) ;; If `hexl-mode' is invoked with an argument the buffer is assumed to ;; be in hexl format. (when (memq arg '(1 nil)) - ;; If the buffer's EOL type is -dos, we need to account for - ;; extra CR characters added when hexlify-buffer writes the - ;; buffer to a file. - ;; FIXME: This doesn't take into account multibyte coding systems. - (when (eq (coding-system-eol-type buffer-file-coding-system) 1) - (setq original-point (+ (count-lines (point-min) (point)) - original-point)) - (or (bolp) (setq original-point (1- original-point)))) (hexlify-buffer) (restore-buffer-modified-p modified)) (setq hexl-max-address (+ (* (/ (1- (buffer-size)) (hexl-line-displen)) 16) 15)) (condition-case nil - (hexl-goto-address original-point) + (hexl-goto-address point-offset) (error nil))) (let ((max-address hexl-max-address)) @@ -440,7 +438,8 @@ You can use \\[hexl-find-file] to visit a file in Hexl mode. (defun hexl-find-file (filename) "Edit file FILENAME as a binary file in hex dump format. Switch to a buffer visiting file FILENAME, creating one if none exists, -and edit the file in `hexl-mode'." +and edit the file in `hexl-mode'. The buffer's coding-system will be +no-conversion, unlike if you visit it normally and then invoke `hexl-mode'." (interactive (list (let ((completion-ignored-extensions nil)) @@ -478,17 +477,11 @@ With arg, don't unhexlify buffer." (if (or (eq arg 1) (not arg)) (let ((modified (buffer-modified-p)) (inhibit-read-only t) - (original-point (1+ (hexl-current-address)))) + (point-offset (hexl-current-address))) (dehexlify-buffer) (remove-hook 'write-contents-functions #'hexl-save-buffer t) (restore-buffer-modified-p modified) - (goto-char original-point) - ;; Maybe adjust point for the removed CR characters. - (when (eq (coding-system-eol-type buffer-file-coding-system) 1) - (setq original-point (- original-point - (count-lines (point-min) (point)))) - (or (bobp) (setq original-point (1+ original-point)))) - (goto-char original-point))) + (goto-char (filepos-to-bufferpos point-offset 'exact)))) (remove-hook 'change-major-mode-hook #'hexl-maybe-dehexlify-buffer t) (major-mode-restore)) @@ -499,11 +492,11 @@ Ask the user for confirmation." (if (y-or-n-p "Convert contents back to binary format? ") (let ((modified (buffer-modified-p)) (inhibit-read-only t) - (original-point (1+ (hexl-current-address)))) + (point-offset (hexl-current-address))) (dehexlify-buffer) (remove-hook 'write-contents-functions #'hexl-save-buffer t) (restore-buffer-modified-p modified) - (goto-char original-point)))) + (goto-char (filepos-to-bufferpos point-offset 'exact))))) (defun hexl-current-address (&optional validate) "Return current hexl-address." @@ -879,14 +872,27 @@ This discards the buffer's undo information." "Insert a possibly multibyte character CH NUM times. Non-ASCII characters are first encoded with `buffer-file-coding-system', -and their encoded form is inserted byte by byte." +and their encoded form is inserted byte by byte. Note that if the +hexl buffer was produced by `hexl-find-file', its coding-system +is no-conversion. + +Inserting non-ASCII characters requires caution: the buffer's +coding-system should correspond to the encoding on disk, and +multibyte characters should be inserted with cursor on the first +byte of a multibyte sequence whose length is identical to the +length of the multibyte sequence to be inserted, otherwise this +could produce invalid multibyte sequences. Non-ASCII characters +in ISO-2022 encodings should preferably inserted byte by byte, to +avoid problems caused by the designation sequences before the +actual characters." (let ((charset (char-charset ch)) (coding (if (or (null buffer-file-coding-system) ;; coding-system-type equals t means undecided. (eq (coding-system-type buffer-file-coding-system) t)) (default-value 'buffer-file-coding-system) buffer-file-coding-system))) - (cond ((and (> ch 0) (< ch 256)) + (cond ((and (>= ch 0) (< ch 256) + (coding-system-get coding :ascii-compatible-p)) (hexl-insert-char ch num)) ((eq charset 'unknown) (error @@ -924,7 +930,19 @@ and their encoded form is inserted byte by byte." Interactively, with a numeric argument, insert this character that many times. Non-ASCII characters are first encoded with `buffer-file-coding-system', -and their encoded form is inserted byte by byte." +and their encoded form is inserted byte by byte. Note that if the +hexl buffer was produced by `hexl-find-file', its coding-system +is no-conversion. + +Inserting non-ASCII characters requires caution: the buffer's +coding-system should correspond to the encoding on disk, and +multibyte characters should be inserted with cursor on the first +byte of a multibyte sequence whose length is identical to the +length of the multibyte sequence to be inserted, otherwise this +could produce invalid multibyte sequences. Non-ASCII characters +in ISO-2022 encodings should preferably inserted byte by byte, to +avoid problems caused by the designation sequences before the +actual characters." (interactive "p") (hexl-insert-multibyte-char last-command-event arg)) @@ -964,7 +982,21 @@ CH must be a unibyte character whose value is between 0 and 255." ;; hex conversion (defun hexl-insert-hex-char (arg) - "Insert a character given by its hexadecimal code ARG times at point." + "Insert a character given by its hexadecimal code ARG times at point. + +Values above 0xFF are treated as multibyte characters, and first encoded +using `buffer-file-coding-system'. Note that if the hexl buffer was +produced by `hexl-find-file', its coding-system is no-conversion. + +Inserting non-ASCII characters requires caution: the buffer's +coding-system should correspond to the encoding on disk, and +multibyte characters should be inserted with cursor on the first +byte of a multibyte sequence whose length is identical to the +length of the multibyte sequence to be inserted, otherwise this +could produce invalid multibyte sequences. Non-ASCII characters +in ISO-2022 encodings should preferably inserted byte by byte, to +avoid problems caused by the designation sequences before the +actual characters." (interactive "p") (let ((num (hexl-hex-string-to-integer (read-string "Hex number: ")))) (if (< num 0) @@ -997,7 +1029,21 @@ Embedded whitespace, dashes, and periods in the string are ignored." (setq arg (- arg 1))))) (defun hexl-insert-decimal-char (arg) - "Insert a character given by its decimal code ARG times at point." + "Insert a character given by its decimal code ARG times at point. + +Values above 256 are treated as multibyte characters, and first encoded +using `buffer-file-coding-system'. Note that if the hexl buffer was +produced by `hexl-find-file', its coding-system is no-conversion. + +Inserting non-ASCII characters requires caution: the buffer's +coding-system should correspond to the encoding on disk, and +multibyte characters should be inserted with cursor on the first +byte of a multibyte sequence whose length is identical to the +length of the multibyte sequence to be inserted, otherwise this +could produce invalid multibyte sequences. Non-ASCII characters +in ISO-2022 encodings should preferably inserted byte by byte, to +avoid problems caused by the designation sequences before the +actual characters." (interactive "p") (let ((num (string-to-number (read-string "Decimal Number: ")))) (if (< num 0) @@ -1005,7 +1051,21 @@ Embedded whitespace, dashes, and periods in the string are ignored." (hexl-insert-multibyte-char num arg)))) (defun hexl-insert-octal-char (arg) - "Insert a character given by its octal code ARG times at point." + "Insert a character given by its octal code ARG times at point. + +Values above \377 are treated as multibyte characters, and first encoded +using `buffer-file-coding-system'. Note that if the hexl buffer was +produced by `hexl-find-file', its coding-system is no-conversion. + +Inserting non-ASCII characters requires caution: the buffer's +coding-system should correspond to the encoding on disk, and +multibyte characters should be inserted with cursor on the first +byte of a multibyte sequence whose length is identical to the +length of the multibyte sequence to be inserted, otherwise this +could produce invalid multibyte sequences. Non-ASCII characters +in ISO-2022 encodings should preferably inserted byte by byte, to +avoid problems caused by the designation sequences before the +actual characters." (interactive "p") (let ((num (hexl-octal-string-to-integer (read-string "Octal Number: ")))) (if (< num 0) diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el index b99db46e458..7f8d98b7ceb 100644 --- a/lisp/international/mule-cmds.el +++ b/lisp/international/mule-cmds.el @@ -2963,18 +2963,22 @@ STR should be a unibyte string." str " ")) (defun encode-coding-char (char coding-system &optional charset) - "Encode CHAR by CODING-SYSTEM and return the resulting string. + "Encode CHAR by CODING-SYSTEM and return the resulting string of bytes. If CODING-SYSTEM can't safely encode CHAR, return nil. The 3rd optional argument CHARSET, if non-nil, is a charset preferred on encoding." (let* ((str1 (string char)) (str2 (string char char)) (found (find-coding-systems-string str1)) - enc1 enc2 i1 i2) - (if (eq (car-safe found) 'undecided) ;Aka (not (multibyte-string-p str1)) - ;; `char' is ASCII. + (bom-p (coding-system-get coding-system :bom)) + enc1 enc2 i0 i1 i2) + ;; If CHAR is ASCII and CODING-SYSTEM doesn't prepend a BOM, just + ;; encode CHAR. + (if (and (eq (car-safe found) 'undecided) + (null bom-p)) (encode-coding-string str1 coding-system) - (when (memq (coding-system-base coding-system) found) + (when (or (eq (car-safe found) 'undecided) + (memq (coding-system-base coding-system) found)) ;; We must find the encoded string of CHAR. But, just encoding ;; CHAR will put extra control sequences (usually to designate ;; ASCII charset) at the tail if type of CODING is ISO 2022. @@ -2995,7 +2999,19 @@ on encoding." ;; Now (substring enc1 i1) and (substring enc2 i2) are the same, ;; and they are the extra control sequences at the tail to ;; exclude. - (substring enc2 0 i2))))) + + ;; We also need to exclude the leading 2 or 3 bytes if they + ;; come from a BOM. + (setq i0 + (if bom-p + (cond + ((eq (coding-system-type coding-system) 'utf-8) + 3) + ((eq (coding-system-type coding-system) 'utf-16) + 2) + (t 0)) + 0)) + (substring enc2 i0 i2))))) ;; Backwards compatibility. These might be better with :init-value t, ;; but that breaks loadup. -- 2.39.5