From bb591f139f0602af292c772f974dcc14dabb1deb Mon Sep 17 00:00:00 2001
From: Alan Mackenzie <acm@muc.de>
Date: Sun, 20 May 2018 13:28:14 +0000
Subject: [PATCH] Enhance CC Mode's fontification, etc., of unterminated
 strings.

String delimiters, including escaped new lines, of correctly terminated
strings are left in font-lock-string-face.  All others get
font-lock-warning-face.  The latter get syntax-table text properties on the
opening string delim and the "terminating EOL".

Correct two miscellaneous bugs: the handling of text properties on Java Mode's
generic delimiters; the handling of c-just-done-before-change.

* lisp/progmodes/cc-defs.el (c-point): New position 'eoll "end of logical line".
(c-characterp): New macro.

* lisp/progmodes/cc-fonts.el (c-font-lock-invalid-string): Removed.
(c-basic-matchers-before): Use a simple matcher in place of the form around
c-font-lock-invalid-string.

* lisp/progmodes/cc-langs.el (c-get-state-before-change-functions): Add
c-before-change-check-unbalanced-strings to the value for all modes except AWK
Mode.  Also add c-before-change-check-<>-operators to Java Mode, correcting an
error in that mode's handling of generic delimiters.
(c-before-font-lock-functions): Add c-after-change-re-mark-unbalanced-strings
to the value for all modes except AWK Mode.
(c-single-quotes-quote-strings, c-string-delims): New lang variables for
future enhancements.
(c-string-innards-re-alist): New lang variable.

* lisp/progmodes/cc-mode.el (c-just-done-before-change): Do not set this
variable when a change is the alteration of text properties.
(c-basic-common-init): Set parse-sexp-lookup-properties (and the XEmacs
equivalent) also for Pike Mode.
(c-neutralize-CPP-line): No longer neutralize unbalanced quotes here.
(c-unescaped-nls-in-string-p, c-multiline-string-start-is-being-detached)
(c-pps-to-string-delim, c-before-change-check-unbalanced-strings)
(c-after-change-re-mark-unbalanced-strings): New functions.
(c-after-change): Fix a bug with the handling of c-just-done-before-change.
---
 lisp/progmodes/cc-defs.el  |  18 +++
 lisp/progmodes/cc-fonts.el |  35 +-----
 lisp/progmodes/cc-langs.el |  48 +++++++-
 lisp/progmodes/cc-mode.el  | 225 +++++++++++++++++++++++++++++++++++--
 4 files changed, 279 insertions(+), 47 deletions(-)

diff --git a/lisp/progmodes/cc-defs.el b/lisp/progmodes/cc-defs.el
index 7ec57e03b38..53d665477c1 100644
--- a/lisp/progmodes/cc-defs.el
+++ b/lisp/progmodes/cc-defs.el
@@ -219,6 +219,7 @@ one of the following symbols:
 
 `bol'   -- beginning of line
 `eol'   -- end of line
+`eoll'  -- end of logical line (i.e. without escaped NL)
 `bod'   -- beginning of defun
 `eod'   -- end of defun
 `boi'   -- beginning of indentation
@@ -254,6 +255,16 @@ to it is returned.  This function does not modify the point or the mark."
 	       (end-of-line)
 	       (point))))
 
+	 ((eq position 'eoll)
+	  `(save-excursion
+	     ,@(if point `((goto-char ,point)))
+	     (while (progn
+		      (end-of-line)
+		      (prog1 (eq (logand 1 (skip-chars-backward "\\\\")) 1)))
+	       (beginning-of-line 2))
+	     (end-of-line)
+	     (point)))
+
 	 ((eq position 'boi)
 	  `(save-excursion
 	     ,@(if point `((goto-char ,point)))
@@ -453,6 +464,13 @@ to it is returned.  This function does not modify the point or the mark."
       `(int-to-char ,integer)
     integer))
 
+(defmacro c-characterp (arg)
+  ;; Return t when ARG is a character (XEmacs) or integer (Emacs), otherwise
+  ;; return nil.
+  (if (integerp ?c)
+      `(integerp ,arg)
+    `(characterp ,arg)))
+
 (defmacro c-last-command-char ()
   ;; The last character just typed.  Note that `last-command-event' exists in
   ;; both Emacs and XEmacs, but with confusingly different meanings.
diff --git a/lisp/progmodes/cc-fonts.el b/lisp/progmodes/cc-fonts.el
index 7cac55e057f..9d2517f2524 100644
--- a/lisp/progmodes/cc-fonts.el
+++ b/lisp/progmodes/cc-fonts.el
@@ -682,33 +682,6 @@ stuff.  Used on level 1 and higher."
 		   ''c-nonbreakable-space-face)))
       ))
 
-(defun c-font-lock-invalid-string ()
-  ;; Assuming the point is after the opening character of a string,
-  ;; fontify that char with `font-lock-warning-face' if the string
-  ;; decidedly isn't terminated properly.
-  ;;
-  ;; This function does hidden buffer changes.
-  (let ((start (1- (point))))
-    (save-excursion
-      (and (eq (elt (parse-partial-sexp start (c-point 'eol)) 8) start)
-	   (if (if (eval-when-compile (integerp ?c))
-		   ;; Emacs
-		   (integerp c-multiline-string-start-char)
-		 ;; XEmacs
-		 (characterp c-multiline-string-start-char))
-	       ;; There's no multiline string start char before the
-	       ;; string, so newlines aren't allowed.
-	       (not (eq (char-before start) c-multiline-string-start-char))
-	     ;; Multiline strings are allowed anywhere if
-	     ;; c-multiline-string-start-char is t.
-	     (not c-multiline-string-start-char))
-	   (if c-string-escaped-newlines
-	       ;; There's no \ before the newline.
-	       (not (eq (char-before (point)) ?\\))
-	     ;; Escaped newlines aren't supported.
-	     t)
-	   (c-put-font-lock-face start (1+ start) 'font-lock-warning-face)))))
-
 (defun c-font-lock-invalid-single-quotes (limit)
   ;; This function will be called from font-lock for a region bounded by POINT
   ;; and LIMIT, as though it were to identify a keyword for
@@ -749,16 +722,12 @@ casts and declarations are fontified.  Used on level 2 and higher."
   ;; `c-recognize-<>-arglists' is set.
 
   t `(;; Put a warning face on the opener of unclosed strings that
-      ;; can't span lines.  Later font
+      ;; can't span lines and on the "terminating" newlines.  Later font
       ;; lock packages have a `font-lock-syntactic-face-function' for
       ;; this, but it doesn't give the control we want since any
       ;; fontification done inside the function will be
       ;; unconditionally overridden.
-      ,(c-make-font-lock-search-function
-	;; Match a char before the string starter to make
-	;; `c-skip-comments-and-strings' work correctly.
-	(concat ".\\(" c-string-limit-regexp "\\)")
-	'((c-font-lock-invalid-string)))
+      ("\\s|" 0 font-lock-warning-face t nil)
 
       ;; Invalid single quotes.
       c-font-lock-invalid-single-quotes
diff --git a/lisp/progmodes/cc-langs.el b/lisp/progmodes/cc-langs.el
index 4a7c79a6dfa..53cb7f7f725 100644
--- a/lisp/progmodes/cc-langs.el
+++ b/lisp/progmodes/cc-langs.el
@@ -472,21 +472,24 @@ so that all identifiers are recognized as words.")
 (c-lang-defconst c-get-state-before-change-functions
   ;; For documentation see the following c-lang-defvar of the same name.
   ;; The value here may be a list of functions or a single function.
-  t nil
+  t 'c-before-change-check-unbalanced-strings
   c++ '(c-extend-region-for-CPP
 	c-before-change-check-raw-strings
 	c-before-change-check-<>-operators
 	c-depropertize-CPP
 	c-invalidate-macro-cache
 	c-truncate-bs-cache
+	c-before-change-check-unbalanced-strings
 	c-parse-quotes-before-change)
   (c objc) '(c-extend-region-for-CPP
 	     c-depropertize-CPP
 	     c-invalidate-macro-cache
 	     c-truncate-bs-cache
+	     c-before-change-check-unbalanced-strings
 	     c-parse-quotes-before-change)
-  java 'c-parse-quotes-before-change
-       ;; 'c-before-change-check-<>-operators
+  java '(c-parse-quotes-before-change
+	 c-before-change-check-unbalanced-strings
+	 c-before-change-check-<>-operators)
   awk 'c-awk-record-region-clear-NL)
 (c-lang-defvar c-get-state-before-change-functions
 	       (let ((fs (c-lang-const c-get-state-before-change-functions)))
@@ -514,14 +517,17 @@ parameters \(point-min) and \(point-max).")
   ;; For documentation see the following c-lang-defvar of the same name.
   ;; The value here may be a list of functions or a single function.
   t '(c-depropertize-new-text
+      c-after-change-re-mark-unbalanced-strings
       c-change-expand-fl-region)
   (c objc) '(c-depropertize-new-text
 	     c-parse-quotes-after-change
+	     c-after-change-re-mark-unbalanced-strings
 	     c-extend-font-lock-region-for-macros
 	     c-neutralize-syntax-in-CPP
 	     c-change-expand-fl-region)
   c++ '(c-depropertize-new-text
 	c-parse-quotes-after-change
+	c-after-change-re-mark-unbalanced-strings
 	c-extend-font-lock-region-for-macros
 	c-after-change-re-mark-raw-strings
 	c-neutralize-syntax-in-CPP
@@ -529,6 +535,7 @@ parameters \(point-min) and \(point-max).")
 	c-change-expand-fl-region)
   java '(c-depropertize-new-text
 	 c-parse-quotes-after-change
+	 c-after-change-re-mark-unbalanced-strings
 	 c-restore-<>-properties
 	 c-change-expand-fl-region)
   awk '(c-depropertize-new-text
@@ -611,6 +618,19 @@ EOL terminated statements."
   (c c++ objc) t)
 (c-lang-defvar c-has-bitfields (c-lang-const c-has-bitfields))
 
+(c-lang-defconst c-single-quotes-quote-strings
+  "Whether the language uses single quotes for multi-char strings."
+  t nil)
+(c-lang-defvar c-single-quotes-quote-strings
+	       (c-lang-const c-single-quotes-quote-strings))
+
+(c-lang-defconst c-string-delims
+  "A list of characters which can delimit arbitrary length strings"
+  t (if (c-lang-const c-single-quotes-quote-strings)
+	'(?\" ?\')
+      '(?\")))
+(c-lang-defvar c-string-delims (c-lang-const c-string-delims))
+
 (c-lang-defconst c-has-quoted-numbers
   "Whether the language has numbers quoted like 4'294'967'295."
   t nil
@@ -856,6 +876,28 @@ literal are multiline."
 (c-lang-defvar c-multiline-string-start-char
   (c-lang-const c-multiline-string-start-char))
 
+(c-lang-defconst c-string-innards-re-alist
+  ;; An alist of regexps matching the innards of a string, the key being the
+  ;; string's delimiter.
+  ;;
+  ;; The regexps' matches extend up to, but not including, the closing string
+  ;; delimiter or an unescaped NL.  An EOL is part of the string only if it is
+  ;; escaped.
+  t (mapcar (lambda (delim)
+	      (cons
+	       delim
+	       (concat "\\(\\\\\\(.\\|\n\\|\r\\)\\|[^\\\n\r"
+		       (string delim)
+		       "]\\)*")))
+	    (and
+	     (or (null (c-lang-const c-multiline-string-start-char))
+		 (c-characterp (c-lang-const c-multiline-string-start-char)))
+	     (if (c-lang-const c-single-quotes-quote-strings)
+		 '(?\" ?\')
+	       '(?\")))))
+(c-lang-defvar c-string-innards-re-alist
+  (c-lang-const c-string-innards-re-alist))
+
 (c-lang-defconst c-opt-cpp-symbol
   "The symbol which starts preprocessor constructs when in the margin."
   t "#"
diff --git a/lisp/progmodes/cc-mode.el b/lisp/progmodes/cc-mode.el
index f42510932e5..2427191eae0 100644
--- a/lisp/progmodes/cc-mode.el
+++ b/lisp/progmodes/cc-mode.el
@@ -499,9 +499,10 @@ preferably use the `c-mode-menu' language constant directly."
 ;; `basic-save-buffer' does (insert ?\n) when `require-final-newline' is
 ;; non-nil; (ii) to detect when Emacs fails to invoke
 ;; `before-change-functions'.  This can happen when reverting a buffer - see
-;; bug #24094.  It seems these failures happen only in GNU Emacs; XEmacs
-;; seems to maintain the strict alternation of calls to
-;; `before-change-functions' and `after-change-functions'.
+;; bug #24094.  It seems these failures happen only in GNU Emacs; XEmacs seems
+;; to maintain the strict alternation of calls to `before-change-functions'
+;; and `after-change-functions'.  Note that this variable is not set when
+;; `c-before-change' is invoked by a change to text properties.
 
 (defun c-basic-common-init (mode default-style)
   "Do the necessary initialization for the syntax handling routines
@@ -563,7 +564,7 @@ that requires a literal mode spec at compile time."
 
   (when (or c-recognize-<>-arglists
 	    (c-major-mode-is 'awk-mode)
-	    (c-major-mode-is '(java-mode c-mode c++-mode objc-mode)))
+	    (c-major-mode-is '(java-mode c-mode c++-mode objc-mode pike-mode)))
     ;; We'll use the syntax-table text property to change the syntax
     ;; of some chars for this language, so do the necessary setup for
     ;; that.
@@ -996,9 +997,9 @@ Note that the style variables are always made local to the buffer."
   ;; characters, ones which would interact syntactically with stuff outside
   ;; this region.
   ;;
-  ;; These are unmatched string delimiters, or unmatched
-  ;; parens/brackets/braces.  An unclosed comment is regarded as valid, NOT
-  ;; obtrusive.
+  ;; These are unmatched parens/brackets/braces.  An unclosed comment is
+  ;; regarded as valid, NOT obtrusive.  Unbalanced strings are handled
+  ;; elsewhere.
   (save-excursion
     (let (s)
       (while
@@ -1008,9 +1009,11 @@ Note that the style variables are always made local to the buffer."
 	     ((< (nth 0 s) 0)		; found an unmated ),},]
 	      (c-put-char-property (1- (point)) 'syntax-table '(1))
 	      t)
-	     ((nth 3 s)			; In a string
-	      (c-put-char-property (nth 8 s) 'syntax-table '(1))
-	      t)
+	     ;; Unbalanced strings are now handled by
+	     ;; `c-before-change-check-unbalanced-strings', etc.
+	     ;; ((nth 3 s)			; In a string
+	     ;;  (c-put-char-property (nth 8 s) 'syntax-table '(1))
+	     ;;  t)
 	     ((> (nth 0 s) 0)		; In a (,{,[
 	      (c-put-char-property (nth 1 s) 'syntax-table '(1))
 	      t)
@@ -1070,6 +1073,205 @@ Note that the style variables are always made local to the buffer."
 	    (forward-line))	      ; no infinite loop with, e.g., "#//"
 	  )))))
 
+(defun c-unescaped-nls-in-string-p (&optional quote-pos)
+  ;; Return whether unescaped newlines can be inside strings.
+  ;;
+  ;; QUOTE-POS, if present, is the position of the opening quote of a string.
+  ;; Depending on the language, there might be a special character before it
+  ;; signifying the validity of such NLs.
+  (cond
+   ((null c-multiline-string-start-char) nil)
+   ((c-characterp c-multiline-string-start-char)
+    (and quote-pos
+	 (eq (char-before quote-pos) c-multiline-string-start-char)))
+   (t t)))
+
+(defun c-multiline-string-start-is-being-detached (end)
+  ;; If (e.g.), the # character in Pike is being detached from the string
+  ;; opener it applies to, return t.  Else return nil.  END is the argument
+  ;; supplied to every before-change function.
+  (and (memq (char-after end) c-string-delims)
+       (c-characterp c-multiline-string-start-char)
+       (eq (char-before end) c-multiline-string-start-char)))
+
+(defun c-pps-to-string-delim (end)
+  ;; parse-partial-sexp forward to the next string quote, which is deemed to
+  ;; be a closing quote.  Return nil.
+  ;;
+  ;; We remove string-fence syntax-table text properties from characters we
+  ;; pass over.
+  (let* ((start (point))
+	 (no-st-s `(0 nil nil ?\" nil nil 0 nil ,start nil nil))
+	 (st-s `(0 nil nil t nil nil 0 nil ,start nil nil))
+	 no-st-pos st-pos
+	 )
+    (parse-partial-sexp start end nil nil no-st-s 'syntax-table)
+    (setq no-st-pos (point))
+    (goto-char start)
+    (while (progn
+	     (parse-partial-sexp (point) end nil nil st-s 'syntax-table)
+	     (c-clear-char-property (1- (point)) 'syntax-table)
+	     (setq st-pos (point))
+	     (and (< (point) end)
+		  (not (eq (char-before) ?\")))))
+    (goto-char (min no-st-pos st-pos))
+    nil))
+
+(defun c-before-change-check-unbalanced-strings (beg end)
+  ;; If BEG or END is inside an unbalanced string, remove the syntax-table
+  ;; text property from respectively the start or end of the string.  Also
+  ;; extend the region (c-new-BEG c-new-END) as necessary to cope with the
+  ;; change being the insertion of an odd number of quotes.
+  ;;
+  ;; POINT is undefined both at entry to and exit from this function, the
+  ;; buffer will have been widened, and match data will have been saved.
+  ;;
+  ;; This function is called exclusively as a before-change function via
+  ;; `c-get-state-before-change-functions'.
+  (c-save-buffer-state
+      ((end-limits
+	(progn
+	  (goto-char (if (c-multiline-string-start-is-being-detached end)
+			 (1+ end)
+		       end))
+	  (c-literal-limits)))
+       (end-literal-type (and end-limits
+		       	      (c-literal-type end-limits)))
+       (beg-limits
+	(progn
+	  (goto-char beg)
+	  (c-literal-limits)))
+       (beg-literal-type (and beg-limits
+		       	      (c-literal-type beg-limits))))
+
+    (when (eq end-literal-type 'string)
+      (setq c-new-END (max c-new-END (cdr end-limits))))
+    ;; It is possible the buffer change will include inserting a string quote.
+    ;; This could have the effect of flipping the meaning of any following
+    ;; quotes up until the next unescaped EOL.  Also guard against the change
+    ;; being the insertion of \ before an EOL, escaping it.
+    (cond
+     ((c-characterp c-multiline-string-start-char)
+      ;; The text about to be inserted might contain a multiline string
+      ;; opener.  Set c-new-END after anything which might be affected.
+      ;; Go to the end of the putative multiline string.
+      (goto-char end)
+      (c-pps-to-string-delim (point-max))
+      (when (< (point) (point-max))
+	(while
+	    (and
+	     (progn
+	       (while
+		   (and
+		    (c-syntactic-re-search-forward
+		     "\"\\|\\s|" (point-max) t t)
+		    (progn
+		      (c-clear-char-property (1- (point)) 'syntax-table)
+		      (not (eq (char-before) ?\")))))
+	       (eq (char-before) ?\"))
+	     (if (eq (char-before (1- (point)))
+		     c-multiline-string-start-char)
+		 (progn
+		   (c-pps-to-string-delim (point-max))
+		   (< (point) (point-max)))
+	       (c-pps-to-string-delim (c-point 'eoll))
+	       (< (point) (c-point 'eoll))))))
+      (setq c-new-END (max (point) c-new-END)))
+
+     ((< c-new-END (point-max))
+      (goto-char (1+ c-new-END))	; might be a newline.
+      ;; In the following regexp, the initial \n caters for a newline getting
+      ;; joined to a preceding \ by the removal of what comes between.
+      (re-search-forward "\n?\\(\\\\\\(.\\|\n\\|\r\\)\\|[^\\\n\r]\\)*" nil t)
+      ;; We're at an EOLL or point-max.
+      (setq c-new-END (min (1+ (point)) (point-max)))
+      ;; FIXME!!!  Write a clever comment here.
+      (goto-char c-new-END)
+      (when (equal (c-get-char-property (1- (point)) 'syntax-table) '(15))
+	(backward-sexp)
+	(c-clear-char-property (1- c-new-END) 'syntax-table)
+	(c-clear-char-property (point) 'syntax-table)))
+
+     (t (if (memq (char-before c-new-END) c-string-delims)
+	    (c-clear-char-property (1- c-new-END) 'syntax-table))))
+
+    (when (eq end-literal-type 'string)
+      (c-clear-char-property (1- (cdr end-limits)) 'syntax-table))
+
+    (when (eq beg-literal-type 'string)
+      (setq c-new-BEG (min c-new-BEG (car beg-limits)))
+      (c-clear-char-property (car beg-limits) 'syntax-table))))
+
+(defun c-after-change-re-mark-unbalanced-strings (beg _end _old-len)
+  ;; Mark any unbalanced strings in the region (c-new-BEG c-new-END) with
+  ;; string fence syntax-table text properties.
+  ;;
+  ;; POINT is undefined both at entry to and exit from this function, the
+  ;; buffer will have been widened, and match data will have been saved.
+  ;;
+  ;; This function is called exclusively as an after-change function via
+  ;; `c-before-font-lock-functions'.
+  (c-save-buffer-state
+      ((cll (progn (goto-char c-new-BEG)
+		   (c-literal-limits)))
+       (beg-literal-type (and cll (c-literal-type cll)))
+       (beg-limits
+	(cond
+	 ((and (eq beg-literal-type 'string)
+	       (c-unescaped-nls-in-string-p (car cll)))
+	  (cons
+	   (car cll)
+	   (progn
+	     (goto-char (1+ (car cll)))
+	     (search-forward-regexp
+	      (cdr (assq (char-after (car cll)) c-string-innards-re-alist))
+	      nil t)
+	     (min (1+ (point)) (point-max)))))
+	 ((and (null beg-literal-type)
+	       (goto-char beg)
+	       (eq (char-before) c-multiline-string-start-char)
+	       (memq (char-after) c-string-delims))
+	  (cons (point)
+		(progn
+		  (forward-char)
+		  (search-forward-regexp
+		   (cdr (assq (char-before) c-string-innards-re-alist)) nil t)
+		  (1+ (point)))))
+	 (cll)))
+       s)
+    (goto-char
+     (cond ((null beg-literal-type)
+	    c-new-BEG)
+	   ((eq beg-literal-type 'string)
+	    (car beg-limits))
+	   (t				; comment
+	    (cdr beg-limits))))
+    (while
+	(and
+	 (< (point) c-new-END)
+	 (progn
+	   ;; Skip over any comments before the next string.
+	   (while (progn
+		    (setq s (parse-partial-sexp (point) c-new-END nil
+						nil s 'syntax-table))
+		    (and (not (nth 3 s))
+			 (< (point) c-new-END)
+			 (not (memq (char-before) c-string-delims)))))
+	   ;; We're at the start of a string.
+	   (memq (char-before) c-string-delims)))
+      (if (c-unescaped-nls-in-string-p (1- (point)))
+	  (looking-at "[^\"]*")
+	(looking-at (cdr (assq (char-before) c-string-innards-re-alist))))
+      (cond
+       ((memq (char-after (match-end 0)) '(?\n ?\r))
+	(c-put-char-property (1- (point)) 'syntax-table '(15))
+	(c-put-char-property (match-end 0) 'syntax-table '(15)))
+       ((or (eq (match-end 0) (point-max))
+	    (eq (char-after (match-end 0)) ?\\)) ; \ at EOB
+	(c-put-char-property (1- (point)) 'syntax-table '(15))))
+      (goto-char (min (1+ (match-end 0)) (point-max)))
+      (setq s nil))))
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Parsing of quotes.
 ;;
@@ -1418,7 +1620,8 @@ Note that this is a strict tail, so won't match, e.g. \"0x....\".")
   ;; without an intervening call to `before-change-functions' when reverting
   ;; the buffer (see bug #24094).  Whatever the cause, assume that the entire
   ;; buffer has changed.
-  (when (not c-just-done-before-change)
+  (when (and (not c-just-done-before-change)
+	     (not (c-called-from-text-property-change-p)))
     (save-restriction
       (widen)
       (c-before-change (point-min) (point-max))
-- 
2.39.5