From f9ff60e0d7288e30cdbd1e43225059f1374441f1 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Tue, 2 Apr 2019 15:00:59 -0700 Subject: [PATCH] Improve regexp advice again, and unchain ranges MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit * doc/lispref/searching.texi (Regexp Special): Mention char classes earlier, in a more-logical place. Advise sticking to ASCII letters and digits in ranges. Reword negative advice to make it clearer that it’s negative. * lisp/files.el (make-auto-save-file-name): * lisp/gnus/message.el (message-mailer-swallows-blank-line): * lisp/gnus/nndoc.el (nndoc-lanl-gov-announce-type-p) (nndoc-generate-lanl-gov-head): * lisp/org/org-eshell.el (org-eshell-open): * lisp/org/org.el (org-deadline-time-hour-regexp) (org-scheduled-time-hour-regexp): * lisp/progmodes/bat-mode.el (bat-font-lock-keywords): * lisp/progmodes/bug-reference.el (bug-reference-bug-regexp): * lisp/textmodes/less-css-mode.el (less-css-font-lock-keywords): * lisp/vc/vc-cvs.el (vc-cvs-valid-symbolic-tag-name-p): * lisp/vc/vc-svn.el (vc-svn-valid-symbolic-tag-name-p): Avoid attempts to chain ranges, as this can be confusing. For example, instead of [0-9-_.], use [0-9_.-]. --- doc/lispref/searching.texi | 52 ++++++++++++++++++++------------- lisp/files.el | 2 +- lisp/gnus/message.el | 2 +- lisp/gnus/nndoc.el | 4 +-- lisp/org/org-eshell.el | 2 +- lisp/org/org.el | 4 +-- lisp/progmodes/bat-mode.el | 2 +- lisp/progmodes/bug-reference.el | 2 +- lisp/textmodes/less-css-mode.el | 4 +-- lisp/vc/vc-cvs.el | 2 +- lisp/vc/vc-svn.el | 2 +- 11 files changed, 45 insertions(+), 33 deletions(-) diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 72ee9233a3c..8775254dd07 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -395,9 +395,18 @@ or @samp{$}, @samp{%} or period. However, the ending character of one range should not be the starting point of another one; for example, @samp{[a-m-z]} should be avoided. +A character alternative can also specify named character classes +(@pxref{Char Classes}). This is a POSIX feature. For example, +@samp{[[:ascii:]]} matches any @acronym{ASCII} character. +Using a character class is equivalent to mentioning each of the +characters in that class; but the latter is not feasible in practice, +since some classes include thousands of different characters. +A character class should not appear as the lower or upper bound +of a range. + The usual regexp special characters are not special inside a character alternative. A completely different set of characters is -special inside character alternatives: @samp{]}, @samp{-} and @samp{^}. +special: @samp{]}, @samp{-} and @samp{^}. To include @samp{]} in a character alternative, put it at the beginning. To include @samp{^}, put it anywhere but at the beginning. To include @samp{-}, put it at the end. Thus, @samp{[]^-]} matches @@ -430,33 +439,36 @@ matches only @samp{/} rather than the likely-intended four characters. @end enumerate Some kinds of character alternatives are not the best style even -though they are standardized by POSIX and are portable. They include: +though they have a well-defined meaning in Emacs. They include: @enumerate @item -A character alternative can include duplicates. For example, -@samp{[XYa-yYb-zX]} is less clear than @samp{[XYa-z]}. +Although a range's bound can be almost any character, it is better +style to stay within natural sequences of ASCII letters and digits +because most people have not memorized character code tables. +For example, @samp{[.-9]} is less clear than @samp{[./0-9]}, +and @samp{[`-~]} is less clear than @samp{[`a-z@{|@}~]}. +Unicode character escapes can help here; for example, for most programmers +@samp{[ก-ฺ฿-๛]} is less clear than @samp{[\u0E01-\u0E3A\u0E3F-\u0E5B]}. @item -A range can denote just one, two, or three characters. For example, -@samp{[(-(]} is less clear than @samp{[(]}, @samp{[*-+]} is less clear -than @samp{[*+]}, and @samp{[*-,]} is less clear than @samp{[*+,]}. +Although a character alternative can include duplicates, it is better +style to avoid them. For example, @samp{[XYa-yYb-zX]} is less clear +than @samp{[XYa-z]}. @item -A @samp{-} also appear at the beginning of a character alternative, or -as the upper bound of a range. For example, although @samp{[-a-z]} is -valid, @samp{[a-z-]} is better style; and although @samp{[!--/]} is -valid, @samp{[!-,/-]} is clearer. -@end enumerate +Although a range can denote just one, two, or three characters, it +is simpler to list the characters. For example, +@samp{[a-a0]} is less clear than @samp{[a0]}, @samp{[i-j]} is less clear +than @samp{[ij]}, and @samp{[i-k]} is less clear than @samp{[ijk]}. -A character alternative can also specify named character classes -(@pxref{Char Classes}). This is a POSIX feature. For example, -@samp{[[:ascii:]]} matches any @acronym{ASCII} character. -Using a character class is equivalent to mentioning each of the -characters in that class; but the latter is not feasible in practice, -since some classes include thousands of different characters. -A character class should not appear as the lower or upper bound -of a range. +@item +Although a @samp{-} can appear at the beginning of a character +alternative or as the upper bound of a range, it is better style to +put @samp{-} by itself at the end of a character alternative. For +example, although @samp{[-a-z]} is valid, @samp{[a-z-]} is better +style; and although @samp{[*--]} is valid, @samp{[*+,-]} is clearer. +@end enumerate @item @samp{[^ @dots{} ]} @cindex @samp{^} in regexp diff --git a/lisp/files.el b/lisp/files.el index 77a194b085d..1dae57593a0 100644 --- a/lisp/files.el +++ b/lisp/files.el @@ -6316,7 +6316,7 @@ See also `auto-save-file-name-p'." ;; We do this on all platforms, because even if we are not ;; running on DOS/Windows, the current directory may be on a ;; mounted VFAT filesystem, such as a USB memory stick. - (while (string-match "[^A-Za-z0-9-_.~#+]" buffer-name limit) + (while (string-match "[^A-Za-z0-9_.~#+-]" buffer-name limit) (let* ((character (aref buffer-name (match-beginning 0))) (replacement ;; For multibyte characters, this will produce more than diff --git a/lisp/gnus/message.el b/lisp/gnus/message.el index dae4b0dced6..c8b6f0ee685 100644 --- a/lisp/gnus/message.el +++ b/lisp/gnus/message.el @@ -1288,7 +1288,7 @@ called and its result is inserted." ;; According to RFC 822 and its successors, the field name must ;; consist of printable US-ASCII characters other than colon, ;; i.e., decimal 33-56 and 59-126. - '(looking-at "[ \t]\\|[][!\"#$%&'()*+,-./0-9;<=>?@A-Z\\^_`a-z{|}~]+:")) + '(looking-at "[ \t]\\|[][!\"#$%&'()*+,./0-9;<=>?@A-Z\\^_`a-z{|}~-]+:")) "Set this non-nil if the system's mailer runs the header and body together. \(This problem exists on Sunos 4 when sendmail is run in remote mode.) The value should be an expression to test whether the problem will diff --git a/lisp/gnus/nndoc.el b/lisp/gnus/nndoc.el index 8f1217b1275..532ba11fa09 100644 --- a/lisp/gnus/nndoc.el +++ b/lisp/gnus/nndoc.el @@ -701,7 +701,7 @@ from the document.") (defun nndoc-lanl-gov-announce-type-p () (when (let ((case-fold-search nil)) - (re-search-forward "^\\\\\\\\\n\\(Paper\\( (\\*cross-listing\\*)\\)?: [a-zA-Z-\\.]+/[0-9]+\\|arXiv:\\)" nil t)) + (re-search-forward "^\\\\\\\\\n\\(Paper\\( (\\*cross-listing\\*)\\)?: [a-zA-Z\\.-]+/[0-9]+\\|arXiv:\\)" nil t)) t)) (defun nndoc-transform-lanl-gov-announce (article) @@ -732,7 +732,7 @@ from the document.") (save-restriction (narrow-to-region (car entry) (nth 1 entry)) (goto-char (point-min)) - (when (looking-at "^\\(Paper.*: \\|arXiv:\\)\\([0-9a-zA-Z-\\./]+\\)") + (when (looking-at "^\\(Paper.*: \\|arXiv:\\)\\([0-9a-zA-Z\\./-]+\\)") (setq subject (concat " (" (match-string 2) ")")) (when (re-search-forward "^From: \\(.*\\)" nil t) (setq from (concat "<" diff --git a/lisp/org/org-eshell.el b/lisp/org/org-eshell.el index bb27d92e12d..2251a1b892f 100644 --- a/lisp/org/org-eshell.el +++ b/lisp/org/org-eshell.el @@ -37,7 +37,7 @@ eshell buffer) or a command line prefixed by a buffer name followed by a colon." (let* ((buffer-and-command - (if (string-match "\\([A-Za-z0-9-+*]+\\):\\(.*\\)" link) + (if (string-match "\\([A-Za-z0-9+*-]+\\):\\(.*\\)" link) (list (match-string 1 link) (match-string 2 link)) (list eshell-buffer-name link))) diff --git a/lisp/org/org.el b/lisp/org/org.el index bf7e305b7a0..ce6dd24a83b 100644 --- a/lisp/org/org.el +++ b/lisp/org/org.el @@ -430,7 +430,7 @@ Matched keyword is in group 1.") (defconst org-deadline-time-hour-regexp (concat "\\<" org-deadline-string - " *<\\([^>]+[0-9]\\{1,2\\}:[0-9]\\{2\\}[0-9-+:hdwmy \t.]*\\)>") + " *<\\([^>]+[0-9]\\{1,2\\}:[0-9]\\{2\\}[0-9+:hdwmy \t.-]*\\)>") "Matches the DEADLINE keyword together with a time-and-hour stamp.") (defconst org-deadline-line-regexp @@ -446,7 +446,7 @@ Matched keyword is in group 1.") (defconst org-scheduled-time-hour-regexp (concat "\\<" org-scheduled-string - " *<\\([^>]+[0-9]\\{1,2\\}:[0-9]\\{2\\}[0-9-+:hdwmy \t.]*\\)>") + " *<\\([^>]+[0-9]\\{1,2\\}:[0-9]\\{2\\}[0-9+:hdwmy \t.-]*\\)>") "Matches the SCHEDULED keyword together with a time-and-hour stamp.") (defconst org-closed-time-regexp diff --git a/lisp/progmodes/bat-mode.el b/lisp/progmodes/bat-mode.el index 6c85ff99053..a8b002be59b 100644 --- a/lisp/progmodes/bat-mode.el +++ b/lisp/progmodes/bat-mode.el @@ -78,7 +78,7 @@ "goto" "gtr" "if" "in" "leq" "lss" "neq" "not" "start")) (UNIX '("bash" "cat" "cp" "fgrep" "grep" "ls" "sed" "sh" "mv" "rm"))) - `(("\\_<\\(call\\|goto\\)\\_>[ \t]+%?\\([A-Za-z0-9-_\\:.]+\\)%?" + `(("\\_<\\(call\\|goto\\)\\_>[ \t]+%?\\([A-Za-z0-9_\\:.-]+\\)%?" (2 font-lock-constant-face t)) ("^:[^:].*" . 'bat-label-face) diff --git a/lisp/progmodes/bug-reference.el b/lisp/progmodes/bug-reference.el index 8baf74854f6..759db1f5686 100644 --- a/lisp/progmodes/bug-reference.el +++ b/lisp/progmodes/bug-reference.el @@ -69,7 +69,7 @@ so that it is considered safe, see `enable-local-variables'.") (get s 'bug-reference-url-format))))) (defcustom bug-reference-bug-regexp - "\\([Bb]ug ?#?\\|[Pp]atch ?#\\|RFE ?#\\|PR [a-z-+]+/\\)\\([0-9]+\\(?:#[0-9]+\\)?\\)" + "\\([Bb]ug ?#?\\|[Pp]atch ?#\\|RFE ?#\\|PR [a-z+-]+/\\)\\([0-9]+\\(?:#[0-9]+\\)?\\)" "Regular expression matching bug references. The second subexpression should match the bug reference (usually a number)." :type 'string diff --git a/lisp/textmodes/less-css-mode.el b/lisp/textmodes/less-css-mode.el index b4c7f28985d..4077789eb12 100644 --- a/lisp/textmodes/less-css-mode.el +++ b/lisp/textmodes/less-css-mode.el @@ -194,10 +194,10 @@ directory by default." ;; - custom faces. (defconst less-css-font-lock-keywords '(;; Variables - ("@[a-z_-][a-z-_0-9]*" . font-lock-variable-name-face) + ("@[a-z_-][a-z_0-9-]*" . font-lock-variable-name-face) ("&" . font-lock-preprocessor-face) ;; Mixins - ("\\(?:[ \t{;]\\|^\\)\\(\\.[a-z_-][a-z-_0-9]*\\)[ \t]*;" . + ("\\(?:[ \t{;]\\|^\\)\\(\\.[a-z_-][a-z_0-9-]*\\)[ \t]*;" . (1 font-lock-keyword-face)))) (defvar less-css-mode-syntax-table diff --git a/lisp/vc/vc-cvs.el b/lisp/vc/vc-cvs.el index 3bbd0ed49b1..626e190c1e8 100644 --- a/lisp/vc/vc-cvs.el +++ b/lisp/vc/vc-cvs.el @@ -1087,7 +1087,7 @@ CVS/Entries should only be accessed through this function." ;; an uppercase or lowercase letter and can contain uppercase and ;; lowercase letters, digits, `-', and `_'. (and (string-match "^[a-zA-Z]" tag) - (not (string-match "[^a-z0-9A-Z-_]" tag)))) + (not (string-match "[^a-z0-9A-Z_-]" tag)))) (defun vc-cvs-valid-revision-number-p (tag) "Return non-nil if TAG is a valid revision number." diff --git a/lisp/vc/vc-svn.el b/lisp/vc/vc-svn.el index 618f03eedc5..3c50c8fff64 100644 --- a/lisp/vc/vc-svn.el +++ b/lisp/vc/vc-svn.el @@ -759,7 +759,7 @@ Set file properties accordingly. If FILENAME is non-nil, return its status." ;; an uppercase or lowercase letter and can contain uppercase and ;; lowercase letters, digits, `-', and `_'. (and (string-match "^[a-zA-Z]" tag) - (not (string-match "[^a-z0-9A-Z-_]" tag)))) + (not (string-match "[^a-z0-9A-Z_-]" tag)))) (defun vc-svn-valid-revision-number-p (tag) "Return non-nil if TAG is a valid revision number." -- 2.39.2