;;; Code:
(require 'ert)
+(require 'cl)
(ert-deftest regex-word-cc-fallback-test ()
"Test that ‘[[:cc:]]*x’ matches ‘x’ (bug#24020).
(regex--test-cc "unibyte" "abcABC012 \t\n\1" "łą\u2622")
(regex--test-cc "multibyte" "łą\u2622" "abcABC012 \t\n\1")))
+
+(defmacro regex-tests-generic-line (comment-char test-file whitelist &rest body)
+ "Reads a line of the test file TEST-FILE, skipping
+comments (defined by COMMENT-CHAR), and evaluates the tests in
+this line as defined in the BODY. Line numbers in the WHITELIST
+are known failures, and are skipped."
+
+ `(with-temp-buffer
+ (modify-syntax-entry ?_ "w;; ") ; tests expect _ to be a word
+ (insert-file-contents ,(concat (file-name-directory (buffer-file-name)) test-file))
+
+ (let ((case-fold-search nil)
+ (line-number 1)
+ (whitelist-idx 0))
+
+ (goto-char (point-min))
+
+ (while (not (eobp))
+ (let ((start (point)))
+ (end-of-line)
+ (narrow-to-region start (point))
+
+ (goto-char (point-min))
+
+ (when
+ (and
+ ;; ignore comments
+ (save-excursion
+ (re-search-forward ,(concat "^[^" (string comment-char) "]") nil t))
+
+ ;; skip lines in the whitelist
+ (let ((whitelist-next
+ (condition-case nil
+ (aref ,whitelist whitelist-idx) (args-out-of-range nil))))
+ (cond
+ ;; whitelist exhausted. do process this line
+ ((null whitelist-next) t)
+
+ ;; we're not yet at the next whitelist element. do
+ ;; process this line
+ ((< line-number whitelist-next) t)
+
+ ;; we're past the next whitelist element. This
+ ;; shouldn't happen
+ ((> line-number whitelist-next)
+ (error
+ (format
+ "We somehow skipped the next whitelist element: line %d" whitelist-next)))
+
+ ;; we're at the next whitelist element. Skip this
+ ;; line, and advance the whitelist index
+ (t
+ (setq whitelist-idx (1+ whitelist-idx)) nil))))
+ ,@body)
+
+ (widen)
+ (forward-line)
+ (beginning-of-line)
+ (setq line-number (1+ line-number)))))))
+
+(defun regex-tests-compare (string what-failed bounds-ref &optional substring-ref)
+ "I just ran a search, looking at STRING. WHAT-FAILED describes
+what failed, if anything; valid values are 'search-failed,
+'compilation-failed and nil. I compare the beginning/end of each
+group with their expected values. This is done with either
+BOUNDS-REF or SUBSTRING-REF; one of those should be non-nil.
+BOUNDS-REF is a sequence \[start-ref0 end-ref0 start-ref1
+end-ref1 ....] while SUBSTRING-REF is the expected substring
+obtained by indexing the input string by start/end-ref.
+
+If the search was supposed to fail then start-ref0/substring-ref0
+is 'search-failed. If the search wasn't even supposed to compile
+successfully, then start-ref0/substring-ref0 is
+'compilation-failed. If I only care about a match succeeding,
+this can be set to t.
+
+This function returns a string that describes the failure, or nil
+on success"
+
+ (when (or
+ (and bounds-ref substring-ref)
+ (not (or bounds-ref substring-ref)))
+ (error "Exactly one of bounds-ref and bounds-ref should be non-nil"))
+
+ (let ((what-failed-ref (car (or bounds-ref substring-ref))))
+
+ (cond
+ ((eq what-failed 'search-failed)
+ (cond
+ ((eq what-failed-ref 'search-failed)
+ nil)
+ ((eq what-failed-ref 'compilation-failed)
+ "Expected pattern failure; but no match")
+ (t
+ "Expected match; but no match")))
+
+ ((eq what-failed 'compilation-failed)
+ (cond
+ ((eq what-failed-ref 'search-failed)
+ "Expected no match; but pattern failure")
+ ((eq what-failed-ref 'compilation-failed)
+ nil)
+ (t
+ "Expected match; but pattern failure")))
+
+ ;; The regex match succeeded
+ ((eq what-failed-ref 'search-failed)
+ "Expected no match; but match")
+ ((eq what-failed-ref 'compilation-failed)
+ "Expected pattern failure; but match")
+
+ ;; The regex match succeeded, as expected. I now check all the
+ ;; bounds
+ (t
+ (let ((idx 0)
+ msg
+ ref next-ref-function compare-ref-function mismatched-ref-function)
+
+ (if bounds-ref
+ (setq ref bounds-ref
+ next-ref-function (lambda (x) (cddr x))
+ compare-ref-function (lambda (ref start-pos end-pos)
+ (or (eq (car ref) t)
+ (and (eq start-pos (car ref))
+ (eq end-pos (cadr ref)))))
+ mismatched-ref-function (lambda (ref start-pos end-pos)
+ (format
+ "beginning/end positions: %d/%s and %d/%s"
+ start-pos (car ref) end-pos (cadr ref))))
+ (setq ref substring-ref
+ next-ref-function (lambda (x) (cdr x))
+ compare-ref-function (lambda (ref start-pos end-pos)
+ (or (eq (car ref) t)
+ (string= (substring string start-pos end-pos) (car ref))))
+ mismatched-ref-function (lambda (ref start-pos end-pos)
+ (format
+ "beginning/end positions: %d/%s and %d/%s"
+ start-pos (car ref) end-pos (cadr ref)))))
+
+ (while (not (or (null ref) msg))
+
+ (let ((start (match-beginning idx))
+ (end (match-end idx)))
+
+ (when (not (funcall compare-ref-function ref start end))
+ (setq msg
+ (format
+ "Have expected match, but mismatch in group %d: %s" idx (funcall mismatched-ref-function ref start end))))
+
+ (setq ref (funcall next-ref-function ref)
+ idx (1+ idx))))
+
+ (or msg
+ nil))))))
+
+
+
+(defun regex-tests-match (pattern string bounds-ref &optional substring-ref)
+ "I match the given STRING against PATTERN. I compare the
+beginning/end of each group with their expected values.
+BOUNDS-REF is a sequence [start-ref0 end-ref0 start-ref1 end-ref1
+....].
+
+If the search was supposed to fail then start-ref0 is
+'search-failed. If the search wasn't even supposed to compile
+successfully, then start-ref0 is 'compilation-failed.
+
+This function returns a string that describes the failure, or nil
+on success"
+
+ (if (string-match "\\[\\([\\.=]\\)..?\\1\\]" pattern)
+ ;; Skipping test: [.x.] and [=x=] forms not supported by emacs
+ nil
+
+ (regex-tests-compare
+ string
+ (condition-case nil
+ (if (string-match pattern string) nil 'search-failed)
+ ('invalid-regexp 'compilation-failed))
+ bounds-ref substring-ref)))
+
+
+(defconst regex-tests-re-even-escapes
+ "\\(?:^\\|[^\\\\]\\)\\(?:\\\\\\\\\\)*"
+ "Regex that matches an even number of \\ characters")
+
+(defconst regex-tests-re-odd-escapes
+ (concat regex-tests-re-even-escapes "\\\\")
+ "Regex that matches an odd number of \\ characters")
+
+
+(defun regex-tests-unextend (pattern)
+ "Basic conversion from extended regexen to emacs ones. This is
+mostly a hack that adds \\ to () and | and {}, and removes it if
+it already exists. We also change \\S (and \\s) to \\S- (and
+\\s-) because extended regexen see the former as whitespace, but
+emacs requires an extra symbol character"
+
+ (with-temp-buffer
+ (insert pattern)
+ (goto-char (point-min))
+
+ (while (re-search-forward "[()|{}]" nil t)
+ ;; point is past special character. If it is escaped, unescape
+ ;; it
+
+ (if (save-excursion
+ (re-search-backward (concat regex-tests-re-odd-escapes ".\\=") nil t))
+
+ ;; This special character is preceded by an odd number of \,
+ ;; so I unescape it by removing the last one
+ (progn
+ (forward-char -2)
+ (delete-char 1)
+ (forward-char 1))
+
+ ;; This special character is preceded by an even (possibly 0)
+ ;; number of \. I add an escape
+ (forward-char -1)
+ (insert "\\")
+ (forward-char 1)))
+
+ ;; convert \s to \s-
+ (goto-char (point-min))
+ (while (re-search-forward (concat regex-tests-re-odd-escapes "[Ss]") nil t)
+ (insert "-"))
+
+ (buffer-string)))
+
+(defun regex-tests-BOOST-frob-escapes (s ispattern)
+ "Mangle \\ the way it is done in frob_escapes() in
+regex-tests-BOOST.c in glibc: \\t, \\n, \\r are interpreted;
+\\\\, \\^, \{, \\|, \} are unescaped for the string (not
+pattern)"
+
+ ;; this is all similar to (regex-tests-unextend)
+ (with-temp-buffer
+ (insert s)
+
+ (let ((interpret-list (list "t" "n" "r")))
+ (while interpret-list
+ (goto-char (point-min))
+ (while (re-search-forward
+ (concat "\\(" regex-tests-re-even-escapes "\\)"
+ "\\\\" (car interpret-list))
+ nil t)
+ (replace-match (concat "\\1" (car (read-from-string
+ (concat "\"\\" (car interpret-list) "\""))))))
+
+ (setq interpret-list (cdr interpret-list))))
+
+ (when (not ispattern)
+ ;; unescape \\, \^, \{, \|, \}
+ (let ((unescape-list (list "\\\\" "^" "{" "|" "}")))
+ (while unescape-list
+ (goto-char (point-min))
+ (while (re-search-forward
+ (concat "\\(" regex-tests-re-even-escapes "\\)"
+ "\\\\" (car unescape-list))
+ nil t)
+ (replace-match (concat "\\1" (car unescape-list))))
+
+ (setq unescape-list (cdr unescape-list))))
+ )
+ (buffer-string)))
+
+
+
+
+(defconst regex-tests-BOOST-whitelist
+ [
+ ;; emacs is more stringent with regexen involving unbalanced )
+ 63 65 69
+
+ ;; in emacs, regex . doesn't match \n
+ 91
+
+ ;; emacs is more forgiving with * and ? that don't apply to
+ ;; characters
+ 107 108 109 122 123 124 140 141 142
+
+ ;; emacs accepts regexen with {}
+ 161
+
+ ;; emacs doesn't fail on bogus ranges such as [3-1] or [1-3-5]
+ 222 223
+
+ ;; emacs doesn't match (ab*)[ab]*\1 greedily: only 4 chars of
+ ;; ababaaa match
+ 284 294
+
+ ;; ambiguous groupings are ambiguous
+ 443 444 445 446 448 449 450
+
+ ;; emacs doesn't know how to handle weird ranges such as [a-Z] and
+ ;; [[:alpha:]-a]
+ 539 580 581
+
+ ;; emacs matches non-greedy regex ab.*? non-greedily
+ 639 677 712
+ ]
+ "Line numbers in the boost test that should be skipped. These
+are false-positive test failures that represent known/benign
+differences in behavior.")
+
+;; - Format
+;; - Comments are lines starting with ;
+;; - Lines starting with - set options passed to regcomp() and regexec():
+;; - if no "REG_BASIC" is found, with have an extended regex
+;; - These set a flag:
+;; - REG_ICASE
+;; - REG_NEWLINE
+;; - REG_NOTBOL
+;; - REG_NOTEOL
+;;
+;; - Test lines are
+;; pattern string start0 end0 start1 end1 ...
+;;
+;; - pattern, string can have escapes
+;; - string can have whitespace if enclosed in ""
+;; - if string is "!", then the pattern is supposed to fail compilation
+;; - start/end are of group0, group1, etc. group 0 is the full match
+;; - start<0 indicates "no match"
+;; - start is the 0-based index of the first character
+;; - end is the 0-based index of the first character past the group
+(defun regex-tests-BOOST ()
+ (let (failures
+ basic icase newline notbol noteol)
+ (regex-tests-generic-line
+ ?; "regex-resources/BOOST.tests" regex-tests-BOOST-whitelist
+ (if (save-excursion (re-search-forward "^-" nil t))
+ (setq basic (save-excursion (re-search-forward "REG_BASIC" nil t))
+ icase (save-excursion (re-search-forward "REG_ICASE" nil t))
+ newline (save-excursion (re-search-forward "REG_NEWLINE" nil t))
+ notbol (save-excursion (re-search-forward "REG_NOTBOL" nil t))
+ noteol (save-excursion (re-search-forward "REG_NOTEOL" nil t)))
+
+ (save-excursion
+ (or (re-search-forward "\\(\\S-+\\)\\s-+\"\\(.*\\)\"\\s-+?\\(.+\\)" nil t)
+ (re-search-forward "\\(\\S-+\\)\\s-+\\(\\S-+\\)\\s-+?\\(.+\\)" nil t)
+ (re-search-forward "\\(\\S-+\\)\\s-+\\(!\\)" nil t)))
+
+ (let* ((pattern-raw (match-string 1))
+ (string-raw (match-string 2))
+ (positions-raw (match-string 3))
+ (pattern (regex-tests-BOOST-frob-escapes pattern-raw t))
+ (string (regex-tests-BOOST-frob-escapes string-raw nil))
+ (positions
+ (if (string= string "!")
+ (list 'compilation-failed 0)
+ (mapcar
+ (lambda (x)
+ (let ((x (string-to-number x)))
+ (if (< x 0) nil x)))
+ (split-string positions-raw)))))
+
+ (when (null (car positions))
+ (setcar positions 'search-failed))
+
+ (when (not basic)
+ (setq pattern (regex-tests-unextend pattern)))
+
+ ;; great. I now have all the data parsed. Let's use it to do
+ ;; stuff
+ (let* ((case-fold-search icase)
+ (msg (regex-tests-match pattern string positions)))
+
+ (if (and
+ ;; Skipping test: notbol/noteol not supported
+ (not notbol) (not noteol)
+
+ msg)
+
+ ;; store failure
+ (setq failures
+ (cons (format "line number %d: Regex '%s': %s"
+ line-number pattern msg)
+ failures)))))))
+
+ failures))
+
+(defconst regex-tests-PCRE-whitelist
+ [
+ ;; ambiguous groupings are ambiguous
+ 610 611 1154 1157 1160 1168 1171 1176 1179 1182 1185 1188 1193 1196 1203
+ ]
+ "Line numbers in the PCRE test that should be skipped. These
+are false-positive test failures that represent known/benign
+differences in behavior.")
+
+;; - Format
+;;
+;; regex
+;; input_string
+;; group_num: group_match | "No match"
+;; input_string
+;; group_num: group_match | "No match"
+;; input_string
+;; group_num: group_match | "No match"
+;; input_string
+;; group_num: group_match | "No match"
+;; ...
+(defun regex-tests-PCRE ()
+ (let (failures
+ pattern icase string what-failed matches-observed)
+ (regex-tests-generic-line
+ ?# "regex-resources/PCRE.tests" regex-tests-PCRE-whitelist
+
+ (cond
+
+ ;; pattern
+ ((save-excursion (re-search-forward "^/\\(.*\\)/\\(.*i?\\)$" nil t))
+ (setq icase (string= "i" (match-string 2))
+ pattern (regex-tests-unextend (match-string 1))))
+
+ ;; string. read it in, match against pattern, and save all the results
+ ((save-excursion (re-search-forward "^ \\(.*\\)" nil t))
+ (let ((case-fold-search icase))
+ (setq string (match-string 1)
+
+ ;; the regex match under test
+ what-failed
+ (condition-case nil
+ (if (string-match pattern string) nil 'search-failed)
+ ('invalid-regexp 'compilation-failed))
+
+ matches-observed
+ (loop for x from 0 to 20
+ collect (and (not what-failed)
+ (or (match-string x string) "<unset>")))))
+ nil)
+
+ ;; verification line: failed match
+ ((save-excursion (re-search-forward "^No match" nil t))
+ (unless what-failed
+ (setq failures
+ (cons (format "line number %d: Regex '%s': Expected no match; but match"
+ line-number pattern)
+ failures))))
+
+ ;; verification line: succeeded match
+ ((save-excursion (re-search-forward "^ *\\([0-9]+\\): \\(.*\\)" nil t))
+ (let* ((match-ref (match-string 2))
+ (idx (string-to-number (match-string 1))))
+
+ (if what-failed
+ "Expected match; but no match"
+ (unless (string= match-ref (elt matches-observed idx))
+ (setq failures
+ (cons (format "line number %d: Regex '%s': Have expected match, but group %d is wrong: '%s'/'%s'"
+ line-number pattern
+ idx match-ref (elt matches-observed idx))
+ failures))))))
+
+ ;; reset
+ (t (setq pattern nil) nil)))
+
+ failures))
+
+(defconst regex-tests-PTESTS-whitelist
+ [
+ ;; emacs doesn't barf on weird ranges such as [b-a], but simply
+ ;; fails to match
+ 138
+
+ ;; emacs doesn't see DEL (0x78) as a [:cntrl:] character
+ 168
+ ]
+ "Line numbers in the PTESTS test that should be skipped. These
+are false-positive test failures that represent known/benign
+differences in behavior.")
+
+;; - Format
+;; - fields separated by ¦ (note: this is not a |)
+;; - start¦end¦pattern¦string
+;; - start is the 1-based index of the first character
+;; - end is the 1-based index of the last character
+(defun regex-tests-PTESTS ()
+ (let (failures)
+ (regex-tests-generic-line
+ ?# "regex-resources/PTESTS" regex-tests-PTESTS-whitelist
+ (let* ((fields (split-string (buffer-string) "¦"))
+
+ ;; string has 1-based index of first char in the
+ ;; match. -1 means "no match". -2 means "invalid
+ ;; regex".
+ ;;
+ ;; start-ref is 0-based index of first char in the
+ ;; match
+ ;;
+ ;; string==0 is a special case, and I have to treat
+ ;; it as start-ref = 0
+ (start-ref (let ((raw (string-to-number (elt fields 0))))
+ (cond
+ ((= raw -2) 'compilation-failed)
+ ((= raw -1) 'search-failed)
+ ((= raw 0) 0)
+ (t (1- raw)))))
+
+ ;; string has 1-based index of last char in the
+ ;; match. end-ref is 0-based index of first char past
+ ;; the match
+ (end-ref (string-to-number (elt fields 1)))
+ (pattern (elt fields 2))
+ (string (elt fields 3)))
+
+ (let ((msg (regex-tests-match pattern string (list start-ref end-ref))))
+ (when msg
+ (setq failures
+ (cons (format "line number %d: Regex '%s': %s"
+ line-number pattern msg)
+ failures))))))
+ failures))
+
+(defconst regex-tests-TESTS-whitelist
+ [
+ ;; emacs doesn't barf on weird ranges such as [b-a], but simply
+ ;; fails to match
+ 42
+
+ ;; emacs is more forgiving with * and ? that don't apply to
+ ;; characters
+ 57 58 59 60
+
+ ;; emacs is more stringent with regexen involving unbalanced )
+ 67
+ ]
+ "Line numbers in the TESTS test that should be skipped. These
+are false-positive test failures that represent known/benign
+differences in behavior.")
+
+;; - Format
+;; - fields separated by :. Watch for [\[:xxx:]]
+;; - expected:pattern:string
+;;
+;; expected:
+;; | 0 | successful match |
+;; | 1 | failed match |
+;; | 2 | regcomp() should fail |
+(defun regex-tests-TESTS ()
+ (let (failures)
+ (regex-tests-generic-line
+ ?# "regex-resources/TESTS" regex-tests-TESTS-whitelist
+ (if (save-excursion (re-search-forward "^\\([^:]+\\):\\(.*\\):\\([^:]*\\)$" nil t))
+ (let* ((what-failed
+ (let ((raw (string-to-number (match-string 1))))
+ (cond
+ ((= raw 2) 'compilation-failed)
+ ((= raw 1) 'search-failed)
+ (t t))))
+ (string (match-string 3))
+ (pattern (regex-tests-unextend (match-string 2))))
+
+ (let ((msg (regex-tests-match pattern string nil (list what-failed))))
+ (when msg
+ (setq failures
+ (cons (format "line number %d: Regex '%s': %s"
+ line-number pattern msg)
+ failures)))))
+
+ (error "Error parsing TESTS file line: '%s'" (buffer-string))))
+ failures))
+
+(ert-deftest regex-tests ()
+ "Tests of the regular expression engine. This evaluates the
+BOOST, PCRE, PTESTS and TESTS test cases from glibc."
+ (should-not (regex-tests-BOOST))
+ (should-not (regex-tests-PCRE))
+ (should-not (regex-tests-PTESTS))
+ (should-not (regex-tests-TESTS)))
+
;;; regex-tests.el ends here