From 3d49ad73e5a93625629c96b6c0b921bb019ea9da Mon Sep 17 00:00:00 2001 From: =?utf8?q?Harald=20J=C3=B6rg?= Date: Tue, 14 Sep 2021 17:53:52 +0200 Subject: [PATCH] cperl-mode.el: Allow non-ASCII Perl identifiers Replace all "A-Z" regexp literals with unicode-aware rx constructs wherever Perl allows non-ASCII identifiers. * lisp/progmodes/cperl-mode.el (cperl-after-sub-regexp) (cperl-after-label. cperl-sniff-for-indent) (cperl-find-pods-heres, cperl-indent-exp) (cperl-fix-line-spacing, cperl-imenu--create-perl-index) (cperl-init-faces, cperl-find-tags): Replace ASCII regex literals by unicode-aware rx constructs. (cperl-init-faces): Eliminate unused lexical `font-lock-anchored'. (cperl-have-help-regexp, cperl-word-at-point-hard): Allow non-ASCII word characters. * test/lisp/progmodes/cperl-mode-tests.el (cperl-test-fontify-special-variables): New test for $^T and $^{VARNAME}. (cperl-test-ws-rx cperl-test-ws+-rx), (cperl-test-version-regexp, cperl-test-package-regexp): Skip for perl-mode. (cperl-test-identifier-rx, cperl--test-unicode-setup) (cperl-test-unicode-labels, cperl-test-unicode-sub) (cperl-test-unicode-varname) (cperl-test-unicode-varname-list, cperl-test-unicode-arrays) (cperl-test-unicode-hashes, cperl-test-unicode-hashref) (cperl-test-unicode-proto, cperl-test-unicode-fhs) (cperl-test-unicode-hashkeys, cperl-test-word-at-point): New tests for unicode identifiers. (cperl-test-imenu-index): Add a unicode identifier to the test. * test/lisp/progmodes/cperl-mode-resources/grammar.pl: Add a function with non-ASCII name for imenu tests. --- lisp/progmodes/cperl-mode.el | 330 ++++++++++++------ .../progmodes/cperl-mode-resources/grammar.pl | 14 + test/lisp/progmodes/cperl-mode-tests.el | 301 +++++++++++++++- 3 files changed, 545 insertions(+), 100 deletions(-) diff --git a/lisp/progmodes/cperl-mode.el b/lisp/progmodes/cperl-mode.el index 76c82f8c73e..1147889969b 100644 --- a/lisp/progmodes/cperl-mode.el +++ b/lisp/progmodes/cperl-mode.el @@ -1407,7 +1407,7 @@ the last)." (concat ; Assume n groups before this... "\\(" ; n+1=name-group cperl-white-and-comment-rex ; n+2=pre-name - "\\(::[a-zA-Z_0-9:']+\\|[a-zA-Z_'][a-zA-Z_0-9:']*\\)" ; n+3=name + (rx-to-string `(group ,cperl--normal-identifier-rx)) "\\)" ; END n+1=name-group (if named "" "?") "\\(" ; n+4=proto-group @@ -2573,7 +2573,8 @@ Return the amount the indentation changed by." '(?w ?_)) (progn (backward-sexp) - (looking-at "[a-zA-Z_][a-zA-Z0-9_]*:[^:]")))) + (looking-at (rx (sequence (eval cperl--label-rx) + (not (in ":")))))))) (defun cperl-get-state (&optional parse-start start-state) "Return list (START STATE DEPTH PRESTART), @@ -2740,7 +2741,9 @@ Will not look before LIM." (progn (forward-sexp -1) (skip-chars-backward " \t") - (looking-at "[ \t]*[a-zA-Z_][a-zA-Z_0-9]*[ \t]*:"))) + (looking-at + (rx (sequence (0+ blank) + (eval cperl--label-rx)))))) (get-text-property (point) 'first-format-line))) ;; Look at previous line that's at column 0 @@ -3836,7 +3839,8 @@ recursive calls in starting lines of here-documents." "\\<" cperl-sub-regexp "\\>" ; sub with proto/attr "\\(" cperl-white-and-comment-rex - "\\(::[a-zA-Z_:'0-9]*\\|[a-zA-Z_'][a-zA-Z_:'0-9]*\\)\\)?" ; name + (rx (group (eval cperl--normal-identifier-rx))) + "\\)" "\\(" cperl-maybe-white-and-comment-rex "\\(([^()]*)\\|:[^:]\\)\\)" ; prototype or attribute start @@ -4111,10 +4115,12 @@ recursive calls in starting lines of here-documents." (t t)))) ;; or <$file> (and (eq c ?\<) - ;; Do not stringify , <$fh> : + ;; Stringify what looks like a glob, but + ;; do not stringify file handles , <$fh> : (save-match-data (looking-at - "\\$?\\([_a-zA-Z:][_a-zA-Z0-9:]*\\)?>")))) + (rx (sequence (opt "$") + (eval cperl--normal-identifier-rx))))))) tb (match-beginning 0)) (goto-char (match-beginning b1)) (cperl-backward-to-noncomment (point-min)) @@ -4184,7 +4190,16 @@ recursive calls in starting lines of here-documents." (error nil))) (if (or bb (looking-at ; $foo -> {s} - "[$@]\\$*\\([a-zA-Z0-9_:]+\\|[^{]\\)\\([ \t\n]*->\\)?[ \t\n]*{") + (rx + (sequence + (in "$@") (0+ "$") + (or + (eval cperl--normal-identifier-rx) + (not (in "{"))) + (opt (sequence (eval cperl--ws*-rx)) + "->") + (eval cperl--ws*-rx) + "{"))) (and ; $foo[12] -> {s} (memq (following-char) '(?\{ ?\[)) (progn @@ -4199,7 +4214,12 @@ recursive calls in starting lines of here-documents." (setq bb t)) ((and (eq (following-char) ?:) (eq b1 ?\{) ; Check for $ { s::bar } - (looking-at "::[a-zA-Z0-9_:]*[ \t\n\f]*}") + ;; (looking-at "::[a-zA-Z0-9_:]*[ \t\n\f]*}") + (looking-at + (rx (sequence "::" + (eval cperl--normal-identifier-rx) + (eval cperl--ws*-rx) + "}"))) (progn (goto-char (1- go)) (skip-chars-backward " \t\n\f") @@ -4364,7 +4384,7 @@ recursive calls in starting lines of here-documents." "\\(" ;; XXXX 1-char variables, exc. |()\s "[$@]" "\\(" - "[_a-zA-Z:][_a-zA-Z0-9:]*" + (rx (eval cperl--normal-identifier-rx)) "\\|" "{[^{}]*}" ; only one-level allowed "\\|" @@ -4820,6 +4840,7 @@ recursive calls in starting lines of here-documents." (progn (backward-sexp) ;; sub {BLK}, print {BLK} $data, but NOT `bless', `return', `tr', `constant' + ;; a-zA-Z is fine here, these are Perl keywords (or (and (looking-at "[a-zA-Z0-9_:]+[ \t\n\f]*[{#]") ; Method call syntax (not (looking-at "\\(bless\\|return\\|q[wqrx]?\\|tr\\|[smy]\\|constant\\)\\>"))) ;; sub bless::foo {} @@ -5028,7 +5049,11 @@ conditional/loop constructs." cperl-maybe-white-and-comment-rex "\\(state\\|my\\|local\\|our\\)\\)?" cperl-maybe-white-and-comment-rex - "\\$[_a-zA-Z0-9]+\\)?\\)\\>")) + (rx + (sequence + "$" + (eval cperl--basic-identifier-rx))) + "\\)?\\)\\>")) (progn (goto-char top) (forward-sexp 1) @@ -5122,7 +5147,14 @@ Returns some position at the last line." ;; Looking at: ;; foreach my $var ( (if (looking-at - "[ \t]*\\\\([ \t]*(\\|[ \t\n]*{\\)\\|[ \t]*{") + (rx (sequence + (0+ blank) + (opt (sequence "}" (0+ blank) )) + symbol-start + (or "else" "elsif" "continue" "if" "unless" "while" "until" + (sequence (or "for" "foreach") + (opt + (opt (sequence (1+ blank) + (or "state" "my" "local" "our"))) + (0+ blank) + "$" (eval cperl--basic-identifier-rx)))) + symbol-end + (group-n 1 + (or + (or (sequence (0+ blank) "(") + (sequence (eval cperl--ws*-rx) "{")) + (sequence (0+ blank) "{")))))) (progn - (setq ml (match-beginning 8)) ; "(" or "{" after control word + (setq ml (match-beginning 1)) ; "(" or "{" after control word (re-search-forward "[({]") (forward-char -1) (setq p (point)) @@ -5544,7 +5592,11 @@ comment, or POD." (setq lst index-sub-alist) (while lst (setq elt (car lst) lst (cdr lst)) - (cond ((string-match "\\(::\\|'\\)[_a-zA-Z0-9]+$" (car elt)) + (cond ((string-match + (rx (sequence (or "::" "'") + (eval cperl--basic-identifier-rx) + string-end)) + (car elt)) (setq pack (substring (car elt) 0 (match-beginning 0))) (if (setq group (assoc pack hier-list)) (if (listp (cdr group)) @@ -5646,8 +5698,7 @@ default function." (defun cperl-init-faces () (condition-case errs (progn - (let (t-font-lock-keywords t-font-lock-keywords-1 font-lock-anchored) - (setq font-lock-anchored t) + (let (t-font-lock-keywords t-font-lock-keywords-1) (setq t-font-lock-keywords (list @@ -5760,20 +5811,41 @@ default function." (if (eq (char-after (cperl-1- (match-end 0))) ?\{ ) 'font-lock-function-name-face 'font-lock-variable-name-face)))) - '("\\<\\(package\\|require\\|use\\|import\\|no\\|bootstrap\\)[ \t]+\\([a-zA-Z_][a-zA-Z_0-9:]*\\)[ \t;]" ; require A if B; - 2 font-lock-function-name-face) + `(,(rx (sequence symbol-start + (or "package" "require" "use" "import" + "no" "bootstrap") + (eval cperl--ws+-rx) + (group-n 1 (eval cperl--normal-identifier-rx)) + (any " \t;"))) ; require A if B; + 1 font-lock-function-name-face) '("^[ \t]*format[ \t]+\\([a-zA-Z_][a-zA-Z_0-9:]*\\)[ \t]*=[ \t]*$" 1 font-lock-function-name-face) - (cond (font-lock-anchored - '("\\([]}\\%@>*&]\\|\\$[a-zA-Z0-9_:]*\\)[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}" - (2 font-lock-string-face t) - ("\\=[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}" - nil nil - (1 font-lock-string-face t)))) - (t '("\\([]}\\%@>*&]\\|\\$[a-zA-Z0-9_:]*\\)[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}" - 2 font-lock-string-face t))) - '("[[ \t{,(]\\(-?[a-zA-Z0-9_:]+\\)[ \t]*=>" 1 - font-lock-string-face t) + ;; bareword hash key: $foo{bar} + `(,(rx (or (in "]}\\%@>*&") ; What Perl is this? + (sequence "$" (eval cperl--normal-identifier-rx))) + (0+ blank) "{" (0+ blank) + (group-n 1 (sequence (opt "-") + (eval cperl--basic-identifier-rx))) + (0+ blank) "}") +;; '("\\([]}\\%@>*&]\\|\\$[a-zA-Z0-9_:]*\\)[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}" + (1 font-lock-string-face t) + ;; anchored bareword hash key: $foo{bar}{baz} + (,(rx point + (0+ blank) "{" (0+ blank) + (group-n 1 (sequence (opt "-") + (eval cperl--basic-identifier-rx))) + (0+ blank) "}") + ;; ("\\=[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}" + nil nil + (1 font-lock-string-face t))) + ;; hash element assignments with bareword key => value + `(,(rx (in "[ \t{,()") + (group-n 1 (sequence (opt "-") + (eval cperl--basic-identifier-rx))) + (0+ blank) "=>") + 1 font-lock-string-face t) +;; '("[[ \t{,(]\\(-?[a-zA-Z0-9_:]+\\)[ \t]*=>" 1 +;; font-lock-string-face t) ;; labels `(,(rx (sequence @@ -5797,83 +5869,130 @@ default function." ;;; '("[$*]{?\\(\\sw+\\)" 1 font-lock-variable-name-face) ;;; '("\\([@%]\\|\\$#\\)\\(\\sw+\\)" ;;; (2 (cons font-lock-variable-name-face '(underline)))) - (cond (font-lock-anchored ;; 1=my_etc, 2=white? 3=(+white? 4=white? 5=var - `(,(concat "\\<\\(state\\|my\\|local\\|our\\)" - cperl-maybe-white-and-comment-rex - "\\((" - cperl-maybe-white-and-comment-rex - "\\)?\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)") - (5 ,(if cperl-font-lock-multiline - 'font-lock-variable-name-face - '(progn (setq cperl-font-lock-multiline-start - (match-beginning 0)) - 'font-lock-variable-name-face))) - (,(concat "\\=" - cperl-maybe-white-and-comment-rex - "," - cperl-maybe-white-and-comment-rex - "\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)") - ;; Bug in font-lock: limit is used not only to limit - ;; searches, but to set the "extend window for - ;; facification" property. Thus we need to minimize. - ,(if cperl-font-lock-multiline - '(if (match-beginning 3) - (save-excursion - (goto-char (match-beginning 3)) - (condition-case nil - (forward-sexp 1) - (error - (condition-case nil - (forward-char 200) - (error nil)))) ; typeahead - (1- (point))) ; report limit - (forward-char -2)) ; disable continued expr - '(if (match-beginning 3) - (point-max) ; No limit for continuation - (forward-char -2))) ; disable continued expr - ,(if cperl-font-lock-multiline - nil - '(progn ; Do at end - ;; "my" may be already fontified (POD), - ;; so cperl-font-lock-multiline-start is nil - (if (or (not cperl-font-lock-multiline-start) - (> 2 (count-lines - cperl-font-lock-multiline-start - (point)))) - nil - (put-text-property - (1+ cperl-font-lock-multiline-start) (point) - 'syntax-type 'multiline)) - (setq cperl-font-lock-multiline-start nil))) - (3 font-lock-variable-name-face)))) - (t '("^[ \t{}]*\\(state\\|my\\|local\\|our\\)[ \t]*\\(([ \t]*\\)?\\([$@%*][a-zA-Z0-9_:]+\\)" - 3 font-lock-variable-name-face))) - '("\\ 2 (count-lines + cperl-font-lock-multiline-start + (point)))) + nil + (put-text-property + (1+ cperl-font-lock-multiline-start) (point) + 'syntax-type 'multiline)) + (setq cperl-font-lock-multiline-start nil))) + (1 font-lock-variable-name-face))) + ;; foreach my $foo ( + `(,(rx symbol-start "for" (opt "each") + (opt (sequence (1+ blank) + (or "state" "my" "local" "our"))) + (0+ blank) + (group-n 1 (sequence "$" + (eval cperl--basic-identifier-rx))) + (0+ blank) "(") +;; '("\\") (setq search @@ -6472,6 +6593,9 @@ Will not move the position at the start to the left." "Run etags with appropriate options for Perl files. If optional argument ALL is `recursive', will process Perl files in subdirectories too." + ;; Apparently etags doesn't support UTF-8 encoded sources, and usage + ;; of etags has been commented out in the menu since ... well, + ;; forever. So, let's just stick to ASCII here. -- haj, 2021-09-14 (interactive) (let ((cmd "etags") (args `("-l" "none" "-r" @@ -6611,6 +6735,9 @@ Does not move point." ;; Search for the function (progn ;;save-match-data (while (re-search-forward + ;; FIXME: Should XS code be unicode aware? Recent C + ;; compilers (Gcc 10+) are, but I guess this isn't used + ;; much. -- haj, 2021-09-14 "^\\([ \t]*MODULE\\>[^\n]*\\\\|\\([a-zA-Z_][a-zA-Z_0-9]*\\)(\\|[ \t]*BOOT:\\)" nil t) (cond @@ -6673,7 +6800,7 @@ Does not move point." (setq lst (mapcar (lambda (elt) - (cond ((string-match "^[_a-zA-Z]" (car elt)) + (cond ((string-match (rx line-start (or alpha "_")) (car elt)) (goto-char (cdr elt)) (beginning-of-line) ; pos should be of the start of the line (list (car elt) @@ -6703,9 +6830,14 @@ Does not move point." "," (number-to-string (1- (elt elt 1))) ; Char pos 0-based "\n") - (if (and (string-match "^[_a-zA-Z]+::" (car elt)) - (string-match (concat "^" cperl-sub-regexp "[ \t]+\\([_a-zA-Z]+\\)[^:_a-zA-Z]") - (elt elt 3))) + (if (and (string-match (rx line-start + (eval cperl--basic-identifier-rx) "++") + (car elt)) + (string-match (rx-to-string `(sequence line-start + (regexp ,cperl-sub-regexp) + (1+ (in " \t")) + ,cperl--normal-identifier-rx)) + (elt elt 3))) ;; Need to insert the name without package as well (setq lst (cons (cons (substring (elt elt 3) (match-beginning 1) @@ -7155,14 +7287,14 @@ Currently it is tuned to C and Perl syntax." ;;(concat "\\(" (mapconcat #'identity - '("[$@%*&][0-9a-zA-Z_:]+\\([ \t]*[[{]\\)?" ; Usual variable + '("[$@%*&][[:alnum:]_:]+\\([ \t]*[[{]\\)?" ; Usual variable "[$@]\\^[a-zA-Z]" ; Special variable "[$@][^ \n\t]" ; Special variable "-[a-zA-Z]" ; File test "\\\\[a-zA-Z0]" ; Special chars "^=[a-z][a-zA-Z0-9_]*" ; POD sections "[-!&*+,./<=>?\\^|~]+" ; Operator - "[a-zA-Z_0-9:]+" ; symbol or number + "[[:alnum:]_:]+" ; symbol or number "x=" "#!") ;;"\\)\\|\\(" @@ -7178,7 +7310,7 @@ Currently it is tuned to C and Perl syntax." ;; Does not save-excursion ;; Get to the something meaningful (or (eobp) (eolp) (forward-char 1)) - (re-search-backward "[-a-zA-Z0-9_:!&*+,./<=>?\\^|~$%@]" + (re-search-backward "[-[:alnum:]_:!&*+,./<=>?\\^|~$%@]" (point-at-bol) 'to-beg) ;; (cond @@ -7187,8 +7319,8 @@ Currently it is tuned to C and Perl syntax." ;; (or (bobp) (backward-char 1)))) ;; Try to backtrace (cond - ((looking-at "[a-zA-Z0-9_:]") ; symbol - (skip-chars-backward "a-zA-Z0-9_:") + ((looking-at "[[:alnum:]_:]") ; symbol + (skip-chars-backward "[:alnum:]_:") (cond ((and (eq (preceding-char) ?^) ; $^I (eq (char-after (- (point) 2)) ?\$)) @@ -7199,7 +7331,7 @@ Currently it is tuned to C and Perl syntax." (eq (current-column) 1)) (forward-char -1))) ; =head1 (if (and (eq (preceding-char) ?\<) - (looking-at "\\$?[a-zA-Z0-9_:]+>")) ; + (looking-at "\\$?[[:alnum:]_:]+>")) ; (forward-char -1))) ((and (looking-at "=") (eq (preceding-char) ?x)) ; x= (forward-char -1)) @@ -7212,15 +7344,15 @@ Currently it is tuned to C and Perl syntax." (not (eq (char-after (- (point) 2)) ?\$))) ; $- (forward-char -1)) ((and (eq (following-char) ?\>) - (string-match "[a-zA-Z0-9_]" (char-to-string (preceding-char))) + (string-match "[[:alnum:]_]" (char-to-string (preceding-char))) (save-excursion (forward-sexp -1) (and (eq (preceding-char) ?\<) - (looking-at "\\$?[a-zA-Z0-9_:]+>")))) ; + (looking-at "\\$?[[:alnum:]_:]+>")))) ; (search-backward "<")))) ((and (eq (following-char) ?\$) (eq (preceding-char) ?\<) - (looking-at "\\$?[a-zA-Z0-9_:]+>")) ; <$fh> + (looking-at "\\$?[[:alnum:]_:]+>")) ; <$fh> (forward-char -1))) (if (looking-at cperl-have-help-regexp) (buffer-substring (match-beginning 0) (match-end 0)))) diff --git a/test/lisp/progmodes/cperl-mode-resources/grammar.pl b/test/lisp/progmodes/cperl-mode-resources/grammar.pl index c05fd7efc2a..96a86993082 100644 --- a/test/lisp/progmodes/cperl-mode-resources/grammar.pl +++ b/test/lisp/progmodes/cperl-mode-resources/grammar.pl @@ -1,6 +1,7 @@ use 5.024; use strict; use warnings; +use utf8; sub outside { say "Line @{[__LINE__]}: package '@{[__PACKAGE__]}'"; @@ -155,4 +156,17 @@ package :: { Shoved::elsewhere(); +# Finally, try unicode identifiers. +package Erdős::Number; + +sub erdős_number { + my $name = shift; + if ($name eq "Erdős Pál") { + return 0; + } + else { + die "No access to the database. Sorry."; + } +} + 1; diff --git a/test/lisp/progmodes/cperl-mode-tests.el b/test/lisp/progmodes/cperl-mode-tests.el index 54012c3918e..29b9e3f6fb9 100644 --- a/test/lisp/progmodes/cperl-mode-tests.el +++ b/test/lisp/progmodes/cperl-mode-tests.el @@ -154,6 +154,22 @@ point in the distant past, and is still broken in perl-mode. " (should (equal (get-text-property (match-beginning 0) 'face) 'font-lock-keyword-face)))) +(ert-deftest cperl-test-fontify-special-variables () + "Test fontification of variables like $^T or ${^ENCODING}. +These can occur as \"local\" aliases." + (skip-unless (eq cperl-test-mode #'cperl-mode)) + (with-temp-buffer + (insert "local ($^I, ${^UNICODE});\n") + (goto-char (point-min)) + (funcall cperl-test-mode) + (font-lock-ensure) + (search-forward "$") + (should (equal (get-text-property (point) 'face) + 'font-lock-variable-name-face)) + (search-forward "$") + (should (equal (get-text-property (point) 'face) + 'font-lock-variable-name-face)))) + (ert-deftest cperl-test-identify-heredoc () "Test whether a construct containing \"<<\" followed by a bareword is properly identified for a here-document if @@ -297,6 +313,7 @@ the whole string." (ert-deftest cperl-test-ws-rx () "Tests capture of very simple regular expressions (yawn)." + (skip-unless (eq cperl-test-mode #'cperl-mode)) (let ((valid '(" " "\t" "\n")) (invalid @@ -306,6 +323,7 @@ the whole string." (ert-deftest cperl-test-ws+-rx () "Tests sequences of whitespace and comment lines." + (skip-unless (eq cperl-test-mode #'cperl-mode)) (let ((valid `(" " "\t#\n" "\n# \n" ,(concat "# comment\n" "# comment\n" "\n" "#comment\n"))) @@ -316,6 +334,7 @@ the whole string." (ert-deftest cperl-test-version-regexp () "Tests the regexp for recommended syntax of versions in Perl." + (skip-unless (eq cperl-test-mode #'cperl-mode)) (let ((valid '("1" "1.1" "1.1_1" "5.032001" "v120.100.103")) @@ -331,6 +350,7 @@ the whole string." (ert-deftest cperl-test-package-regexp () "Tests the regular expression of Perl package names with versions. Also includes valid cases with whitespace in strange places." + (skip-unless (eq cperl-test-mode #'cperl-mode)) (let ((valid '("package Foo" "package Foo::Bar" @@ -346,6 +366,284 @@ Also includes valid cases with whitespace in strange places." (cperl-test--validate-regexp (rx (eval cperl--package-rx)) valid invalid))) +(ert-deftest cperl-test-identifier-rx () + "Test valid and invalid identifiers (no sigils)." + (skip-unless (eq cperl-test-mode #'cperl-mode)) + (let ((valid + '("foo" "FOO" "f_oo" "a123" + "manĝis")) ; Unicode is allowed! + (invalid + '("$foo" ; no sigils allowed (yet) + "Foo::bar" ; no package qualifiers allowed + "lots_of_€"))) ; € is not alphabetic + (cperl-test--validate-regexp (rx (eval cperl--basic-identifier-rx)) + valid invalid))) + +;;; Test unicode identifier in various places + +(defun cperl--test-unicode-setup (code string) + "Insert CODE, prepare it for tests, and find STRING. +Invoke the appropriate major mode, ensure fontification, and set +point after the first occurrence of STRING (no regexp!)." + (insert code) + (funcall cperl-test-mode) + (font-lock-ensure) + (goto-char (point-min)) + (search-forward string)) + +(ert-deftest cperl-test-unicode-labels () + "Verify that non-ASCII labels are processed correctly." + (with-temp-buffer + (cperl--test-unicode-setup "LABEł: for ($manĝi) { say; }" "LAB") + (should (equal (get-text-property (point) 'face) + 'font-lock-constant-face)))) + +(ert-deftest cperl-test-unicode-sub () + (with-temp-buffer + (cperl--test-unicode-setup + (concat "use strict;\n" ; distinguish bob from b-o-f + "sub ℏ {\n" + " 6.62607015e-34\n" + "};") + "sub ") ; point is before "ℏ" + + ;; Testing fontification + ;; FIXME 2021-09-10: This tests succeeds because cperl-mode + ;; accepts almost anything as a sub name for fontification. For + ;; example, it fontifies "sub @ {...;}" which is a syntax error in + ;; Perl. I let this pass for the moment. + (should (equal (get-text-property (point) 'face) + 'font-lock-function-name-face)) + + ;; Testing `beginning-of-defun'. Not available in perl-mode, + ;; where it jumps to the beginning of the buffer. + (when (eq cperl-test-mode #'cperl-mode) + (goto-char (point-min)) + (search-forward "-34") + (beginning-of-defun) + (should (looking-at "sub"))))) + +(ert-deftest cperl-test-unicode-varname () + (with-temp-buffer + (cperl--test-unicode-setup + (concat "use strict;\n" + "my $π = 3.1415926535897932384626433832795028841971;\n" + "\n" + "my $manĝi = $π;\n" + "__END__\n") + "my $") ; perl-mode doesn't fontify the sigil, so include it here + + ;; Testing fontification + ;; FIXME 2021-09-10: This test succeeds in cperl-mode because the + ;; π character is "not ASCII alphabetic", so it treats $π as a + ;; punctuation variable. The following two `should' forms with a + ;; longer variable name were added for stronger verification. + (should (equal (get-text-property (point) 'face) + 'font-lock-variable-name-face)) + ;; Test both ends of a longer variable name + (search-forward "my $") ; again skip the sigil + (should (equal (get-text-property (point) 'face) + 'font-lock-variable-name-face)) + (search-forward "manĝi") + (should (equal (get-text-property (1- (match-end 0)) 'face) + 'font-lock-variable-name-face)))) + +(ert-deftest cperl-test-unicode-varname-list () + "Verify that all elements of a variable list are fontified." + + (let ((hash-face (if (eq cperl-test-mode #'perl-mode) + 'perl-non-scalar-variable + 'cperl-hash-face)) + (array-face (if (eq cperl-test-mode #'perl-mode) + 'perl-non-scalar-variable + 'cperl-array-face))) + (with-temp-buffer + (cperl--test-unicode-setup + "my (%äsh,@ärräy,$scâlâr);" "%") + (should (equal (get-text-property (point) 'face) + hash-face)) + (search-forward "@") + (should (equal (get-text-property (point) 'face) + array-face)) + (search-forward "scâlâr") + (should (equal (get-text-property (match-beginning 0) 'face) + 'font-lock-variable-name-face)) + (should (equal (get-text-property (1- (match-end 0)) 'face) + 'font-lock-variable-name-face))) + + ;; Now with package-qualified variables + (with-temp-buffer + (cperl--test-unicode-setup + "local (%Søme::äsh,@Søme::ärräy,$Søme::scâlâr);" "%") + (should (equal (get-text-property (point) 'face) + hash-face)) + (search-forward "Søme::") ; test basic identifier + (should (equal (get-text-property (point) 'face) + hash-face)) + (search-forward "@") ; test package name + (should (equal (get-text-property (point) 'face) + array-face)) + (search-forward "Søme::") ; test basic identifier + (should (equal (get-text-property (point) 'face) + array-face)) + (search-forward "Søme") ; test package name + (should (equal (get-text-property (match-beginning 0) 'face) + 'font-lock-variable-name-face)) + (should (equal (get-text-property (1- (match-end 0)) 'face) + 'font-lock-variable-name-face)) + (search-forward "scâlâr") ; test basic identifier + (should (equal (get-text-property (match-beginning 0) 'face) + 'font-lock-variable-name-face)) + (should (equal (get-text-property (1- (match-end 0)) 'face) + 'font-lock-variable-name-face))))) + +(ert-deftest cperl-test-unicode-arrays () + "Test fontification of array access." + ;; Perl mode just looks at the sigil, for element access + (skip-unless (eq cperl-test-mode #'cperl-mode)) + ;; simple array element + (with-temp-buffer + (cperl--test-unicode-setup + "$ärräy[1] = 7;" "$") + (should (equal (get-text-property (point) 'face) + 'cperl-array-face))) + ;; array slice + (with-temp-buffer + (cperl--test-unicode-setup + "@ärräy[(1..3)] = (4..6);" "@") + (should (equal (get-text-property (point) 'face) + 'cperl-array-face))) + ;; array max index + (with-temp-buffer + (cperl--test-unicode-setup + "$#ärräy = 1;" "$") + (should (equal (get-text-property (point) 'face) + 'cperl-array-face))) + ;; array dereference + (with-temp-buffer + (cperl--test-unicode-setup + "@$ärräy = (1,2,3)" "@") + (should (equal (get-text-property (1- (point)) 'face) + 'cperl-array-face)) + (should (equal (get-text-property (1+ (point)) 'face) + 'font-lock-variable-name-face)))) + +(ert-deftest cperl-test-unicode-hashes () + "Test fontification of hash access." + ;; Perl mode just looks at the sigil, for element access + (skip-unless (eq cperl-test-mode #'cperl-mode)) + ;; simple hash element + (with-temp-buffer + (cperl--test-unicode-setup + "$häsh{'a'} = 7;" "$") + (should (equal (get-text-property (point) 'face) + 'cperl-hash-face))) + ;; hash array slice + (with-temp-buffer + (cperl--test-unicode-setup + "@häsh{(1..3)} = (4..6);" "@") + (should (equal (get-text-property (point) 'face) + 'cperl-hash-face))) + ;; hash subset + (with-temp-buffer + (cperl--test-unicode-setup + "my %hash = %häsh{'a',2,3};" "= %") + (should (equal (get-text-property (point) 'face) + 'cperl-hash-face))) + ;; hash dereference + (with-temp-buffer + (cperl--test-unicode-setup + "%$äsh = (key => 'value');" "%") + (should (equal (get-text-property (1- (point)) 'face) + 'cperl-hash-face)) + (should (equal (get-text-property (1+ (point)) 'face) + 'font-lock-variable-name-face)))) + +(ert-deftest cperl-test-unicode-hashref () + "Verify that a hashref access disambiguates {s}. +CPerl mode takes the token \"s\" as a substitution unless +detected otherwise. Not for perl-mode: it doesn't stringify +bareword hash keys and doesn't recognize a substitution +\"s}foo}bar}\"" + (skip-unless (eq cperl-test-mode #'cperl-mode)) + (with-temp-buffer + (cperl--test-unicode-setup "$häshref->{s} # }}" "{") + (should (equal (get-text-property (point) 'face) + 'font-lock-string-face)) + (should (equal (get-text-property (1+ (point)) 'face) + nil)))) + +(ert-deftest cperl-test-unicode-proto () + ;; perl-mode doesn't fontify prototypes at all + (skip-unless (eq cperl-test-mode #'cperl-mode)) + (with-temp-buffer + (cperl--test-unicode-setup + (concat "sub prötötyped ($) {\n" + " ...;" + "}\n") + "prötötyped (") + + (should (equal (get-text-property (point) 'face) + 'font-lock-string-face)))) + +(ert-deftest cperl-test-unicode-fhs () + (with-temp-buffer + (cperl--test-unicode-setup + (concat "while () {\n" + " ...;)\n" + "}\n") + "while (<") ; point is before the first char of the handle + ;; Testing fontification + ;; FIXME 2021-09-10: perl-mode.el and cperl-mode.el handle these + ;; completely differently. perl-mode interprets barewords as + ;; constants, cperl-mode does not fontify them. Both treat + ;; non-barewords as globs, which are not fontified by perl-mode, + ;; but fontified as strings in cperl-mode. We keep (and test) + ;; that behavior "as is" because both bareword filehandles and + ;; syntax are no longer recommended. + (let ((bareword-face + (if (equal cperl-test-mode 'perl-mode) 'font-lock-constant-face + nil))) + (should (equal (get-text-property (point) 'face) + bareword-face))))) + +(ert-deftest cperl-test-unicode-hashkeys () + "Test stringification of bareword hash keys. Not in perl-mode. +perl-mode generally does not stringify bareword hash keys." + (skip-unless (eq cperl-test-mode #'cperl-mode)) + ;; Plain hash key + (with-temp-buffer + (cperl--test-unicode-setup + "$häsh { kéy }" "{ ") + (should (equal (get-text-property (point) 'face) + 'font-lock-string-face))) + ;; Nested hash key + (with-temp-buffer + (cperl--test-unicode-setup + "$häsh { kéy } { kèy }" "} { ") + (should (equal (get-text-property (point) 'face) + 'font-lock-string-face))) + ;; Key => value + (with-temp-buffer + (cperl--test-unicode-setup + "( kéy => 'value'," "( ") + (should (equal (get-text-property (point) 'face) + 'font-lock-string-face)))) + +(ert-deftest cperl-test-word-at-point () + "Test whether the function captures non-ASCII words." + (skip-unless (eq cperl-test-mode #'cperl-mode)) + (let ((words '("rôle" "café" "ångström" + "Data::Dump::dump" + "_underscore"))) + (dolist (word words) + (with-temp-buffer + (insert " + ") ; this will be the suffix + (beginning-of-line) + (insert ")") ; A non-word char + (insert word) + (should (string= word (cperl-word-at-point-hard))))))) + ;;; Function test: Building an index for imenu (ert-deftest cperl-test-imenu-index () @@ -369,7 +667,8 @@ created by CPerl mode, so skip it for Perl mode." "Versioned::Package::outer" "lexical" "Versioned::Block::signatured" - "Package::in_package_again"))) + "Package::in_package_again" + "Erdős::Number::erdős_number"))) (dolist (sub expected) (should (assoc-string sub index))))))) -- 2.39.5