]> git.eshelyaron.com Git - emacs.git/commitdiff
cperl-mode.el: Allow non-ASCII Perl identifiers
authorHarald Jörg <haj@posteo.de>
Tue, 14 Sep 2021 15:53:52 +0000 (17:53 +0200)
committerHarald Jörg <haj@posteo.de>
Tue, 14 Sep 2021 15:53:52 +0000 (17:53 +0200)
Replace all "A-Z" regexp literals with unicode-aware rx constructs
wherever Perl allows non-ASCII identifiers.
* lisp/progmodes/cperl-mode.el (cperl-after-sub-regexp)
(cperl-after-label. cperl-sniff-for-indent)
(cperl-find-pods-heres, cperl-indent-exp)
(cperl-fix-line-spacing, cperl-imenu--create-perl-index)
(cperl-init-faces, cperl-find-tags):
Replace ASCII regex literals by unicode-aware rx constructs.
(cperl-init-faces): Eliminate unused lexical `font-lock-anchored'.
(cperl-have-help-regexp, cperl-word-at-point-hard): Allow non-ASCII
word characters.

* test/lisp/progmodes/cperl-mode-tests.el
(cperl-test-fontify-special-variables): New test for $^T
and $^{VARNAME}.
(cperl-test-ws-rx cperl-test-ws+-rx),
(cperl-test-version-regexp, cperl-test-package-regexp): Skip
for perl-mode.
(cperl-test-identifier-rx, cperl--test-unicode-setup)
(cperl-test-unicode-labels, cperl-test-unicode-sub)
(cperl-test-unicode-varname)
(cperl-test-unicode-varname-list, cperl-test-unicode-arrays)
(cperl-test-unicode-hashes, cperl-test-unicode-hashref)
(cperl-test-unicode-proto, cperl-test-unicode-fhs)
(cperl-test-unicode-hashkeys, cperl-test-word-at-point):
New tests for unicode identifiers.
 (cperl-test-imenu-index): Add a unicode identifier to the test.

* test/lisp/progmodes/cperl-mode-resources/grammar.pl: Add a
function with non-ASCII name for imenu tests.

lisp/progmodes/cperl-mode.el
test/lisp/progmodes/cperl-mode-resources/grammar.pl
test/lisp/progmodes/cperl-mode-tests.el

index 76c82f8c73e8c7db24d879d946b5ba9b10d8bce9..1147889969bffa6dccad0aa3b93e933f425c220b 100644 (file)
@@ -1407,7 +1407,7 @@ the last)."
   (concat                              ; Assume n groups before this...
    "\\("                               ; n+1=name-group
      cperl-white-and-comment-rex       ; n+2=pre-name
-     "\\(::[a-zA-Z_0-9:']+\\|[a-zA-Z_'][a-zA-Z_0-9:']*\\)" ; n+3=name
+     (rx-to-string `(group ,cperl--normal-identifier-rx))
    "\\)"                               ; END n+1=name-group
    (if named "" "?")
    "\\("                               ; n+4=proto-group
@@ -2573,7 +2573,8 @@ Return the amount the indentation changed by."
             '(?w ?_))
        (progn
         (backward-sexp)
-        (looking-at "[a-zA-Z_][a-zA-Z0-9_]*:[^:]"))))
+         (looking-at (rx (sequence (eval cperl--label-rx)
+                                   (not (in ":"))))))))
 
 (defun cperl-get-state (&optional parse-start start-state)
   "Return list (START STATE DEPTH PRESTART),
@@ -2740,7 +2741,9 @@ Will not look before LIM."
                                  (progn
                                    (forward-sexp -1)
                                    (skip-chars-backward " \t")
-                                   (looking-at "[ \t]*[a-zA-Z_][a-zA-Z_0-9]*[ \t]*:")))
+                                   (looking-at
+                                     (rx (sequence (0+ blank)
+                                                   (eval cperl--label-rx))))))
                             (get-text-property (point) 'first-format-line)))
 
                   ;; Look at previous line that's at column 0
@@ -3836,7 +3839,8 @@ recursive calls in starting lines of here-documents."
                "\\<" cperl-sub-regexp "\\>" ;  sub with proto/attr
                "\\("
                   cperl-white-and-comment-rex
-                  "\\(::[a-zA-Z_:'0-9]*\\|[a-zA-Z_'][a-zA-Z_:'0-9]*\\)\\)?" ; name
+                   (rx (group (eval cperl--normal-identifier-rx)))
+                "\\)"
                "\\("
                   cperl-maybe-white-and-comment-rex
                   "\\(([^()]*)\\|:[^:]\\)\\)" ; prototype or attribute start
@@ -4111,10 +4115,12 @@ recursive calls in starting lines of here-documents."
                                         (t t))))
                           ;; <file> or <$file>
                           (and (eq c ?\<)
-                               ;; Do not stringify <FH>, <$fh> :
+                                ;; Stringify what looks like a glob, but
+                               ;; do not stringify file handles <FH>, <$fh> :
                                (save-match-data
                                  (looking-at
-                                  "\\$?\\([_a-zA-Z:][_a-zA-Z0-9:]*\\)?>"))))
+                                   (rx (sequence (opt "$")
+                                                 (eval cperl--normal-identifier-rx)))))))
                      tb (match-beginning 0))
                (goto-char (match-beginning b1))
                (cperl-backward-to-noncomment (point-min))
@@ -4184,7 +4190,16 @@ recursive calls in starting lines of here-documents."
                                 (error nil)))
                             (if (or bb
                                     (looking-at ; $foo -> {s}
-                                     "[$@]\\$*\\([a-zA-Z0-9_:]+\\|[^{]\\)\\([ \t\n]*->\\)?[ \t\n]*{")
+                                      (rx
+                                       (sequence
+                                        (in "$@") (0+ "$")
+                                        (or
+                                         (eval cperl--normal-identifier-rx)
+                                         (not (in "{")))
+                                        (opt (sequence (eval cperl--ws*-rx))
+                                             "->")
+                                        (eval cperl--ws*-rx)
+                                        "{")))
                                     (and ; $foo[12] -> {s}
                                      (memq (following-char) '(?\{ ?\[))
                                      (progn
@@ -4199,7 +4214,12 @@ recursive calls in starting lines of here-documents."
                             (setq bb t))
                            ((and (eq (following-char) ?:)
                                  (eq b1 ?\{) ; Check for $ { s::bar }
-                                 (looking-at "::[a-zA-Z0-9_:]*[ \t\n\f]*}")
+                                 ;;  (looking-at "::[a-zA-Z0-9_:]*[ \t\n\f]*}")
+                                  (looking-at
+                                   (rx (sequence "::"
+                                                 (eval cperl--normal-identifier-rx)
+                                                 (eval cperl--ws*-rx)
+                                                 "}")))
                                  (progn
                                    (goto-char (1- go))
                                    (skip-chars-backward " \t\n\f")
@@ -4364,7 +4384,7 @@ recursive calls in starting lines of here-documents."
                                    "\\(" ;; XXXX 1-char variables, exc. |()\s
                                       "[$@]"
                                       "\\("
-                                         "[_a-zA-Z:][_a-zA-Z0-9:]*"
+                                          (rx (eval cperl--normal-identifier-rx))
                                       "\\|"
                                          "{[^{}]*}" ; only one-level allowed
                                       "\\|"
@@ -4820,6 +4840,7 @@ recursive calls in starting lines of here-documents."
           (progn
             (backward-sexp)
             ;; sub {BLK}, print {BLK} $data, but NOT `bless', `return', `tr', `constant'
+             ;; a-zA-Z is fine here, these are Perl keywords
             (or (and (looking-at "[a-zA-Z0-9_:]+[ \t\n\f]*[{#]") ; Method call syntax
                      (not (looking-at "\\(bless\\|return\\|q[wqrx]?\\|tr\\|[smy]\\|constant\\)\\>")))
                 ;; sub bless::foo {}
@@ -5028,7 +5049,11 @@ conditional/loop constructs."
                                     cperl-maybe-white-and-comment-rex
                                     "\\(state\\|my\\|local\\|our\\)\\)?"
                                     cperl-maybe-white-and-comment-rex
-                                    "\\$[_a-zA-Z0-9]+\\)?\\)\\>"))
+                                     (rx
+                                      (sequence
+                                       "$"
+                                       (eval cperl--basic-identifier-rx)))
+                                    "\\)?\\)\\>"))
                            (progn
                              (goto-char top)
                              (forward-sexp 1)
@@ -5122,7 +5147,14 @@ Returns some position at the last line."
        ;; Looking at:
        ;; foreach my $var     (
        (if (looking-at
-            "[ \t]*\\<for\\(each\\)?[ \t]+\\(state\\|my\\|local\\|our\\)[ \t]*\\$[_a-zA-Z0-9]+\\(\t*\\|[ \t][ \t]+\\)[^ \t\n#]")
+             (rx (sequence (0+ blank) symbol-start
+                           "for" (opt "each")
+                           (1+ blank)
+                           (or "state" "my" "local" "our")
+                           (0+ blank)
+                           "$" (eval cperl--basic-identifier-rx)
+                           (1+ blank)
+                           (not (in " \t\n#")))))
            (progn
              (forward-sexp 3)
              (delete-horizontal-space)
@@ -5132,9 +5164,25 @@ Returns some position at the last line."
        ;; Looking at (with or without "}" at start, ending after "({"):
        ;; } foreach my $var ()         OR   {
        (if (looking-at
-            "[ \t]*\\(}[ \t]*\\)?\\<\\(els\\(e\\|if\\)\\|continue\\|if\\|unless\\|while\\|for\\(each\\)?\\(\\([ \t]+\\(state\\|my\\|local\\|our\\)\\)?[ \t]*\\$[_a-zA-Z0-9]+\\)?\\|until\\)\\>\\([ \t]*(\\|[ \t\n]*{\\)\\|[ \t]*{")
+             (rx (sequence
+                  (0+ blank)
+                  (opt (sequence "}" (0+ blank) ))
+                  symbol-start
+                  (or "else" "elsif" "continue" "if" "unless" "while" "until"
+                      (sequence (or "for" "foreach")
+                                (opt
+                                 (opt (sequence (1+ blank)
+                                                (or "state" "my" "local" "our")))
+                                 (0+ blank)
+                                 "$" (eval cperl--basic-identifier-rx))))
+                  symbol-end
+                  (group-n 1
+                           (or
+                            (or (sequence (0+ blank) "(")
+                                (sequence (eval cperl--ws*-rx) "{"))
+                            (sequence (0+ blank) "{"))))))
            (progn
-             (setq ml (match-beginning 8)) ; "(" or "{" after control word
+             (setq ml (match-beginning 1)) ; "(" or "{" after control word
              (re-search-forward "[({]")
              (forward-char -1)
              (setq p (point))
@@ -5544,7 +5592,11 @@ comment, or POD."
           (setq lst index-sub-alist)
           (while lst
             (setq elt (car lst) lst (cdr lst))
-            (cond ((string-match "\\(::\\|'\\)[_a-zA-Z0-9]+$" (car elt))
+            (cond ((string-match
+                     (rx (sequence (or "::" "'")
+                                   (eval cperl--basic-identifier-rx)
+                                   string-end))
+                     (car elt))
                    (setq pack (substring (car elt) 0 (match-beginning 0)))
                    (if (setq group (assoc pack hier-list))
                        (if (listp (cdr group))
@@ -5646,8 +5698,7 @@ default function."
 (defun cperl-init-faces ()
   (condition-case errs
       (progn
-       (let (t-font-lock-keywords t-font-lock-keywords-1 font-lock-anchored)
-         (setq font-lock-anchored t)
+       (let (t-font-lock-keywords t-font-lock-keywords-1)
          (setq
           t-font-lock-keywords
           (list
@@ -5760,20 +5811,41 @@ default function."
                         (if (eq (char-after (cperl-1- (match-end 0))) ?\{ )
                             'font-lock-function-name-face
                           'font-lock-variable-name-face))))
-           '("\\<\\(package\\|require\\|use\\|import\\|no\\|bootstrap\\)[ \t]+\\([a-zA-Z_][a-zA-Z_0-9:]*\\)[ \t;]" ; require A if B;
-             2 font-lock-function-name-face)
+            `(,(rx (sequence symbol-start
+                             (or "package" "require" "use" "import"
+                                 "no" "bootstrap")
+                             (eval cperl--ws+-rx)
+                             (group-n 1 (eval cperl--normal-identifier-rx))
+                             (any " \t;"))) ; require A if B;
+             1 font-lock-function-name-face)
            '("^[ \t]*format[ \t]+\\([a-zA-Z_][a-zA-Z_0-9:]*\\)[ \t]*=[ \t]*$"
              1 font-lock-function-name-face)
-           (cond (font-lock-anchored
-                  '("\\([]}\\%@>*&]\\|\\$[a-zA-Z0-9_:]*\\)[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
-                    (2 font-lock-string-face t)
-                    ("\\=[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
-                     nil nil
-                     (1 font-lock-string-face t))))
-                 (t '("\\([]}\\%@>*&]\\|\\$[a-zA-Z0-9_:]*\\)[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
-                      2 font-lock-string-face t)))
-           '("[[ \t{,(]\\(-?[a-zA-Z0-9_:]+\\)[ \t]*=>" 1
-             font-lock-string-face t)
+            ;; bareword hash key: $foo{bar}
+            `(,(rx (or (in "]}\\%@>*&") ; What Perl is this?
+                       (sequence "$" (eval cperl--normal-identifier-rx)))
+                   (0+ blank) "{" (0+ blank)
+                   (group-n 1 (sequence (opt "-")
+                                        (eval cperl--basic-identifier-rx)))
+                   (0+ blank) "}")
+;;         '("\\([]}\\%@>*&]\\|\\$[a-zA-Z0-9_:]*\\)[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
+             (1 font-lock-string-face t)
+              ;; anchored bareword hash key: $foo{bar}{baz}
+              (,(rx point
+                   (0+ blank) "{" (0+ blank)
+                   (group-n 1 (sequence (opt "-")
+                                        (eval cperl--basic-identifier-rx)))
+                   (0+ blank) "}")
+             ;; ("\\=[ \t]*{[ \t]*\\(-?[a-zA-Z0-9_:]+\\)[ \t]*}"
+              nil nil
+              (1 font-lock-string-face t)))
+              ;; hash element assignments with bareword key => value
+              `(,(rx (in "[ \t{,()")
+                     (group-n 1 (sequence (opt "-")
+                                          (eval cperl--basic-identifier-rx)))
+                     (0+ blank) "=>")
+                1 font-lock-string-face t)
+;;         '("[[ \t{,(]\\(-?[a-zA-Z0-9_:]+\\)[ \t]*=>" 1
+;;           font-lock-string-face t)
             ;; labels
             `(,(rx
                 (sequence
@@ -5797,83 +5869,130 @@ default function."
             ;;; '("[$*]{?\\(\\sw+\\)" 1 font-lock-variable-name-face)
             ;;; '("\\([@%]\\|\\$#\\)\\(\\sw+\\)"
             ;;;  (2 (cons font-lock-variable-name-face '(underline))))
-           (cond (font-lock-anchored
                   ;; 1=my_etc, 2=white? 3=(+white? 4=white? 5=var
-                  `(,(concat "\\<\\(state\\|my\\|local\\|our\\)"
-                                 cperl-maybe-white-and-comment-rex
-                                 "\\(("
-                                    cperl-maybe-white-and-comment-rex
-                                 "\\)?\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)")
-                      (5 ,(if cperl-font-lock-multiline
-                                'font-lock-variable-name-face
-                              '(progn  (setq cperl-font-lock-multiline-start
-                                             (match-beginning 0))
-                                       'font-lock-variable-name-face)))
-                      (,(concat "\\="
-                                  cperl-maybe-white-and-comment-rex
-                                  ","
-                                  cperl-maybe-white-and-comment-rex
-                                  "\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)")
-                       ;; Bug in font-lock: limit is used not only to limit
-                       ;; searches, but to set the "extend window for
-                       ;; facification" property.  Thus we need to minimize.
-                       ,(if cperl-font-lock-multiline
-                            '(if (match-beginning 3)
-                                 (save-excursion
-                                   (goto-char (match-beginning 3))
-                                   (condition-case nil
-                                       (forward-sexp 1)
-                                     (error
-                                      (condition-case nil
-                                          (forward-char 200)
-                                        (error nil)))) ; typeahead
-                                   (1- (point))) ; report limit
-                               (forward-char -2)) ; disable continued expr
-                            '(if (match-beginning 3)
-                                 (point-max) ; No limit for continuation
-                               (forward-char -2))) ; disable continued expr
-                       ,(if cperl-font-lock-multiline
-                              nil
-                            '(progn    ; Do at end
-                               ;; "my" may be already fontified (POD),
-                               ;; so cperl-font-lock-multiline-start is nil
-                               (if (or (not cperl-font-lock-multiline-start)
-                                       (> 2 (count-lines
-                                             cperl-font-lock-multiline-start
-                                             (point))))
-                                   nil
-                                 (put-text-property
-                                  (1+ cperl-font-lock-multiline-start) (point)
-                                  'syntax-type 'multiline))
-                               (setq cperl-font-lock-multiline-start nil)))
-                       (3 font-lock-variable-name-face))))
-                 (t '("^[ \t{}]*\\(state\\|my\\|local\\|our\\)[ \t]*\\(([ \t]*\\)?\\([$@%*][a-zA-Z0-9_:]+\\)"
-                      3 font-lock-variable-name-face)))
-           '("\\<for\\(each\\)?\\([ \t]+\\(state\\|my\\|local\\|our\\)\\)?[ \t]*\\(\\$[a-zA-Z_][a-zA-Z_0-9]*\\)[ \t]*("
+           `(,(rx (sequence (or "state" "my" "local" "our"))
+                   (eval cperl--ws*-rx)
+                   (opt (sequence "(" (eval cperl--ws*-rx)))
+                   (group
+                    (in "$@%*")
+                    (or
+                     (eval cperl--normal-identifier-rx)
+                     (eval cperl--special-identifier-rx))
+                    )
+                   )
+              ;; (concat "\\<\\(state\\|my\\|local\\|our\\)"
+             ;;          cperl-maybe-white-and-comment-rex
+             ;;          "\\(("
+             ;;          cperl-maybe-white-and-comment-rex
+             ;;          "\\)?\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)")
+             ;; (5 ,(if cperl-font-lock-multiline
+             (1 ,(if cperl-font-lock-multiline
+                     'font-lock-variable-name-face
+                   '(progn  (setq cperl-font-lock-multiline-start
+                                  (match-beginning 0))
+                            'font-lock-variable-name-face)))
+             (,(rx (sequence point
+                              (eval cperl--ws*-rx)
+                              ","
+                              (eval cperl--ws*-rx)
+                              (group
+                               (in "$@%*")
+                               (or
+                                (eval cperl--normal-identifier-rx)
+                                (eval cperl--special-identifier-rx))
+                               )
+                              )
+                    )
+               ;; ,(concat "\\="
+              ;;       cperl-maybe-white-and-comment-rex
+              ;;       ","
+              ;;       cperl-maybe-white-and-comment-rex
+              ;;       "\\([$@%*]\\([a-zA-Z0-9_:]+\\|[^a-zA-Z0-9_]\\)\\)")
+              ;; Bug in font-lock: limit is used not only to limit
+              ;; searches, but to set the "extend window for
+              ;; facification" property.  Thus we need to minimize.
+              ,(if cperl-font-lock-multiline
+                   '(if (match-beginning 1)
+                        (save-excursion
+                          (goto-char (match-beginning 1))
+                          (condition-case nil
+                              (forward-sexp 1)
+                            (error
+                             (condition-case nil
+                                 (forward-char 200)
+                               (error nil)))) ; typeahead
+                          (1- (point))) ; report limit
+                      (forward-char -2)) ; disable continued expr
+                 '(if (match-beginning 1)
+                      (point-max) ; No limit for continuation
+                    (forward-char -2))) ; disable continued expr
+              ,(if cperl-font-lock-multiline
+                   nil
+                 '(progn       ; Do at end
+                    ;; "my" may be already fontified (POD),
+                    ;; so cperl-font-lock-multiline-start is nil
+                    (if (or (not cperl-font-lock-multiline-start)
+                            (> 2 (count-lines
+                                  cperl-font-lock-multiline-start
+                                  (point))))
+                        nil
+                      (put-text-property
+                       (1+ cperl-font-lock-multiline-start) (point)
+                       'syntax-type 'multiline))
+                    (setq cperl-font-lock-multiline-start nil)))
+              (1 font-lock-variable-name-face)))
+            ;; foreach my $foo (
+            `(,(rx symbol-start "for" (opt "each")
+                   (opt (sequence (1+ blank)
+                                  (or "state" "my" "local" "our")))
+                   (0+ blank)
+                   (group-n 1 (sequence "$"
+                                        (eval cperl--basic-identifier-rx)))
+                   (0+ blank) "(")
+;;         '("\\<for\\(each\\)?\\([ \t]+\\(state\\|my\\|local\\|our\\)\\)?[ \t]*\\(\\$[a-zA-Z_][a-zA-Z_0-9]*\\)[ \t]*("
              4 font-lock-variable-name-face)
            ;; Avoid $!, and s!!, qq!! etc. when not fontifying syntactically
            '("\\(?:^\\|[^smywqrx$]\\)\\(!\\)" 1 font-lock-negation-char-face)
            '("\\[\\(\\^\\)" 1 font-lock-negation-char-face prepend)))
          (setq
           t-font-lock-keywords-1
-          '(
-            ("\\(\\([@%]\\|\\$#\\)[a-zA-Z_:][a-zA-Z0-9_:]*\\)" 1
+          `(
+             ;; arrays and hashes.  Access to elements is fixed below
+             (,(rx (group-n 1 (group-n 2 (or (in "@%") "$#"))
+                            (eval cperl--normal-identifier-rx)))
+              1
+;;          ("\\(\\([@%]\\|\\$#\\)[a-zA-Z_:][a-zA-Z0-9_:]*\\)" 1
              (if (eq (char-after (match-beginning 2)) ?%)
                  'cperl-hash-face
                'cperl-array-face)
              nil)                      ; arrays and hashes
-            ("\\(\\([$@%]+\\)[a-zA-Z_:][a-zA-Z0-9_:]*\\)[ \t]*\\([[{]\\)"
+             ;; access to array/hash elements
+             (,(rx (group-n 1 (group-n 2 (in "$@%"))
+                            (eval cperl--normal-identifier-rx))
+                   (0+ blank)
+                   (group-n 3 (in "[{")))
+;;          ("\\(\\([$@%]+\\)[a-zA-Z_:][a-zA-Z0-9_:]*\\)[ \t]*\\([[{]\\)"
              1
              (if (= (- (match-end 2) (match-beginning 2)) 1)
                  (if (eq (char-after (match-beginning 3)) ?{)
                      'cperl-hash-face
                    'cperl-array-face)             ; arrays and hashes
                font-lock-variable-name-face)      ; Just to put something
-             t)
-            ("\\(@\\|\\$#\\)\\(\\$+\\([a-zA-Z_:][a-zA-Z0-9_:]*\\|[^ \t\n]\\)\\)"
+             t)                                   ; override previous
+             ;; @$ array dereferences, $#$ last array index
+             (,(rx (group-n 1 (or "@" "$#"))
+                   (group-n 2 (sequence "$"
+                                        (or (eval cperl--normal-identifier-rx)
+                                            (not (in " \t\n"))))))
+            ;; ("\\(@\\|\\$#\\)\\(\\$+\\([a-zA-Z_:][a-zA-Z0-9_:]*\\|[^ \t\n]\\)\\)"
              (1 'cperl-array-face)
              (2 font-lock-variable-name-face))
-            ("\\(%\\)\\(\\$+\\([a-zA-Z_:][a-zA-Z0-9_:]*\\|[^ \t\n]\\)\\)"
+             ;; %$ hash dereferences
+             (,(rx (group-n 1 "%")
+                   (group-n 2 (sequence "$"
+                                        (or (eval cperl--normal-identifier-rx)
+                                            (not (in " \t\n"))))))
+            ;; ("\\(%\\)\\(\\$+\\([a-zA-Z_:][a-zA-Z0-9_:]*\\|[^ \t\n]\\)\\)"
              (1 'cperl-hash-face)
              (2 font-lock-variable-name-face))
 ;;("\\([smy]\\|tr\\)\\([^a-z_A-Z0-9]\\)\\(\\([^\n\\]*||\\)\\)\\2")
@@ -6435,6 +6554,8 @@ Will not move the position at the start to the left."
       (indent-region beg end nil)
       (goto-char beg)
       (setq col (current-column))
+      ;; Assuming that lineup is done on Perl syntax, this regexp
+      ;; doesn't need to be unicode aware -- haj, 2021-09-10
       (if (looking-at "[a-zA-Z0-9_]")
          (if (looking-at "\\<[a-zA-Z0-9_]+\\>")
              (setq search
@@ -6472,6 +6593,9 @@ Will not move the position at the start to the left."
   "Run etags with appropriate options for Perl files.
 If optional argument ALL is `recursive', will process Perl files
 in subdirectories too."
+  ;; Apparently etags doesn't support UTF-8 encoded sources, and usage
+  ;; of etags has been commented out in the menu since ... well,
+  ;; forever.  So, let's just stick to ASCII here. -- haj, 2021-09-14
   (interactive)
   (let ((cmd "etags")
        (args `("-l" "none" "-r"
@@ -6611,6 +6735,9 @@ Does not move point."
     ;; Search for the function
     (progn ;;save-match-data
       (while (re-search-forward
+              ;; FIXME: Should XS code be unicode aware?  Recent C
+              ;; compilers (Gcc 10+) are, but I guess this isn't used
+              ;; much. -- haj, 2021-09-14
              "^\\([ \t]*MODULE\\>[^\n]*\\<PACKAGE[ \t]*=[ \t]*\\([a-zA-Z_][a-zA-Z_0-9:]*\\)\\>\\|\\([a-zA-Z_][a-zA-Z_0-9]*\\)(\\|[ \t]*BOOT:\\)"
              nil t)
        (cond
@@ -6673,7 +6800,7 @@ Does not move point."
        (setq lst
              (mapcar
                (lambda (elt)
-                 (cond ((string-match "^[_a-zA-Z]" (car elt))
+                 (cond ((string-match (rx line-start (or alpha "_")) (car elt))
                         (goto-char (cdr elt))
                         (beginning-of-line) ; pos should be of the start of the line
                         (list (car elt)
@@ -6703,9 +6830,14 @@ Does not move point."
                        ","
                        (number-to-string (1- (elt elt 1))) ; Char pos 0-based
                        "\n")
-               (if (and (string-match "^[_a-zA-Z]+::" (car elt))
-                        (string-match (concat "^" cperl-sub-regexp "[ \t]+\\([_a-zA-Z]+\\)[^:_a-zA-Z]")
-                                      (elt elt 3)))
+               (if (and (string-match (rx line-start
+                                           (eval cperl--basic-identifier-rx) "++")
+                                       (car elt))
+                         (string-match (rx-to-string `(sequence line-start
+                                                                (regexp ,cperl-sub-regexp)
+                                                                (1+ (in " \t"))
+                                                                ,cperl--normal-identifier-rx))
+                                       (elt elt 3)))
                    ;; Need to insert the name without package as well
                    (setq lst (cons (cons (substring (elt elt 3)
                                                     (match-beginning 1)
@@ -7155,14 +7287,14 @@ Currently it is tuned to C and Perl syntax."
   ;;(concat "\\("
   (mapconcat
    #'identity
-   '("[$@%*&][0-9a-zA-Z_:]+\\([ \t]*[[{]\\)?" ; Usual variable
+   '("[$@%*&][[:alnum:]_:]+\\([ \t]*[[{]\\)?" ; Usual variable
      "[$@]\\^[a-zA-Z]"                 ; Special variable
      "[$@][^ \n\t]"                    ; Special variable
      "-[a-zA-Z]"                       ; File test
      "\\\\[a-zA-Z0]"                   ; Special chars
      "^=[a-z][a-zA-Z0-9_]*"            ; POD sections
      "[-!&*+,./<=>?\\^|~]+"            ; Operator
-     "[a-zA-Z_0-9:]+"                  ; symbol or number
+     "[[:alnum:]_:]+"                  ; symbol or number
      "x="
      "#!")
    ;;"\\)\\|\\("
@@ -7178,7 +7310,7 @@ Currently it is tuned to C and Perl syntax."
   ;; Does not save-excursion
   ;; Get to the something meaningful
   (or (eobp) (eolp) (forward-char 1))
-  (re-search-backward "[-a-zA-Z0-9_:!&*+,./<=>?\\^|~$%@]"
+  (re-search-backward "[-[:alnum:]_:!&*+,./<=>?\\^|~$%@]"
                      (point-at-bol)
                      'to-beg)
   ;;  (cond
@@ -7187,8 +7319,8 @@ Currently it is tuned to C and Perl syntax."
   ;;    (or (bobp) (backward-char 1))))
   ;; Try to backtrace
   (cond
-   ((looking-at "[a-zA-Z0-9_:]")       ; symbol
-    (skip-chars-backward "a-zA-Z0-9_:")
+   ((looking-at "[[:alnum:]_:]")       ; symbol
+    (skip-chars-backward "[:alnum:]_:")
     (cond
      ((and (eq (preceding-char) ?^)    ; $^I
           (eq (char-after (- (point) 2)) ?\$))
@@ -7199,7 +7331,7 @@ Currently it is tuned to C and Perl syntax."
           (eq (current-column) 1))
       (forward-char -1)))              ; =head1
     (if (and (eq (preceding-char) ?\<)
-            (looking-at "\\$?[a-zA-Z0-9_:]+>")) ; <FH>
+             (looking-at "\\$?[[:alnum:]_:]+>")) ; <FH>
        (forward-char -1)))
    ((and (looking-at "=") (eq (preceding-char) ?x)) ; x=
     (forward-char -1))
@@ -7212,15 +7344,15 @@ Currently it is tuned to C and Perl syntax."
           (not (eq (char-after (- (point) 2)) ?\$))) ; $-
       (forward-char -1))
      ((and (eq (following-char) ?\>)
-          (string-match "[a-zA-Z0-9_]" (char-to-string (preceding-char)))
+          (string-match "[[:alnum:]_]" (char-to-string (preceding-char)))
           (save-excursion
             (forward-sexp -1)
             (and (eq (preceding-char) ?\<)
-                 (looking-at "\\$?[a-zA-Z0-9_:]+>")))) ; <FH>
+                 (looking-at "\\$?[[:alnum:]_:]+>")))) ; <FH>
       (search-backward "<"))))
    ((and (eq (following-char) ?\$)
         (eq (preceding-char) ?\<)
-        (looking-at "\\$?[a-zA-Z0-9_:]+>")) ; <$fh>
+        (looking-at "\\$?[[:alnum:]_:]+>")) ; <$fh>
     (forward-char -1)))
   (if (looking-at cperl-have-help-regexp)
       (buffer-substring (match-beginning 0) (match-end 0))))
index c05fd7efc2a870bf7d1319b63f9242c086b6fc1c..96a869930827426ee79d0ab3ede65652e1112135 100644 (file)
@@ -1,6 +1,7 @@
 use 5.024;
 use strict;
 use warnings;
+use utf8;
 
 sub outside {
     say "Line @{[__LINE__]}: package '@{[__PACKAGE__]}'";
@@ -155,4 +156,17 @@ package :: {
 
 Shoved::elsewhere();
 
+# Finally, try unicode identifiers.
+package Erdős::Number;
+
+sub erdős_number {
+    my $name = shift;
+    if ($name eq  "Erdős Pál") {
+       return 0;
+    }
+    else {
+        die "No access to the database. Sorry.";
+    }
+}
+
 1;
index 54012c3918ef11d6b777c7211b2d439fbbb3c063..29b9e3f6fb93a4dcb6e8b07f32c7b3e9479bdf29 100644 (file)
@@ -154,6 +154,22 @@ point in the distant past, and is still broken in perl-mode. "
     (should (equal (get-text-property (match-beginning 0) 'face)
                    'font-lock-keyword-face))))
 
+(ert-deftest cperl-test-fontify-special-variables ()
+  "Test fontification of variables like $^T or ${^ENCODING}.
+These can occur as \"local\" aliases."
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
+  (with-temp-buffer
+    (insert "local ($^I, ${^UNICODE});\n")
+    (goto-char (point-min))
+    (funcall cperl-test-mode)
+    (font-lock-ensure)
+    (search-forward "$")
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-variable-name-face))
+    (search-forward "$")
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-variable-name-face))))
+
 (ert-deftest cperl-test-identify-heredoc ()
   "Test whether a construct containing \"<<\" followed by a
   bareword is properly identified for a here-document if
@@ -297,6 +313,7 @@ the whole string."
 
 (ert-deftest cperl-test-ws-rx ()
   "Tests capture of very simple regular expressions (yawn)."
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
   (let ((valid
         '(" " "\t" "\n"))
        (invalid
@@ -306,6 +323,7 @@ the whole string."
 
 (ert-deftest cperl-test-ws+-rx ()
   "Tests sequences of whitespace and comment lines."
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
   (let ((valid
         `(" " "\t#\n" "\n# \n"
           ,(concat "# comment\n" "# comment\n" "\n" "#comment\n")))
@@ -316,6 +334,7 @@ the whole string."
 
 (ert-deftest cperl-test-version-regexp ()
   "Tests the regexp for recommended syntax of versions in Perl."
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
   (let ((valid
         '("1" "1.1" "1.1_1" "5.032001"
           "v120.100.103"))
@@ -331,6 +350,7 @@ the whole string."
 (ert-deftest cperl-test-package-regexp ()
   "Tests the regular expression of Perl package names with versions.
 Also includes valid cases with whitespace in strange places."
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
   (let ((valid
         '("package Foo"
           "package Foo::Bar"
@@ -346,6 +366,284 @@ Also includes valid cases with whitespace in strange places."
     (cperl-test--validate-regexp (rx (eval cperl--package-rx))
                                 valid invalid)))
 
+(ert-deftest cperl-test-identifier-rx ()
+  "Test valid and invalid identifiers (no sigils)."
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
+  (let ((valid
+         '("foo" "FOO" "f_oo" "a123"
+           "manĝis"))                   ; Unicode is allowed!
+        (invalid
+         '("$foo"                       ; no sigils allowed (yet)
+           "Foo::bar"                   ; no package qualifiers allowed
+           "lots_of_€")))               ; € is not alphabetic
+    (cperl-test--validate-regexp (rx (eval cperl--basic-identifier-rx))
+                                 valid invalid)))
+
+;;; Test unicode identifier in various places
+
+(defun cperl--test-unicode-setup (code string)
+  "Insert CODE, prepare it for tests, and find STRING.
+Invoke the appropriate major mode, ensure fontification, and set
+point after the first occurrence of STRING (no regexp!)."
+  (insert code)
+  (funcall cperl-test-mode)
+  (font-lock-ensure)
+  (goto-char (point-min))
+  (search-forward string))
+
+(ert-deftest cperl-test-unicode-labels ()
+  "Verify that non-ASCII labels are processed correctly."
+  (with-temp-buffer
+    (cperl--test-unicode-setup "LABEł: for ($manĝi) { say; }" "LAB")
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-constant-face))))
+
+(ert-deftest cperl-test-unicode-sub ()
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     (concat "use strict;\n"            ; distinguish bob from b-o-f
+             "sub ℏ {\n"
+             "  6.62607015e-34\n"
+             "};")
+     "sub ")                            ; point is before "ℏ"
+
+    ;; Testing fontification
+    ;; FIXME 2021-09-10: This tests succeeds because cperl-mode
+    ;; accepts almost anything as a sub name for fontification.  For
+    ;; example, it fontifies "sub @ {...;}" which is a syntax error in
+    ;; Perl.  I let this pass for the moment.
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-function-name-face))
+
+    ;; Testing `beginning-of-defun'.  Not available in perl-mode,
+    ;; where it jumps to the beginning of the buffer.
+    (when (eq cperl-test-mode #'cperl-mode)
+      (goto-char (point-min))
+      (search-forward "-34")
+      (beginning-of-defun)
+     (should (looking-at "sub")))))
+
+(ert-deftest cperl-test-unicode-varname ()
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     (concat "use strict;\n"
+             "my $π = 3.1415926535897932384626433832795028841971;\n"
+             "\n"
+             "my $manĝi = $π;\n"
+             "__END__\n")
+     "my $") ; perl-mode doesn't fontify the sigil, so include it here
+
+    ;; Testing fontification
+    ;; FIXME 2021-09-10: This test succeeds in cperl-mode because the
+    ;; π character is "not ASCII alphabetic", so it treats $π as a
+    ;; punctuation variable.  The following two `should' forms with a
+    ;; longer variable name were added for stronger verification.
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-variable-name-face))
+    ;; Test both ends of a longer variable name
+    (search-forward "my $")             ; again skip the sigil
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-variable-name-face))
+    (search-forward "manĝi")
+    (should (equal (get-text-property (1- (match-end 0)) 'face)
+                   'font-lock-variable-name-face))))
+
+(ert-deftest cperl-test-unicode-varname-list ()
+  "Verify that all elements of a variable list are fontified."
+
+  (let ((hash-face (if (eq cperl-test-mode #'perl-mode)
+                       'perl-non-scalar-variable
+                     'cperl-hash-face))
+        (array-face (if (eq cperl-test-mode #'perl-mode)
+                        'perl-non-scalar-variable
+                      'cperl-array-face)))
+    (with-temp-buffer
+      (cperl--test-unicode-setup
+       "my (%äsh,@ärräy,$scâlâr);" "%")
+      (should (equal (get-text-property (point) 'face)
+                     hash-face))
+      (search-forward "@")
+      (should (equal (get-text-property (point) 'face)
+                     array-face))
+      (search-forward "scâlâr")
+      (should (equal (get-text-property (match-beginning 0) 'face)
+                     'font-lock-variable-name-face))
+      (should (equal (get-text-property (1- (match-end 0)) 'face)
+                     'font-lock-variable-name-face)))
+
+      ;; Now with package-qualified variables
+    (with-temp-buffer
+      (cperl--test-unicode-setup
+       "local (%Søme::äsh,@Søme::ärräy,$Søme::scâlâr);" "%")
+      (should (equal (get-text-property (point) 'face)
+                     hash-face))
+      (search-forward "Søme::")         ; test basic identifier
+      (should (equal (get-text-property (point) 'face)
+                     hash-face))
+      (search-forward "@")              ; test package name
+      (should (equal (get-text-property (point) 'face)
+                     array-face))
+      (search-forward "Søme::")         ; test basic identifier
+      (should (equal (get-text-property (point) 'face)
+                     array-face))
+      (search-forward "Søme")           ; test package name
+      (should (equal (get-text-property (match-beginning 0) 'face)
+                     'font-lock-variable-name-face))
+      (should (equal (get-text-property (1- (match-end 0)) 'face)
+                     'font-lock-variable-name-face))
+      (search-forward "scâlâr")         ; test basic identifier
+      (should (equal (get-text-property (match-beginning 0) 'face)
+                     'font-lock-variable-name-face))
+      (should (equal (get-text-property (1- (match-end 0)) 'face)
+                     'font-lock-variable-name-face)))))
+
+(ert-deftest cperl-test-unicode-arrays ()
+  "Test fontification of array access."
+  ;; Perl mode just looks at the sigil, for element access
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
+  ;; simple array element
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "$ärräy[1] = 7;" "$")
+    (should (equal (get-text-property (point) 'face)
+                   'cperl-array-face)))
+  ;; array slice
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "@ärräy[(1..3)] = (4..6);" "@")
+    (should (equal (get-text-property (point) 'face)
+                     'cperl-array-face)))
+  ;; array max index
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "$#ärräy = 1;" "$")
+    (should (equal (get-text-property (point) 'face)
+                   'cperl-array-face)))
+  ;; array dereference
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "@$ärräy = (1,2,3)" "@")
+    (should (equal (get-text-property (1- (point)) 'face)
+                   'cperl-array-face))
+    (should (equal (get-text-property (1+ (point)) 'face)
+                   'font-lock-variable-name-face))))
+
+(ert-deftest cperl-test-unicode-hashes ()
+  "Test fontification of hash access."
+  ;; Perl mode just looks at the sigil, for element access
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
+  ;; simple hash element
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "$häsh{'a'} = 7;" "$")
+    (should (equal (get-text-property (point) 'face)
+                   'cperl-hash-face)))
+  ;; hash array slice
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "@häsh{(1..3)} = (4..6);" "@")
+    (should (equal (get-text-property (point) 'face)
+                     'cperl-hash-face)))
+  ;; hash subset
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "my %hash = %häsh{'a',2,3};" "= %")
+    (should (equal (get-text-property (point) 'face)
+                   'cperl-hash-face)))
+  ;; hash dereference
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "%$äsh = (key => 'value');" "%")
+    (should (equal (get-text-property (1- (point)) 'face)
+                   'cperl-hash-face))
+    (should (equal (get-text-property (1+ (point)) 'face)
+                   'font-lock-variable-name-face))))
+
+(ert-deftest cperl-test-unicode-hashref ()
+  "Verify that a hashref access disambiguates {s}.
+CPerl mode takes the token \"s\" as a substitution unless
+detected otherwise.  Not for perl-mode: it doesn't stringify
+bareword hash keys and doesn't recognize a substitution
+\"s}foo}bar}\""
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
+  (with-temp-buffer
+    (cperl--test-unicode-setup "$häshref->{s} # }}" "{")
+    (should (equal (get-text-property (point) 'face)
+            'font-lock-string-face))
+    (should (equal (get-text-property (1+ (point)) 'face)
+            nil))))
+
+(ert-deftest cperl-test-unicode-proto ()
+  ;; perl-mode doesn't fontify prototypes at all
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     (concat "sub prötötyped ($) {\n"
+             "  ...;"
+             "}\n")
+     "prötötyped (")
+
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-string-face))))
+
+(ert-deftest cperl-test-unicode-fhs ()
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     (concat "while (<BAREWÖRD>) {\n"
+             "    ...;)\n"
+             "}\n")
+     "while (<") ; point is before the first char of the handle
+    ;; Testing fontification
+    ;; FIXME 2021-09-10: perl-mode.el and cperl-mode.el handle these
+    ;; completely differently.  perl-mode interprets barewords as
+    ;; constants, cperl-mode does not fontify them.  Both treat
+    ;; non-barewords as globs, which are not fontified by perl-mode,
+    ;; but fontified as strings in cperl-mode.  We keep (and test)
+    ;; that behavior "as is" because both bareword filehandles and
+    ;; <glob> syntax are no longer recommended.
+    (let ((bareword-face
+           (if (equal cperl-test-mode 'perl-mode) 'font-lock-constant-face
+             nil)))
+            (should (equal (get-text-property (point) 'face)
+                     bareword-face)))))
+
+(ert-deftest cperl-test-unicode-hashkeys ()
+  "Test stringification of bareword hash keys.  Not in perl-mode.
+perl-mode generally does not stringify bareword hash keys."
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
+  ;; Plain hash key
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "$häsh { kéy }" "{ ")
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-string-face)))
+  ;; Nested hash key
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "$häsh { kéy } { kèy }" "} { ")
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-string-face)))
+  ;; Key => value
+  (with-temp-buffer
+    (cperl--test-unicode-setup
+     "( kéy => 'value'," "( ")
+    (should (equal (get-text-property (point) 'face)
+                   'font-lock-string-face))))
+
+(ert-deftest cperl-test-word-at-point ()
+  "Test whether the function captures non-ASCII words."
+  (skip-unless (eq cperl-test-mode #'cperl-mode))
+  (let ((words '("rôle" "café" "ångström"
+                 "Data::Dump::dump"
+                 "_underscore")))
+    (dolist (word words)
+      (with-temp-buffer
+        (insert " + ")                  ; this will be the suffix
+        (beginning-of-line)
+        (insert ")")                    ; A non-word char
+        (insert word)
+        (should (string= word (cperl-word-at-point-hard)))))))
+
 ;;; Function test: Building an index for imenu
 
 (ert-deftest cperl-test-imenu-index ()
@@ -369,7 +667,8 @@ created by CPerl mode, so skip it for Perl mode."
                         "Versioned::Package::outer"
                         "lexical"
                         "Versioned::Block::signatured"
-                        "Package::in_package_again")))
+                        "Package::in_package_again"
+                        "Erdős::Number::erdős_number")))
         (dolist (sub expected)
           (should (assoc-string sub index)))))))