From ce7b18ec41c5102f4af27ec22cf873a75f510630 Mon Sep 17 00:00:00 2001 From: Chong Yidong Date: Wed, 9 May 2012 16:33:48 +0800 Subject: [PATCH] Improve RFC 3986 conformance of url package. Fix 2012-04-10 change to url.el. * url-http.el (url-http-create-request): Ignore obsolete attributes slot of url-object. * url-parse.el: Improve RFC 3986 conformance. (url-generic-parse-url): Do not populate the ATTRIBUTES slot, since this is not reliable for general RFC 3986 URIs. Keep the whole path and query inside the FILENAME slot. Improve docstring. (url-recreate-url-attributes): Mark as obsolete. (url-recreate-url): Handle missing scheme and userinfo. * url-util.el (url-encode-url): New function for URL quoting. (url-encoding-table, url-host-allowed-chars) (url-path-allowed-chars): New constants. (url--allowed-chars): New helper function. (url-hexify-string): Use them. * url-vars.el (url-nonrelative-link): Make the regexp stricter. * url.el (url-retrieve-internal): Use url-encode-url. Fixes: debbugs:7017 --- etc/NEWS | 12 +++ lisp/url/ChangeLog | 22 +++++ lisp/url/url-http.el | 3 +- lisp/url/url-parse.el | 208 ++++++++++++++++++++++++------------------ lisp/url/url-util.el | 135 +++++++++++++++++++++------ lisp/url/url-vars.el | 6 +- lisp/url/url.el | 10 +- 7 files changed, 271 insertions(+), 125 deletions(-) diff --git a/etc/NEWS b/etc/NEWS index 9d011df6feb..921f44bbcce 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -202,6 +202,18 @@ server properties. *** New command `tabulated-list-sort', bound to `S', sorts the column at point, or the Nth column if a numeric prefix argument is given. +** URL + +*** Structs made by `url-generic-parse-url' have nil `attributes' slot. +Previously, this slot stored semicolon-separated attribute-value pairs +appended to some imap URLs, but this is not compatible with RFC 3986. +So now the `filename' slot stores the entire path and query components +and the `attributes' slot is always nil. + +*** New function `url-encode-url' for encoding a URI string. +The `url-retrieve' function now uses this to encode its URL argument, +in case that is not properly encoded. + ** Obsolete packages: *** assoc.el diff --git a/lisp/url/ChangeLog b/lisp/url/ChangeLog index 7ce3489cfcc..3980b22d4c1 100644 --- a/lisp/url/ChangeLog +++ b/lisp/url/ChangeLog @@ -1,3 +1,25 @@ +2012-05-09 Chong Yidong + + * url-util.el (url-encode-url): New function for URL quoting. + (url-encoding-table, url-host-allowed-chars) + (url-path-allowed-chars): New constants. + (url--allowed-chars): New helper function. + (url-hexify-string): Use them. + + * url-parse.el: Improve RFC 3986 conformance. + (url-generic-parse-url): Do not populate the ATTRIBUTES slot, + since this is not reliable for general RFC 3986 URIs. Keep the + whole path and query inside the FILENAME slot. Improve docstring. + (url-recreate-url-attributes): Mark as obsolete. + (url-recreate-url): Handle missing scheme and userinfo. + + * url-http.el (url-http-create-request): Ignore obsolete + attributes slot of url-object. + + * url-vars.el (url-nonrelative-link): Make the regexp stricter. + + * url.el (url-retrieve-internal): Use url-encode-url (Bug#7017). + 2012-04-26 Stefan Monnier * url.el (url-retrieve-synchronously): Replace lexical-let by diff --git a/lisp/url/url-http.el b/lisp/url/url-http.el index a4726489814..ff026da2d21 100644 --- a/lisp/url/url-http.el +++ b/lisp/url/url-http.el @@ -223,8 +223,7 @@ request.") (let ((url-basic-auth-storage 'url-http-proxy-basic-auth-storage)) (url-get-authentication url-http-target-url nil 'any nil)))) - (real-fname (concat (url-filename url-http-target-url) - (url-recreate-url-attributes url-http-target-url))) + (real-fname (url-filename url-http-target-url)) (host (url-host url-http-target-url)) (auth (if (cdr-safe (assoc "Authorization" url-http-extra-headers)) nil diff --git a/lisp/url/url-parse.el b/lisp/url/url-parse.el index b91c85c0c3d..40183a4f533 100644 --- a/lisp/url/url-parse.el +++ b/lisp/url/url-parse.el @@ -48,21 +48,31 @@ ;;;###autoload (defun url-recreate-url (urlobj) "Recreate a URL string from the parsed URLOBJ." - (concat (url-type urlobj) ":" (if (url-host urlobj) "//" "") - (if (url-user urlobj) - (concat (url-user urlobj) - (if (url-password urlobj) - (concat ":" (url-password urlobj))) - "@")) - (url-host urlobj) - (if (and (url-port urlobj) - (not (equal (url-port urlobj) - (url-scheme-get-property (url-type urlobj) 'default-port)))) - (format ":%d" (url-port urlobj))) - (or (url-filename urlobj) "/") - (url-recreate-url-attributes urlobj) - (if (url-target urlobj) - (concat "#" (url-target urlobj))))) + (let ((type (url-type urlobj)) + (user (url-user urlobj)) + (pass (url-password urlobj)) + (host (url-host urlobj)) + (port (url-portspec urlobj)) + (file (url-filename urlobj)) + (frag (url-target urlobj))) + (concat (if type (concat type ":")) + (if (url-fullness urlobj) "//") + (if (or user pass) + (concat user + (if pass (concat ":" pass)) + "@")) + host + ;; RFC 3986: "omit the port component and its : delimiter + ;; if port is empty or if its value would be the same as + ;; that of the scheme's default." + (and port + (or (null type) + (not (equal port + (url-scheme-get-property type + 'default-port)))) + (format ":%d" (url-port urlobj))) + (or file "/") + (if frag (concat "#" frag))))) (defun url-recreate-url-attributes (urlobj) "Recreate the attributes of an URL string from the parsed URLOBJ." @@ -73,107 +83,129 @@ (concat (car x) "=" (cdr x)) (car x))) (url-attributes urlobj) ";")))) +(make-obsolete 'url-recreate-url-attributes nil "24.2") ;;;###autoload (defun url-generic-parse-url (url) "Return an URL-struct of the parts of URL. The CL-style struct contains the following fields: -TYPE USER PASSWORD HOST PORTSPEC FILENAME TARGET ATTRIBUTES FULLNESS." - ;; See RFC 3986. - (cond - ((null url) - (url-parse-make-urlobj)) - ((or (not (string-match url-nonrelative-link url)) - (= ?/ (string-to-char url))) - ;; This isn't correct, as a relative URL can be a fragment link - ;; (e.g. "#foo") and many other things (see section 4.2). - ;; However, let's not fix something that isn't broken, especially - ;; when close to a release. - (url-parse-make-urlobj nil nil nil nil nil url)) - (t + +TYPE is the URI scheme (string or nil). +USER is the user name (string or nil). +PASSWORD is the password (string [deprecated] or nil). +HOST is the host (a registered name, IP literal in square + brackets, or IPv4 address in dotted-decimal form). +PORTSPEC is the specified port (a number), or nil. +FILENAME is the path AND the query component of the URI. +TARGET is the fragment identifier component (used to refer to a + subordinate resource, e.g. a part of a webpage). +ATTRIBUTES is nil; this slot originally stored the attribute and + value alists for IMAP URIs, but this feature was removed + since it conflicts with RFC 3986. +FULLNESS is non-nil iff the authority component of the URI is + present. + +The parser follows RFC 3986, except that it also tries to handle +URIs that are not fully specified (e.g. lacking TYPE), and it +does not check for or perform %-encoding. + +Here is an example. The URL + + foo://bob:pass@example.com:42/a/b/c.dtb?type=animal&name=narwhal#nose + +parses to + + TYPE = \"foo\" + USER = \"bob\" + PASSWORD = \"pass\" + HOST = \"example.com\" + PORTSPEC = 42 + FILENAME = \"/a/b/c.dtb?type=animal&name=narwhal\" + TARGET = \"nose\" + ATTRIBUTES = nil + FULLNESS = t" + (if (null url) + (url-parse-make-urlobj) (with-temp-buffer ;; Don't let those temp-buffer modifications accidentally ;; deactivate the mark of the current-buffer. (let ((deactivate-mark nil)) (set-syntax-table url-parse-syntax-table) - (let ((save-pos nil) - (prot nil) - (user nil) - (pass nil) - (host nil) - (port nil) - (file nil) - (refs nil) - (attr nil) - (full nil) + (erase-buffer) + (insert url) + (goto-char (point-min)) + (let ((save-pos (point)) + scheme user pass host port file fragment full (inhibit-read-only t)) - (erase-buffer) - (insert url) - (goto-char (point-min)) - (setq save-pos (point)) ;; 3.1. Scheme - (unless (looking-at "//") - (skip-chars-forward "a-zA-Z+.\\-") - (downcase-region save-pos (point)) - (setq prot (buffer-substring save-pos (point))) - (skip-chars-forward ":") - (setq save-pos (point))) + ;; This is nil for a URI that is not fully specified. + (when (looking-at "\\([a-zA-Z][-a-zA-Z0-9+.]*\\):") + (goto-char (match-end 0)) + (setq save-pos (point)) + (setq scheme (downcase (match-string 1)))) ;; 3.2. Authority (when (looking-at "//") (setq full t) (forward-char 2) (setq save-pos (point)) - (skip-chars-forward "^/") + (skip-chars-forward "^/?#") (setq host (buffer-substring save-pos (point))) + ;; 3.2.1 User Information (if (string-match "^\\([^@]+\\)@" host) (setq user (match-string 1 host) - host (substring host (match-end 0) nil))) - (if (and user (string-match "\\([^:]+\\):\\(.*\\)" user)) + host (substring host (match-end 0)))) + (if (and user (string-match "\\`\\([^:]*\\):\\(.*\\)" user)) (setq pass (match-string 2 user) user (match-string 1 user))) - ;; This gives wrong results for IPv6 literal addresses. - (if (string-match ":\\([0-9+]+\\)" host) - (setq port (string-to-number (match-string 1 host)) - host (substring host 0 (match-beginning 0)))) - (if (string-match ":$" host) - (setq host (substring host 0 (match-beginning 0)))) - (setq host (downcase host) - save-pos (point))) - - (if (not port) - (setq port (url-scheme-get-property prot 'default-port))) - - ;; 3.3. Path - ;; Gross hack to preserve ';' in data URLs + (cond + ;; IPv6 literal address. + ((string-match "^\\(\\[[^]]+\\]\\)\\(?::\\([0-9]*\\)\\)?$" host) + (setq port (match-string 2 host) + host (match-string 1 host))) + ;; Registered name or IPv4 address. + ((string-match ":\\([0-9]*\\)$" host) + (setq port (match-string 1 host) + host (substring host 0 (match-beginning 0))))) + (cond ((equal port "") + (setq port nil)) + (port + (setq port (string-to-number port)))) + (setq host (downcase host))) + + (and (null port) + scheme + (setq port (url-scheme-get-property scheme 'default-port))) + + ;; Now point is on the / ? or # which terminates the + ;; authority, or at the end of the URI, or (if there is no + ;; authority) at the beginning of the absolute path. + (setq save-pos (point)) + (if (string= "data" scheme) + ;; For the "data" URI scheme, all the rest is the FILE. + (setq file (buffer-substring save-pos (point-max))) + ;; For hysterical raisins, our data structure returns the + ;; path and query components together in one slot. + ;; 3.3. Path + (skip-chars-forward "^?#") + ;; 3.4. Query + (when (looking-at "?") + (skip-chars-forward "^#")) + (setq file (buffer-substring save-pos (point))) + ;; 3.5 Fragment + (when (looking-at "#") + (let ((opoint (point))) + (forward-char 1) + (unless (eobp) + (setq fragment (buffer-substring (point) (point-max)))) + (delete-region opoint (point-max))))) - ;; 3.4. Query - (if (string= "data" prot) - (goto-char (point-max)) - ;; Now check for references - (skip-chars-forward "^#") - (if (eobp) - nil - (delete-region - (point) - (progn - (skip-chars-forward "#") - (setq refs (buffer-substring (point) (point-max))) - (point-max)))) - (goto-char save-pos) - (skip-chars-forward "^;") - (unless (eobp) - (setq attr (url-parse-args (buffer-substring (point) (point-max)) - t) - attr (nreverse attr)))) - - (setq file (buffer-substring save-pos (point))) (if (and host (string-match "%[0-9][0-9]" host)) (setq host (url-unhex-string host))) - (url-parse-make-urlobj - prot user pass host port file refs attr full))))))) + (url-parse-make-urlobj scheme user pass host port file + fragment nil full)))))) (defmacro url-bit-for-url (method lookfor url) `(let* ((urlobj (url-generic-parse-url url)) diff --git a/lisp/url/url-util.el b/lisp/url/url-util.el index d12bd5447fa..c8016ef6cdb 100644 --- a/lisp/url/url-util.el +++ b/lisp/url/url-util.el @@ -333,40 +333,117 @@ forbidden in URL encoding." (concat tmp str))) (defconst url-unreserved-chars - '( - ?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z + '(?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z ?A ?B ?C ?D ?E ?F ?G ?H ?I ?J ?K ?L ?M ?N ?O ?P ?Q ?R ?S ?T ?U ?V ?W ?X ?Y ?Z ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 - ?- ?_ ?. ?! ?~ ?* ?' ?\( ?\)) - "A list of characters that are _NOT_ reserved in the URL spec. -This is taken from RFC 2396.") + ?- ?_ ?. ?~) + "List of characters that are unreserved in the URL spec. +This is taken from RFC 3986 (section 2.3).") + +(defconst url-encoding-table + (let ((vec (make-vector 256 nil))) + (dotimes (byte 256) + (aset vec byte (format "%%%02x" byte))) + vec) + "Vector translating bytes to URI-encoded %-sequences.") + +(defun url--allowed-chars (char-list) + "Return an \"allowed character\" mask (a 256-slot vector). +The Nth element is non-nil if character N is in CHAR-LIST. The +result can be passed as the second arg to `url-hexify-string'." + (let ((vec (make-vector 256 nil))) + (dolist (byte char-list) + (ignore-errors (aset vec byte t))) + vec)) ;;;###autoload -(defun url-hexify-string (string) - "Return a new string that is STRING URI-encoded. -First, STRING is converted to utf-8, if necessary. Then, for each -character in the utf-8 string, those found in `url-unreserved-chars' -are left as-is, all others are represented as a three-character -string: \"%\" followed by two lowercase hex digits." - ;; To go faster and avoid a lot of consing, we could do: - ;; - ;; (defconst url-hexify-table - ;; (let ((map (make-vector 256 nil))) - ;; (dotimes (byte 256) (aset map byte - ;; (if (memq byte url-unreserved-chars) - ;; (char-to-string byte) - ;; (format "%%%02x" byte)))) - ;; map)) - ;; - ;; (mapconcat (curry 'aref url-hexify-table) ...) +(defun url-hexify-string (string &optional allowed-chars) + "URI-encode STRING and return the result. +If STRING is multibyte, it is first converted to a utf-8 byte +string. Each byte corresponding to an allowed character is left +as-is, while all other bytes are converted to a three-character +string: \"%\" followed by two lowercase hex digits. + +The allowed characters are specified by ALLOWED-CHARS. If this +argument is nil, the list `url-unreserved-chars' determines the +allowed characters. Otherwise, ALLOWED-CHARS should be a vector +whose Nth element is non-nil if character N is allowed." + (unless allowed-chars + (setq allowed-chars (url--allowed-chars url-unreserved-chars))) (mapconcat (lambda (byte) - (if (memq byte url-unreserved-chars) - (char-to-string byte) - (format "%%%02x" byte))) - (if (multibyte-string-p string) - (encode-coding-string string 'utf-8) - string) - "")) + (if (aref allowed-chars byte) + (char-to-string byte) + (aref url-encoding-table byte))) + (if (multibyte-string-p string) + (encode-coding-string string 'utf-8) + string) + "")) + +(defconst url-host-allowed-chars + ;; Allow % to avoid re-encoding %-encoded sequences. + (url--allowed-chars (append '(?% ?! ?$ ?& ?' ?\( ?\) ?* ?+ ?, ?\; ?=) + url-unreserved-chars)) + "Allowed-character byte mask for the host segment of a URI. +These characters are specified in RFC 3986, Appendix A.") + +(defconst url-path-allowed-chars + (let ((vec (copy-sequence url-host-allowed-chars))) + (aset vec ?/ t) + (aset vec ?: t) + (aset vec ?@ t) + vec) + "Allowed-character byte mask for the path segment of a URI. +These characters are specified in RFC 3986, Appendix A.") + +(defconst url-query-allowed-chars + (let ((vec (copy-sequence url-path-allowed-chars))) + (aset vec ?? t) + vec) + "Allowed-character byte mask for the query segment of a URI. +These characters are specified in RFC 3986, Appendix A.") + +;;;###autoload +(defun url-encode-url (url) + "Return a properly URI-encoded version of URL. +This function also performs URI normalization, e.g. converting +the scheme to lowercase if it is uppercase. Apart from +normalization, if URL is already URI-encoded, this function +should return it unchanged." + (if (multibyte-string-p url) + (setq url (encode-coding-string url 'utf-8))) + (let* ((obj (url-generic-parse-url url)) + (user (url-user obj)) + (pass (url-password obj)) + (host (url-host obj)) + (file (url-filename obj)) + (frag (url-target obj)) + path query) + (if user + (setf (url-user obj) (url-hexify-string user))) + (if pass + (setf (url-password obj) (url-hexify-string pass))) + (when host + ;; No special encoding for IPv6 literals. + (unless (string-match "\\`\\[.*\\]\\'" host) + (setf (url-host obj) + (url-hexify-string host url-host-allowed-chars)))) + ;; Split FILENAME slot into its PATH and QUERY components, and + ;; encode them separately. The PATH component can contain + ;; unreserved characters, %-encodings, and /:@!$&'()*+,;= + (when file + (if (string-match "\\?" file) + (setq path (substring file 0 (match-beginning 0)) + query (substring file (match-end 0))) + (setq path file)) + (setq path (url-hexify-string path url-path-allowed-chars)) + (if query + (setq query (url-hexify-string query url-query-allowed-chars))) + (setf (url-filename obj) + (if query (concat path "?" query) path))) + (if frag + (setf (url-target obj) + (url-hexify-string frag url-query-allowed-chars))) + (url-recreate-url obj))) ;;;###autoload (defun url-file-extension (fname &optional x) diff --git a/lisp/url/url-vars.el b/lisp/url/url-vars.el index ff18049e97b..0d71910849f 100644 --- a/lisp/url/url-vars.el +++ b/lisp/url/url-vars.el @@ -304,8 +304,12 @@ undefined." :type '(choice (const :tag "None" :value nil) string) :group 'url) +;; From RFC3986: Scheme names consist of a sequence of characters +;; beginning with a letter and followed by any combination of letters, +;; digits, plus ("+"), period ("."), or hyphen ("-"). + (defvar url-nonrelative-link - "\\`\\([-a-zA-Z0-9+.]+:\\)" + "\\`\\([a-zA-Z][-a-zA-Z0-9+.]*:\\)" "A regular expression that will match an absolute URL.") (defcustom url-max-redirections 30 diff --git a/lisp/url/url.el b/lisp/url/url.el index 7884882c6e7..6d276273c2d 100644 --- a/lisp/url/url.el +++ b/lisp/url/url.el @@ -125,7 +125,9 @@ variable in the original buffer as a forwarding pointer.") ;;;###autoload (defun url-retrieve (url callback &optional cbargs silent inhibit-cookies) "Retrieve URL asynchronously and call CALLBACK with CBARGS when finished. -URL is either a string or a parsed URL. +URL is either a string or a parsed URL. If it is a string +containing characters that are not valid in a URI, those +characters are percent-encoded; see `url-encode-url'. CALLBACK is called when the object has been completely retrieved, with the current buffer containing the object, and any MIME headers associated @@ -179,10 +181,8 @@ URL-encoded before it's used." (url-do-setup) (url-gc-dead-buffers) (if (stringp url) - (set-text-properties 0 (length url) nil url)) - (when (multibyte-string-p url) - (let ((url-unreserved-chars (append '(?: ?/) url-unreserved-chars))) - (setq url (url-hexify-string url)))) + (set-text-properties 0 (length url) nil url)) + (setq url (url-encode-url url)) (if (not (vectorp url)) (setq url (url-generic-parse-url url))) (if (not (functionp callback)) -- 2.39.2