From fa04c8b87e50a2e2b0d021958f637be8f475d8bc Mon Sep 17 00:00:00 2001 From: Lars Ingebrigtsen Date: Mon, 29 Jul 2019 14:22:31 +0200 Subject: [PATCH] Add an ISO 8601 parsing library * doc/lispref/os.texi (Time Parsing): Document it. * lisp/calendar/iso8601.el: New file. * test/lisp/calendar/iso8601-tests.el: Test ISO8601 parsing functions. --- doc/lispref/os.texi | 13 + etc/NEWS | 9 + lisp/calendar/iso8601.el | 370 ++++++++++++++++++++++++++++ test/lisp/calendar/iso8601-tests.el | 291 ++++++++++++++++++++++ 4 files changed, 683 insertions(+) create mode 100644 lisp/calendar/iso8601.el create mode 100644 test/lisp/calendar/iso8601-tests.el diff --git a/doc/lispref/os.texi b/doc/lispref/os.texi index d397a125738..b3444838d3b 100644 --- a/doc/lispref/os.texi +++ b/doc/lispref/os.texi @@ -1622,6 +1622,19 @@ ISO 8601 string, like ``Fri, 25 Mar 2016 16:24:56 +0100'' or less well-formed time strings as well. @end defun +@vindex ISO 8601 date/time strings +@defun iso8601-parse string +For a more strict function (that will error out upon invalid input), +this function can be used instead. It's able to parse all variants of +the ISO 8601 standard, so in addition to the formats mentioned above, +it also parses things like ``1998W45-3'' (week number) and +``1998-245'' (ordinal day number). To parse durations, there's +@code{iso8601-parse-duration}, and to parse intervals, there's +@code{iso8601-parse-interval}. All these functions return decoded +time structures, except the final one, which returns three of them +(the start, the end, and the duration). +@end defun + @defun format-time-string format-string &optional time zone This function converts @var{time} (or the current time, if diff --git a/etc/NEWS b/etc/NEWS index 2bdbfcb8d08..7c21cc79307 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -2055,6 +2055,15 @@ of various forms, including a new timestamp form '(TICKS . HZ)', where TICKS is an integer and HZ is a positive integer denoting a clock frequency. The old 'encode-time' API is still supported. ++++ +*** A new package to parse ISO 8601 time, date, durations and +intervals has been added. The main function to use is +'iso8601-parse', but there's also 'iso8601-parse-date', +'iso8601-parse-time', 'iso8601-parse-duration' and +'iso8601-parse-interval'. All these functions return decoded time +structures, except the final one, which returns three of them (start, +end and duration). + +++ *** 'time-add', 'time-subtract', and 'time-less-p' now accept infinities and NaNs too, and propagate them or return nil like diff --git a/lisp/calendar/iso8601.el b/lisp/calendar/iso8601.el new file mode 100644 index 00000000000..ab0077ac58d --- /dev/null +++ b/lisp/calendar/iso8601.el @@ -0,0 +1,370 @@ +;;; iso8601.el --- parse ISO 8601 date/time strings -*- lexical-binding:t -*- + +;; Copyright (C) 2019 Free Software Foundation, Inc. + +;; Keywords: dates + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see . + +;;; Commentary: + +;; ISO8601 times basically look like 1985-04-01T15:23:49... Or so +;; you'd think. This is what everybody means when they say "ISO8601", +;; but it's in reality a quite large collection of syntaxes, including +;; week numbers, ordinal dates, durations and intervals. This package +;; has functions for parsing them all. +;; +;; The interface functions are `iso8601-parse', `iso8601-parse-date', +;; `iso8601-parse-time', `iso8601-parse-zone', +;; `iso8601-parse-duration' and `iso8601-parse-interval'. They all +;; return decoded time objects, except the last one, which returns a +;; list of three of them. +;; +;; (iso8601-parse-interval "P1Y2M10DT2H30M/2008W32T153000-01") +;; '((0 0 13 24 5 2007 nil nil -3600) +;; (0 30 15 3 8 2008 nil nil -3600) +;; (0 30 2 10 2 1 nil nil nil)) +;; +;; +;; The standard can be found at: +;; +;; http://www.loc.gov/standards/datetime/iso-tc154-wg5_n0038_iso_wd_8601-1_2016-02-16.pdf +;; +;; The Wikipedia page on the standard is also informative: +;; +;; https://en.wikipedia.org/wiki/ISO_8601 +;; +;; RFC3339 defines the subset that everybody thinks of as "ISO8601". + +;;; Code: + +(require 'time-date) +(require 'cl-lib) + +(defun iso8601--concat-regexps (regexps) + (mapconcat (lambda (regexp) + (concat "\\(?:" + (replace-regexp-in-string "(" "(?:" regexp) + "\\)")) + regexps "\\|")) + +(defconst iso8601--year-match + "\\([-+]\\)?\\([0-9][0-9][0-9][0-9]\\)") +(defconst iso8601--full-date-match + "\\([-+]\\)?\\([0-9][0-9][0-9][0-9]\\)-?\\([0-9][0-9]\\)-?\\([0-9][0-9]\\)") +(defconst iso8601--without-day-match + "\\([-+]\\)?\\([0-9][0-9][0-9][0-9]\\)-\\([0-9][0-9]\\)") +(defconst iso8601--outdated-date-match + "--\\([0-9][0-9]\\)-?\\([0-9][0-9]\\)") +(defconst iso8601--week-date-match + "\\([-+]\\)?\\([0-9][0-9][0-9][0-9]\\)-?W\\([0-9][0-9]\\)-?\\([0-9]\\)?") +(defconst iso8601--ordinal-date-match + "\\([-+]\\)?\\([0-9][0-9][0-9][0-9]\\)-?\\([0-9][0-9][0-9]\\)") +(defconst iso8601--date-match + (iso8601--concat-regexps + (list iso8601--year-match + iso8601--full-date-match + iso8601--without-day-match + iso8601--outdated-date-match + iso8601--week-date-match + iso8601--ordinal-date-match))) + +(defconst iso8601--time-match + "\\([0-9][0-9]\\):?\\([0-9][0-9]\\)?:?\\([0-9][0-9]\\)?\\.?\\([0-9][0-9][0-9]\\)?") + +(defconst iso8601--zone-match + "\\(Z\\|\\([-+]\\)\\([0-9][0-9]\\):?\\([0-9][0-9]\\)?\\)") + +(defconst iso8601--full-time-match + (concat "\\(" (replace-regexp-in-string "(" "(?:" iso8601--time-match) "\\)" + "\\(" iso8601--zone-match "\\)?")) + +(defconst iso8601--combined-match + (concat "\\(" iso8601--date-match "\\)" + "\\(?:T\\(" + (replace-regexp-in-string "(" "(?:" iso8601--time-match) + "\\)" + "\\(" iso8601--zone-match "\\)?\\)?")) + +(defconst iso8601--duration-full-match + "P\\([0-9]+Y\\)?\\([0-9]+M\\)?\\([0-9]+D\\)?\\(T\\([0-9]+H\\)?\\([0-9]+M\\)?\\([0-9]+S\\)?\\)?") +(defconst iso8601--duration-week-match + "P\\([0-9]+\\)W") +(defconst iso8601--duration-combined-match + (concat "P" iso8601--combined-match)) +(defconst iso8601--duration-match + (iso8601--concat-regexps + (list iso8601--duration-full-match + iso8601--duration-week-match + iso8601--duration-combined-match))) + +(defun iso8601-parse (string) + "Parse an ISO 8601 date/time string and return a `decoded-time' structure. + +The ISO 8601 date/time strings look like \"2008-03-02T13:47:30\", +but shorter, incomplete strings like \"2008-03-02\" are valid, as +well as variants like \"2008W32\" (week number) and +\"2008-234\" (ordinal day number)." + (if (not (iso8601-valid-p string)) + (signal 'wrong-type-argument string) + (let* ((date-string (match-string 1 string)) + (time-string (match-string 2 string)) + (zone-string (match-string 3 string)) + (date (iso8601-parse-date date-string))) + ;; The time portion is optional. + (when time-string + (let ((time (iso8601-parse-time time-string))) + (setf (decoded-time-hour date) (decoded-time-hour time)) + (setf (decoded-time-minute date) (decoded-time-minute time)) + (setf (decoded-time-second date) (decoded-time-second time)))) + ;; The time zone is optional. + (when zone-string + (setf (decoded-time-zone date) + ;; The time zone in decoded times are in seconds. + (* (iso8601-parse-zone zone-string) 60))) + date))) + +(defun iso8601-parse-date (string) + "Parse STRING (which should be on ISO 8601 format) and return a time value." + (cond + ;; Just a year: [-+]YYYY. + ((iso8601--match iso8601--year-match string) + (iso8601--decoded-time + :year (iso8601--adjust-year (match-string 1 string) + (match-string 2 string)))) + ;; Calendar dates: YYYY-MM-DD and variants. + ((iso8601--match iso8601--full-date-match string) + (iso8601--decoded-time + :year (iso8601--adjust-year (match-string 1 string) + (match-string 2 string)) + :month (match-string 3 string) + :day (match-string 4 string))) + ;; Calendar date without day: YYYY-MM. + ((iso8601--match iso8601--without-day-match string) + (iso8601--decoded-time + :year (iso8601--adjust-year (match-string 1 string) + (match-string 2 string)) + :month (match-string 3 string))) + ;; Outdated date without year: --MM-DD + ((iso8601--match iso8601--outdated-date-match string) + (iso8601--decoded-time + :month (match-string 1 string) + :day (match-string 2 string))) + ;; Week dates: YYYY-Www-D + ((iso8601--match iso8601--week-date-match string) + (let* ((year (iso8601--adjust-year (match-string 1 string) + (match-string 2 string))) + (week (string-to-number (match-string 3 string))) + (day-of-week (and (match-string 4 string) + (string-to-number (match-string 4 string)))) + (jan-start (decoded-time-weekday + (decode-time + (iso8601--encode-time + (iso8601--decoded-time :year year + :month 1 + :day 4))))) + (correction (+ (if (zerop jan-start) 7 jan-start) + 3)) + (ordinal (+ (* week 7) (or day-of-week 0) (- correction)))) + (cond + ;; Monday 29 December 2008 is written "2009-W01-1". + ((< ordinal 1) + (setq year (1- year) + ordinal (+ ordinal (if (date-leap-year-p year) + 366 365)))) + ;; Sunday 3 January 2010 is written "2009-W53-7". + ((> ordinal (if (date-leap-year-p year) + 366 365)) + (setq ordinal (- ordinal (if (date-leap-year-p year) + 366 365)) + year (1+ year)))) + (let ((month-day (date-ordinal-to-time year ordinal))) + (iso8601--decoded-time :year year + :month (decoded-time-month month-day) + :day (decoded-time-day month-day))))) + ;; Ordinal dates: YYYY-DDD + ((iso8601--match iso8601--ordinal-date-match string) + (let* ((year (iso8601--adjust-year (match-string 1 string) + (match-string 2 string))) + (ordinal (string-to-number (match-string 3 string))) + (month-day (date-ordinal-to-time year ordinal))) + (iso8601--decoded-time :year year + :month (decoded-time-month month-day) + :day (decoded-time-day month-day)))) + (t + (signal 'wrong-type-argument string)))) + +(defun iso8601--adjust-year (sign year) + (save-match-data + (let ((year (if (stringp year) + (string-to-number year) + year))) + (if (string= sign "-") + ;; -0001 is 2 BCE. + (1- (- year)) + year)))) + +(defun iso8601-parse-time (string) + "Parse STRING, which should be an ISO 8601 time string, and return a time value." + (if (not (iso8601--match iso8601--full-time-match string)) + (signal 'wrong-type-argument string) + (let ((time (match-string 1 string)) + (zone (match-string 2 string))) + (if (not (iso8601--match iso8601--time-match time)) + (signal 'wrong-type-argument string) + (let ((hour (string-to-number (match-string 1 time))) + (minute (and (match-string 2 time) + (string-to-number (match-string 2 time)))) + (second (and (match-string 3 time) + (string-to-number (match-string 3 time)))) + ;; Hm... + (_millisecond (and (match-string 4 time) + (string-to-number (match-string 4 time))))) + (iso8601--decoded-time :hour hour + :minute (or minute 0) + :second (or second 0) + :zone (and zone + (* 60 (iso8601-parse-zone + zone))))))))) + +(defun iso8601-parse-zone (string) + "Parse STRING, which should be an ISO 8601 time zone. +Return the number of minutes." + (if (not (iso8601--match iso8601--zone-match string)) + (signal 'wrong-type-argument string) + (if (match-string 2 string) + ;; HH:MM-ish. + (let ((hour (string-to-number (match-string 3 string))) + (minute (and (match-string 4 string) + (string-to-number (match-string 4 string))))) + (* (if (equal (match-string 2 string) "-") + -1 + 1) + (+ (* hour 60) + (or minute 0)))) + ;; "Z". + 0))) + +(defun iso8601-valid-p (string) + "Say whether STRING is a valid ISO 8601 representation." + (iso8601--match iso8601--combined-match string)) + +(defun iso8601-parse-duration (string) + "Parse ISO 8601 durations on the form P3Y6M4DT12H30M5S." + (cond + ((and (iso8601--match iso8601--duration-full-match string) + ;; Just a "P" isn't valid; there has to be at least one + ;; element, like P1M. + (> (length (match-string 0 string)) 2)) + (iso8601--decoded-time :year (or (match-string 1 string) 0) + :month (or (match-string 2 string) 0) + :day (or (match-string 3 string) 0) + :hour (or (match-string 5 string) 0) + :minute (or (match-string 6 string) 0) + :second (or (match-string 7 string) 0))) + ;; PnW: Weeks. + ((iso8601--match iso8601--duration-week-match string) + (let ((weeks (string-to-number (match-string 1 string)))) + ;; Does this make sense? Hm... + (iso8601--decoded-time :day (* weeks 7)))) + ;; PT